blob: ccc9d68d5a586717a23a88508cbdc72f12c2b413 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * X86-64 port
8 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -07009 *
10 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 */
12
13/*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17#include <stdarg.h>
18
Ashok Raj76e4f662005-06-25 14:55:00 -070019#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070020#include <linux/errno.h>
21#include <linux/sched.h>
22#include <linux/kernel.h>
23#include <linux/mm.h>
Alexey Dobriyan4e950f62007-07-30 02:36:13 +040024#include <linux/fs.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070025#include <linux/elfcore.h>
26#include <linux/smp.h>
27#include <linux/slab.h>
28#include <linux/user.h>
29#include <linux/module.h>
30#include <linux/a.out.h>
31#include <linux/interrupt.h>
32#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033#include <linux/ptrace.h>
34#include <linux/utsname.h>
35#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010036#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080037#include <linux/kprobes.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070038#include <linux/kdebug.h>
Chris Wright02290682007-10-12 23:04:07 +020039#include <linux/tick.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070040
41#include <asm/uaccess.h>
42#include <asm/pgtable.h>
43#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h>
46#include <asm/i387.h>
47#include <asm/mmu_context.h>
48#include <asm/pda.h>
49#include <asm/prctl.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070050#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010053#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020066EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070067static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
Alan Sterne041c682006-03-27 01:16:30 -080069static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010070
71void idle_notifier_register(struct notifier_block *n)
72{
Alan Sterne041c682006-03-27 01:16:30 -080073 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010074}
Andi Kleen95833c82006-01-11 22:44:36 +010075
Andi Kleen95833c82006-01-11 22:44:36 +010076void enter_idle(void)
77{
Andi Kleena15da492006-09-26 10:52:40 +020078 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080079 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010080}
81
82static void __exit_idle(void)
83{
Andi Kleen94468682006-11-14 16:57:46 +010084 if (test_and_clear_bit_pda(0, isidle) == 0)
Andi Kleena15da492006-09-26 10:52:40 +020085 return;
Alan Sterne041c682006-03-27 01:16:30 -080086 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010087}
88
89/* Called from interrupts to signify idle end */
90void exit_idle(void)
91{
Andi Kleena15da492006-09-26 10:52:40 +020092 /* idle loop has pid 0 */
93 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +010094 return;
95 __exit_idle();
96}
97
Linus Torvalds1da177e2005-04-16 15:20:36 -070098/*
99 * We use this if we don't have any better
100 * idle routine..
101 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800102static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200104 current_thread_info()->status &= ~TS_POLLING;
Ingo Molnar0888f062006-12-22 01:11:56 -0800105 /*
106 * TS_POLLING-cleared state must be visible before we
107 * test NEED_RESCHED:
108 */
109 smp_mb();
Andi Kleen72690a22006-12-07 02:14:03 +0100110 local_irq_disable();
111 if (!need_resched()) {
Ingo Molnar5ee613b2008-01-30 13:30:06 +0100112 ktime_t t0, t1;
113 u64 t0n, t1n;
114
115 t0 = ktime_get();
116 t0n = ktime_to_ns(t0);
117 safe_halt(); /* enables interrupts racelessly */
118 local_irq_disable();
119 t1 = ktime_get();
120 t1n = ktime_to_ns(t1);
121 sched_clock_idle_wakeup_event(t1n - t0n);
Hiroshi Shimamoto39d44a52008-01-30 13:30:06 +0100122 }
123 local_irq_enable();
Andi Kleen495ab9c2006-06-26 13:59:11 +0200124 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700125}
126
127/*
128 * On SMP it's slightly faster (but much more power-consuming!)
129 * to poll the ->need_resched flag instead of waiting for the
130 * cross-CPU IPI to arrive. Use this option with caution.
131 */
132static void poll_idle (void)
133{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100134 local_irq_enable();
Andi Kleen72690a22006-12-07 02:14:03 +0100135 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136}
137
Steven Rostedt40d6a142008-01-14 00:55:10 -0800138static void do_nothing(void *unused)
139{
140}
141
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142void cpu_idle_wait(void)
143{
144 unsigned int cpu, this_cpu = get_cpu();
Ingo Molnardc1829a2006-11-17 14:26:18 +0100145 cpumask_t map, tmp = current->cpus_allowed;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700146
147 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
148 put_cpu();
149
150 cpus_clear(map);
151 for_each_online_cpu(cpu) {
152 per_cpu(cpu_idle_state, cpu) = 1;
153 cpu_set(cpu, map);
154 }
155
156 __get_cpu_var(cpu_idle_state) = 0;
157
158 wmb();
159 do {
160 ssleep(1);
161 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100162 if (cpu_isset(cpu, map) &&
163 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164 cpu_clear(cpu, map);
165 }
166 cpus_and(map, map, cpu_online_map);
Steven Rostedt40d6a142008-01-14 00:55:10 -0800167 /*
168 * We waited 1 sec, if a CPU still did not call idle
169 * it may be because it is in idle and not waking up
170 * because it has nothing to do.
171 * Give all the remaining CPUS a kick.
172 */
173 smp_call_function_mask(map, do_nothing, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174 } while (!cpus_empty(map));
Ingo Molnardc1829a2006-11-17 14:26:18 +0100175
176 set_cpus_allowed(current, tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177}
178EXPORT_SYMBOL_GPL(cpu_idle_wait);
179
Ashok Raj76e4f662005-06-25 14:55:00 -0700180#ifdef CONFIG_HOTPLUG_CPU
181DECLARE_PER_CPU(int, cpu_state);
182
183#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800184/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700185static inline void play_dead(void)
186{
187 idle_task_exit();
188 wbinvd();
189 mb();
190 /* Ack it */
191 __get_cpu_var(cpu_state) = CPU_DEAD;
192
Shaohua Li1fa744e2006-01-06 00:12:20 -0800193 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700194 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800195 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700196}
197#else
198static inline void play_dead(void)
199{
200 BUG();
201}
202#endif /* CONFIG_HOTPLUG_CPU */
203
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204/*
205 * The idle thread. There's no useful work to be
206 * done, so just try to conserve power and have a
207 * low exit latency (ie sit in a loop waiting for
208 * somebody to say that they'd like to reschedule)
209 */
Pavel Machekb10db7f2008-01-30 13:30:00 +0100210void cpu_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200212 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213 /* endless idle loop with no priority at all */
214 while (1) {
215 while (!need_resched()) {
216 void (*idle)(void);
217
218 if (__get_cpu_var(cpu_idle_state))
219 __get_cpu_var(cpu_idle_state) = 0;
220
Chris Wright02290682007-10-12 23:04:07 +0200221 tick_nohz_stop_sched_tick();
222
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223 rmb();
224 idle = pm_idle;
225 if (!idle)
226 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700227 if (cpu_is_offline(smp_processor_id()))
228 play_dead();
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100229 /*
230 * Idle routines should keep interrupts disabled
231 * from here on, until they go to idle.
232 * Otherwise, idle callbacks can misfire.
233 */
234 local_irq_disable();
Andi Kleen95833c82006-01-11 22:44:36 +0100235 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200237 /* In many cases the interrupt that ended idle
238 has already called exit_idle. But some idle
239 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100240 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241 }
242
Chris Wright02290682007-10-12 23:04:07 +0200243 tick_nohz_restart_sched_tick();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800244 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800246 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247 }
248}
249
250/*
251 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
252 * which can obviate IPI to trigger checking of need_resched.
253 * We execute MONITOR against need_resched and enter optimized wait state
254 * through MWAIT. Whenever someone changes need_resched, we would be woken
255 * up from MWAIT (without an IPI).
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700256 *
257 * New with Core Duo processors, MWAIT can take some hints based on CPU
258 * capability.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259 */
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700260void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
261{
262 if (!need_resched()) {
263 __monitor((void *)&current_thread_info()->flags, 0, 0);
264 smp_mb();
265 if (!need_resched())
266 __mwait(eax, ecx);
267 }
268}
269
270/* Default MONITOR/MWAIT with no hints, used for default C1 state */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271static void mwait_idle(void)
272{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100273 if (!need_resched()) {
274 __monitor((void *)&current_thread_info()->flags, 0, 0);
275 smp_mb();
276 if (!need_resched())
277 __sti_mwait(0, 0);
278 else
279 local_irq_enable();
280 } else {
281 local_irq_enable();
282 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283}
284
Ashok Raje6982c62005-06-25 14:54:58 -0700285void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286{
287 static int printed;
288 if (cpu_has(c, X86_FEATURE_MWAIT)) {
289 /*
290 * Skip, if setup has overridden idle.
291 * One CPU supports mwait => All CPUs supports mwait
292 */
293 if (!pm_idle) {
294 if (!printed) {
Dan Aloni2d4fa2f2007-07-21 17:11:20 +0200295 printk(KERN_INFO "using mwait in idle threads.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296 printed = 1;
297 }
298 pm_idle = mwait_idle;
299 }
300 }
301}
302
303static int __init idle_setup (char *str)
304{
Andi Kleenf039b752007-05-02 19:27:12 +0200305 if (!strcmp(str, "poll")) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306 printk("using polling idle threads.\n");
307 pm_idle = poll_idle;
Andi Kleenf039b752007-05-02 19:27:12 +0200308 } else if (!strcmp(str, "mwait"))
309 force_mwait = 1;
310 else
311 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312
313 boot_option_idle_override = 1;
Andi Kleenf039b752007-05-02 19:27:12 +0200314 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315}
Andi Kleenf039b752007-05-02 19:27:12 +0200316early_param("idle", idle_setup);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317
318/* Prints also some state that isn't saved in the pt_regs */
319void __show_regs(struct pt_regs * regs)
320{
321 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
Alan Sternbb1995d2007-07-21 17:10:42 +0200322 unsigned long d0, d1, d2, d3, d6, d7;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323 unsigned int fsindex,gsindex;
324 unsigned int ds,cs,es;
325
326 printk("\n");
327 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200328 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
329 current->pid, current->comm, print_tainted(),
Serge E. Hallyn96b644b2006-10-02 02:18:13 -0700330 init_utsname()->release,
331 (int)strcspn(init_utsname()->version, " "),
332 init_utsname()->version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
334 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700335 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100336 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
338 regs->rax, regs->rbx, regs->rcx);
339 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
340 regs->rdx, regs->rsi, regs->rdi);
341 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
342 regs->rbp, regs->r8, regs->r9);
343 printk("R10: %016lx R11: %016lx R12: %016lx\n",
344 regs->r10, regs->r11, regs->r12);
345 printk("R13: %016lx R14: %016lx R15: %016lx\n",
346 regs->r13, regs->r14, regs->r15);
347
348 asm("movl %%ds,%0" : "=r" (ds));
349 asm("movl %%cs,%0" : "=r" (cs));
350 asm("movl %%es,%0" : "=r" (es));
351 asm("movl %%fs,%0" : "=r" (fsindex));
352 asm("movl %%gs,%0" : "=r" (gsindex));
353
354 rdmsrl(MSR_FS_BASE, fs);
355 rdmsrl(MSR_GS_BASE, gs);
356 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
357
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200358 cr0 = read_cr0();
359 cr2 = read_cr2();
360 cr3 = read_cr3();
361 cr4 = read_cr4();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362
363 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
364 fs,fsindex,gs,gsindex,shadowgs);
365 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
366 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
Alan Sternbb1995d2007-07-21 17:10:42 +0200367
368 get_debugreg(d0, 0);
369 get_debugreg(d1, 1);
370 get_debugreg(d2, 2);
371 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
372 get_debugreg(d3, 3);
373 get_debugreg(d6, 6);
374 get_debugreg(d7, 7);
375 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376}
377
378void show_regs(struct pt_regs *regs)
379{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700380 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200382 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383}
384
385/*
386 * Free current thread data structures etc..
387 */
388void exit_thread(void)
389{
390 struct task_struct *me = current;
391 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700392
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 if (me->thread.io_bitmap_ptr) {
394 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
395
396 kfree(t->io_bitmap_ptr);
397 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200398 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399 /*
400 * Careful, clear this in the TSS too:
401 */
402 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
403 t->io_bitmap_max = 0;
404 put_cpu();
405 }
406}
407
408void flush_thread(void)
409{
410 struct task_struct *tsk = current;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700411
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800412 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
413 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
414 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
415 clear_tsk_thread_flag(tsk, TIF_IA32);
416 } else {
417 set_tsk_thread_flag(tsk, TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200418 current_thread_info()->status |= TS_COMPAT;
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800419 }
Andi Kleen4d9bc792006-06-26 13:57:19 +0200420 }
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800421 clear_tsk_thread_flag(tsk, TIF_DEBUG);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422
423 tsk->thread.debugreg0 = 0;
424 tsk->thread.debugreg1 = 0;
425 tsk->thread.debugreg2 = 0;
426 tsk->thread.debugreg3 = 0;
427 tsk->thread.debugreg6 = 0;
428 tsk->thread.debugreg7 = 0;
429 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
430 /*
431 * Forget coprocessor state..
432 */
433 clear_fpu(tsk);
434 clear_used_math();
435}
436
437void release_thread(struct task_struct *dead_task)
438{
439 if (dead_task->mm) {
440 if (dead_task->mm->context.size) {
441 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
442 dead_task->comm,
443 dead_task->mm->context.ldt,
444 dead_task->mm->context.size);
445 BUG();
446 }
447 }
448}
449
450static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
451{
452 struct user_desc ud = {
453 .base_addr = addr,
454 .limit = 0xfffff,
455 .seg_32bit = 1,
456 .limit_in_pages = 1,
457 .useable = 1,
458 };
459 struct n_desc_struct *desc = (void *)t->thread.tls_array;
460 desc += tls;
461 desc->a = LDT_entry_a(&ud);
462 desc->b = LDT_entry_b(&ud);
463}
464
465static inline u32 read_32bit_tls(struct task_struct *t, int tls)
466{
Roland McGrath91394eb2008-01-30 13:30:45 +0100467 return get_desc_base(&t->thread.tls_array[tls]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468}
469
470/*
471 * This gets called before we allocate a new thread and copy
472 * the current task into it.
473 */
474void prepare_to_copy(struct task_struct *tsk)
475{
476 unlazy_fpu(tsk);
477}
478
479int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
480 unsigned long unused,
481 struct task_struct * p, struct pt_regs * regs)
482{
483 int err;
484 struct pt_regs * childregs;
485 struct task_struct *me = current;
486
Andi Kleena88cde12005-11-05 17:25:54 +0100487 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800488 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489 *childregs = *regs;
490
491 childregs->rax = 0;
492 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100493 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495
496 p->thread.rsp = (unsigned long) childregs;
497 p->thread.rsp0 = (unsigned long) (childregs+1);
498 p->thread.userrsp = me->thread.userrsp;
499
Al Viroe4f17c42006-01-12 01:05:38 -0800500 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501
502 p->thread.fs = me->thread.fs;
503 p->thread.gs = me->thread.gs;
504
H. J. Lufd51f662005-05-01 08:58:48 -0700505 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
506 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
507 asm("mov %%es,%0" : "=m" (p->thread.es));
508 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700509
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200510 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
512 if (!p->thread.io_bitmap_ptr) {
513 p->thread.io_bitmap_max = 0;
514 return -ENOMEM;
515 }
Andi Kleena88cde12005-11-05 17:25:54 +0100516 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
517 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200518 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519 }
520
521 /*
522 * Set a new TLS for the child thread?
523 */
524 if (clone_flags & CLONE_SETTLS) {
525#ifdef CONFIG_IA32_EMULATION
526 if (test_thread_flag(TIF_IA32))
Roland McGrathefd1ca52008-01-30 13:30:46 +0100527 err = do_set_thread_area(p, -1,
528 (struct user_desc __user *)childregs->rsi, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700529 else
530#endif
531 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
532 if (err)
533 goto out;
534 }
535 err = 0;
536out:
537 if (err && p->thread.io_bitmap_ptr) {
538 kfree(p->thread.io_bitmap_ptr);
539 p->thread.io_bitmap_max = 0;
540 }
541 return err;
542}
543
544/*
545 * This special macro can be used to load a debugging register
546 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100547#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700548
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200549static inline void __switch_to_xtra(struct task_struct *prev_p,
550 struct task_struct *next_p,
551 struct tss_struct *tss)
552{
553 struct thread_struct *prev, *next;
554
555 prev = &prev_p->thread,
556 next = &next_p->thread;
557
558 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
559 loaddebug(next, 0);
560 loaddebug(next, 1);
561 loaddebug(next, 2);
562 loaddebug(next, 3);
563 /* no 4 and 5 */
564 loaddebug(next, 6);
565 loaddebug(next, 7);
566 }
567
568 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
569 /*
570 * Copy the relevant range of the IO bitmap.
571 * Normally this is 128 bytes or less:
572 */
573 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
574 max(prev->io_bitmap_max, next->io_bitmap_max));
575 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
576 /*
577 * Clear any possible leftover bits:
578 */
579 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
580 }
581}
582
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583/*
584 * switch_to(x,y) should switch tasks from x to y.
585 *
586 * This could still be optimized:
587 * - fold all the options into a flag word and test it with a single test.
588 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100589 *
590 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591 */
Masami Hiramatsuf438d912007-10-16 01:27:49 -0700592struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100593__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594{
595 struct thread_struct *prev = &prev_p->thread,
596 *next = &next_p->thread;
597 int cpu = smp_processor_id();
598 struct tss_struct *tss = &per_cpu(init_tss, cpu);
599
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200600 /* we're going to use this soon, after a few expensive things */
601 if (next_p->fpu_counter>5)
602 prefetch(&next->i387.fxsave);
603
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604 /*
605 * Reload esp0, LDT and the page table pointer:
606 */
607 tss->rsp0 = next->rsp0;
608
609 /*
610 * Switch DS and ES.
611 * This won't pick up thread selector changes, but I guess that is ok.
612 */
H. J. Lufd51f662005-05-01 08:58:48 -0700613 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614 if (unlikely(next->es | prev->es))
615 loadsegment(es, next->es);
616
H. J. Lufd51f662005-05-01 08:58:48 -0700617 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618 if (unlikely(next->ds | prev->ds))
619 loadsegment(ds, next->ds);
620
621 load_TLS(next, cpu);
622
623 /*
624 * Switch FS and GS.
625 */
626 {
627 unsigned fsindex;
628 asm volatile("movl %%fs,%0" : "=r" (fsindex));
629 /* segment register != 0 always requires a reload.
630 also reload when it has changed.
631 when prev process used 64bit base always reload
632 to avoid an information leak. */
633 if (unlikely(fsindex | next->fsindex | prev->fs)) {
634 loadsegment(fs, next->fsindex);
635 /* check if the user used a selector != 0
636 * if yes clear 64bit base, since overloaded base
637 * is always mapped to the Null selector
638 */
639 if (fsindex)
640 prev->fs = 0;
641 }
642 /* when next process has a 64bit base use it */
643 if (next->fs)
644 wrmsrl(MSR_FS_BASE, next->fs);
645 prev->fsindex = fsindex;
646 }
647 {
648 unsigned gsindex;
649 asm volatile("movl %%gs,%0" : "=r" (gsindex));
650 if (unlikely(gsindex | next->gsindex | prev->gs)) {
651 load_gs_index(next->gsindex);
652 if (gsindex)
653 prev->gs = 0;
654 }
655 if (next->gs)
656 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
657 prev->gsindex = gsindex;
658 }
659
Andi Kleen0a5ace22006-10-05 18:47:22 +0200660 /* Must be after DS reload */
661 unlazy_fpu(prev_p);
662
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100664 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665 */
666 prev->userrsp = read_pda(oldrsp);
667 write_pda(oldrsp, next->userrsp);
668 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200669
Andi Kleena88cde12005-11-05 17:25:54 +0100670 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200671 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a425402006-09-26 10:52:38 +0200672#ifdef CONFIG_CC_STACKPROTECTOR
673 write_pda(stack_canary, next_p->stack_canary);
674 /*
675 * Build time only check to make sure the stack_canary is at
676 * offset 40 in the pda; this is a gcc ABI requirement
677 */
678 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
679#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680
681 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200682 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200684 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
685 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
686 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200688 /* If the task has used fpu the last 5 timeslices, just do a full
689 * restore of the math state immediately to avoid the trap; the
690 * chances of needing FPU soon are obviously high now
691 */
692 if (next_p->fpu_counter>5)
693 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 return prev_p;
695}
696
697/*
698 * sys_execve() executes a new program.
699 */
700asmlinkage
701long sys_execve(char __user *name, char __user * __user *argv,
702 char __user * __user *envp, struct pt_regs regs)
703{
704 long error;
705 char * filename;
706
707 filename = getname(name);
708 error = PTR_ERR(filename);
709 if (IS_ERR(filename))
710 return error;
711 error = do_execve(filename, argv, envp, &regs);
712 if (error == 0) {
713 task_lock(current);
714 current->ptrace &= ~PT_DTRACE;
715 task_unlock(current);
716 }
717 putname(filename);
718 return error;
719}
720
721void set_personality_64bit(void)
722{
723 /* inherit personality from parent */
724
725 /* Make sure to be in 64bit mode */
726 clear_thread_flag(TIF_IA32);
727
728 /* TBD: overwrites user setup. Should have two bits.
729 But 64bit processes have always behaved this way,
730 so it's not too bad. The main problem is just that
731 32bit childs are affected again. */
732 current->personality &= ~READ_IMPLIES_EXEC;
733}
734
735asmlinkage long sys_fork(struct pt_regs *regs)
736{
737 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
738}
739
Andi Kleena88cde12005-11-05 17:25:54 +0100740asmlinkage long
741sys_clone(unsigned long clone_flags, unsigned long newsp,
742 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743{
744 if (!newsp)
745 newsp = regs->rsp;
746 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
747}
748
749/*
750 * This is trivial, and on the face of it looks like it
751 * could equally well be done in user mode.
752 *
753 * Not so, for quite unobvious reasons - register pressure.
754 * In user mode vfork() cannot have a stack frame, and if
755 * done by calling the "clone()" system call directly, you
756 * do not have enough call-clobbered registers to hold all
757 * the information you need.
758 */
759asmlinkage long sys_vfork(struct pt_regs *regs)
760{
761 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
762 NULL, NULL);
763}
764
765unsigned long get_wchan(struct task_struct *p)
766{
767 unsigned long stack;
768 u64 fp,rip;
769 int count = 0;
770
771 if (!p || p == current || p->state==TASK_RUNNING)
772 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800773 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700774 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
775 return 0;
776 fp = *(u64 *)(p->thread.rsp);
777 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100778 if (fp < (unsigned long)stack ||
779 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700780 return 0;
781 rip = *(u64 *)(fp+8);
782 if (!in_sched_functions(rip))
783 return rip;
784 fp = *(u64 *)fp;
785 } while (count++ < 16);
786 return 0;
787}
788
789long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
790{
791 int ret = 0;
792 int doit = task == current;
793 int cpu;
794
795 switch (code) {
796 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700797 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798 return -EPERM;
799 cpu = get_cpu();
800 /* handle small bases via the GDT because that's faster to
801 switch. */
802 if (addr <= 0xffffffff) {
803 set_32bit_tls(task, GS_TLS, addr);
804 if (doit) {
805 load_TLS(&task->thread, cpu);
806 load_gs_index(GS_TLS_SEL);
807 }
808 task->thread.gsindex = GS_TLS_SEL;
809 task->thread.gs = 0;
810 } else {
811 task->thread.gsindex = 0;
812 task->thread.gs = addr;
813 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100814 load_gs_index(0);
815 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700816 }
817 }
818 put_cpu();
819 break;
820 case ARCH_SET_FS:
821 /* Not strictly needed for fs, but do it for symmetry
822 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700823 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700824 return -EPERM;
825 cpu = get_cpu();
826 /* handle small bases via the GDT because that's faster to
827 switch. */
828 if (addr <= 0xffffffff) {
829 set_32bit_tls(task, FS_TLS, addr);
830 if (doit) {
831 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100832 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833 }
834 task->thread.fsindex = FS_TLS_SEL;
835 task->thread.fs = 0;
836 } else {
837 task->thread.fsindex = 0;
838 task->thread.fs = addr;
839 if (doit) {
840 /* set the selector to 0 to not confuse
841 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100842 asm volatile("movl %0,%%fs" :: "r" (0));
843 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700844 }
845 }
846 put_cpu();
847 break;
848 case ARCH_GET_FS: {
849 unsigned long base;
850 if (task->thread.fsindex == FS_TLS_SEL)
851 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100852 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100854 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700855 base = task->thread.fs;
856 ret = put_user(base, (unsigned long __user *)addr);
857 break;
858 }
859 case ARCH_GET_GS: {
860 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200861 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700862 if (task->thread.gsindex == GS_TLS_SEL)
863 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200864 else if (doit) {
865 asm("movl %%gs,%0" : "=r" (gsindex));
866 if (gsindex)
867 rdmsrl(MSR_KERNEL_GS_BASE, base);
868 else
869 base = task->thread.gs;
870 }
Andi Kleena88cde12005-11-05 17:25:54 +0100871 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872 base = task->thread.gs;
873 ret = put_user(base, (unsigned long __user *)addr);
874 break;
875 }
876
877 default:
878 ret = -EINVAL;
879 break;
880 }
881
882 return ret;
883}
884
885long sys_arch_prctl(int code, unsigned long addr)
886{
887 return do_arch_prctl(current, code, addr);
888}
889
890/*
891 * Capture the user space registers if the task is not running (in user space)
892 */
893int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
894{
895 struct pt_regs *pp, ptregs;
896
Al Virobb049232006-01-12 01:05:38 -0800897 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700898
899 ptregs = *pp;
900 ptregs.cs &= 0xffff;
901 ptregs.ss &= 0xffff;
902
903 elf_core_copy_regs(regs, &ptregs);
904
905 return 1;
906}
907
908unsigned long arch_align_stack(unsigned long sp)
909{
Andi Kleenc16b63e2006-09-26 10:52:28 +0200910 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911 sp -= get_random_int() % 8192;
912 return sp & ~0xf;
913}
Jiri Kosinac1d171a2008-01-30 13:30:40 +0100914
915unsigned long arch_randomize_brk(struct mm_struct *mm)
916{
917 unsigned long range_end = mm->brk + 0x02000000;
918 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
919}
920