blob: a418ee4c8c62a0d14102b22939dfa8c5b16d0f79 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070011 *
12 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
Ashok Raj76e4f662005-06-25 14:55:00 -070021#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/module.h>
31#include <linux/a.out.h>
32#include <linux/interrupt.h>
33#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/ptrace.h>
35#include <linux/utsname.h>
36#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010037#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080038#include <linux/kprobes.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039
40#include <asm/uaccess.h>
41#include <asm/pgtable.h>
42#include <asm/system.h>
43#include <asm/io.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/pda.h>
48#include <asm/prctl.h>
49#include <asm/kdebug.h>
50#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010053#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020066EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070067static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
Alan Sterne041c682006-03-27 01:16:30 -080069static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010070
71void idle_notifier_register(struct notifier_block *n)
72{
Alan Sterne041c682006-03-27 01:16:30 -080073 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010074}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
Alan Sterne041c682006-03-27 01:16:30 -080079 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010080}
81EXPORT_SYMBOL(idle_notifier_unregister);
82
Andi Kleen95833c82006-01-11 22:44:36 +010083void enter_idle(void)
84{
Andi Kleena15da492006-09-26 10:52:40 +020085 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080086 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010087}
88
89static void __exit_idle(void)
90{
Andi Kleen94468682006-11-14 16:57:46 +010091 if (test_and_clear_bit_pda(0, isidle) == 0)
Andi Kleena15da492006-09-26 10:52:40 +020092 return;
Alan Sterne041c682006-03-27 01:16:30 -080093 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010094}
95
96/* Called from interrupts to signify idle end */
97void exit_idle(void)
98{
Andi Kleena15da492006-09-26 10:52:40 +020099 /* idle loop has pid 0 */
100 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +0100101 return;
102 __exit_idle();
103}
104
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105/*
106 * We use this if we don't have any better
107 * idle routine..
108 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800109static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200111 current_thread_info()->status &= ~TS_POLLING;
Andi Kleen2d52ede2006-01-11 22:42:42 +0100112 smp_mb__after_clear_bit();
Andi Kleen72690a22006-12-07 02:14:03 +0100113 local_irq_disable();
114 if (!need_resched()) {
115 /* Enables interrupts one instruction before HLT.
116 x86 special cases this so there is no race. */
117 safe_halt();
118 } else
119 local_irq_enable();
Andi Kleen495ab9c2006-06-26 13:59:11 +0200120 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121}
122
123/*
124 * On SMP it's slightly faster (but much more power-consuming!)
125 * to poll the ->need_resched flag instead of waiting for the
126 * cross-CPU IPI to arrive. Use this option with caution.
127 */
128static void poll_idle (void)
129{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100130 local_irq_enable();
Andi Kleen72690a22006-12-07 02:14:03 +0100131 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132}
133
134void cpu_idle_wait(void)
135{
136 unsigned int cpu, this_cpu = get_cpu();
Ingo Molnardc1829a2006-11-17 14:26:18 +0100137 cpumask_t map, tmp = current->cpus_allowed;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138
139 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
140 put_cpu();
141
142 cpus_clear(map);
143 for_each_online_cpu(cpu) {
144 per_cpu(cpu_idle_state, cpu) = 1;
145 cpu_set(cpu, map);
146 }
147
148 __get_cpu_var(cpu_idle_state) = 0;
149
150 wmb();
151 do {
152 ssleep(1);
153 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100154 if (cpu_isset(cpu, map) &&
155 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 cpu_clear(cpu, map);
157 }
158 cpus_and(map, map, cpu_online_map);
159 } while (!cpus_empty(map));
Ingo Molnardc1829a2006-11-17 14:26:18 +0100160
161 set_cpus_allowed(current, tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162}
163EXPORT_SYMBOL_GPL(cpu_idle_wait);
164
Ashok Raj76e4f662005-06-25 14:55:00 -0700165#ifdef CONFIG_HOTPLUG_CPU
166DECLARE_PER_CPU(int, cpu_state);
167
168#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800169/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700170static inline void play_dead(void)
171{
172 idle_task_exit();
173 wbinvd();
174 mb();
175 /* Ack it */
176 __get_cpu_var(cpu_state) = CPU_DEAD;
177
Shaohua Li1fa744e2006-01-06 00:12:20 -0800178 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700179 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800180 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700181}
182#else
183static inline void play_dead(void)
184{
185 BUG();
186}
187#endif /* CONFIG_HOTPLUG_CPU */
188
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189/*
190 * The idle thread. There's no useful work to be
191 * done, so just try to conserve power and have a
192 * low exit latency (ie sit in a loop waiting for
193 * somebody to say that they'd like to reschedule)
194 */
195void cpu_idle (void)
196{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200197 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 /* endless idle loop with no priority at all */
199 while (1) {
200 while (!need_resched()) {
201 void (*idle)(void);
202
203 if (__get_cpu_var(cpu_idle_state))
204 __get_cpu_var(cpu_idle_state) = 0;
205
206 rmb();
207 idle = pm_idle;
208 if (!idle)
209 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700210 if (cpu_is_offline(smp_processor_id()))
211 play_dead();
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100212 /*
213 * Idle routines should keep interrupts disabled
214 * from here on, until they go to idle.
215 * Otherwise, idle callbacks can misfire.
216 */
217 local_irq_disable();
Andi Kleen95833c82006-01-11 22:44:36 +0100218 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700219 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200220 /* In many cases the interrupt that ended idle
221 has already called exit_idle. But some idle
222 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100223 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700224 }
225
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800226 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800228 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700229 }
230}
231
232/*
233 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
234 * which can obviate IPI to trigger checking of need_resched.
235 * We execute MONITOR against need_resched and enter optimized wait state
236 * through MWAIT. Whenever someone changes need_resched, we would be woken
237 * up from MWAIT (without an IPI).
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700238 *
239 * New with Core Duo processors, MWAIT can take some hints based on CPU
240 * capability.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241 */
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700242void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
243{
244 if (!need_resched()) {
245 __monitor((void *)&current_thread_info()->flags, 0, 0);
246 smp_mb();
247 if (!need_resched())
248 __mwait(eax, ecx);
249 }
250}
251
252/* Default MONITOR/MWAIT with no hints, used for default C1 state */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253static void mwait_idle(void)
254{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100255 if (!need_resched()) {
256 __monitor((void *)&current_thread_info()->flags, 0, 0);
257 smp_mb();
258 if (!need_resched())
259 __sti_mwait(0, 0);
260 else
261 local_irq_enable();
262 } else {
263 local_irq_enable();
264 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265}
266
Ashok Raje6982c62005-06-25 14:54:58 -0700267void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268{
269 static int printed;
270 if (cpu_has(c, X86_FEATURE_MWAIT)) {
271 /*
272 * Skip, if setup has overridden idle.
273 * One CPU supports mwait => All CPUs supports mwait
274 */
275 if (!pm_idle) {
276 if (!printed) {
277 printk("using mwait in idle threads.\n");
278 printed = 1;
279 }
280 pm_idle = mwait_idle;
281 }
282 }
283}
284
285static int __init idle_setup (char *str)
286{
287 if (!strncmp(str, "poll", 4)) {
288 printk("using polling idle threads.\n");
289 pm_idle = poll_idle;
290 }
291
292 boot_option_idle_override = 1;
293 return 1;
294}
295
296__setup("idle=", idle_setup);
297
298/* Prints also some state that isn't saved in the pt_regs */
299void __show_regs(struct pt_regs * regs)
300{
301 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
302 unsigned int fsindex,gsindex;
303 unsigned int ds,cs,es;
304
305 printk("\n");
306 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200307 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
308 current->pid, current->comm, print_tainted(),
Serge E. Hallyn96b644b2006-10-02 02:18:13 -0700309 init_utsname()->release,
310 (int)strcspn(init_utsname()->version, " "),
311 init_utsname()->version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
313 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700314 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100315 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
317 regs->rax, regs->rbx, regs->rcx);
318 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
319 regs->rdx, regs->rsi, regs->rdi);
320 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
321 regs->rbp, regs->r8, regs->r9);
322 printk("R10: %016lx R11: %016lx R12: %016lx\n",
323 regs->r10, regs->r11, regs->r12);
324 printk("R13: %016lx R14: %016lx R15: %016lx\n",
325 regs->r13, regs->r14, regs->r15);
326
327 asm("movl %%ds,%0" : "=r" (ds));
328 asm("movl %%cs,%0" : "=r" (cs));
329 asm("movl %%es,%0" : "=r" (es));
330 asm("movl %%fs,%0" : "=r" (fsindex));
331 asm("movl %%gs,%0" : "=r" (gsindex));
332
333 rdmsrl(MSR_FS_BASE, fs);
334 rdmsrl(MSR_GS_BASE, gs);
335 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
336
337 asm("movq %%cr0, %0": "=r" (cr0));
338 asm("movq %%cr2, %0": "=r" (cr2));
339 asm("movq %%cr3, %0": "=r" (cr3));
340 asm("movq %%cr4, %0": "=r" (cr4));
341
342 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
343 fs,fsindex,gs,gsindex,shadowgs);
344 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
345 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
346}
347
348void show_regs(struct pt_regs *regs)
349{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700350 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200352 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353}
354
355/*
356 * Free current thread data structures etc..
357 */
358void exit_thread(void)
359{
360 struct task_struct *me = current;
361 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700362
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 if (me->thread.io_bitmap_ptr) {
364 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
365
366 kfree(t->io_bitmap_ptr);
367 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200368 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369 /*
370 * Careful, clear this in the TSS too:
371 */
372 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
373 t->io_bitmap_max = 0;
374 put_cpu();
375 }
376}
377
378void flush_thread(void)
379{
380 struct task_struct *tsk = current;
381 struct thread_info *t = current_thread_info();
382
Andi Kleen4d9bc792006-06-26 13:57:19 +0200383 if (t->flags & _TIF_ABI_PENDING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200385 if (t->flags & _TIF_IA32)
386 current_thread_info()->status |= TS_COMPAT;
387 }
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200388 t->flags &= ~_TIF_DEBUG;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389
390 tsk->thread.debugreg0 = 0;
391 tsk->thread.debugreg1 = 0;
392 tsk->thread.debugreg2 = 0;
393 tsk->thread.debugreg3 = 0;
394 tsk->thread.debugreg6 = 0;
395 tsk->thread.debugreg7 = 0;
396 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
397 /*
398 * Forget coprocessor state..
399 */
400 clear_fpu(tsk);
401 clear_used_math();
402}
403
404void release_thread(struct task_struct *dead_task)
405{
406 if (dead_task->mm) {
407 if (dead_task->mm->context.size) {
408 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
409 dead_task->comm,
410 dead_task->mm->context.ldt,
411 dead_task->mm->context.size);
412 BUG();
413 }
414 }
415}
416
417static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
418{
419 struct user_desc ud = {
420 .base_addr = addr,
421 .limit = 0xfffff,
422 .seg_32bit = 1,
423 .limit_in_pages = 1,
424 .useable = 1,
425 };
426 struct n_desc_struct *desc = (void *)t->thread.tls_array;
427 desc += tls;
428 desc->a = LDT_entry_a(&ud);
429 desc->b = LDT_entry_b(&ud);
430}
431
432static inline u32 read_32bit_tls(struct task_struct *t, int tls)
433{
434 struct desc_struct *desc = (void *)t->thread.tls_array;
435 desc += tls;
436 return desc->base0 |
437 (((u32)desc->base1) << 16) |
438 (((u32)desc->base2) << 24);
439}
440
441/*
442 * This gets called before we allocate a new thread and copy
443 * the current task into it.
444 */
445void prepare_to_copy(struct task_struct *tsk)
446{
447 unlazy_fpu(tsk);
448}
449
450int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
451 unsigned long unused,
452 struct task_struct * p, struct pt_regs * regs)
453{
454 int err;
455 struct pt_regs * childregs;
456 struct task_struct *me = current;
457
Andi Kleena88cde12005-11-05 17:25:54 +0100458 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800459 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460 *childregs = *regs;
461
462 childregs->rax = 0;
463 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100464 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466
467 p->thread.rsp = (unsigned long) childregs;
468 p->thread.rsp0 = (unsigned long) (childregs+1);
469 p->thread.userrsp = me->thread.userrsp;
470
Al Viroe4f17c42006-01-12 01:05:38 -0800471 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472
473 p->thread.fs = me->thread.fs;
474 p->thread.gs = me->thread.gs;
475
H. J. Lufd51f662005-05-01 08:58:48 -0700476 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
477 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
478 asm("mov %%es,%0" : "=m" (p->thread.es));
479 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700480
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200481 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
483 if (!p->thread.io_bitmap_ptr) {
484 p->thread.io_bitmap_max = 0;
485 return -ENOMEM;
486 }
Andi Kleena88cde12005-11-05 17:25:54 +0100487 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
488 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200489 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490 }
491
492 /*
493 * Set a new TLS for the child thread?
494 */
495 if (clone_flags & CLONE_SETTLS) {
496#ifdef CONFIG_IA32_EMULATION
497 if (test_thread_flag(TIF_IA32))
498 err = ia32_child_tls(p, childregs);
499 else
500#endif
501 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
502 if (err)
503 goto out;
504 }
505 err = 0;
506out:
507 if (err && p->thread.io_bitmap_ptr) {
508 kfree(p->thread.io_bitmap_ptr);
509 p->thread.io_bitmap_max = 0;
510 }
511 return err;
512}
513
514/*
515 * This special macro can be used to load a debugging register
516 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100517#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200519static inline void __switch_to_xtra(struct task_struct *prev_p,
520 struct task_struct *next_p,
521 struct tss_struct *tss)
522{
523 struct thread_struct *prev, *next;
524
525 prev = &prev_p->thread,
526 next = &next_p->thread;
527
528 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
529 loaddebug(next, 0);
530 loaddebug(next, 1);
531 loaddebug(next, 2);
532 loaddebug(next, 3);
533 /* no 4 and 5 */
534 loaddebug(next, 6);
535 loaddebug(next, 7);
536 }
537
538 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
539 /*
540 * Copy the relevant range of the IO bitmap.
541 * Normally this is 128 bytes or less:
542 */
543 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
544 max(prev->io_bitmap_max, next->io_bitmap_max));
545 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
546 /*
547 * Clear any possible leftover bits:
548 */
549 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
550 }
551}
552
Linus Torvalds1da177e2005-04-16 15:20:36 -0700553/*
554 * switch_to(x,y) should switch tasks from x to y.
555 *
556 * This could still be optimized:
557 * - fold all the options into a flag word and test it with a single test.
558 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100559 *
560 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700561 */
Andi Kleen099f3182006-02-03 21:51:38 +0100562__kprobes struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100563__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564{
565 struct thread_struct *prev = &prev_p->thread,
566 *next = &next_p->thread;
567 int cpu = smp_processor_id();
568 struct tss_struct *tss = &per_cpu(init_tss, cpu);
569
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200570 /* we're going to use this soon, after a few expensive things */
571 if (next_p->fpu_counter>5)
572 prefetch(&next->i387.fxsave);
573
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574 /*
575 * Reload esp0, LDT and the page table pointer:
576 */
577 tss->rsp0 = next->rsp0;
578
579 /*
580 * Switch DS and ES.
581 * This won't pick up thread selector changes, but I guess that is ok.
582 */
H. J. Lufd51f662005-05-01 08:58:48 -0700583 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584 if (unlikely(next->es | prev->es))
585 loadsegment(es, next->es);
586
H. J. Lufd51f662005-05-01 08:58:48 -0700587 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588 if (unlikely(next->ds | prev->ds))
589 loadsegment(ds, next->ds);
590
591 load_TLS(next, cpu);
592
593 /*
594 * Switch FS and GS.
595 */
596 {
597 unsigned fsindex;
598 asm volatile("movl %%fs,%0" : "=r" (fsindex));
599 /* segment register != 0 always requires a reload.
600 also reload when it has changed.
601 when prev process used 64bit base always reload
602 to avoid an information leak. */
603 if (unlikely(fsindex | next->fsindex | prev->fs)) {
604 loadsegment(fs, next->fsindex);
605 /* check if the user used a selector != 0
606 * if yes clear 64bit base, since overloaded base
607 * is always mapped to the Null selector
608 */
609 if (fsindex)
610 prev->fs = 0;
611 }
612 /* when next process has a 64bit base use it */
613 if (next->fs)
614 wrmsrl(MSR_FS_BASE, next->fs);
615 prev->fsindex = fsindex;
616 }
617 {
618 unsigned gsindex;
619 asm volatile("movl %%gs,%0" : "=r" (gsindex));
620 if (unlikely(gsindex | next->gsindex | prev->gs)) {
621 load_gs_index(next->gsindex);
622 if (gsindex)
623 prev->gs = 0;
624 }
625 if (next->gs)
626 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
627 prev->gsindex = gsindex;
628 }
629
Andi Kleen0a5ace22006-10-05 18:47:22 +0200630 /* Must be after DS reload */
631 unlazy_fpu(prev_p);
632
Linus Torvalds1da177e2005-04-16 15:20:36 -0700633 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100634 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700635 */
636 prev->userrsp = read_pda(oldrsp);
637 write_pda(oldrsp, next->userrsp);
638 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200639
Andi Kleena88cde12005-11-05 17:25:54 +0100640 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200641 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a425402006-09-26 10:52:38 +0200642#ifdef CONFIG_CC_STACKPROTECTOR
643 write_pda(stack_canary, next_p->stack_canary);
644 /*
645 * Build time only check to make sure the stack_canary is at
646 * offset 40 in the pda; this is a gcc ABI requirement
647 */
648 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
649#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650
651 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200652 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200654 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
655 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
656 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200658 /* If the task has used fpu the last 5 timeslices, just do a full
659 * restore of the math state immediately to avoid the trap; the
660 * chances of needing FPU soon are obviously high now
661 */
662 if (next_p->fpu_counter>5)
663 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664 return prev_p;
665}
666
667/*
668 * sys_execve() executes a new program.
669 */
670asmlinkage
671long sys_execve(char __user *name, char __user * __user *argv,
672 char __user * __user *envp, struct pt_regs regs)
673{
674 long error;
675 char * filename;
676
677 filename = getname(name);
678 error = PTR_ERR(filename);
679 if (IS_ERR(filename))
680 return error;
681 error = do_execve(filename, argv, envp, &regs);
682 if (error == 0) {
683 task_lock(current);
684 current->ptrace &= ~PT_DTRACE;
685 task_unlock(current);
686 }
687 putname(filename);
688 return error;
689}
690
691void set_personality_64bit(void)
692{
693 /* inherit personality from parent */
694
695 /* Make sure to be in 64bit mode */
696 clear_thread_flag(TIF_IA32);
697
698 /* TBD: overwrites user setup. Should have two bits.
699 But 64bit processes have always behaved this way,
700 so it's not too bad. The main problem is just that
701 32bit childs are affected again. */
702 current->personality &= ~READ_IMPLIES_EXEC;
703}
704
705asmlinkage long sys_fork(struct pt_regs *regs)
706{
707 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
708}
709
Andi Kleena88cde12005-11-05 17:25:54 +0100710asmlinkage long
711sys_clone(unsigned long clone_flags, unsigned long newsp,
712 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700713{
714 if (!newsp)
715 newsp = regs->rsp;
716 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
717}
718
719/*
720 * This is trivial, and on the face of it looks like it
721 * could equally well be done in user mode.
722 *
723 * Not so, for quite unobvious reasons - register pressure.
724 * In user mode vfork() cannot have a stack frame, and if
725 * done by calling the "clone()" system call directly, you
726 * do not have enough call-clobbered registers to hold all
727 * the information you need.
728 */
729asmlinkage long sys_vfork(struct pt_regs *regs)
730{
731 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
732 NULL, NULL);
733}
734
735unsigned long get_wchan(struct task_struct *p)
736{
737 unsigned long stack;
738 u64 fp,rip;
739 int count = 0;
740
741 if (!p || p == current || p->state==TASK_RUNNING)
742 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800743 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
745 return 0;
746 fp = *(u64 *)(p->thread.rsp);
747 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100748 if (fp < (unsigned long)stack ||
749 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700750 return 0;
751 rip = *(u64 *)(fp+8);
752 if (!in_sched_functions(rip))
753 return rip;
754 fp = *(u64 *)fp;
755 } while (count++ < 16);
756 return 0;
757}
758
759long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
760{
761 int ret = 0;
762 int doit = task == current;
763 int cpu;
764
765 switch (code) {
766 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700767 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700768 return -EPERM;
769 cpu = get_cpu();
770 /* handle small bases via the GDT because that's faster to
771 switch. */
772 if (addr <= 0xffffffff) {
773 set_32bit_tls(task, GS_TLS, addr);
774 if (doit) {
775 load_TLS(&task->thread, cpu);
776 load_gs_index(GS_TLS_SEL);
777 }
778 task->thread.gsindex = GS_TLS_SEL;
779 task->thread.gs = 0;
780 } else {
781 task->thread.gsindex = 0;
782 task->thread.gs = addr;
783 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100784 load_gs_index(0);
785 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700786 }
787 }
788 put_cpu();
789 break;
790 case ARCH_SET_FS:
791 /* Not strictly needed for fs, but do it for symmetry
792 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700793 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700794 return -EPERM;
795 cpu = get_cpu();
796 /* handle small bases via the GDT because that's faster to
797 switch. */
798 if (addr <= 0xffffffff) {
799 set_32bit_tls(task, FS_TLS, addr);
800 if (doit) {
801 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100802 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803 }
804 task->thread.fsindex = FS_TLS_SEL;
805 task->thread.fs = 0;
806 } else {
807 task->thread.fsindex = 0;
808 task->thread.fs = addr;
809 if (doit) {
810 /* set the selector to 0 to not confuse
811 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100812 asm volatile("movl %0,%%fs" :: "r" (0));
813 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814 }
815 }
816 put_cpu();
817 break;
818 case ARCH_GET_FS: {
819 unsigned long base;
820 if (task->thread.fsindex == FS_TLS_SEL)
821 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100822 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700823 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100824 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825 base = task->thread.fs;
826 ret = put_user(base, (unsigned long __user *)addr);
827 break;
828 }
829 case ARCH_GET_GS: {
830 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200831 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700832 if (task->thread.gsindex == GS_TLS_SEL)
833 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200834 else if (doit) {
835 asm("movl %%gs,%0" : "=r" (gsindex));
836 if (gsindex)
837 rdmsrl(MSR_KERNEL_GS_BASE, base);
838 else
839 base = task->thread.gs;
840 }
Andi Kleena88cde12005-11-05 17:25:54 +0100841 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700842 base = task->thread.gs;
843 ret = put_user(base, (unsigned long __user *)addr);
844 break;
845 }
846
847 default:
848 ret = -EINVAL;
849 break;
850 }
851
852 return ret;
853}
854
855long sys_arch_prctl(int code, unsigned long addr)
856{
857 return do_arch_prctl(current, code, addr);
858}
859
860/*
861 * Capture the user space registers if the task is not running (in user space)
862 */
863int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
864{
865 struct pt_regs *pp, ptregs;
866
Al Virobb049232006-01-12 01:05:38 -0800867 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700868
869 ptregs = *pp;
870 ptregs.cs &= 0xffff;
871 ptregs.ss &= 0xffff;
872
873 elf_core_copy_regs(regs, &ptregs);
874
875 return 1;
876}
877
878unsigned long arch_align_stack(unsigned long sp)
879{
Andi Kleenc16b63e2006-09-26 10:52:28 +0200880 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700881 sp -= get_random_int() % 8192;
882 return sp & ~0xf;
883}