blob: 49f7fac6229e5f4e920d58fcdec1fd5e49e8667b [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070011 *
12 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
Ashok Raj76e4f662005-06-25 14:55:00 -070021#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/module.h>
31#include <linux/a.out.h>
32#include <linux/interrupt.h>
33#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/ptrace.h>
35#include <linux/utsname.h>
36#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010037#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080038#include <linux/kprobes.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039
40#include <asm/uaccess.h>
41#include <asm/pgtable.h>
42#include <asm/system.h>
43#include <asm/io.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/pda.h>
48#include <asm/prctl.h>
49#include <asm/kdebug.h>
50#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010053#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020066EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070067static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
Alan Sterne041c682006-03-27 01:16:30 -080069static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010070
71void idle_notifier_register(struct notifier_block *n)
72{
Alan Sterne041c682006-03-27 01:16:30 -080073 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010074}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
Alan Sterne041c682006-03-27 01:16:30 -080079 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010080}
81EXPORT_SYMBOL(idle_notifier_unregister);
82
Andi Kleen95833c82006-01-11 22:44:36 +010083void enter_idle(void)
84{
Andi Kleena15da492006-09-26 10:52:40 +020085 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080086 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010087}
88
89static void __exit_idle(void)
90{
Andi Kleena15da492006-09-26 10:52:40 +020091 if (read_pda(isidle) == 0)
92 return;
93 write_pda(isidle, 0);
Alan Sterne041c682006-03-27 01:16:30 -080094 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010095}
96
97/* Called from interrupts to signify idle end */
98void exit_idle(void)
99{
Andi Kleena15da492006-09-26 10:52:40 +0200100 /* idle loop has pid 0 */
101 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +0100102 return;
103 __exit_idle();
104}
105
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106/*
107 * We use this if we don't have any better
108 * idle routine..
109 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800110static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111{
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800112 local_irq_enable();
113
Andi Kleen495ab9c2006-06-26 13:59:11 +0200114 current_thread_info()->status &= ~TS_POLLING;
Andi Kleen2d52ede2006-01-11 22:42:42 +0100115 smp_mb__after_clear_bit();
116 while (!need_resched()) {
117 local_irq_disable();
118 if (!need_resched())
119 safe_halt();
120 else
121 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122 }
Andi Kleen495ab9c2006-06-26 13:59:11 +0200123 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124}
125
126/*
127 * On SMP it's slightly faster (but much more power-consuming!)
128 * to poll the ->need_resched flag instead of waiting for the
129 * cross-CPU IPI to arrive. Use this option with caution.
130 */
131static void poll_idle (void)
132{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133 local_irq_enable();
134
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800135 asm volatile(
136 "2:"
137 "testl %0,%1;"
138 "rep; nop;"
139 "je 2b;"
140 : :
141 "i" (_TIF_NEED_RESCHED),
142 "m" (current_thread_info()->flags));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143}
144
145void cpu_idle_wait(void)
146{
147 unsigned int cpu, this_cpu = get_cpu();
148 cpumask_t map;
149
150 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
151 put_cpu();
152
153 cpus_clear(map);
154 for_each_online_cpu(cpu) {
155 per_cpu(cpu_idle_state, cpu) = 1;
156 cpu_set(cpu, map);
157 }
158
159 __get_cpu_var(cpu_idle_state) = 0;
160
161 wmb();
162 do {
163 ssleep(1);
164 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100165 if (cpu_isset(cpu, map) &&
166 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167 cpu_clear(cpu, map);
168 }
169 cpus_and(map, map, cpu_online_map);
170 } while (!cpus_empty(map));
171}
172EXPORT_SYMBOL_GPL(cpu_idle_wait);
173
Ashok Raj76e4f662005-06-25 14:55:00 -0700174#ifdef CONFIG_HOTPLUG_CPU
175DECLARE_PER_CPU(int, cpu_state);
176
177#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800178/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700179static inline void play_dead(void)
180{
181 idle_task_exit();
182 wbinvd();
183 mb();
184 /* Ack it */
185 __get_cpu_var(cpu_state) = CPU_DEAD;
186
Shaohua Li1fa744e2006-01-06 00:12:20 -0800187 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700188 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800189 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700190}
191#else
192static inline void play_dead(void)
193{
194 BUG();
195}
196#endif /* CONFIG_HOTPLUG_CPU */
197
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198/*
199 * The idle thread. There's no useful work to be
200 * done, so just try to conserve power and have a
201 * low exit latency (ie sit in a loop waiting for
202 * somebody to say that they'd like to reschedule)
203 */
204void cpu_idle (void)
205{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200206 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 /* endless idle loop with no priority at all */
208 while (1) {
209 while (!need_resched()) {
210 void (*idle)(void);
211
212 if (__get_cpu_var(cpu_idle_state))
213 __get_cpu_var(cpu_idle_state) = 0;
214
215 rmb();
216 idle = pm_idle;
217 if (!idle)
218 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700219 if (cpu_is_offline(smp_processor_id()))
220 play_dead();
Andi Kleen95833c82006-01-11 22:44:36 +0100221 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200223 /* In many cases the interrupt that ended idle
224 has already called exit_idle. But some idle
225 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100226 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227 }
228
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800229 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800231 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232 }
233}
234
235/*
236 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
237 * which can obviate IPI to trigger checking of need_resched.
238 * We execute MONITOR against need_resched and enter optimized wait state
239 * through MWAIT. Whenever someone changes need_resched, we would be woken
240 * up from MWAIT (without an IPI).
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700241 *
242 * New with Core Duo processors, MWAIT can take some hints based on CPU
243 * capability.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244 */
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700245void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
246{
247 if (!need_resched()) {
248 __monitor((void *)&current_thread_info()->flags, 0, 0);
249 smp_mb();
250 if (!need_resched())
251 __mwait(eax, ecx);
252 }
253}
254
255/* Default MONITOR/MWAIT with no hints, used for default C1 state */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256static void mwait_idle(void)
257{
258 local_irq_enable();
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700259 while (!need_resched())
260 mwait_idle_with_hints(0,0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261}
262
Ashok Raje6982c62005-06-25 14:54:58 -0700263void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700264{
265 static int printed;
266 if (cpu_has(c, X86_FEATURE_MWAIT)) {
267 /*
268 * Skip, if setup has overridden idle.
269 * One CPU supports mwait => All CPUs supports mwait
270 */
271 if (!pm_idle) {
272 if (!printed) {
273 printk("using mwait in idle threads.\n");
274 printed = 1;
275 }
276 pm_idle = mwait_idle;
277 }
278 }
279}
280
281static int __init idle_setup (char *str)
282{
283 if (!strncmp(str, "poll", 4)) {
284 printk("using polling idle threads.\n");
285 pm_idle = poll_idle;
286 }
287
288 boot_option_idle_override = 1;
289 return 1;
290}
291
292__setup("idle=", idle_setup);
293
294/* Prints also some state that isn't saved in the pt_regs */
295void __show_regs(struct pt_regs * regs)
296{
297 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
298 unsigned int fsindex,gsindex;
299 unsigned int ds,cs,es;
300
301 printk("\n");
302 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200303 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
304 current->pid, current->comm, print_tainted(),
Serge E. Hallyn96b644b2006-10-02 02:18:13 -0700305 init_utsname()->release,
306 (int)strcspn(init_utsname()->version, " "),
307 init_utsname()->version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
309 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700310 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100311 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
313 regs->rax, regs->rbx, regs->rcx);
314 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
315 regs->rdx, regs->rsi, regs->rdi);
316 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
317 regs->rbp, regs->r8, regs->r9);
318 printk("R10: %016lx R11: %016lx R12: %016lx\n",
319 regs->r10, regs->r11, regs->r12);
320 printk("R13: %016lx R14: %016lx R15: %016lx\n",
321 regs->r13, regs->r14, regs->r15);
322
323 asm("movl %%ds,%0" : "=r" (ds));
324 asm("movl %%cs,%0" : "=r" (cs));
325 asm("movl %%es,%0" : "=r" (es));
326 asm("movl %%fs,%0" : "=r" (fsindex));
327 asm("movl %%gs,%0" : "=r" (gsindex));
328
329 rdmsrl(MSR_FS_BASE, fs);
330 rdmsrl(MSR_GS_BASE, gs);
331 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
332
333 asm("movq %%cr0, %0": "=r" (cr0));
334 asm("movq %%cr2, %0": "=r" (cr2));
335 asm("movq %%cr3, %0": "=r" (cr3));
336 asm("movq %%cr4, %0": "=r" (cr4));
337
338 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
339 fs,fsindex,gs,gsindex,shadowgs);
340 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
341 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
342}
343
344void show_regs(struct pt_regs *regs)
345{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700346 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200348 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349}
350
351/*
352 * Free current thread data structures etc..
353 */
354void exit_thread(void)
355{
356 struct task_struct *me = current;
357 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700358
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 if (me->thread.io_bitmap_ptr) {
360 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
361
362 kfree(t->io_bitmap_ptr);
363 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200364 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365 /*
366 * Careful, clear this in the TSS too:
367 */
368 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
369 t->io_bitmap_max = 0;
370 put_cpu();
371 }
372}
373
374void flush_thread(void)
375{
376 struct task_struct *tsk = current;
377 struct thread_info *t = current_thread_info();
378
Andi Kleen4d9bc792006-06-26 13:57:19 +0200379 if (t->flags & _TIF_ABI_PENDING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200381 if (t->flags & _TIF_IA32)
382 current_thread_info()->status |= TS_COMPAT;
383 }
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200384 t->flags &= ~_TIF_DEBUG;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385
386 tsk->thread.debugreg0 = 0;
387 tsk->thread.debugreg1 = 0;
388 tsk->thread.debugreg2 = 0;
389 tsk->thread.debugreg3 = 0;
390 tsk->thread.debugreg6 = 0;
391 tsk->thread.debugreg7 = 0;
392 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
393 /*
394 * Forget coprocessor state..
395 */
396 clear_fpu(tsk);
397 clear_used_math();
398}
399
400void release_thread(struct task_struct *dead_task)
401{
402 if (dead_task->mm) {
403 if (dead_task->mm->context.size) {
404 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
405 dead_task->comm,
406 dead_task->mm->context.ldt,
407 dead_task->mm->context.size);
408 BUG();
409 }
410 }
411}
412
413static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
414{
415 struct user_desc ud = {
416 .base_addr = addr,
417 .limit = 0xfffff,
418 .seg_32bit = 1,
419 .limit_in_pages = 1,
420 .useable = 1,
421 };
422 struct n_desc_struct *desc = (void *)t->thread.tls_array;
423 desc += tls;
424 desc->a = LDT_entry_a(&ud);
425 desc->b = LDT_entry_b(&ud);
426}
427
428static inline u32 read_32bit_tls(struct task_struct *t, int tls)
429{
430 struct desc_struct *desc = (void *)t->thread.tls_array;
431 desc += tls;
432 return desc->base0 |
433 (((u32)desc->base1) << 16) |
434 (((u32)desc->base2) << 24);
435}
436
437/*
438 * This gets called before we allocate a new thread and copy
439 * the current task into it.
440 */
441void prepare_to_copy(struct task_struct *tsk)
442{
443 unlazy_fpu(tsk);
444}
445
446int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
447 unsigned long unused,
448 struct task_struct * p, struct pt_regs * regs)
449{
450 int err;
451 struct pt_regs * childregs;
452 struct task_struct *me = current;
453
Andi Kleena88cde12005-11-05 17:25:54 +0100454 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800455 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456 *childregs = *regs;
457
458 childregs->rax = 0;
459 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100460 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462
463 p->thread.rsp = (unsigned long) childregs;
464 p->thread.rsp0 = (unsigned long) (childregs+1);
465 p->thread.userrsp = me->thread.userrsp;
466
Al Viroe4f17c42006-01-12 01:05:38 -0800467 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468
469 p->thread.fs = me->thread.fs;
470 p->thread.gs = me->thread.gs;
471
H. J. Lufd51f662005-05-01 08:58:48 -0700472 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
473 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
474 asm("mov %%es,%0" : "=m" (p->thread.es));
475 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700476
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200477 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
479 if (!p->thread.io_bitmap_ptr) {
480 p->thread.io_bitmap_max = 0;
481 return -ENOMEM;
482 }
Andi Kleena88cde12005-11-05 17:25:54 +0100483 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
484 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200485 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 }
487
488 /*
489 * Set a new TLS for the child thread?
490 */
491 if (clone_flags & CLONE_SETTLS) {
492#ifdef CONFIG_IA32_EMULATION
493 if (test_thread_flag(TIF_IA32))
494 err = ia32_child_tls(p, childregs);
495 else
496#endif
497 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
498 if (err)
499 goto out;
500 }
501 err = 0;
502out:
503 if (err && p->thread.io_bitmap_ptr) {
504 kfree(p->thread.io_bitmap_ptr);
505 p->thread.io_bitmap_max = 0;
506 }
507 return err;
508}
509
510/*
511 * This special macro can be used to load a debugging register
512 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100513#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700514
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200515static inline void __switch_to_xtra(struct task_struct *prev_p,
516 struct task_struct *next_p,
517 struct tss_struct *tss)
518{
519 struct thread_struct *prev, *next;
520
521 prev = &prev_p->thread,
522 next = &next_p->thread;
523
524 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
525 loaddebug(next, 0);
526 loaddebug(next, 1);
527 loaddebug(next, 2);
528 loaddebug(next, 3);
529 /* no 4 and 5 */
530 loaddebug(next, 6);
531 loaddebug(next, 7);
532 }
533
534 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
535 /*
536 * Copy the relevant range of the IO bitmap.
537 * Normally this is 128 bytes or less:
538 */
539 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
540 max(prev->io_bitmap_max, next->io_bitmap_max));
541 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
542 /*
543 * Clear any possible leftover bits:
544 */
545 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
546 }
547}
548
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549/*
550 * switch_to(x,y) should switch tasks from x to y.
551 *
552 * This could still be optimized:
553 * - fold all the options into a flag word and test it with a single test.
554 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100555 *
556 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557 */
Andi Kleen099f3182006-02-03 21:51:38 +0100558__kprobes struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100559__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560{
561 struct thread_struct *prev = &prev_p->thread,
562 *next = &next_p->thread;
563 int cpu = smp_processor_id();
564 struct tss_struct *tss = &per_cpu(init_tss, cpu);
565
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200566 /* we're going to use this soon, after a few expensive things */
567 if (next_p->fpu_counter>5)
568 prefetch(&next->i387.fxsave);
569
Linus Torvalds1da177e2005-04-16 15:20:36 -0700570 /*
571 * Reload esp0, LDT and the page table pointer:
572 */
573 tss->rsp0 = next->rsp0;
574
575 /*
576 * Switch DS and ES.
577 * This won't pick up thread selector changes, but I guess that is ok.
578 */
H. J. Lufd51f662005-05-01 08:58:48 -0700579 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700580 if (unlikely(next->es | prev->es))
581 loadsegment(es, next->es);
582
H. J. Lufd51f662005-05-01 08:58:48 -0700583 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584 if (unlikely(next->ds | prev->ds))
585 loadsegment(ds, next->ds);
586
587 load_TLS(next, cpu);
588
589 /*
590 * Switch FS and GS.
591 */
592 {
593 unsigned fsindex;
594 asm volatile("movl %%fs,%0" : "=r" (fsindex));
595 /* segment register != 0 always requires a reload.
596 also reload when it has changed.
597 when prev process used 64bit base always reload
598 to avoid an information leak. */
599 if (unlikely(fsindex | next->fsindex | prev->fs)) {
600 loadsegment(fs, next->fsindex);
601 /* check if the user used a selector != 0
602 * if yes clear 64bit base, since overloaded base
603 * is always mapped to the Null selector
604 */
605 if (fsindex)
606 prev->fs = 0;
607 }
608 /* when next process has a 64bit base use it */
609 if (next->fs)
610 wrmsrl(MSR_FS_BASE, next->fs);
611 prev->fsindex = fsindex;
612 }
613 {
614 unsigned gsindex;
615 asm volatile("movl %%gs,%0" : "=r" (gsindex));
616 if (unlikely(gsindex | next->gsindex | prev->gs)) {
617 load_gs_index(next->gsindex);
618 if (gsindex)
619 prev->gs = 0;
620 }
621 if (next->gs)
622 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
623 prev->gsindex = gsindex;
624 }
625
Andi Kleen0a5ace22006-10-05 18:47:22 +0200626 /* Must be after DS reload */
627 unlazy_fpu(prev_p);
628
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100630 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631 */
632 prev->userrsp = read_pda(oldrsp);
633 write_pda(oldrsp, next->userrsp);
634 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200635
Andi Kleena88cde12005-11-05 17:25:54 +0100636 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200637 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a425402006-09-26 10:52:38 +0200638#ifdef CONFIG_CC_STACKPROTECTOR
639 write_pda(stack_canary, next_p->stack_canary);
640 /*
641 * Build time only check to make sure the stack_canary is at
642 * offset 40 in the pda; this is a gcc ABI requirement
643 */
644 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
645#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646
647 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200648 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200650 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
651 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
652 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200654 /* If the task has used fpu the last 5 timeslices, just do a full
655 * restore of the math state immediately to avoid the trap; the
656 * chances of needing FPU soon are obviously high now
657 */
658 if (next_p->fpu_counter>5)
659 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700660 return prev_p;
661}
662
663/*
664 * sys_execve() executes a new program.
665 */
666asmlinkage
667long sys_execve(char __user *name, char __user * __user *argv,
668 char __user * __user *envp, struct pt_regs regs)
669{
670 long error;
671 char * filename;
672
673 filename = getname(name);
674 error = PTR_ERR(filename);
675 if (IS_ERR(filename))
676 return error;
677 error = do_execve(filename, argv, envp, &regs);
678 if (error == 0) {
679 task_lock(current);
680 current->ptrace &= ~PT_DTRACE;
681 task_unlock(current);
682 }
683 putname(filename);
684 return error;
685}
686
687void set_personality_64bit(void)
688{
689 /* inherit personality from parent */
690
691 /* Make sure to be in 64bit mode */
692 clear_thread_flag(TIF_IA32);
693
694 /* TBD: overwrites user setup. Should have two bits.
695 But 64bit processes have always behaved this way,
696 so it's not too bad. The main problem is just that
697 32bit childs are affected again. */
698 current->personality &= ~READ_IMPLIES_EXEC;
699}
700
701asmlinkage long sys_fork(struct pt_regs *regs)
702{
703 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
704}
705
Andi Kleena88cde12005-11-05 17:25:54 +0100706asmlinkage long
707sys_clone(unsigned long clone_flags, unsigned long newsp,
708 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700709{
710 if (!newsp)
711 newsp = regs->rsp;
712 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
713}
714
715/*
716 * This is trivial, and on the face of it looks like it
717 * could equally well be done in user mode.
718 *
719 * Not so, for quite unobvious reasons - register pressure.
720 * In user mode vfork() cannot have a stack frame, and if
721 * done by calling the "clone()" system call directly, you
722 * do not have enough call-clobbered registers to hold all
723 * the information you need.
724 */
725asmlinkage long sys_vfork(struct pt_regs *regs)
726{
727 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
728 NULL, NULL);
729}
730
731unsigned long get_wchan(struct task_struct *p)
732{
733 unsigned long stack;
734 u64 fp,rip;
735 int count = 0;
736
737 if (!p || p == current || p->state==TASK_RUNNING)
738 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800739 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
741 return 0;
742 fp = *(u64 *)(p->thread.rsp);
743 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100744 if (fp < (unsigned long)stack ||
745 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 return 0;
747 rip = *(u64 *)(fp+8);
748 if (!in_sched_functions(rip))
749 return rip;
750 fp = *(u64 *)fp;
751 } while (count++ < 16);
752 return 0;
753}
754
755long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
756{
757 int ret = 0;
758 int doit = task == current;
759 int cpu;
760
761 switch (code) {
762 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700763 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764 return -EPERM;
765 cpu = get_cpu();
766 /* handle small bases via the GDT because that's faster to
767 switch. */
768 if (addr <= 0xffffffff) {
769 set_32bit_tls(task, GS_TLS, addr);
770 if (doit) {
771 load_TLS(&task->thread, cpu);
772 load_gs_index(GS_TLS_SEL);
773 }
774 task->thread.gsindex = GS_TLS_SEL;
775 task->thread.gs = 0;
776 } else {
777 task->thread.gsindex = 0;
778 task->thread.gs = addr;
779 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100780 load_gs_index(0);
781 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782 }
783 }
784 put_cpu();
785 break;
786 case ARCH_SET_FS:
787 /* Not strictly needed for fs, but do it for symmetry
788 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700789 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790 return -EPERM;
791 cpu = get_cpu();
792 /* handle small bases via the GDT because that's faster to
793 switch. */
794 if (addr <= 0xffffffff) {
795 set_32bit_tls(task, FS_TLS, addr);
796 if (doit) {
797 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100798 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799 }
800 task->thread.fsindex = FS_TLS_SEL;
801 task->thread.fs = 0;
802 } else {
803 task->thread.fsindex = 0;
804 task->thread.fs = addr;
805 if (doit) {
806 /* set the selector to 0 to not confuse
807 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100808 asm volatile("movl %0,%%fs" :: "r" (0));
809 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810 }
811 }
812 put_cpu();
813 break;
814 case ARCH_GET_FS: {
815 unsigned long base;
816 if (task->thread.fsindex == FS_TLS_SEL)
817 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100818 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100820 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821 base = task->thread.fs;
822 ret = put_user(base, (unsigned long __user *)addr);
823 break;
824 }
825 case ARCH_GET_GS: {
826 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200827 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700828 if (task->thread.gsindex == GS_TLS_SEL)
829 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200830 else if (doit) {
831 asm("movl %%gs,%0" : "=r" (gsindex));
832 if (gsindex)
833 rdmsrl(MSR_KERNEL_GS_BASE, base);
834 else
835 base = task->thread.gs;
836 }
Andi Kleena88cde12005-11-05 17:25:54 +0100837 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700838 base = task->thread.gs;
839 ret = put_user(base, (unsigned long __user *)addr);
840 break;
841 }
842
843 default:
844 ret = -EINVAL;
845 break;
846 }
847
848 return ret;
849}
850
851long sys_arch_prctl(int code, unsigned long addr)
852{
853 return do_arch_prctl(current, code, addr);
854}
855
856/*
857 * Capture the user space registers if the task is not running (in user space)
858 */
859int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
860{
861 struct pt_regs *pp, ptregs;
862
Al Virobb049232006-01-12 01:05:38 -0800863 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700864
865 ptregs = *pp;
866 ptregs.cs &= 0xffff;
867 ptregs.ss &= 0xffff;
868
869 elf_core_copy_regs(regs, &ptregs);
870
871 return 1;
872}
873
874unsigned long arch_align_stack(unsigned long sp)
875{
Andi Kleenc16b63e2006-09-26 10:52:28 +0200876 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700877 sp -= get_random_int() % 8192;
878 return sp & ~0xf;
879}