blob: f6226055d53de3b84e2812785bc699ae17394beb [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070011 *
12 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
Ashok Raj76e4f662005-06-25 14:55:00 -070021#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/module.h>
31#include <linux/a.out.h>
32#include <linux/interrupt.h>
33#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/ptrace.h>
35#include <linux/utsname.h>
36#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010037#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080038#include <linux/kprobes.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039
40#include <asm/uaccess.h>
41#include <asm/pgtable.h>
42#include <asm/system.h>
43#include <asm/io.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/pda.h>
48#include <asm/prctl.h>
49#include <asm/kdebug.h>
50#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010053#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020066EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070067static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
Alan Sterne041c682006-03-27 01:16:30 -080069static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010070
71void idle_notifier_register(struct notifier_block *n)
72{
Alan Sterne041c682006-03-27 01:16:30 -080073 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010074}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
Alan Sterne041c682006-03-27 01:16:30 -080079 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010080}
81EXPORT_SYMBOL(idle_notifier_unregister);
82
Andi Kleen95833c82006-01-11 22:44:36 +010083void enter_idle(void)
84{
Andi Kleena15da492006-09-26 10:52:40 +020085 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080086 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010087}
88
89static void __exit_idle(void)
90{
Andi Kleen94468682006-11-14 16:57:46 +010091 if (test_and_clear_bit_pda(0, isidle) == 0)
Andi Kleena15da492006-09-26 10:52:40 +020092 return;
Alan Sterne041c682006-03-27 01:16:30 -080093 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010094}
95
96/* Called from interrupts to signify idle end */
97void exit_idle(void)
98{
Andi Kleena15da492006-09-26 10:52:40 +020099 /* idle loop has pid 0 */
100 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +0100101 return;
102 __exit_idle();
103}
104
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105/*
106 * We use this if we don't have any better
107 * idle routine..
108 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800109static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110{
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800111 local_irq_enable();
112
Andi Kleen495ab9c2006-06-26 13:59:11 +0200113 current_thread_info()->status &= ~TS_POLLING;
Andi Kleen2d52ede2006-01-11 22:42:42 +0100114 smp_mb__after_clear_bit();
115 while (!need_resched()) {
116 local_irq_disable();
117 if (!need_resched())
118 safe_halt();
119 else
120 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121 }
Andi Kleen495ab9c2006-06-26 13:59:11 +0200122 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123}
124
125/*
126 * On SMP it's slightly faster (but much more power-consuming!)
127 * to poll the ->need_resched flag instead of waiting for the
128 * cross-CPU IPI to arrive. Use this option with caution.
129 */
130static void poll_idle (void)
131{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132 local_irq_enable();
133
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800134 asm volatile(
135 "2:"
136 "testl %0,%1;"
137 "rep; nop;"
138 "je 2b;"
139 : :
140 "i" (_TIF_NEED_RESCHED),
141 "m" (current_thread_info()->flags));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142}
143
144void cpu_idle_wait(void)
145{
146 unsigned int cpu, this_cpu = get_cpu();
147 cpumask_t map;
148
149 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
150 put_cpu();
151
152 cpus_clear(map);
153 for_each_online_cpu(cpu) {
154 per_cpu(cpu_idle_state, cpu) = 1;
155 cpu_set(cpu, map);
156 }
157
158 __get_cpu_var(cpu_idle_state) = 0;
159
160 wmb();
161 do {
162 ssleep(1);
163 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100164 if (cpu_isset(cpu, map) &&
165 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166 cpu_clear(cpu, map);
167 }
168 cpus_and(map, map, cpu_online_map);
169 } while (!cpus_empty(map));
170}
171EXPORT_SYMBOL_GPL(cpu_idle_wait);
172
Ashok Raj76e4f662005-06-25 14:55:00 -0700173#ifdef CONFIG_HOTPLUG_CPU
174DECLARE_PER_CPU(int, cpu_state);
175
176#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800177/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700178static inline void play_dead(void)
179{
180 idle_task_exit();
181 wbinvd();
182 mb();
183 /* Ack it */
184 __get_cpu_var(cpu_state) = CPU_DEAD;
185
Shaohua Li1fa744e2006-01-06 00:12:20 -0800186 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700187 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800188 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700189}
190#else
191static inline void play_dead(void)
192{
193 BUG();
194}
195#endif /* CONFIG_HOTPLUG_CPU */
196
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197/*
198 * The idle thread. There's no useful work to be
199 * done, so just try to conserve power and have a
200 * low exit latency (ie sit in a loop waiting for
201 * somebody to say that they'd like to reschedule)
202 */
203void cpu_idle (void)
204{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200205 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 /* endless idle loop with no priority at all */
207 while (1) {
208 while (!need_resched()) {
209 void (*idle)(void);
210
211 if (__get_cpu_var(cpu_idle_state))
212 __get_cpu_var(cpu_idle_state) = 0;
213
214 rmb();
215 idle = pm_idle;
216 if (!idle)
217 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700218 if (cpu_is_offline(smp_processor_id()))
219 play_dead();
Andi Kleen95833c82006-01-11 22:44:36 +0100220 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700221 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200222 /* In many cases the interrupt that ended idle
223 has already called exit_idle. But some idle
224 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100225 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700226 }
227
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800228 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700229 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800230 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231 }
232}
233
234/*
235 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
236 * which can obviate IPI to trigger checking of need_resched.
237 * We execute MONITOR against need_resched and enter optimized wait state
238 * through MWAIT. Whenever someone changes need_resched, we would be woken
239 * up from MWAIT (without an IPI).
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700240 *
241 * New with Core Duo processors, MWAIT can take some hints based on CPU
242 * capability.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 */
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700244void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
245{
246 if (!need_resched()) {
247 __monitor((void *)&current_thread_info()->flags, 0, 0);
248 smp_mb();
249 if (!need_resched())
250 __mwait(eax, ecx);
251 }
252}
253
254/* Default MONITOR/MWAIT with no hints, used for default C1 state */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255static void mwait_idle(void)
256{
257 local_irq_enable();
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700258 while (!need_resched())
259 mwait_idle_with_hints(0,0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260}
261
Ashok Raje6982c62005-06-25 14:54:58 -0700262void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263{
264 static int printed;
265 if (cpu_has(c, X86_FEATURE_MWAIT)) {
266 /*
267 * Skip, if setup has overridden idle.
268 * One CPU supports mwait => All CPUs supports mwait
269 */
270 if (!pm_idle) {
271 if (!printed) {
272 printk("using mwait in idle threads.\n");
273 printed = 1;
274 }
275 pm_idle = mwait_idle;
276 }
277 }
278}
279
280static int __init idle_setup (char *str)
281{
282 if (!strncmp(str, "poll", 4)) {
283 printk("using polling idle threads.\n");
284 pm_idle = poll_idle;
285 }
286
287 boot_option_idle_override = 1;
288 return 1;
289}
290
291__setup("idle=", idle_setup);
292
293/* Prints also some state that isn't saved in the pt_regs */
294void __show_regs(struct pt_regs * regs)
295{
296 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
297 unsigned int fsindex,gsindex;
298 unsigned int ds,cs,es;
299
300 printk("\n");
301 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200302 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
303 current->pid, current->comm, print_tainted(),
Serge E. Hallyn96b644b2006-10-02 02:18:13 -0700304 init_utsname()->release,
305 (int)strcspn(init_utsname()->version, " "),
306 init_utsname()->version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
308 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700309 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100310 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700311 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
312 regs->rax, regs->rbx, regs->rcx);
313 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
314 regs->rdx, regs->rsi, regs->rdi);
315 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
316 regs->rbp, regs->r8, regs->r9);
317 printk("R10: %016lx R11: %016lx R12: %016lx\n",
318 regs->r10, regs->r11, regs->r12);
319 printk("R13: %016lx R14: %016lx R15: %016lx\n",
320 regs->r13, regs->r14, regs->r15);
321
322 asm("movl %%ds,%0" : "=r" (ds));
323 asm("movl %%cs,%0" : "=r" (cs));
324 asm("movl %%es,%0" : "=r" (es));
325 asm("movl %%fs,%0" : "=r" (fsindex));
326 asm("movl %%gs,%0" : "=r" (gsindex));
327
328 rdmsrl(MSR_FS_BASE, fs);
329 rdmsrl(MSR_GS_BASE, gs);
330 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
331
332 asm("movq %%cr0, %0": "=r" (cr0));
333 asm("movq %%cr2, %0": "=r" (cr2));
334 asm("movq %%cr3, %0": "=r" (cr3));
335 asm("movq %%cr4, %0": "=r" (cr4));
336
337 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
338 fs,fsindex,gs,gsindex,shadowgs);
339 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
340 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
341}
342
343void show_regs(struct pt_regs *regs)
344{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700345 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200347 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348}
349
350/*
351 * Free current thread data structures etc..
352 */
353void exit_thread(void)
354{
355 struct task_struct *me = current;
356 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700357
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358 if (me->thread.io_bitmap_ptr) {
359 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
360
361 kfree(t->io_bitmap_ptr);
362 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200363 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364 /*
365 * Careful, clear this in the TSS too:
366 */
367 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
368 t->io_bitmap_max = 0;
369 put_cpu();
370 }
371}
372
373void flush_thread(void)
374{
375 struct task_struct *tsk = current;
376 struct thread_info *t = current_thread_info();
377
Andi Kleen4d9bc792006-06-26 13:57:19 +0200378 if (t->flags & _TIF_ABI_PENDING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200380 if (t->flags & _TIF_IA32)
381 current_thread_info()->status |= TS_COMPAT;
382 }
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200383 t->flags &= ~_TIF_DEBUG;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384
385 tsk->thread.debugreg0 = 0;
386 tsk->thread.debugreg1 = 0;
387 tsk->thread.debugreg2 = 0;
388 tsk->thread.debugreg3 = 0;
389 tsk->thread.debugreg6 = 0;
390 tsk->thread.debugreg7 = 0;
391 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
392 /*
393 * Forget coprocessor state..
394 */
395 clear_fpu(tsk);
396 clear_used_math();
397}
398
399void release_thread(struct task_struct *dead_task)
400{
401 if (dead_task->mm) {
402 if (dead_task->mm->context.size) {
403 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
404 dead_task->comm,
405 dead_task->mm->context.ldt,
406 dead_task->mm->context.size);
407 BUG();
408 }
409 }
410}
411
412static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
413{
414 struct user_desc ud = {
415 .base_addr = addr,
416 .limit = 0xfffff,
417 .seg_32bit = 1,
418 .limit_in_pages = 1,
419 .useable = 1,
420 };
421 struct n_desc_struct *desc = (void *)t->thread.tls_array;
422 desc += tls;
423 desc->a = LDT_entry_a(&ud);
424 desc->b = LDT_entry_b(&ud);
425}
426
427static inline u32 read_32bit_tls(struct task_struct *t, int tls)
428{
429 struct desc_struct *desc = (void *)t->thread.tls_array;
430 desc += tls;
431 return desc->base0 |
432 (((u32)desc->base1) << 16) |
433 (((u32)desc->base2) << 24);
434}
435
436/*
437 * This gets called before we allocate a new thread and copy
438 * the current task into it.
439 */
440void prepare_to_copy(struct task_struct *tsk)
441{
442 unlazy_fpu(tsk);
443}
444
445int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
446 unsigned long unused,
447 struct task_struct * p, struct pt_regs * regs)
448{
449 int err;
450 struct pt_regs * childregs;
451 struct task_struct *me = current;
452
Andi Kleena88cde12005-11-05 17:25:54 +0100453 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800454 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455 *childregs = *regs;
456
457 childregs->rax = 0;
458 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100459 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461
462 p->thread.rsp = (unsigned long) childregs;
463 p->thread.rsp0 = (unsigned long) (childregs+1);
464 p->thread.userrsp = me->thread.userrsp;
465
Al Viroe4f17c42006-01-12 01:05:38 -0800466 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467
468 p->thread.fs = me->thread.fs;
469 p->thread.gs = me->thread.gs;
470
H. J. Lufd51f662005-05-01 08:58:48 -0700471 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
472 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
473 asm("mov %%es,%0" : "=m" (p->thread.es));
474 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700475
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200476 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
478 if (!p->thread.io_bitmap_ptr) {
479 p->thread.io_bitmap_max = 0;
480 return -ENOMEM;
481 }
Andi Kleena88cde12005-11-05 17:25:54 +0100482 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
483 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200484 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 }
486
487 /*
488 * Set a new TLS for the child thread?
489 */
490 if (clone_flags & CLONE_SETTLS) {
491#ifdef CONFIG_IA32_EMULATION
492 if (test_thread_flag(TIF_IA32))
493 err = ia32_child_tls(p, childregs);
494 else
495#endif
496 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
497 if (err)
498 goto out;
499 }
500 err = 0;
501out:
502 if (err && p->thread.io_bitmap_ptr) {
503 kfree(p->thread.io_bitmap_ptr);
504 p->thread.io_bitmap_max = 0;
505 }
506 return err;
507}
508
509/*
510 * This special macro can be used to load a debugging register
511 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100512#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700513
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200514static inline void __switch_to_xtra(struct task_struct *prev_p,
515 struct task_struct *next_p,
516 struct tss_struct *tss)
517{
518 struct thread_struct *prev, *next;
519
520 prev = &prev_p->thread,
521 next = &next_p->thread;
522
523 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
524 loaddebug(next, 0);
525 loaddebug(next, 1);
526 loaddebug(next, 2);
527 loaddebug(next, 3);
528 /* no 4 and 5 */
529 loaddebug(next, 6);
530 loaddebug(next, 7);
531 }
532
533 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
534 /*
535 * Copy the relevant range of the IO bitmap.
536 * Normally this is 128 bytes or less:
537 */
538 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
539 max(prev->io_bitmap_max, next->io_bitmap_max));
540 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
541 /*
542 * Clear any possible leftover bits:
543 */
544 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
545 }
546}
547
Linus Torvalds1da177e2005-04-16 15:20:36 -0700548/*
549 * switch_to(x,y) should switch tasks from x to y.
550 *
551 * This could still be optimized:
552 * - fold all the options into a flag word and test it with a single test.
553 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100554 *
555 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700556 */
Andi Kleen099f3182006-02-03 21:51:38 +0100557__kprobes struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100558__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700559{
560 struct thread_struct *prev = &prev_p->thread,
561 *next = &next_p->thread;
562 int cpu = smp_processor_id();
563 struct tss_struct *tss = &per_cpu(init_tss, cpu);
564
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200565 /* we're going to use this soon, after a few expensive things */
566 if (next_p->fpu_counter>5)
567 prefetch(&next->i387.fxsave);
568
Linus Torvalds1da177e2005-04-16 15:20:36 -0700569 /*
570 * Reload esp0, LDT and the page table pointer:
571 */
572 tss->rsp0 = next->rsp0;
573
574 /*
575 * Switch DS and ES.
576 * This won't pick up thread selector changes, but I guess that is ok.
577 */
H. J. Lufd51f662005-05-01 08:58:48 -0700578 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700579 if (unlikely(next->es | prev->es))
580 loadsegment(es, next->es);
581
H. J. Lufd51f662005-05-01 08:58:48 -0700582 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583 if (unlikely(next->ds | prev->ds))
584 loadsegment(ds, next->ds);
585
586 load_TLS(next, cpu);
587
588 /*
589 * Switch FS and GS.
590 */
591 {
592 unsigned fsindex;
593 asm volatile("movl %%fs,%0" : "=r" (fsindex));
594 /* segment register != 0 always requires a reload.
595 also reload when it has changed.
596 when prev process used 64bit base always reload
597 to avoid an information leak. */
598 if (unlikely(fsindex | next->fsindex | prev->fs)) {
599 loadsegment(fs, next->fsindex);
600 /* check if the user used a selector != 0
601 * if yes clear 64bit base, since overloaded base
602 * is always mapped to the Null selector
603 */
604 if (fsindex)
605 prev->fs = 0;
606 }
607 /* when next process has a 64bit base use it */
608 if (next->fs)
609 wrmsrl(MSR_FS_BASE, next->fs);
610 prev->fsindex = fsindex;
611 }
612 {
613 unsigned gsindex;
614 asm volatile("movl %%gs,%0" : "=r" (gsindex));
615 if (unlikely(gsindex | next->gsindex | prev->gs)) {
616 load_gs_index(next->gsindex);
617 if (gsindex)
618 prev->gs = 0;
619 }
620 if (next->gs)
621 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
622 prev->gsindex = gsindex;
623 }
624
Andi Kleen0a5ace22006-10-05 18:47:22 +0200625 /* Must be after DS reload */
626 unlazy_fpu(prev_p);
627
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100629 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630 */
631 prev->userrsp = read_pda(oldrsp);
632 write_pda(oldrsp, next->userrsp);
633 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200634
Andi Kleena88cde12005-11-05 17:25:54 +0100635 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200636 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a425402006-09-26 10:52:38 +0200637#ifdef CONFIG_CC_STACKPROTECTOR
638 write_pda(stack_canary, next_p->stack_canary);
639 /*
640 * Build time only check to make sure the stack_canary is at
641 * offset 40 in the pda; this is a gcc ABI requirement
642 */
643 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
644#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645
646 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200647 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200649 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
650 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
651 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700652
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200653 /* If the task has used fpu the last 5 timeslices, just do a full
654 * restore of the math state immediately to avoid the trap; the
655 * chances of needing FPU soon are obviously high now
656 */
657 if (next_p->fpu_counter>5)
658 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 return prev_p;
660}
661
662/*
663 * sys_execve() executes a new program.
664 */
665asmlinkage
666long sys_execve(char __user *name, char __user * __user *argv,
667 char __user * __user *envp, struct pt_regs regs)
668{
669 long error;
670 char * filename;
671
672 filename = getname(name);
673 error = PTR_ERR(filename);
674 if (IS_ERR(filename))
675 return error;
676 error = do_execve(filename, argv, envp, &regs);
677 if (error == 0) {
678 task_lock(current);
679 current->ptrace &= ~PT_DTRACE;
680 task_unlock(current);
681 }
682 putname(filename);
683 return error;
684}
685
686void set_personality_64bit(void)
687{
688 /* inherit personality from parent */
689
690 /* Make sure to be in 64bit mode */
691 clear_thread_flag(TIF_IA32);
692
693 /* TBD: overwrites user setup. Should have two bits.
694 But 64bit processes have always behaved this way,
695 so it's not too bad. The main problem is just that
696 32bit childs are affected again. */
697 current->personality &= ~READ_IMPLIES_EXEC;
698}
699
700asmlinkage long sys_fork(struct pt_regs *regs)
701{
702 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
703}
704
Andi Kleena88cde12005-11-05 17:25:54 +0100705asmlinkage long
706sys_clone(unsigned long clone_flags, unsigned long newsp,
707 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708{
709 if (!newsp)
710 newsp = regs->rsp;
711 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
712}
713
714/*
715 * This is trivial, and on the face of it looks like it
716 * could equally well be done in user mode.
717 *
718 * Not so, for quite unobvious reasons - register pressure.
719 * In user mode vfork() cannot have a stack frame, and if
720 * done by calling the "clone()" system call directly, you
721 * do not have enough call-clobbered registers to hold all
722 * the information you need.
723 */
724asmlinkage long sys_vfork(struct pt_regs *regs)
725{
726 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
727 NULL, NULL);
728}
729
730unsigned long get_wchan(struct task_struct *p)
731{
732 unsigned long stack;
733 u64 fp,rip;
734 int count = 0;
735
736 if (!p || p == current || p->state==TASK_RUNNING)
737 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800738 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
740 return 0;
741 fp = *(u64 *)(p->thread.rsp);
742 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100743 if (fp < (unsigned long)stack ||
744 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745 return 0;
746 rip = *(u64 *)(fp+8);
747 if (!in_sched_functions(rip))
748 return rip;
749 fp = *(u64 *)fp;
750 } while (count++ < 16);
751 return 0;
752}
753
754long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
755{
756 int ret = 0;
757 int doit = task == current;
758 int cpu;
759
760 switch (code) {
761 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700762 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763 return -EPERM;
764 cpu = get_cpu();
765 /* handle small bases via the GDT because that's faster to
766 switch. */
767 if (addr <= 0xffffffff) {
768 set_32bit_tls(task, GS_TLS, addr);
769 if (doit) {
770 load_TLS(&task->thread, cpu);
771 load_gs_index(GS_TLS_SEL);
772 }
773 task->thread.gsindex = GS_TLS_SEL;
774 task->thread.gs = 0;
775 } else {
776 task->thread.gsindex = 0;
777 task->thread.gs = addr;
778 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100779 load_gs_index(0);
780 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 }
782 }
783 put_cpu();
784 break;
785 case ARCH_SET_FS:
786 /* Not strictly needed for fs, but do it for symmetry
787 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700788 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700789 return -EPERM;
790 cpu = get_cpu();
791 /* handle small bases via the GDT because that's faster to
792 switch. */
793 if (addr <= 0xffffffff) {
794 set_32bit_tls(task, FS_TLS, addr);
795 if (doit) {
796 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100797 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798 }
799 task->thread.fsindex = FS_TLS_SEL;
800 task->thread.fs = 0;
801 } else {
802 task->thread.fsindex = 0;
803 task->thread.fs = addr;
804 if (doit) {
805 /* set the selector to 0 to not confuse
806 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100807 asm volatile("movl %0,%%fs" :: "r" (0));
808 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700809 }
810 }
811 put_cpu();
812 break;
813 case ARCH_GET_FS: {
814 unsigned long base;
815 if (task->thread.fsindex == FS_TLS_SEL)
816 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100817 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700818 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100819 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820 base = task->thread.fs;
821 ret = put_user(base, (unsigned long __user *)addr);
822 break;
823 }
824 case ARCH_GET_GS: {
825 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200826 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827 if (task->thread.gsindex == GS_TLS_SEL)
828 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200829 else if (doit) {
830 asm("movl %%gs,%0" : "=r" (gsindex));
831 if (gsindex)
832 rdmsrl(MSR_KERNEL_GS_BASE, base);
833 else
834 base = task->thread.gs;
835 }
Andi Kleena88cde12005-11-05 17:25:54 +0100836 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837 base = task->thread.gs;
838 ret = put_user(base, (unsigned long __user *)addr);
839 break;
840 }
841
842 default:
843 ret = -EINVAL;
844 break;
845 }
846
847 return ret;
848}
849
850long sys_arch_prctl(int code, unsigned long addr)
851{
852 return do_arch_prctl(current, code, addr);
853}
854
855/*
856 * Capture the user space registers if the task is not running (in user space)
857 */
858int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
859{
860 struct pt_regs *pp, ptregs;
861
Al Virobb049232006-01-12 01:05:38 -0800862 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700863
864 ptregs = *pp;
865 ptregs.cs &= 0xffff;
866 ptregs.ss &= 0xffff;
867
868 elf_core_copy_regs(regs, &ptregs);
869
870 return 1;
871}
872
873unsigned long arch_align_stack(unsigned long sp)
874{
Andi Kleenc16b63e2006-09-26 10:52:28 +0200875 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876 sp -= get_random_int() % 8192;
877 return sp & ~0xf;
878}