blob: 7451a4c43c1681639c0aa5fbb2640d509ccac603 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070011 *
12 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
Ashok Raj76e4f662005-06-25 14:55:00 -070021#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/module.h>
31#include <linux/a.out.h>
32#include <linux/interrupt.h>
33#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/ptrace.h>
35#include <linux/utsname.h>
36#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010037#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080038#include <linux/kprobes.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039
40#include <asm/uaccess.h>
41#include <asm/pgtable.h>
42#include <asm/system.h>
43#include <asm/io.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/pda.h>
48#include <asm/prctl.h>
49#include <asm/kdebug.h>
50#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010053#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020066EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070067static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
Alan Sterne041c682006-03-27 01:16:30 -080069static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010070
71void idle_notifier_register(struct notifier_block *n)
72{
Alan Sterne041c682006-03-27 01:16:30 -080073 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010074}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
Alan Sterne041c682006-03-27 01:16:30 -080079 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010080}
81EXPORT_SYMBOL(idle_notifier_unregister);
82
Andi Kleen95833c82006-01-11 22:44:36 +010083void enter_idle(void)
84{
Andi Kleena15da492006-09-26 10:52:40 +020085 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080086 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010087}
88
89static void __exit_idle(void)
90{
Andi Kleen94468682006-11-14 16:57:46 +010091 if (test_and_clear_bit_pda(0, isidle) == 0)
Andi Kleena15da492006-09-26 10:52:40 +020092 return;
Alan Sterne041c682006-03-27 01:16:30 -080093 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010094}
95
96/* Called from interrupts to signify idle end */
97void exit_idle(void)
98{
Andi Kleena15da492006-09-26 10:52:40 +020099 /* idle loop has pid 0 */
100 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +0100101 return;
102 __exit_idle();
103}
104
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105/*
106 * We use this if we don't have any better
107 * idle routine..
108 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800109static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110{
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800111 local_irq_enable();
112
Andi Kleen495ab9c2006-06-26 13:59:11 +0200113 current_thread_info()->status &= ~TS_POLLING;
Andi Kleen2d52ede2006-01-11 22:42:42 +0100114 smp_mb__after_clear_bit();
115 while (!need_resched()) {
116 local_irq_disable();
117 if (!need_resched())
118 safe_halt();
119 else
120 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121 }
Andi Kleen495ab9c2006-06-26 13:59:11 +0200122 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123}
124
125/*
126 * On SMP it's slightly faster (but much more power-consuming!)
127 * to poll the ->need_resched flag instead of waiting for the
128 * cross-CPU IPI to arrive. Use this option with caution.
129 */
130static void poll_idle (void)
131{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132 local_irq_enable();
133
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800134 asm volatile(
135 "2:"
136 "testl %0,%1;"
137 "rep; nop;"
138 "je 2b;"
139 : :
140 "i" (_TIF_NEED_RESCHED),
141 "m" (current_thread_info()->flags));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142}
143
144void cpu_idle_wait(void)
145{
146 unsigned int cpu, this_cpu = get_cpu();
Ingo Molnardc1829a2006-11-17 14:26:18 +0100147 cpumask_t map, tmp = current->cpus_allowed;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148
149 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
150 put_cpu();
151
152 cpus_clear(map);
153 for_each_online_cpu(cpu) {
154 per_cpu(cpu_idle_state, cpu) = 1;
155 cpu_set(cpu, map);
156 }
157
158 __get_cpu_var(cpu_idle_state) = 0;
159
160 wmb();
161 do {
162 ssleep(1);
163 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100164 if (cpu_isset(cpu, map) &&
165 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166 cpu_clear(cpu, map);
167 }
168 cpus_and(map, map, cpu_online_map);
169 } while (!cpus_empty(map));
Ingo Molnardc1829a2006-11-17 14:26:18 +0100170
171 set_cpus_allowed(current, tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700172}
173EXPORT_SYMBOL_GPL(cpu_idle_wait);
174
Ashok Raj76e4f662005-06-25 14:55:00 -0700175#ifdef CONFIG_HOTPLUG_CPU
176DECLARE_PER_CPU(int, cpu_state);
177
178#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800179/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700180static inline void play_dead(void)
181{
182 idle_task_exit();
183 wbinvd();
184 mb();
185 /* Ack it */
186 __get_cpu_var(cpu_state) = CPU_DEAD;
187
Shaohua Li1fa744e2006-01-06 00:12:20 -0800188 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700189 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800190 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700191}
192#else
193static inline void play_dead(void)
194{
195 BUG();
196}
197#endif /* CONFIG_HOTPLUG_CPU */
198
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199/*
200 * The idle thread. There's no useful work to be
201 * done, so just try to conserve power and have a
202 * low exit latency (ie sit in a loop waiting for
203 * somebody to say that they'd like to reschedule)
204 */
205void cpu_idle (void)
206{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200207 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208 /* endless idle loop with no priority at all */
209 while (1) {
210 while (!need_resched()) {
211 void (*idle)(void);
212
213 if (__get_cpu_var(cpu_idle_state))
214 __get_cpu_var(cpu_idle_state) = 0;
215
216 rmb();
217 idle = pm_idle;
218 if (!idle)
219 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700220 if (cpu_is_offline(smp_processor_id()))
221 play_dead();
Andi Kleen95833c82006-01-11 22:44:36 +0100222 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200224 /* In many cases the interrupt that ended idle
225 has already called exit_idle. But some idle
226 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100227 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228 }
229
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800230 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800232 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233 }
234}
235
236/*
237 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
238 * which can obviate IPI to trigger checking of need_resched.
239 * We execute MONITOR against need_resched and enter optimized wait state
240 * through MWAIT. Whenever someone changes need_resched, we would be woken
241 * up from MWAIT (without an IPI).
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700242 *
243 * New with Core Duo processors, MWAIT can take some hints based on CPU
244 * capability.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245 */
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700246void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
247{
248 if (!need_resched()) {
249 __monitor((void *)&current_thread_info()->flags, 0, 0);
250 smp_mb();
251 if (!need_resched())
252 __mwait(eax, ecx);
253 }
254}
255
256/* Default MONITOR/MWAIT with no hints, used for default C1 state */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257static void mwait_idle(void)
258{
259 local_irq_enable();
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700260 while (!need_resched())
261 mwait_idle_with_hints(0,0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262}
263
Ashok Raje6982c62005-06-25 14:54:58 -0700264void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265{
266 static int printed;
267 if (cpu_has(c, X86_FEATURE_MWAIT)) {
268 /*
269 * Skip, if setup has overridden idle.
270 * One CPU supports mwait => All CPUs supports mwait
271 */
272 if (!pm_idle) {
273 if (!printed) {
274 printk("using mwait in idle threads.\n");
275 printed = 1;
276 }
277 pm_idle = mwait_idle;
278 }
279 }
280}
281
282static int __init idle_setup (char *str)
283{
284 if (!strncmp(str, "poll", 4)) {
285 printk("using polling idle threads.\n");
286 pm_idle = poll_idle;
287 }
288
289 boot_option_idle_override = 1;
290 return 1;
291}
292
293__setup("idle=", idle_setup);
294
295/* Prints also some state that isn't saved in the pt_regs */
296void __show_regs(struct pt_regs * regs)
297{
298 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
299 unsigned int fsindex,gsindex;
300 unsigned int ds,cs,es;
301
302 printk("\n");
303 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200304 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
305 current->pid, current->comm, print_tainted(),
Serge E. Hallyn96b644b2006-10-02 02:18:13 -0700306 init_utsname()->release,
307 (int)strcspn(init_utsname()->version, " "),
308 init_utsname()->version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700309 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
310 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700311 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100312 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
314 regs->rax, regs->rbx, regs->rcx);
315 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
316 regs->rdx, regs->rsi, regs->rdi);
317 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
318 regs->rbp, regs->r8, regs->r9);
319 printk("R10: %016lx R11: %016lx R12: %016lx\n",
320 regs->r10, regs->r11, regs->r12);
321 printk("R13: %016lx R14: %016lx R15: %016lx\n",
322 regs->r13, regs->r14, regs->r15);
323
324 asm("movl %%ds,%0" : "=r" (ds));
325 asm("movl %%cs,%0" : "=r" (cs));
326 asm("movl %%es,%0" : "=r" (es));
327 asm("movl %%fs,%0" : "=r" (fsindex));
328 asm("movl %%gs,%0" : "=r" (gsindex));
329
330 rdmsrl(MSR_FS_BASE, fs);
331 rdmsrl(MSR_GS_BASE, gs);
332 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
333
334 asm("movq %%cr0, %0": "=r" (cr0));
335 asm("movq %%cr2, %0": "=r" (cr2));
336 asm("movq %%cr3, %0": "=r" (cr3));
337 asm("movq %%cr4, %0": "=r" (cr4));
338
339 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
340 fs,fsindex,gs,gsindex,shadowgs);
341 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
342 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
343}
344
345void show_regs(struct pt_regs *regs)
346{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700347 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200349 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350}
351
352/*
353 * Free current thread data structures etc..
354 */
355void exit_thread(void)
356{
357 struct task_struct *me = current;
358 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700359
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 if (me->thread.io_bitmap_ptr) {
361 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
362
363 kfree(t->io_bitmap_ptr);
364 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200365 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700366 /*
367 * Careful, clear this in the TSS too:
368 */
369 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
370 t->io_bitmap_max = 0;
371 put_cpu();
372 }
373}
374
375void flush_thread(void)
376{
377 struct task_struct *tsk = current;
378 struct thread_info *t = current_thread_info();
379
Andi Kleen4d9bc792006-06-26 13:57:19 +0200380 if (t->flags & _TIF_ABI_PENDING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200382 if (t->flags & _TIF_IA32)
383 current_thread_info()->status |= TS_COMPAT;
384 }
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200385 t->flags &= ~_TIF_DEBUG;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386
387 tsk->thread.debugreg0 = 0;
388 tsk->thread.debugreg1 = 0;
389 tsk->thread.debugreg2 = 0;
390 tsk->thread.debugreg3 = 0;
391 tsk->thread.debugreg6 = 0;
392 tsk->thread.debugreg7 = 0;
393 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
394 /*
395 * Forget coprocessor state..
396 */
397 clear_fpu(tsk);
398 clear_used_math();
399}
400
401void release_thread(struct task_struct *dead_task)
402{
403 if (dead_task->mm) {
404 if (dead_task->mm->context.size) {
405 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
406 dead_task->comm,
407 dead_task->mm->context.ldt,
408 dead_task->mm->context.size);
409 BUG();
410 }
411 }
412}
413
414static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
415{
416 struct user_desc ud = {
417 .base_addr = addr,
418 .limit = 0xfffff,
419 .seg_32bit = 1,
420 .limit_in_pages = 1,
421 .useable = 1,
422 };
423 struct n_desc_struct *desc = (void *)t->thread.tls_array;
424 desc += tls;
425 desc->a = LDT_entry_a(&ud);
426 desc->b = LDT_entry_b(&ud);
427}
428
429static inline u32 read_32bit_tls(struct task_struct *t, int tls)
430{
431 struct desc_struct *desc = (void *)t->thread.tls_array;
432 desc += tls;
433 return desc->base0 |
434 (((u32)desc->base1) << 16) |
435 (((u32)desc->base2) << 24);
436}
437
438/*
439 * This gets called before we allocate a new thread and copy
440 * the current task into it.
441 */
442void prepare_to_copy(struct task_struct *tsk)
443{
444 unlazy_fpu(tsk);
445}
446
447int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
448 unsigned long unused,
449 struct task_struct * p, struct pt_regs * regs)
450{
451 int err;
452 struct pt_regs * childregs;
453 struct task_struct *me = current;
454
Andi Kleena88cde12005-11-05 17:25:54 +0100455 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800456 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 *childregs = *regs;
458
459 childregs->rax = 0;
460 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100461 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463
464 p->thread.rsp = (unsigned long) childregs;
465 p->thread.rsp0 = (unsigned long) (childregs+1);
466 p->thread.userrsp = me->thread.userrsp;
467
Al Viroe4f17c42006-01-12 01:05:38 -0800468 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469
470 p->thread.fs = me->thread.fs;
471 p->thread.gs = me->thread.gs;
472
H. J. Lufd51f662005-05-01 08:58:48 -0700473 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
474 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
475 asm("mov %%es,%0" : "=m" (p->thread.es));
476 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200478 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700479 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
480 if (!p->thread.io_bitmap_ptr) {
481 p->thread.io_bitmap_max = 0;
482 return -ENOMEM;
483 }
Andi Kleena88cde12005-11-05 17:25:54 +0100484 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
485 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200486 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 }
488
489 /*
490 * Set a new TLS for the child thread?
491 */
492 if (clone_flags & CLONE_SETTLS) {
493#ifdef CONFIG_IA32_EMULATION
494 if (test_thread_flag(TIF_IA32))
495 err = ia32_child_tls(p, childregs);
496 else
497#endif
498 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
499 if (err)
500 goto out;
501 }
502 err = 0;
503out:
504 if (err && p->thread.io_bitmap_ptr) {
505 kfree(p->thread.io_bitmap_ptr);
506 p->thread.io_bitmap_max = 0;
507 }
508 return err;
509}
510
511/*
512 * This special macro can be used to load a debugging register
513 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100514#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200516static inline void __switch_to_xtra(struct task_struct *prev_p,
517 struct task_struct *next_p,
518 struct tss_struct *tss)
519{
520 struct thread_struct *prev, *next;
521
522 prev = &prev_p->thread,
523 next = &next_p->thread;
524
525 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
526 loaddebug(next, 0);
527 loaddebug(next, 1);
528 loaddebug(next, 2);
529 loaddebug(next, 3);
530 /* no 4 and 5 */
531 loaddebug(next, 6);
532 loaddebug(next, 7);
533 }
534
535 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
536 /*
537 * Copy the relevant range of the IO bitmap.
538 * Normally this is 128 bytes or less:
539 */
540 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
541 max(prev->io_bitmap_max, next->io_bitmap_max));
542 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
543 /*
544 * Clear any possible leftover bits:
545 */
546 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
547 }
548}
549
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550/*
551 * switch_to(x,y) should switch tasks from x to y.
552 *
553 * This could still be optimized:
554 * - fold all the options into a flag word and test it with a single test.
555 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100556 *
557 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700558 */
Andi Kleen099f3182006-02-03 21:51:38 +0100559__kprobes struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100560__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700561{
562 struct thread_struct *prev = &prev_p->thread,
563 *next = &next_p->thread;
564 int cpu = smp_processor_id();
565 struct tss_struct *tss = &per_cpu(init_tss, cpu);
566
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200567 /* we're going to use this soon, after a few expensive things */
568 if (next_p->fpu_counter>5)
569 prefetch(&next->i387.fxsave);
570
Linus Torvalds1da177e2005-04-16 15:20:36 -0700571 /*
572 * Reload esp0, LDT and the page table pointer:
573 */
574 tss->rsp0 = next->rsp0;
575
576 /*
577 * Switch DS and ES.
578 * This won't pick up thread selector changes, but I guess that is ok.
579 */
H. J. Lufd51f662005-05-01 08:58:48 -0700580 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700581 if (unlikely(next->es | prev->es))
582 loadsegment(es, next->es);
583
H. J. Lufd51f662005-05-01 08:58:48 -0700584 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585 if (unlikely(next->ds | prev->ds))
586 loadsegment(ds, next->ds);
587
588 load_TLS(next, cpu);
589
590 /*
591 * Switch FS and GS.
592 */
593 {
594 unsigned fsindex;
595 asm volatile("movl %%fs,%0" : "=r" (fsindex));
596 /* segment register != 0 always requires a reload.
597 also reload when it has changed.
598 when prev process used 64bit base always reload
599 to avoid an information leak. */
600 if (unlikely(fsindex | next->fsindex | prev->fs)) {
601 loadsegment(fs, next->fsindex);
602 /* check if the user used a selector != 0
603 * if yes clear 64bit base, since overloaded base
604 * is always mapped to the Null selector
605 */
606 if (fsindex)
607 prev->fs = 0;
608 }
609 /* when next process has a 64bit base use it */
610 if (next->fs)
611 wrmsrl(MSR_FS_BASE, next->fs);
612 prev->fsindex = fsindex;
613 }
614 {
615 unsigned gsindex;
616 asm volatile("movl %%gs,%0" : "=r" (gsindex));
617 if (unlikely(gsindex | next->gsindex | prev->gs)) {
618 load_gs_index(next->gsindex);
619 if (gsindex)
620 prev->gs = 0;
621 }
622 if (next->gs)
623 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
624 prev->gsindex = gsindex;
625 }
626
Andi Kleen0a5ace22006-10-05 18:47:22 +0200627 /* Must be after DS reload */
628 unlazy_fpu(prev_p);
629
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100631 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632 */
633 prev->userrsp = read_pda(oldrsp);
634 write_pda(oldrsp, next->userrsp);
635 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200636
Andi Kleena88cde12005-11-05 17:25:54 +0100637 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200638 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a425402006-09-26 10:52:38 +0200639#ifdef CONFIG_CC_STACKPROTECTOR
640 write_pda(stack_canary, next_p->stack_canary);
641 /*
642 * Build time only check to make sure the stack_canary is at
643 * offset 40 in the pda; this is a gcc ABI requirement
644 */
645 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
646#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647
648 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200649 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200651 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
652 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
653 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700654
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200655 /* If the task has used fpu the last 5 timeslices, just do a full
656 * restore of the math state immediately to avoid the trap; the
657 * chances of needing FPU soon are obviously high now
658 */
659 if (next_p->fpu_counter>5)
660 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661 return prev_p;
662}
663
664/*
665 * sys_execve() executes a new program.
666 */
667asmlinkage
668long sys_execve(char __user *name, char __user * __user *argv,
669 char __user * __user *envp, struct pt_regs regs)
670{
671 long error;
672 char * filename;
673
674 filename = getname(name);
675 error = PTR_ERR(filename);
676 if (IS_ERR(filename))
677 return error;
678 error = do_execve(filename, argv, envp, &regs);
679 if (error == 0) {
680 task_lock(current);
681 current->ptrace &= ~PT_DTRACE;
682 task_unlock(current);
683 }
684 putname(filename);
685 return error;
686}
687
688void set_personality_64bit(void)
689{
690 /* inherit personality from parent */
691
692 /* Make sure to be in 64bit mode */
693 clear_thread_flag(TIF_IA32);
694
695 /* TBD: overwrites user setup. Should have two bits.
696 But 64bit processes have always behaved this way,
697 so it's not too bad. The main problem is just that
698 32bit childs are affected again. */
699 current->personality &= ~READ_IMPLIES_EXEC;
700}
701
702asmlinkage long sys_fork(struct pt_regs *regs)
703{
704 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
705}
706
Andi Kleena88cde12005-11-05 17:25:54 +0100707asmlinkage long
708sys_clone(unsigned long clone_flags, unsigned long newsp,
709 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710{
711 if (!newsp)
712 newsp = regs->rsp;
713 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
714}
715
716/*
717 * This is trivial, and on the face of it looks like it
718 * could equally well be done in user mode.
719 *
720 * Not so, for quite unobvious reasons - register pressure.
721 * In user mode vfork() cannot have a stack frame, and if
722 * done by calling the "clone()" system call directly, you
723 * do not have enough call-clobbered registers to hold all
724 * the information you need.
725 */
726asmlinkage long sys_vfork(struct pt_regs *regs)
727{
728 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
729 NULL, NULL);
730}
731
732unsigned long get_wchan(struct task_struct *p)
733{
734 unsigned long stack;
735 u64 fp,rip;
736 int count = 0;
737
738 if (!p || p == current || p->state==TASK_RUNNING)
739 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800740 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
742 return 0;
743 fp = *(u64 *)(p->thread.rsp);
744 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100745 if (fp < (unsigned long)stack ||
746 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747 return 0;
748 rip = *(u64 *)(fp+8);
749 if (!in_sched_functions(rip))
750 return rip;
751 fp = *(u64 *)fp;
752 } while (count++ < 16);
753 return 0;
754}
755
756long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
757{
758 int ret = 0;
759 int doit = task == current;
760 int cpu;
761
762 switch (code) {
763 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700764 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700765 return -EPERM;
766 cpu = get_cpu();
767 /* handle small bases via the GDT because that's faster to
768 switch. */
769 if (addr <= 0xffffffff) {
770 set_32bit_tls(task, GS_TLS, addr);
771 if (doit) {
772 load_TLS(&task->thread, cpu);
773 load_gs_index(GS_TLS_SEL);
774 }
775 task->thread.gsindex = GS_TLS_SEL;
776 task->thread.gs = 0;
777 } else {
778 task->thread.gsindex = 0;
779 task->thread.gs = addr;
780 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100781 load_gs_index(0);
782 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783 }
784 }
785 put_cpu();
786 break;
787 case ARCH_SET_FS:
788 /* Not strictly needed for fs, but do it for symmetry
789 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700790 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700791 return -EPERM;
792 cpu = get_cpu();
793 /* handle small bases via the GDT because that's faster to
794 switch. */
795 if (addr <= 0xffffffff) {
796 set_32bit_tls(task, FS_TLS, addr);
797 if (doit) {
798 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100799 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700800 }
801 task->thread.fsindex = FS_TLS_SEL;
802 task->thread.fs = 0;
803 } else {
804 task->thread.fsindex = 0;
805 task->thread.fs = addr;
806 if (doit) {
807 /* set the selector to 0 to not confuse
808 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100809 asm volatile("movl %0,%%fs" :: "r" (0));
810 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811 }
812 }
813 put_cpu();
814 break;
815 case ARCH_GET_FS: {
816 unsigned long base;
817 if (task->thread.fsindex == FS_TLS_SEL)
818 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100819 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100821 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822 base = task->thread.fs;
823 ret = put_user(base, (unsigned long __user *)addr);
824 break;
825 }
826 case ARCH_GET_GS: {
827 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200828 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700829 if (task->thread.gsindex == GS_TLS_SEL)
830 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200831 else if (doit) {
832 asm("movl %%gs,%0" : "=r" (gsindex));
833 if (gsindex)
834 rdmsrl(MSR_KERNEL_GS_BASE, base);
835 else
836 base = task->thread.gs;
837 }
Andi Kleena88cde12005-11-05 17:25:54 +0100838 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700839 base = task->thread.gs;
840 ret = put_user(base, (unsigned long __user *)addr);
841 break;
842 }
843
844 default:
845 ret = -EINVAL;
846 break;
847 }
848
849 return ret;
850}
851
852long sys_arch_prctl(int code, unsigned long addr)
853{
854 return do_arch_prctl(current, code, addr);
855}
856
857/*
858 * Capture the user space registers if the task is not running (in user space)
859 */
860int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
861{
862 struct pt_regs *pp, ptregs;
863
Al Virobb049232006-01-12 01:05:38 -0800864 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865
866 ptregs = *pp;
867 ptregs.cs &= 0xffff;
868 ptregs.ss &= 0xffff;
869
870 elf_core_copy_regs(regs, &ptregs);
871
872 return 1;
873}
874
875unsigned long arch_align_stack(unsigned long sp)
876{
Andi Kleenc16b63e2006-09-26 10:52:28 +0200877 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878 sp -= get_random_int() % 8192;
879 return sp & ~0xf;
880}