blob: 0b7b4caa4f74071bfe1797d6e575828241fa5f64 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070011 *
12 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
Ashok Raj76e4f662005-06-25 14:55:00 -070021#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/module.h>
31#include <linux/a.out.h>
32#include <linux/interrupt.h>
33#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/ptrace.h>
35#include <linux/utsname.h>
36#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010037#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080038#include <linux/kprobes.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039
40#include <asm/uaccess.h>
41#include <asm/pgtable.h>
42#include <asm/system.h>
43#include <asm/io.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/pda.h>
48#include <asm/prctl.h>
49#include <asm/kdebug.h>
50#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010053#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020066EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070067static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
Alan Sterne041c682006-03-27 01:16:30 -080069static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010070
71void idle_notifier_register(struct notifier_block *n)
72{
Alan Sterne041c682006-03-27 01:16:30 -080073 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010074}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
Alan Sterne041c682006-03-27 01:16:30 -080079 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010080}
81EXPORT_SYMBOL(idle_notifier_unregister);
82
Andi Kleen95833c82006-01-11 22:44:36 +010083void enter_idle(void)
84{
Andi Kleena15da492006-09-26 10:52:40 +020085 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080086 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010087}
88
89static void __exit_idle(void)
90{
Andi Kleen94468682006-11-14 16:57:46 +010091 if (test_and_clear_bit_pda(0, isidle) == 0)
Andi Kleena15da492006-09-26 10:52:40 +020092 return;
Alan Sterne041c682006-03-27 01:16:30 -080093 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010094}
95
96/* Called from interrupts to signify idle end */
97void exit_idle(void)
98{
Andi Kleena15da492006-09-26 10:52:40 +020099 /* idle loop has pid 0 */
100 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +0100101 return;
102 __exit_idle();
103}
104
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105/*
106 * We use this if we don't have any better
107 * idle routine..
108 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800109static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200111 current_thread_info()->status &= ~TS_POLLING;
Andi Kleen2d52ede2006-01-11 22:42:42 +0100112 smp_mb__after_clear_bit();
Andi Kleen72690a22006-12-07 02:14:03 +0100113 local_irq_disable();
114 if (!need_resched()) {
115 /* Enables interrupts one instruction before HLT.
116 x86 special cases this so there is no race. */
117 safe_halt();
118 } else
119 local_irq_enable();
Andi Kleen495ab9c2006-06-26 13:59:11 +0200120 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121}
122
123/*
124 * On SMP it's slightly faster (but much more power-consuming!)
125 * to poll the ->need_resched flag instead of waiting for the
126 * cross-CPU IPI to arrive. Use this option with caution.
127 */
128static void poll_idle (void)
129{
Andi Kleen72690a22006-12-07 02:14:03 +0100130 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131}
132
133void cpu_idle_wait(void)
134{
135 unsigned int cpu, this_cpu = get_cpu();
Ingo Molnardc1829a2006-11-17 14:26:18 +0100136 cpumask_t map, tmp = current->cpus_allowed;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137
138 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
139 put_cpu();
140
141 cpus_clear(map);
142 for_each_online_cpu(cpu) {
143 per_cpu(cpu_idle_state, cpu) = 1;
144 cpu_set(cpu, map);
145 }
146
147 __get_cpu_var(cpu_idle_state) = 0;
148
149 wmb();
150 do {
151 ssleep(1);
152 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100153 if (cpu_isset(cpu, map) &&
154 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 cpu_clear(cpu, map);
156 }
157 cpus_and(map, map, cpu_online_map);
158 } while (!cpus_empty(map));
Ingo Molnardc1829a2006-11-17 14:26:18 +0100159
160 set_cpus_allowed(current, tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161}
162EXPORT_SYMBOL_GPL(cpu_idle_wait);
163
Ashok Raj76e4f662005-06-25 14:55:00 -0700164#ifdef CONFIG_HOTPLUG_CPU
165DECLARE_PER_CPU(int, cpu_state);
166
167#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800168/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700169static inline void play_dead(void)
170{
171 idle_task_exit();
172 wbinvd();
173 mb();
174 /* Ack it */
175 __get_cpu_var(cpu_state) = CPU_DEAD;
176
Shaohua Li1fa744e2006-01-06 00:12:20 -0800177 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700178 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800179 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700180}
181#else
182static inline void play_dead(void)
183{
184 BUG();
185}
186#endif /* CONFIG_HOTPLUG_CPU */
187
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188/*
189 * The idle thread. There's no useful work to be
190 * done, so just try to conserve power and have a
191 * low exit latency (ie sit in a loop waiting for
192 * somebody to say that they'd like to reschedule)
193 */
194void cpu_idle (void)
195{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200196 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197 /* endless idle loop with no priority at all */
198 while (1) {
199 while (!need_resched()) {
200 void (*idle)(void);
201
202 if (__get_cpu_var(cpu_idle_state))
203 __get_cpu_var(cpu_idle_state) = 0;
204
205 rmb();
206 idle = pm_idle;
207 if (!idle)
208 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700209 if (cpu_is_offline(smp_processor_id()))
210 play_dead();
Andi Kleen95833c82006-01-11 22:44:36 +0100211 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200213 /* In many cases the interrupt that ended idle
214 has already called exit_idle. But some idle
215 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100216 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217 }
218
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800219 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800221 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222 }
223}
224
225/*
226 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
227 * which can obviate IPI to trigger checking of need_resched.
228 * We execute MONITOR against need_resched and enter optimized wait state
229 * through MWAIT. Whenever someone changes need_resched, we would be woken
230 * up from MWAIT (without an IPI).
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700231 *
232 * New with Core Duo processors, MWAIT can take some hints based on CPU
233 * capability.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234 */
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700235void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
236{
237 if (!need_resched()) {
238 __monitor((void *)&current_thread_info()->flags, 0, 0);
239 smp_mb();
240 if (!need_resched())
241 __mwait(eax, ecx);
242 }
243}
244
245/* Default MONITOR/MWAIT with no hints, used for default C1 state */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246static void mwait_idle(void)
247{
248 local_irq_enable();
Andi Kleen72690a22006-12-07 02:14:03 +0100249 mwait_idle_with_hints(0,0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250}
251
Ashok Raje6982c62005-06-25 14:54:58 -0700252void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253{
254 static int printed;
255 if (cpu_has(c, X86_FEATURE_MWAIT)) {
256 /*
257 * Skip, if setup has overridden idle.
258 * One CPU supports mwait => All CPUs supports mwait
259 */
260 if (!pm_idle) {
261 if (!printed) {
262 printk("using mwait in idle threads.\n");
263 printed = 1;
264 }
265 pm_idle = mwait_idle;
266 }
267 }
268}
269
270static int __init idle_setup (char *str)
271{
272 if (!strncmp(str, "poll", 4)) {
273 printk("using polling idle threads.\n");
274 pm_idle = poll_idle;
275 }
276
277 boot_option_idle_override = 1;
278 return 1;
279}
280
281__setup("idle=", idle_setup);
282
283/* Prints also some state that isn't saved in the pt_regs */
284void __show_regs(struct pt_regs * regs)
285{
286 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
287 unsigned int fsindex,gsindex;
288 unsigned int ds,cs,es;
289
290 printk("\n");
291 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200292 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
293 current->pid, current->comm, print_tainted(),
Serge E. Hallyn96b644b2006-10-02 02:18:13 -0700294 init_utsname()->release,
295 (int)strcspn(init_utsname()->version, " "),
296 init_utsname()->version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
298 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700299 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100300 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
302 regs->rax, regs->rbx, regs->rcx);
303 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
304 regs->rdx, regs->rsi, regs->rdi);
305 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
306 regs->rbp, regs->r8, regs->r9);
307 printk("R10: %016lx R11: %016lx R12: %016lx\n",
308 regs->r10, regs->r11, regs->r12);
309 printk("R13: %016lx R14: %016lx R15: %016lx\n",
310 regs->r13, regs->r14, regs->r15);
311
312 asm("movl %%ds,%0" : "=r" (ds));
313 asm("movl %%cs,%0" : "=r" (cs));
314 asm("movl %%es,%0" : "=r" (es));
315 asm("movl %%fs,%0" : "=r" (fsindex));
316 asm("movl %%gs,%0" : "=r" (gsindex));
317
318 rdmsrl(MSR_FS_BASE, fs);
319 rdmsrl(MSR_GS_BASE, gs);
320 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
321
322 asm("movq %%cr0, %0": "=r" (cr0));
323 asm("movq %%cr2, %0": "=r" (cr2));
324 asm("movq %%cr3, %0": "=r" (cr3));
325 asm("movq %%cr4, %0": "=r" (cr4));
326
327 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
328 fs,fsindex,gs,gsindex,shadowgs);
329 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
330 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
331}
332
333void show_regs(struct pt_regs *regs)
334{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700335 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200337 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338}
339
340/*
341 * Free current thread data structures etc..
342 */
343void exit_thread(void)
344{
345 struct task_struct *me = current;
346 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700347
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348 if (me->thread.io_bitmap_ptr) {
349 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
350
351 kfree(t->io_bitmap_ptr);
352 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200353 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354 /*
355 * Careful, clear this in the TSS too:
356 */
357 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
358 t->io_bitmap_max = 0;
359 put_cpu();
360 }
361}
362
363void flush_thread(void)
364{
365 struct task_struct *tsk = current;
366 struct thread_info *t = current_thread_info();
367
Andi Kleen4d9bc792006-06-26 13:57:19 +0200368 if (t->flags & _TIF_ABI_PENDING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200370 if (t->flags & _TIF_IA32)
371 current_thread_info()->status |= TS_COMPAT;
372 }
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200373 t->flags &= ~_TIF_DEBUG;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374
375 tsk->thread.debugreg0 = 0;
376 tsk->thread.debugreg1 = 0;
377 tsk->thread.debugreg2 = 0;
378 tsk->thread.debugreg3 = 0;
379 tsk->thread.debugreg6 = 0;
380 tsk->thread.debugreg7 = 0;
381 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
382 /*
383 * Forget coprocessor state..
384 */
385 clear_fpu(tsk);
386 clear_used_math();
387}
388
389void release_thread(struct task_struct *dead_task)
390{
391 if (dead_task->mm) {
392 if (dead_task->mm->context.size) {
393 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
394 dead_task->comm,
395 dead_task->mm->context.ldt,
396 dead_task->mm->context.size);
397 BUG();
398 }
399 }
400}
401
402static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
403{
404 struct user_desc ud = {
405 .base_addr = addr,
406 .limit = 0xfffff,
407 .seg_32bit = 1,
408 .limit_in_pages = 1,
409 .useable = 1,
410 };
411 struct n_desc_struct *desc = (void *)t->thread.tls_array;
412 desc += tls;
413 desc->a = LDT_entry_a(&ud);
414 desc->b = LDT_entry_b(&ud);
415}
416
417static inline u32 read_32bit_tls(struct task_struct *t, int tls)
418{
419 struct desc_struct *desc = (void *)t->thread.tls_array;
420 desc += tls;
421 return desc->base0 |
422 (((u32)desc->base1) << 16) |
423 (((u32)desc->base2) << 24);
424}
425
426/*
427 * This gets called before we allocate a new thread and copy
428 * the current task into it.
429 */
430void prepare_to_copy(struct task_struct *tsk)
431{
432 unlazy_fpu(tsk);
433}
434
435int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
436 unsigned long unused,
437 struct task_struct * p, struct pt_regs * regs)
438{
439 int err;
440 struct pt_regs * childregs;
441 struct task_struct *me = current;
442
Andi Kleena88cde12005-11-05 17:25:54 +0100443 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800444 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 *childregs = *regs;
446
447 childregs->rax = 0;
448 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100449 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451
452 p->thread.rsp = (unsigned long) childregs;
453 p->thread.rsp0 = (unsigned long) (childregs+1);
454 p->thread.userrsp = me->thread.userrsp;
455
Al Viroe4f17c42006-01-12 01:05:38 -0800456 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457
458 p->thread.fs = me->thread.fs;
459 p->thread.gs = me->thread.gs;
460
H. J. Lufd51f662005-05-01 08:58:48 -0700461 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
462 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
463 asm("mov %%es,%0" : "=m" (p->thread.es));
464 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200466 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
468 if (!p->thread.io_bitmap_ptr) {
469 p->thread.io_bitmap_max = 0;
470 return -ENOMEM;
471 }
Andi Kleena88cde12005-11-05 17:25:54 +0100472 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
473 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200474 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700475 }
476
477 /*
478 * Set a new TLS for the child thread?
479 */
480 if (clone_flags & CLONE_SETTLS) {
481#ifdef CONFIG_IA32_EMULATION
482 if (test_thread_flag(TIF_IA32))
483 err = ia32_child_tls(p, childregs);
484 else
485#endif
486 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
487 if (err)
488 goto out;
489 }
490 err = 0;
491out:
492 if (err && p->thread.io_bitmap_ptr) {
493 kfree(p->thread.io_bitmap_ptr);
494 p->thread.io_bitmap_max = 0;
495 }
496 return err;
497}
498
499/*
500 * This special macro can be used to load a debugging register
501 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100502#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200504static inline void __switch_to_xtra(struct task_struct *prev_p,
505 struct task_struct *next_p,
506 struct tss_struct *tss)
507{
508 struct thread_struct *prev, *next;
509
510 prev = &prev_p->thread,
511 next = &next_p->thread;
512
513 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
514 loaddebug(next, 0);
515 loaddebug(next, 1);
516 loaddebug(next, 2);
517 loaddebug(next, 3);
518 /* no 4 and 5 */
519 loaddebug(next, 6);
520 loaddebug(next, 7);
521 }
522
523 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
524 /*
525 * Copy the relevant range of the IO bitmap.
526 * Normally this is 128 bytes or less:
527 */
528 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
529 max(prev->io_bitmap_max, next->io_bitmap_max));
530 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
531 /*
532 * Clear any possible leftover bits:
533 */
534 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
535 }
536}
537
Linus Torvalds1da177e2005-04-16 15:20:36 -0700538/*
539 * switch_to(x,y) should switch tasks from x to y.
540 *
541 * This could still be optimized:
542 * - fold all the options into a flag word and test it with a single test.
543 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100544 *
545 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700546 */
Andi Kleen099f3182006-02-03 21:51:38 +0100547__kprobes struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100548__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549{
550 struct thread_struct *prev = &prev_p->thread,
551 *next = &next_p->thread;
552 int cpu = smp_processor_id();
553 struct tss_struct *tss = &per_cpu(init_tss, cpu);
554
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200555 /* we're going to use this soon, after a few expensive things */
556 if (next_p->fpu_counter>5)
557 prefetch(&next->i387.fxsave);
558
Linus Torvalds1da177e2005-04-16 15:20:36 -0700559 /*
560 * Reload esp0, LDT and the page table pointer:
561 */
562 tss->rsp0 = next->rsp0;
563
564 /*
565 * Switch DS and ES.
566 * This won't pick up thread selector changes, but I guess that is ok.
567 */
H. J. Lufd51f662005-05-01 08:58:48 -0700568 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700569 if (unlikely(next->es | prev->es))
570 loadsegment(es, next->es);
571
H. J. Lufd51f662005-05-01 08:58:48 -0700572 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700573 if (unlikely(next->ds | prev->ds))
574 loadsegment(ds, next->ds);
575
576 load_TLS(next, cpu);
577
578 /*
579 * Switch FS and GS.
580 */
581 {
582 unsigned fsindex;
583 asm volatile("movl %%fs,%0" : "=r" (fsindex));
584 /* segment register != 0 always requires a reload.
585 also reload when it has changed.
586 when prev process used 64bit base always reload
587 to avoid an information leak. */
588 if (unlikely(fsindex | next->fsindex | prev->fs)) {
589 loadsegment(fs, next->fsindex);
590 /* check if the user used a selector != 0
591 * if yes clear 64bit base, since overloaded base
592 * is always mapped to the Null selector
593 */
594 if (fsindex)
595 prev->fs = 0;
596 }
597 /* when next process has a 64bit base use it */
598 if (next->fs)
599 wrmsrl(MSR_FS_BASE, next->fs);
600 prev->fsindex = fsindex;
601 }
602 {
603 unsigned gsindex;
604 asm volatile("movl %%gs,%0" : "=r" (gsindex));
605 if (unlikely(gsindex | next->gsindex | prev->gs)) {
606 load_gs_index(next->gsindex);
607 if (gsindex)
608 prev->gs = 0;
609 }
610 if (next->gs)
611 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
612 prev->gsindex = gsindex;
613 }
614
Andi Kleen0a5ace22006-10-05 18:47:22 +0200615 /* Must be after DS reload */
616 unlazy_fpu(prev_p);
617
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100619 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620 */
621 prev->userrsp = read_pda(oldrsp);
622 write_pda(oldrsp, next->userrsp);
623 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200624
Andi Kleena88cde12005-11-05 17:25:54 +0100625 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200626 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a425402006-09-26 10:52:38 +0200627#ifdef CONFIG_CC_STACKPROTECTOR
628 write_pda(stack_canary, next_p->stack_canary);
629 /*
630 * Build time only check to make sure the stack_canary is at
631 * offset 40 in the pda; this is a gcc ABI requirement
632 */
633 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
634#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700635
636 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200637 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200639 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
640 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
641 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200643 /* If the task has used fpu the last 5 timeslices, just do a full
644 * restore of the math state immediately to avoid the trap; the
645 * chances of needing FPU soon are obviously high now
646 */
647 if (next_p->fpu_counter>5)
648 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649 return prev_p;
650}
651
652/*
653 * sys_execve() executes a new program.
654 */
655asmlinkage
656long sys_execve(char __user *name, char __user * __user *argv,
657 char __user * __user *envp, struct pt_regs regs)
658{
659 long error;
660 char * filename;
661
662 filename = getname(name);
663 error = PTR_ERR(filename);
664 if (IS_ERR(filename))
665 return error;
666 error = do_execve(filename, argv, envp, &regs);
667 if (error == 0) {
668 task_lock(current);
669 current->ptrace &= ~PT_DTRACE;
670 task_unlock(current);
671 }
672 putname(filename);
673 return error;
674}
675
676void set_personality_64bit(void)
677{
678 /* inherit personality from parent */
679
680 /* Make sure to be in 64bit mode */
681 clear_thread_flag(TIF_IA32);
682
683 /* TBD: overwrites user setup. Should have two bits.
684 But 64bit processes have always behaved this way,
685 so it's not too bad. The main problem is just that
686 32bit childs are affected again. */
687 current->personality &= ~READ_IMPLIES_EXEC;
688}
689
690asmlinkage long sys_fork(struct pt_regs *regs)
691{
692 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
693}
694
Andi Kleena88cde12005-11-05 17:25:54 +0100695asmlinkage long
696sys_clone(unsigned long clone_flags, unsigned long newsp,
697 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700698{
699 if (!newsp)
700 newsp = regs->rsp;
701 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
702}
703
704/*
705 * This is trivial, and on the face of it looks like it
706 * could equally well be done in user mode.
707 *
708 * Not so, for quite unobvious reasons - register pressure.
709 * In user mode vfork() cannot have a stack frame, and if
710 * done by calling the "clone()" system call directly, you
711 * do not have enough call-clobbered registers to hold all
712 * the information you need.
713 */
714asmlinkage long sys_vfork(struct pt_regs *regs)
715{
716 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
717 NULL, NULL);
718}
719
720unsigned long get_wchan(struct task_struct *p)
721{
722 unsigned long stack;
723 u64 fp,rip;
724 int count = 0;
725
726 if (!p || p == current || p->state==TASK_RUNNING)
727 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800728 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
730 return 0;
731 fp = *(u64 *)(p->thread.rsp);
732 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100733 if (fp < (unsigned long)stack ||
734 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700735 return 0;
736 rip = *(u64 *)(fp+8);
737 if (!in_sched_functions(rip))
738 return rip;
739 fp = *(u64 *)fp;
740 } while (count++ < 16);
741 return 0;
742}
743
744long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
745{
746 int ret = 0;
747 int doit = task == current;
748 int cpu;
749
750 switch (code) {
751 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700752 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753 return -EPERM;
754 cpu = get_cpu();
755 /* handle small bases via the GDT because that's faster to
756 switch. */
757 if (addr <= 0xffffffff) {
758 set_32bit_tls(task, GS_TLS, addr);
759 if (doit) {
760 load_TLS(&task->thread, cpu);
761 load_gs_index(GS_TLS_SEL);
762 }
763 task->thread.gsindex = GS_TLS_SEL;
764 task->thread.gs = 0;
765 } else {
766 task->thread.gsindex = 0;
767 task->thread.gs = addr;
768 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100769 load_gs_index(0);
770 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771 }
772 }
773 put_cpu();
774 break;
775 case ARCH_SET_FS:
776 /* Not strictly needed for fs, but do it for symmetry
777 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700778 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779 return -EPERM;
780 cpu = get_cpu();
781 /* handle small bases via the GDT because that's faster to
782 switch. */
783 if (addr <= 0xffffffff) {
784 set_32bit_tls(task, FS_TLS, addr);
785 if (doit) {
786 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100787 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788 }
789 task->thread.fsindex = FS_TLS_SEL;
790 task->thread.fs = 0;
791 } else {
792 task->thread.fsindex = 0;
793 task->thread.fs = addr;
794 if (doit) {
795 /* set the selector to 0 to not confuse
796 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100797 asm volatile("movl %0,%%fs" :: "r" (0));
798 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799 }
800 }
801 put_cpu();
802 break;
803 case ARCH_GET_FS: {
804 unsigned long base;
805 if (task->thread.fsindex == FS_TLS_SEL)
806 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100807 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100809 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810 base = task->thread.fs;
811 ret = put_user(base, (unsigned long __user *)addr);
812 break;
813 }
814 case ARCH_GET_GS: {
815 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200816 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700817 if (task->thread.gsindex == GS_TLS_SEL)
818 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200819 else if (doit) {
820 asm("movl %%gs,%0" : "=r" (gsindex));
821 if (gsindex)
822 rdmsrl(MSR_KERNEL_GS_BASE, base);
823 else
824 base = task->thread.gs;
825 }
Andi Kleena88cde12005-11-05 17:25:54 +0100826 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827 base = task->thread.gs;
828 ret = put_user(base, (unsigned long __user *)addr);
829 break;
830 }
831
832 default:
833 ret = -EINVAL;
834 break;
835 }
836
837 return ret;
838}
839
840long sys_arch_prctl(int code, unsigned long addr)
841{
842 return do_arch_prctl(current, code, addr);
843}
844
845/*
846 * Capture the user space registers if the task is not running (in user space)
847 */
848int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
849{
850 struct pt_regs *pp, ptregs;
851
Al Virobb049232006-01-12 01:05:38 -0800852 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853
854 ptregs = *pp;
855 ptregs.cs &= 0xffff;
856 ptregs.ss &= 0xffff;
857
858 elf_core_copy_regs(regs, &ptregs);
859
860 return 1;
861}
862
863unsigned long arch_align_stack(unsigned long sp)
864{
Andi Kleenc16b63e2006-09-26 10:52:28 +0200865 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700866 sp -= get_random_int() % 8192;
867 return sp & ~0xf;
868}