blob: fd6ecc2cee38573d8483d377c58749b511f689ec [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
Linus Torvalds1da177e2005-04-16 15:20:36 -07008#include <linux/signal.h>
9#include <linux/sched.h>
10#include <linux/kernel.h>
11#include <linux/errno.h>
12#include <linux/string.h>
13#include <linux/types.h>
14#include <linux/ptrace.h>
15#include <linux/mman.h>
16#include <linux/mm.h>
17#include <linux/smp.h>
18#include <linux/smp_lock.h>
19#include <linux/interrupt.h>
20#include <linux/init.h>
21#include <linux/tty.h>
22#include <linux/vt_kern.h> /* For unblank_screen() */
23#include <linux/compiler.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070024#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070025#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070026#include <linux/kprobes.h>
Andi Kleenab2bf0c2006-12-07 02:14:06 +010027#include <linux/uaccess.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070028#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070029
30#include <asm/system.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031#include <asm/pgalloc.h>
32#include <asm/smp.h>
33#include <asm/tlbflush.h>
34#include <asm/proto.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070035#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070036
Andi Kleen66c58152006-01-11 22:44:09 +010037/* Page fault error code bits */
38#define PF_PROT (1<<0) /* or no page found */
39#define PF_WRITE (1<<1)
40#define PF_USER (1<<2)
41#define PF_RSVD (1<<3)
42#define PF_INSTR (1<<4)
43
Andi Kleen273819a2006-09-26 10:52:35 +020044static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070045
46/* Hook to register for page fault notifications */
47int register_page_fault_notifier(struct notifier_block *nb)
48{
49 vmalloc_sync_all();
50 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
51}
Andi Kleen273819a2006-09-26 10:52:35 +020052EXPORT_SYMBOL_GPL(register_page_fault_notifier);
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070053
54int unregister_page_fault_notifier(struct notifier_block *nb)
55{
56 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
57}
Andi Kleen273819a2006-09-26 10:52:35 +020058EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070059
Jan Beulich9b355892007-02-13 13:26:23 +010060static inline int notify_page_fault(struct pt_regs *regs, long err)
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070061{
62 struct die_args args = {
63 .regs = regs,
Jan Beulich9b355892007-02-13 13:26:23 +010064 .str = "page fault",
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070065 .err = err,
Jan Beulich9b355892007-02-13 13:26:23 +010066 .trapnr = 14,
67 .signr = SIGSEGV
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070068 };
Jan Beulich9b355892007-02-13 13:26:23 +010069 return atomic_notifier_call_chain(&notify_page_fault_chain,
70 DIE_PAGE_FAULT, &args);
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070071}
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070072
Linus Torvalds1da177e2005-04-16 15:20:36 -070073/* Sometimes the CPU reports invalid exceptions on prefetch.
74 Check that here and ignore.
75 Opcode checker based on code by Richard Brunner */
76static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
77 unsigned long error_code)
78{
Andi Kleenab2bf0c2006-12-07 02:14:06 +010079 unsigned char *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070080 int scan_more = 1;
81 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -070082 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070083
84 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +010085 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -070086 return 0;
87
Andi Kleendd2994f2006-09-26 10:52:33 +020088 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
Andi Kleenf1290ec2005-04-16 15:24:59 -070089 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -070090
Vincent Hanquez76381fe2005-06-23 00:08:46 -070091 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -070092 return 0;
93
94 while (scan_more && instr < max_instr) {
95 unsigned char opcode;
96 unsigned char instr_hi;
97 unsigned char instr_lo;
98
Andi Kleenab2bf0c2006-12-07 02:14:06 +010099 if (probe_kernel_address(instr, opcode))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100 break;
101
102 instr_hi = opcode & 0xf0;
103 instr_lo = opcode & 0x0f;
104 instr++;
105
106 switch (instr_hi) {
107 case 0x20:
108 case 0x30:
109 /* Values 0x26,0x2E,0x36,0x3E are valid x86
110 prefixes. In long mode, the CPU will signal
111 invalid opcode if some of these prefixes are
112 present so we will never get here anyway */
113 scan_more = ((instr_lo & 7) == 0x6);
114 break;
115
116 case 0x40:
117 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
118 Need to figure out under what instruction mode the
119 instruction was issued ... */
120 /* Could check the LDT for lm, but for now it's good
121 enough to assume that long mode only uses well known
122 segments or kernel. */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700123 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124 break;
125
126 case 0x60:
127 /* 0x64 thru 0x67 are valid prefixes in all modes. */
128 scan_more = (instr_lo & 0xC) == 0x4;
129 break;
130 case 0xF0:
131 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
132 scan_more = !instr_lo || (instr_lo>>1) == 1;
133 break;
134 case 0x00:
135 /* Prefetch instruction is 0x0F0D or 0x0F18 */
136 scan_more = 0;
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100137 if (probe_kernel_address(instr, opcode))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138 break;
139 prefetch = (instr_lo == 0xF) &&
140 (opcode == 0x0D || opcode == 0x18);
141 break;
142 default:
143 scan_more = 0;
144 break;
145 }
146 }
147 return prefetch;
148}
149
150static int bad_address(void *p)
151{
152 unsigned long dummy;
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100153 return probe_kernel_address((unsigned long *)p, dummy);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154}
155
156void dump_pagetable(unsigned long address)
157{
158 pgd_t *pgd;
159 pud_t *pud;
160 pmd_t *pmd;
161 pte_t *pte;
162
163 asm("movq %%cr3,%0" : "=r" (pgd));
164
165 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
166 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100168 printk("PGD %lx ", pgd_val(*pgd));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 if (!pgd_present(*pgd)) goto ret;
170
Andi Kleend2ae5b52006-06-26 13:57:56 +0200171 pud = pud_offset(pgd, address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700172 if (bad_address(pud)) goto bad;
173 printk("PUD %lx ", pud_val(*pud));
174 if (!pud_present(*pud)) goto ret;
175
176 pmd = pmd_offset(pud, address);
177 if (bad_address(pmd)) goto bad;
178 printk("PMD %lx ", pmd_val(*pmd));
179 if (!pmd_present(*pmd)) goto ret;
180
181 pte = pte_offset_kernel(pmd, address);
182 if (bad_address(pte)) goto bad;
183 printk("PTE %lx", pte_val(*pte));
184ret:
185 printk("\n");
186 return;
187bad:
188 printk("BAD\n");
189}
190
191static const char errata93_warning[] =
192KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
193KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
194KERN_ERR "******* Please consider a BIOS update.\n"
195KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
196
197/* Workaround for K8 erratum #93 & buggy BIOS.
198 BIOS SMM functions are required to use a specific workaround
199 to avoid corruption of the 64bit RIP register on C stepping K8.
200 A lot of BIOS that didn't get tested properly miss this.
201 The OS sees this as a page fault with the upper 32bits of RIP cleared.
202 Try to work around it here.
203 Note we only handle faults in kernel here. */
204
205static int is_errata93(struct pt_regs *regs, unsigned long address)
206{
207 static int warned;
208 if (address != regs->rip)
209 return 0;
210 if ((address >> 32) != 0)
211 return 0;
212 address |= 0xffffffffUL << 32;
213 if ((address >= (u64)_stext && address <= (u64)_etext) ||
214 (address >= MODULES_VADDR && address <= MODULES_END)) {
215 if (!warned) {
216 printk(errata93_warning);
217 warned = 1;
218 }
219 regs->rip = address;
220 return 1;
221 }
222 return 0;
223}
224
225int unhandled_signal(struct task_struct *tsk, int sig)
226{
Sukadev Bhattiproluf400e192006-09-29 02:00:07 -0700227 if (is_init(tsk))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228 return 1;
Andi Kleen5e5ec102005-08-19 06:56:04 +0200229 if (tsk->ptrace & PT_PTRACED)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230 return 0;
231 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
232 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
233}
234
235static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
236 unsigned long error_code)
237{
Jan Beulich12091402005-09-12 18:49:24 +0200238 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100239 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200240
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
242 current->comm, address);
243 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100244 tsk = current;
245 tsk->thread.cr2 = address;
246 tsk->thread.trap_no = 14;
247 tsk->thread.error_code = error_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248 __die("Bad pagetable", regs, error_code);
Jan Beulich12091402005-09-12 18:49:24 +0200249 oops_end(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 do_exit(SIGKILL);
251}
252
253/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100254 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700255 *
256 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 */
258static int vmalloc_fault(unsigned long address)
259{
260 pgd_t *pgd, *pgd_ref;
261 pud_t *pud, *pud_ref;
262 pmd_t *pmd, *pmd_ref;
263 pte_t *pte, *pte_ref;
264
265 /* Copy kernel mappings over when needed. This can also
266 happen within a race in page table update. In the later
267 case just flush. */
268
269 pgd = pgd_offset(current->mm ?: &init_mm, address);
270 pgd_ref = pgd_offset_k(address);
271 if (pgd_none(*pgd_ref))
272 return -1;
273 if (pgd_none(*pgd))
274 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100275 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700276 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277
278 /* Below here mismatches are bugs because these lower tables
279 are shared */
280
281 pud = pud_offset(pgd, address);
282 pud_ref = pud_offset(pgd_ref, address);
283 if (pud_none(*pud_ref))
284 return -1;
Dave McCracken46a82b22006-09-25 23:31:48 -0700285 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286 BUG();
287 pmd = pmd_offset(pud, address);
288 pmd_ref = pmd_offset(pud_ref, address);
289 if (pmd_none(*pmd_ref))
290 return -1;
291 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
292 BUG();
293 pte_ref = pte_offset_kernel(pmd_ref, address);
294 if (!pte_present(*pte_ref))
295 return -1;
296 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700297 /* Don't use pte_page here, because the mappings can point
298 outside mem_map, and the NUMA hash lookup cannot handle
299 that. */
300 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302 return 0;
303}
304
305int page_fault_trace = 0;
306int exception_trace = 1;
307
308/*
309 * This routine handles page faults. It determines the address,
310 * and the problem, and then passes it off to one of the appropriate
311 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700313asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
314 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315{
316 struct task_struct *tsk;
317 struct mm_struct *mm;
318 struct vm_area_struct * vma;
319 unsigned long address;
320 const struct exception_table_entry *fixup;
321 int write;
Jan Beulich12091402005-09-12 18:49:24 +0200322 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323 siginfo_t info;
324
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100325 tsk = current;
326 mm = tsk->mm;
327 prefetchw(&mm->mmap_sem);
328
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 /* get the address */
330 __asm__("movq %%cr2,%0":"=r" (address));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 info.si_code = SEGV_MAPERR;
333
334
335 /*
336 * We fault-in kernel-space virtual memory on-demand. The
337 * 'reference' page table is init_mm.pgd.
338 *
339 * NOTE! We MUST NOT take any locks for this case. We may
340 * be in an interrupt or a critical region, and should
341 * only copy the information from the master page table,
342 * nothing more.
343 *
344 * This verifies that the fault happens in kernel space
345 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100346 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347 */
Suresh Siddha84929802005-06-21 17:14:32 -0700348 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100349 /*
350 * Don't check for the module range here: its PML4
351 * is always initialized because it's shared with the main
352 * kernel text. Only vmalloc may need PML4 syncups.
353 */
Andi Kleen66c58152006-01-11 22:44:09 +0100354 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100355 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100356 if (vmalloc_fault(address) >= 0)
357 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358 }
Jan Beulich9b355892007-02-13 13:26:23 +0100359 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
Jan Beulich8c914cb2006-03-25 16:29:40 +0100360 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 /*
362 * Don't take the mm semaphore here. If we fixup a prefetch
363 * fault we could otherwise deadlock.
364 */
365 goto bad_area_nosemaphore;
366 }
367
Jan Beulich9b355892007-02-13 13:26:23 +0100368 if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
Jan Beulich8c914cb2006-03-25 16:29:40 +0100369 return;
370
371 if (likely(regs->eflags & X86_EFLAGS_IF))
372 local_irq_enable();
373
374 if (unlikely(page_fault_trace))
375 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
376 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
377
Andi Kleen66c58152006-01-11 22:44:09 +0100378 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 pgtable_bad(address, regs, error_code);
380
381 /*
382 * If we're in an interrupt or have no user
383 * context, we must not take the fault..
384 */
385 if (unlikely(in_atomic() || !mm))
386 goto bad_area_nosemaphore;
387
388 again:
389 /* When running in the kernel we expect faults to occur only to
390 * addresses in user space. All other faults represent errors in the
391 * kernel and should generate an OOPS. Unfortunatly, in the case of an
Adrian Bunk80f72282006-06-30 18:27:16 +0200392 * erroneous fault occurring in a code path which already holds mmap_sem
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 * we will deadlock attempting to validate the fault against the
394 * address space. Luckily the kernel only validly references user
395 * space from well defined areas of code, which are listed in the
396 * exceptions table.
397 *
398 * As the vast majority of faults will be valid we will only perform
399 * the source reference check when there is a possibilty of a deadlock.
400 * Attempt to lock the address space, if we cannot we then validate the
401 * source. If this is invalid we can skip the address space check,
402 * thus avoiding the deadlock.
403 */
404 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100405 if ((error_code & PF_USER) == 0 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700406 !search_exception_tables(regs->rip))
407 goto bad_area_nosemaphore;
408 down_read(&mm->mmap_sem);
409 }
410
411 vma = find_vma(mm, address);
412 if (!vma)
413 goto bad_area;
414 if (likely(vma->vm_start <= address))
415 goto good_area;
416 if (!(vma->vm_flags & VM_GROWSDOWN))
417 goto bad_area;
418 if (error_code & 4) {
Chuck Ebbert03fdc2c2006-06-26 13:59:50 +0200419 /* Allow userspace just enough access below the stack pointer
420 * to let the 'enter' instruction work.
421 */
422 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423 goto bad_area;
424 }
425 if (expand_stack(vma, address))
426 goto bad_area;
427/*
428 * Ok, we have a good vm_area for this memory access, so
429 * we can handle it..
430 */
431good_area:
432 info.si_code = SEGV_ACCERR;
433 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100434 switch (error_code & (PF_PROT|PF_WRITE)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435 default: /* 3: write, present */
436 /* fall through */
Andi Kleen66c58152006-01-11 22:44:09 +0100437 case PF_WRITE: /* write, not present */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438 if (!(vma->vm_flags & VM_WRITE))
439 goto bad_area;
440 write++;
441 break;
Andi Kleen66c58152006-01-11 22:44:09 +0100442 case PF_PROT: /* read, present */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700443 goto bad_area;
Andi Kleen66c58152006-01-11 22:44:09 +0100444 case 0: /* read, not present */
Jason Barondf67b3d2006-09-29 01:58:58 -0700445 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 goto bad_area;
447 }
448
449 /*
450 * If for any reason at all we couldn't handle the fault,
451 * make sure we exit gracefully rather than endlessly redo
452 * the fault.
453 */
454 switch (handle_mm_fault(mm, vma, address, write)) {
Alexander Nyberg96800212005-08-04 16:14:57 +0200455 case VM_FAULT_MINOR:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456 tsk->min_flt++;
457 break;
Alexander Nyberg96800212005-08-04 16:14:57 +0200458 case VM_FAULT_MAJOR:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459 tsk->maj_flt++;
460 break;
Alexander Nyberg96800212005-08-04 16:14:57 +0200461 case VM_FAULT_SIGBUS:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462 goto do_sigbus;
463 default:
464 goto out_of_memory;
465 }
466
467 up_read(&mm->mmap_sem);
468 return;
469
470/*
471 * Something tried to access memory that isn't in our memory map..
472 * Fix it, but check if it's kernel or user first..
473 */
474bad_area:
475 up_read(&mm->mmap_sem);
476
477bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100479 if (error_code & PF_USER) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700480 if (is_prefetch(regs, address, error_code))
481 return;
482
483 /* Work around K8 erratum #100 K8 in compat mode
484 occasionally jumps to illegal addresses >4GB. We
485 catch this here in the page fault handler because
486 these addresses are not reachable. Just detect this
487 case and return. Any code segment in LDT is
488 compatibility mode. */
489 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
490 (address >> 32))
491 return;
492
493 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
494 printk(
495 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
496 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
497 tsk->comm, tsk->pid, address, regs->rip,
498 regs->rsp, error_code);
499 }
500
501 tsk->thread.cr2 = address;
502 /* Kernel addresses are always protection faults */
503 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
504 tsk->thread.trap_no = 14;
505 info.si_signo = SIGSEGV;
506 info.si_errno = 0;
507 /* info.si_code has been set above */
508 info.si_addr = (void __user *)address;
509 force_sig_info(SIGSEGV, &info, tsk);
510 return;
511 }
512
513no_context:
514
515 /* Are we prepared to handle this kernel fault? */
516 fixup = search_exception_tables(regs->rip);
517 if (fixup) {
518 regs->rip = fixup->fixup;
519 return;
520 }
521
522 /*
523 * Hall of shame of CPU/BIOS bugs.
524 */
525
526 if (is_prefetch(regs, address, error_code))
527 return;
528
529 if (is_errata93(regs, address))
530 return;
531
532/*
533 * Oops. The kernel tried to access some bad page. We'll have to
534 * terminate things with extreme prejudice.
535 */
536
Jan Beulich12091402005-09-12 18:49:24 +0200537 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700538
539 if (address < PAGE_SIZE)
540 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
541 else
542 printk(KERN_ALERT "Unable to handle kernel paging request");
543 printk(" at %016lx RIP: \n" KERN_ALERT,address);
544 printk_address(regs->rip);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700545 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100546 tsk->thread.cr2 = address;
547 tsk->thread.trap_no = 14;
548 tsk->thread.error_code = error_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 __die("Oops", regs, error_code);
550 /* Executive summary in case the body of the oops scrolled away */
551 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich12091402005-09-12 18:49:24 +0200552 oops_end(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700553 do_exit(SIGKILL);
554
555/*
556 * We ran out of memory, or some other thing happened to us that made
557 * us unable to handle the page fault gracefully.
558 */
559out_of_memory:
560 up_read(&mm->mmap_sem);
Sukadev Bhattiproluf400e192006-09-29 02:00:07 -0700561 if (is_init(current)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562 yield();
563 goto again;
564 }
565 printk("VM: killing process %s\n", tsk->comm);
566 if (error_code & 4)
567 do_exit(SIGKILL);
568 goto no_context;
569
570do_sigbus:
571 up_read(&mm->mmap_sem);
572
573 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100574 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575 goto no_context;
576
577 tsk->thread.cr2 = address;
578 tsk->thread.error_code = error_code;
579 tsk->thread.trap_no = 14;
580 info.si_signo = SIGBUS;
581 info.si_errno = 0;
582 info.si_code = BUS_ADRERR;
583 info.si_addr = (void __user *)address;
584 force_sig_info(SIGBUS, &info, tsk);
585 return;
586}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100587
Jan Beulich8c914cb2006-03-25 16:29:40 +0100588DEFINE_SPINLOCK(pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200589LIST_HEAD(pgd_list);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100590
591void vmalloc_sync_all(void)
592{
593 /* Note that races in the updates of insync and start aren't
594 problematic:
595 insync can only get set bits added, and updates to start are only
596 improving performance (without affecting correctness if undone). */
597 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
598 static unsigned long start = VMALLOC_START & PGDIR_MASK;
599 unsigned long address;
600
601 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
602 if (!test_bit(pgd_index(address), insync)) {
603 const pgd_t *pgd_ref = pgd_offset_k(address);
604 struct page *page;
605
606 if (pgd_none(*pgd_ref))
607 continue;
608 spin_lock(&pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200609 list_for_each_entry(page, &pgd_list, lru) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100610 pgd_t *pgd;
611 pgd = (pgd_t *)page_address(page) + pgd_index(address);
612 if (pgd_none(*pgd))
613 set_pgd(pgd, *pgd_ref);
614 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700615 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Jan Beulich8c914cb2006-03-25 16:29:40 +0100616 }
617 spin_unlock(&pgd_lock);
618 set_bit(pgd_index(address), insync);
619 }
620 if (address == start)
621 start = address + PGDIR_SIZE;
622 }
623 /* Check that there is no need to do the same for the modules area. */
624 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
625 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
626 (__START_KERNEL & PGDIR_MASK)));
627}
628
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100629static int __init enable_pagefaulttrace(char *str)
630{
631 page_fault_trace = 1;
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800632 return 1;
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100633}
634__setup("pagefaulttrace", enable_pagefaulttrace);