blob: cf7e99895b91e81f046db10ccb9485a395b174d2 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
Linus Torvalds1da177e2005-04-16 15:20:36 -07006#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070016#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070021#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070023#include <linux/kprobes.h>
Andi Kleenab2bf0c2006-12-07 02:14:06 +010024#include <linux/uaccess.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070025#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026
27#include <asm/system.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <asm/pgalloc.h>
29#include <asm/smp.h>
30#include <asm/tlbflush.h>
31#include <asm/proto.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033
Harvey Harrison33cb5242008-01-30 13:32:19 +010034/*
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
41 */
Ingo Molnar8a19da72008-01-30 13:32:53 +010042#define PF_PROT (1<<0)
Andi Kleen66c58152006-01-11 22:44:09 +010043#define PF_WRITE (1<<1)
Ingo Molnar8a19da72008-01-30 13:32:53 +010044#define PF_USER (1<<2)
45#define PF_RSVD (1<<3)
Andi Kleen66c58152006-01-11 22:44:09 +010046#define PF_INSTR (1<<4)
47
Christoph Hellwig74a0b572007-10-16 01:24:07 -070048static inline int notify_page_fault(struct pt_regs *regs)
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070049{
Harvey Harrison33cb5242008-01-30 13:32:19 +010050#ifdef CONFIG_KPROBES
Christoph Hellwig74a0b572007-10-16 01:24:07 -070051 int ret = 0;
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070052
Christoph Hellwig74a0b572007-10-16 01:24:07 -070053 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
59 }
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070060
Christoph Hellwig74a0b572007-10-16 01:24:07 -070061 return ret;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070062#else
Christoph Hellwig74a0b572007-10-16 01:24:07 -070063 return 0;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070064#endif
Harvey Harrison33cb5242008-01-30 13:32:19 +010065}
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070066
Harvey Harrison1dc85be2008-01-30 13:32:35 +010067/*
68 * X86_32
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
71 *
72 * X86_64
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
75 *
76 * Opcode checker based on code by Richard Brunner
77 */
78static int is_prefetch(struct pt_regs *regs, unsigned long addr,
79 unsigned long error_code)
Harvey Harrison33cb5242008-01-30 13:32:19 +010080{
Andi Kleenab2bf0c2006-12-07 02:14:06 +010081 unsigned char *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070082 int scan_more = 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +010083 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -070084 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070085
Harvey Harrison1dc85be2008-01-30 13:32:35 +010086#ifdef CONFIG_X86_32
87 unsigned long limit;
88 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
89 boot_cpu_data.x86 >= 6)) {
90 /* Catch an obscure case of prefetch inside an NX page. */
91 if (nx_enabled && (error_code & PF_INSTR))
92 return 0;
93 } else {
94 return 0;
95 }
96 instr = (unsigned char *)get_segment_eip(regs, &limit);
97#else
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +010099 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100 return 0;
Andi Kleendd2994f2006-09-26 10:52:33 +0200101 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100102#endif
103
Andi Kleenf1290ec2005-04-16 15:24:59 -0700104 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100106#ifdef CONFIG_X86_64
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700107 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108 return 0;
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100109#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110
Harvey Harrison33cb5242008-01-30 13:32:19 +0100111 while (scan_more && instr < max_instr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112 unsigned char opcode;
113 unsigned char instr_hi;
114 unsigned char instr_lo;
115
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100116#ifdef CONFIG_X86_32
117 if (instr > (unsigned char *)limit)
118 break;
119#endif
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100120 if (probe_kernel_address(instr, opcode))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100121 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122
Harvey Harrison33cb5242008-01-30 13:32:19 +0100123 instr_hi = opcode & 0xf0;
124 instr_lo = opcode & 0x0f;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700125 instr++;
126
Harvey Harrison33cb5242008-01-30 13:32:19 +0100127 switch (instr_hi) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128 case 0x20:
129 case 0x30:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100130 /*
131 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
132 * In X86_64 long mode, the CPU will signal invalid
133 * opcode if some of these prefixes are present so
134 * X86_64 will never get here anyway
135 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136 scan_more = ((instr_lo & 7) == 0x6);
137 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100138#ifdef CONFIG_X86_64
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139 case 0x40:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100140 /*
141 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
142 * Need to figure out under what instruction mode the
143 * instruction was issued. Could check the LDT for lm,
144 * but for now it's good enough to assume that long
145 * mode only uses well known segments or kernel.
146 */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700147 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100149#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150 case 0x60:
151 /* 0x64 thru 0x67 are valid prefixes in all modes. */
152 scan_more = (instr_lo & 0xC) == 0x4;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100153 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154 case 0xF0:
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100155 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 scan_more = !instr_lo || (instr_lo>>1) == 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100157 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158 case 0x00:
159 /* Prefetch instruction is 0x0F0D or 0x0F18 */
160 scan_more = 0;
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100161#ifdef CONFIG_X86_32
162 if (instr > (unsigned char *)limit)
163 break;
164#endif
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100165 if (probe_kernel_address(instr, opcode))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166 break;
167 prefetch = (instr_lo == 0xF) &&
168 (opcode == 0x0D || opcode == 0x18);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100169 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170 default:
171 scan_more = 0;
172 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100173 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174 }
175 return prefetch;
176}
177
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100178static void force_sig_info_fault(int si_signo, int si_code,
179 unsigned long address, struct task_struct *tsk)
180{
181 siginfo_t info;
182
183 info.si_signo = si_signo;
184 info.si_errno = 0;
185 info.si_code = si_code;
186 info.si_addr = (void __user *)address;
187 force_sig_info(si_signo, &info, tsk);
188}
189
Harvey Harrison33cb5242008-01-30 13:32:19 +0100190static int bad_address(void *p)
191{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192 unsigned long dummy;
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100193 return probe_kernel_address((unsigned long *)p, dummy);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100194}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195
196void dump_pagetable(unsigned long address)
197{
198 pgd_t *pgd;
199 pud_t *pud;
200 pmd_t *pmd;
201 pte_t *pte;
202
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200203 pgd = (pgd_t *)read_cr3();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204
Harvey Harrison33cb5242008-01-30 13:32:19 +0100205 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100208 printk("PGD %lx ", pgd_val(*pgd));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100209 if (!pgd_present(*pgd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210
Andi Kleend2ae5b52006-06-26 13:57:56 +0200211 pud = pud_offset(pgd, address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212 if (bad_address(pud)) goto bad;
213 printk("PUD %lx ", pud_val(*pud));
214 if (!pud_present(*pud)) goto ret;
215
216 pmd = pmd_offset(pud, address);
217 if (bad_address(pmd)) goto bad;
218 printk("PMD %lx ", pmd_val(*pmd));
Jan Beulichb1992df2007-10-19 20:35:03 +0200219 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220
221 pte = pte_offset_kernel(pmd, address);
222 if (bad_address(pte)) goto bad;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100223 printk("PTE %lx", pte_val(*pte));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700224ret:
225 printk("\n");
226 return;
227bad:
228 printk("BAD\n");
229}
230
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100231#ifdef CONFIG_X86_64
Harvey Harrison33cb5242008-01-30 13:32:19 +0100232static const char errata93_warning[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
234KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
235KERN_ERR "******* Please consider a BIOS update.\n"
236KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
237
238/* Workaround for K8 erratum #93 & buggy BIOS.
239 BIOS SMM functions are required to use a specific workaround
Harvey Harrison33cb5242008-01-30 13:32:19 +0100240 to avoid corruption of the 64bit RIP register on C stepping K8.
241 A lot of BIOS that didn't get tested properly miss this.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700242 The OS sees this as a page fault with the upper 32bits of RIP cleared.
243 Try to work around it here.
244 Note we only handle faults in kernel here. */
245
Harvey Harrison33cb5242008-01-30 13:32:19 +0100246static int is_errata93(struct pt_regs *regs, unsigned long address)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247{
248 static int warned;
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100249 if (address != regs->ip)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100251 if ((address >> 32) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252 return 0;
253 address |= 0xffffffffUL << 32;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100254 if ((address >= (u64)_stext && address <= (u64)_etext) ||
255 (address >= MODULES_VADDR && address <= MODULES_END)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256 if (!warned) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100257 printk(errata93_warning);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258 warned = 1;
259 }
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100260 regs->ip = address;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261 return 1;
262 }
263 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100264}
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100265#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
268 unsigned long error_code)
269{
Jan Beulich12091402005-09-12 18:49:24 +0200270 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100271 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200272
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
274 current->comm, address);
275 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100276 tsk = current;
277 tsk->thread.cr2 = address;
278 tsk->thread.trap_no = 14;
279 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100280 if (__die("Bad pagetable", regs, error_code))
281 regs = NULL;
282 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283}
284
285/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100286 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700287 *
288 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 */
290static int vmalloc_fault(unsigned long address)
291{
292 pgd_t *pgd, *pgd_ref;
293 pud_t *pud, *pud_ref;
294 pmd_t *pmd, *pmd_ref;
295 pte_t *pte, *pte_ref;
296
297 /* Copy kernel mappings over when needed. This can also
298 happen within a race in page table update. In the later
299 case just flush. */
300
301 pgd = pgd_offset(current->mm ?: &init_mm, address);
302 pgd_ref = pgd_offset_k(address);
303 if (pgd_none(*pgd_ref))
304 return -1;
305 if (pgd_none(*pgd))
306 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100307 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700308 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700309
310 /* Below here mismatches are bugs because these lower tables
311 are shared */
312
313 pud = pud_offset(pgd, address);
314 pud_ref = pud_offset(pgd_ref, address);
315 if (pud_none(*pud_ref))
316 return -1;
Dave McCracken46a82b22006-09-25 23:31:48 -0700317 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318 BUG();
319 pmd = pmd_offset(pud, address);
320 pmd_ref = pmd_offset(pud_ref, address);
321 if (pmd_none(*pmd_ref))
322 return -1;
323 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
324 BUG();
325 pte_ref = pte_offset_kernel(pmd_ref, address);
326 if (!pte_present(*pte_ref))
327 return -1;
328 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700329 /* Don't use pte_page here, because the mappings can point
330 outside mem_map, and the NUMA hash lookup cannot handle
331 that. */
332 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334 return 0;
335}
336
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200337int show_unhandled_signals = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338
339/*
340 * This routine handles page faults. It determines the address,
341 * and the problem, and then passes it off to one of the appropriate
342 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700344asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
345 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346{
347 struct task_struct *tsk;
348 struct mm_struct *mm;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100349 struct vm_area_struct *vma;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350 unsigned long address;
Nick Piggin83c54072007-07-19 01:47:05 -0700351 int write, fault;
Jan Beulich12091402005-09-12 18:49:24 +0200352 unsigned long flags;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100353 int si_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354
Peter Zijlstra143a5d32007-10-25 14:01:10 +0200355 /*
356 * We can fault from pretty much anywhere, with unknown IRQ state.
357 */
358 trace_hardirqs_fixup();
359
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100360 tsk = current;
361 mm = tsk->mm;
362 prefetchw(&mm->mmap_sem);
363
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364 /* get the address */
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200365 address = read_cr2();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700366
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100367 si_code = SEGV_MAPERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368
369
370 /*
371 * We fault-in kernel-space virtual memory on-demand. The
372 * 'reference' page table is init_mm.pgd.
373 *
374 * NOTE! We MUST NOT take any locks for this case. We may
375 * be in an interrupt or a critical region, and should
376 * only copy the information from the master page table,
377 * nothing more.
378 *
379 * This verifies that the fault happens in kernel space
380 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100381 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382 */
Suresh Siddha84929802005-06-21 17:14:32 -0700383 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100384 /*
385 * Don't check for the module range here: its PML4
386 * is always initialized because it's shared with the main
387 * kernel text. Only vmalloc may need PML4 syncups.
388 */
Andi Kleen66c58152006-01-11 22:44:09 +0100389 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100390 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100391 if (vmalloc_fault(address) >= 0)
392 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 }
Christoph Hellwig74a0b572007-10-16 01:24:07 -0700394 if (notify_page_fault(regs))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100395 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 /*
397 * Don't take the mm semaphore here. If we fixup a prefetch
398 * fault we could otherwise deadlock.
399 */
400 goto bad_area_nosemaphore;
401 }
402
Christoph Hellwig74a0b572007-10-16 01:24:07 -0700403 if (notify_page_fault(regs))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100404 return;
405
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100406 if (likely(regs->flags & X86_EFLAGS_IF))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100407 local_irq_enable();
408
Andi Kleen66c58152006-01-11 22:44:09 +0100409 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700410 pgtable_bad(address, regs, error_code);
411
412 /*
Harvey Harrison33cb5242008-01-30 13:32:19 +0100413 * If we're in an interrupt, have no user context or are running in an
414 * atomic region then we must not take the fault.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700415 */
416 if (unlikely(in_atomic() || !mm))
417 goto bad_area_nosemaphore;
418
Linus Torvaldsdbe3ed12007-09-19 11:37:14 -0700419 /*
420 * User-mode registers count as a user access even for any
421 * potential system fault or CPU buglet.
422 */
423 if (user_mode_vm(regs))
424 error_code |= PF_USER;
425
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426 again:
427 /* When running in the kernel we expect faults to occur only to
428 * addresses in user space. All other faults represent errors in the
Simon Arlott676b1852007-10-20 01:25:36 +0200429 * kernel and should generate an OOPS. Unfortunately, in the case of an
Adrian Bunk80f72282006-06-30 18:27:16 +0200430 * erroneous fault occurring in a code path which already holds mmap_sem
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431 * we will deadlock attempting to validate the fault against the
432 * address space. Luckily the kernel only validly references user
433 * space from well defined areas of code, which are listed in the
434 * exceptions table.
435 *
436 * As the vast majority of faults will be valid we will only perform
Simon Arlott676b1852007-10-20 01:25:36 +0200437 * the source reference check when there is a possibility of a deadlock.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438 * Attempt to lock the address space, if we cannot we then validate the
439 * source. If this is invalid we can skip the address space check,
440 * thus avoiding the deadlock.
441 */
442 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100443 if ((error_code & PF_USER) == 0 &&
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100444 !search_exception_tables(regs->ip))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 goto bad_area_nosemaphore;
446 down_read(&mm->mmap_sem);
447 }
448
449 vma = find_vma(mm, address);
450 if (!vma)
451 goto bad_area;
452 if (likely(vma->vm_start <= address))
453 goto good_area;
454 if (!(vma->vm_flags & VM_GROWSDOWN))
455 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100456 if (error_code & PF_USER) {
Chuck Ebbert03fdc2c2006-06-26 13:59:50 +0200457 /* Allow userspace just enough access below the stack pointer
458 * to let the 'enter' instruction work.
459 */
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100460 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461 goto bad_area;
462 }
463 if (expand_stack(vma, address))
464 goto bad_area;
465/*
466 * Ok, we have a good vm_area for this memory access, so
467 * we can handle it..
468 */
469good_area:
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100470 si_code = SEGV_ACCERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100472 switch (error_code & (PF_PROT|PF_WRITE)) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100473 default: /* 3: write, present */
474 /* fall through */
475 case PF_WRITE: /* write, not present */
476 if (!(vma->vm_flags & VM_WRITE))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100478 write++;
479 break;
480 case PF_PROT: /* read, present */
481 goto bad_area;
482 case 0: /* read, not present */
483 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
484 goto bad_area;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 }
486
487 /*
488 * If for any reason at all we couldn't handle the fault,
489 * make sure we exit gracefully rather than endlessly redo
490 * the fault.
491 */
Nick Piggin83c54072007-07-19 01:47:05 -0700492 fault = handle_mm_fault(mm, vma, address, write);
493 if (unlikely(fault & VM_FAULT_ERROR)) {
494 if (fault & VM_FAULT_OOM)
495 goto out_of_memory;
496 else if (fault & VM_FAULT_SIGBUS)
497 goto do_sigbus;
498 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499 }
Nick Piggin83c54072007-07-19 01:47:05 -0700500 if (fault & VM_FAULT_MAJOR)
501 tsk->maj_flt++;
502 else
503 tsk->min_flt++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700504 up_read(&mm->mmap_sem);
505 return;
506
507/*
508 * Something tried to access memory that isn't in our memory map..
509 * Fix it, but check if it's kernel or user first..
510 */
511bad_area:
512 up_read(&mm->mmap_sem);
513
514bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100516 if (error_code & PF_USER) {
Steven Rostedte5e3c842007-06-06 23:34:04 -0400517
518 /*
519 * It's possible to have interrupts off here.
520 */
521 local_irq_enable();
522
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523 if (is_prefetch(regs, address, error_code))
524 return;
525
526 /* Work around K8 erratum #100 K8 in compat mode
527 occasionally jumps to illegal addresses >4GB. We
528 catch this here in the page fault handler because
529 these addresses are not reachable. Just detect this
530 case and return. Any code segment in LDT is
531 compatibility mode. */
532 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
533 (address >> 32))
534 return;
535
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200536 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
537 printk_ratelimit()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700538 printk(
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100539 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700540 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100541 tsk->comm, tsk->pid, address, regs->ip,
542 regs->sp, error_code);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700543 }
Harvey Harrison33cb5242008-01-30 13:32:19 +0100544
Linus Torvalds1da177e2005-04-16 15:20:36 -0700545 tsk->thread.cr2 = address;
546 /* Kernel addresses are always protection faults */
547 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
548 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100549
550 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700551 return;
552 }
553
554no_context:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700555 /* Are we prepared to handle this kernel fault? */
Harvey Harrison33cb5242008-01-30 13:32:19 +0100556 if (fixup_exception(regs))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700558
Harvey Harrison33cb5242008-01-30 13:32:19 +0100559 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560 * Hall of shame of CPU/BIOS bugs.
561 */
562
Harvey Harrison33cb5242008-01-30 13:32:19 +0100563 if (is_prefetch(regs, address, error_code))
564 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700565
566 if (is_errata93(regs, address))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100567 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700568
569/*
570 * Oops. The kernel tried to access some bad page. We'll have to
571 * terminate things with extreme prejudice.
572 */
573
Jan Beulich12091402005-09-12 18:49:24 +0200574 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575
576 if (address < PAGE_SIZE)
577 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
578 else
579 printk(KERN_ALERT "Unable to handle kernel paging request");
Harvey Harrison33cb5242008-01-30 13:32:19 +0100580 printk(" at %016lx RIP: \n" KERN_ALERT, address);
Arjan van de Venbc850d62008-01-30 13:33:07 +0100581 printk_address(regs->ip, regs->bp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700582 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100583 tsk->thread.cr2 = address;
584 tsk->thread.trap_no = 14;
585 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100586 if (__die("Oops", regs, error_code))
587 regs = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588 /* Executive summary in case the body of the oops scrolled away */
589 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich22f59912008-01-30 13:31:23 +0100590 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591
592/*
593 * We ran out of memory, or some other thing happened to us that made
594 * us unable to handle the page fault gracefully.
595 */
596out_of_memory:
597 up_read(&mm->mmap_sem);
Serge E. Hallynb460cbc2007-10-18 23:39:52 -0700598 if (is_global_init(current)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599 yield();
600 goto again;
601 }
602 printk("VM: killing process %s\n", tsk->comm);
Harvey Harrison318aa292008-01-30 13:32:59 +0100603 if (error_code & PF_USER)
Will Schmidt021daae2007-07-21 17:11:17 +0200604 do_group_exit(SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605 goto no_context;
606
607do_sigbus:
608 up_read(&mm->mmap_sem);
609
610 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100611 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 goto no_context;
613
614 tsk->thread.cr2 = address;
615 tsk->thread.error_code = error_code;
616 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100617 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618 return;
619}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100620
Jan Beulich8c914cb2006-03-25 16:29:40 +0100621DEFINE_SPINLOCK(pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200622LIST_HEAD(pgd_list);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100623
624void vmalloc_sync_all(void)
625{
Harvey Harrison33cb5242008-01-30 13:32:19 +0100626 /* Note that races in the updates of insync and start aren't
Jan Beulich8c914cb2006-03-25 16:29:40 +0100627 problematic:
628 insync can only get set bits added, and updates to start are only
629 improving performance (without affecting correctness if undone). */
630 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
631 static unsigned long start = VMALLOC_START & PGDIR_MASK;
632 unsigned long address;
633
634 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
635 if (!test_bit(pgd_index(address), insync)) {
636 const pgd_t *pgd_ref = pgd_offset_k(address);
637 struct page *page;
638
639 if (pgd_none(*pgd_ref))
640 continue;
641 spin_lock(&pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200642 list_for_each_entry(page, &pgd_list, lru) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100643 pgd_t *pgd;
644 pgd = (pgd_t *)page_address(page) + pgd_index(address);
645 if (pgd_none(*pgd))
646 set_pgd(pgd, *pgd_ref);
647 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700648 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Jan Beulich8c914cb2006-03-25 16:29:40 +0100649 }
650 spin_unlock(&pgd_lock);
651 set_bit(pgd_index(address), insync);
652 }
653 if (address == start)
654 start = address + PGDIR_SIZE;
655 }
656 /* Check that there is no need to do the same for the modules area. */
657 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100658 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
Jan Beulich8c914cb2006-03-25 16:29:40 +0100659 (__START_KERNEL & PGDIR_MASK)));
660}