blob: df13487aa83a7b481b1f05a8682db504c139801c [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
Linus Torvalds1da177e2005-04-16 15:20:36 -07006#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070016#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070021#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070023#include <linux/kprobes.h>
Andi Kleenab2bf0c2006-12-07 02:14:06 +010024#include <linux/uaccess.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070025#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026
27#include <asm/system.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <asm/pgalloc.h>
29#include <asm/smp.h>
30#include <asm/tlbflush.h>
31#include <asm/proto.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033
Harvey Harrison33cb5242008-01-30 13:32:19 +010034/*
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
41 */
Ingo Molnar8a19da72008-01-30 13:32:53 +010042#define PF_PROT (1<<0)
Andi Kleen66c58152006-01-11 22:44:09 +010043#define PF_WRITE (1<<1)
Ingo Molnar8a19da72008-01-30 13:32:53 +010044#define PF_USER (1<<2)
45#define PF_RSVD (1<<3)
Andi Kleen66c58152006-01-11 22:44:09 +010046#define PF_INSTR (1<<4)
47
Christoph Hellwig74a0b572007-10-16 01:24:07 -070048static inline int notify_page_fault(struct pt_regs *regs)
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070049{
Harvey Harrison33cb5242008-01-30 13:32:19 +010050#ifdef CONFIG_KPROBES
Christoph Hellwig74a0b572007-10-16 01:24:07 -070051 int ret = 0;
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070052
Christoph Hellwig74a0b572007-10-16 01:24:07 -070053 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
59 }
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070060
Christoph Hellwig74a0b572007-10-16 01:24:07 -070061 return ret;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070062#else
Christoph Hellwig74a0b572007-10-16 01:24:07 -070063 return 0;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070064#endif
Harvey Harrison33cb5242008-01-30 13:32:19 +010065}
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070066
Harvey Harrison1dc85be2008-01-30 13:32:35 +010067/*
68 * X86_32
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
71 *
72 * X86_64
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
75 *
76 * Opcode checker based on code by Richard Brunner
77 */
78static int is_prefetch(struct pt_regs *regs, unsigned long addr,
79 unsigned long error_code)
Harvey Harrison33cb5242008-01-30 13:32:19 +010080{
Andi Kleenab2bf0c2006-12-07 02:14:06 +010081 unsigned char *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070082 int scan_more = 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +010083 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -070084 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070085
Harvey Harrison1dc85be2008-01-30 13:32:35 +010086#ifdef CONFIG_X86_32
Harvey Harrison1dc85be2008-01-30 13:32:35 +010087 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
88 boot_cpu_data.x86 >= 6)) {
89 /* Catch an obscure case of prefetch inside an NX page. */
90 if (nx_enabled && (error_code & PF_INSTR))
91 return 0;
92 } else {
93 return 0;
94 }
Harvey Harrison1dc85be2008-01-30 13:32:35 +010095#else
Linus Torvalds1da177e2005-04-16 15:20:36 -070096 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +010097 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 return 0;
Harvey Harrison1dc85be2008-01-30 13:32:35 +010099#endif
100
Harvey Harrisonf2857ce2008-01-30 13:33:12 +0100101 instr = (unsigned char *)convert_ip_to_linear(current, regs);
Andi Kleenf1290ec2005-04-16 15:24:59 -0700102 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105 return 0;
106
Harvey Harrison33cb5242008-01-30 13:32:19 +0100107 while (scan_more && instr < max_instr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108 unsigned char opcode;
109 unsigned char instr_hi;
110 unsigned char instr_lo;
111
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100112 if (probe_kernel_address(instr, opcode))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100113 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
Harvey Harrison33cb5242008-01-30 13:32:19 +0100115 instr_hi = opcode & 0xf0;
116 instr_lo = opcode & 0x0f;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117 instr++;
118
Harvey Harrison33cb5242008-01-30 13:32:19 +0100119 switch (instr_hi) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120 case 0x20:
121 case 0x30:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100122 /*
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
127 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128 scan_more = ((instr_lo & 7) == 0x6);
129 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100130#ifdef CONFIG_X86_64
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131 case 0x40:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100132 /*
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
138 */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100141#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142 case 0x60:
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more = (instr_lo & 0xC) == 0x4;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100145 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700146 case 0xF0:
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 scan_more = !instr_lo || (instr_lo>>1) == 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100149 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150 case 0x00:
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
152 scan_more = 0;
Harvey Harrisonf2857ce2008-01-30 13:33:12 +0100153
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100154 if (probe_kernel_address(instr, opcode))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 break;
156 prefetch = (instr_lo == 0xF) &&
157 (opcode == 0x0D || opcode == 0x18);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100158 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 default:
160 scan_more = 0;
161 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100162 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 }
164 return prefetch;
165}
166
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100167static void force_sig_info_fault(int si_signo, int si_code,
168 unsigned long address, struct task_struct *tsk)
169{
170 siginfo_t info;
171
172 info.si_signo = si_signo;
173 info.si_errno = 0;
174 info.si_code = si_code;
175 info.si_addr = (void __user *)address;
176 force_sig_info(si_signo, &info, tsk);
177}
178
Harvey Harrison33cb5242008-01-30 13:32:19 +0100179static int bad_address(void *p)
180{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181 unsigned long dummy;
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100182 return probe_kernel_address((unsigned long *)p, dummy);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100183}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184
185void dump_pagetable(unsigned long address)
186{
187 pgd_t *pgd;
188 pud_t *pud;
189 pmd_t *pmd;
190 pte_t *pte;
191
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200192 pgd = (pgd_t *)read_cr3();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193
Harvey Harrison33cb5242008-01-30 13:32:19 +0100194 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100197 printk("PGD %lx ", pgd_val(*pgd));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100198 if (!pgd_present(*pgd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199
Andi Kleend2ae5b52006-06-26 13:57:56 +0200200 pud = pud_offset(pgd, address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201 if (bad_address(pud)) goto bad;
202 printk("PUD %lx ", pud_val(*pud));
203 if (!pud_present(*pud)) goto ret;
204
205 pmd = pmd_offset(pud, address);
206 if (bad_address(pmd)) goto bad;
207 printk("PMD %lx ", pmd_val(*pmd));
Jan Beulichb1992df2007-10-19 20:35:03 +0200208 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209
210 pte = pte_offset_kernel(pmd, address);
211 if (bad_address(pte)) goto bad;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100212 printk("PTE %lx", pte_val(*pte));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213ret:
214 printk("\n");
215 return;
216bad:
217 printk("BAD\n");
218}
219
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100220#ifdef CONFIG_X86_64
Harvey Harrison33cb5242008-01-30 13:32:19 +0100221static const char errata93_warning[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
223KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
224KERN_ERR "******* Please consider a BIOS update.\n"
225KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100226#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227
228/* Workaround for K8 erratum #93 & buggy BIOS.
229 BIOS SMM functions are required to use a specific workaround
Harvey Harrison33cb5242008-01-30 13:32:19 +0100230 to avoid corruption of the 64bit RIP register on C stepping K8.
231 A lot of BIOS that didn't get tested properly miss this.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232 The OS sees this as a page fault with the upper 32bits of RIP cleared.
233 Try to work around it here.
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100234 Note we only handle faults in kernel here.
235 Does nothing for X86_32
236 */
Harvey Harrison33cb5242008-01-30 13:32:19 +0100237static int is_errata93(struct pt_regs *regs, unsigned long address)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700238{
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100239#ifdef CONFIG_X86_64
Linus Torvalds1da177e2005-04-16 15:20:36 -0700240 static int warned;
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100241 if (address != regs->ip)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700242 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100243 if ((address >> 32) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244 return 0;
245 address |= 0xffffffffUL << 32;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100246 if ((address >= (u64)_stext && address <= (u64)_etext) ||
247 (address >= MODULES_VADDR && address <= MODULES_END)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248 if (!warned) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100249 printk(errata93_warning);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 warned = 1;
251 }
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100252 regs->ip = address;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253 return 1;
254 }
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100255#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100257}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258
Harvey Harrison29caf2f2008-01-30 13:34:09 +0100259void do_invalid_op(struct pt_regs *, unsigned long);
260
261static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
262{
263#ifdef CONFIG_X86_F00F_BUG
264 unsigned long nr;
265 /*
266 * Pentium F0 0F C7 C8 bug workaround.
267 */
268 if (boot_cpu_data.f00f_bug) {
269 nr = (address - idt_descr.address) >> 3;
270
271 if (nr == 6) {
272 do_invalid_op(regs, 0);
273 return 1;
274 }
275 }
276#endif
277 return 0;
278}
279
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
281 unsigned long error_code)
282{
Jan Beulich12091402005-09-12 18:49:24 +0200283 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100284 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200285
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
287 current->comm, address);
288 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100289 tsk = current;
290 tsk->thread.cr2 = address;
291 tsk->thread.trap_no = 14;
292 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100293 if (__die("Bad pagetable", regs, error_code))
294 regs = NULL;
295 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296}
297
298/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100299 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700300 *
301 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302 */
303static int vmalloc_fault(unsigned long address)
304{
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100305#ifdef CONFIG_X86_32
306 unsigned long pgd_paddr;
307 pmd_t *pmd_k;
308 pte_t *pte_k;
309 /*
310 * Synchronize this task's top level page-table
311 * with the 'reference' page table.
312 *
313 * Do _not_ use "current" here. We might be inside
314 * an interrupt in the middle of a task switch..
315 */
316 pgd_paddr = read_cr3();
317 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
318 if (!pmd_k)
319 return -1;
320 pte_k = pte_offset_kernel(pmd_k, address);
321 if (!pte_present(*pte_k))
322 return -1;
323 return 0;
324#else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325 pgd_t *pgd, *pgd_ref;
326 pud_t *pud, *pud_ref;
327 pmd_t *pmd, *pmd_ref;
328 pte_t *pte, *pte_ref;
329
330 /* Copy kernel mappings over when needed. This can also
331 happen within a race in page table update. In the later
332 case just flush. */
333
334 pgd = pgd_offset(current->mm ?: &init_mm, address);
335 pgd_ref = pgd_offset_k(address);
336 if (pgd_none(*pgd_ref))
337 return -1;
338 if (pgd_none(*pgd))
339 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100340 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700341 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700342
343 /* Below here mismatches are bugs because these lower tables
344 are shared */
345
346 pud = pud_offset(pgd, address);
347 pud_ref = pud_offset(pgd_ref, address);
348 if (pud_none(*pud_ref))
349 return -1;
Dave McCracken46a82b22006-09-25 23:31:48 -0700350 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 BUG();
352 pmd = pmd_offset(pud, address);
353 pmd_ref = pmd_offset(pud_ref, address);
354 if (pmd_none(*pmd_ref))
355 return -1;
356 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
357 BUG();
358 pte_ref = pte_offset_kernel(pmd_ref, address);
359 if (!pte_present(*pte_ref))
360 return -1;
361 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700362 /* Don't use pte_page here, because the mappings can point
363 outside mem_map, and the NUMA hash lookup cannot handle
364 that. */
365 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700366 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367 return 0;
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100368#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369}
370
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200371int show_unhandled_signals = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372
373/*
374 * This routine handles page faults. It determines the address,
375 * and the problem, and then passes it off to one of the appropriate
376 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700378asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
379 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380{
381 struct task_struct *tsk;
382 struct mm_struct *mm;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100383 struct vm_area_struct *vma;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 unsigned long address;
Nick Piggin83c54072007-07-19 01:47:05 -0700385 int write, fault;
Jan Beulich12091402005-09-12 18:49:24 +0200386 unsigned long flags;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100387 int si_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388
Peter Zijlstra143a5d32007-10-25 14:01:10 +0200389 /*
390 * We can fault from pretty much anywhere, with unknown IRQ state.
391 */
392 trace_hardirqs_fixup();
393
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100394 tsk = current;
395 mm = tsk->mm;
396 prefetchw(&mm->mmap_sem);
397
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398 /* get the address */
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200399 address = read_cr2();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100401 si_code = SEGV_MAPERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402
Harvey Harrison608566b2008-01-30 13:33:12 +0100403 if (notify_page_fault(regs))
404 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405
406 /*
407 * We fault-in kernel-space virtual memory on-demand. The
408 * 'reference' page table is init_mm.pgd.
409 *
410 * NOTE! We MUST NOT take any locks for this case. We may
411 * be in an interrupt or a critical region, and should
412 * only copy the information from the master page table,
413 * nothing more.
414 *
415 * This verifies that the fault happens in kernel space
416 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100417 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418 */
Suresh Siddha84929802005-06-21 17:14:32 -0700419 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100420 /*
421 * Don't check for the module range here: its PML4
422 * is always initialized because it's shared with the main
423 * kernel text. Only vmalloc may need PML4 syncups.
424 */
Andi Kleen66c58152006-01-11 22:44:09 +0100425 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100426 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100427 if (vmalloc_fault(address) >= 0)
428 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429 }
430 /*
431 * Don't take the mm semaphore here. If we fixup a prefetch
432 * fault we could otherwise deadlock.
433 */
434 goto bad_area_nosemaphore;
435 }
436
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100437 if (likely(regs->flags & X86_EFLAGS_IF))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100438 local_irq_enable();
439
Andi Kleen66c58152006-01-11 22:44:09 +0100440 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441 pgtable_bad(address, regs, error_code);
442
443 /*
Harvey Harrison33cb5242008-01-30 13:32:19 +0100444 * If we're in an interrupt, have no user context or are running in an
445 * atomic region then we must not take the fault.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 */
447 if (unlikely(in_atomic() || !mm))
448 goto bad_area_nosemaphore;
449
Linus Torvaldsdbe3ed12007-09-19 11:37:14 -0700450 /*
451 * User-mode registers count as a user access even for any
452 * potential system fault or CPU buglet.
453 */
454 if (user_mode_vm(regs))
455 error_code |= PF_USER;
456
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 again:
458 /* When running in the kernel we expect faults to occur only to
459 * addresses in user space. All other faults represent errors in the
Simon Arlott676b1852007-10-20 01:25:36 +0200460 * kernel and should generate an OOPS. Unfortunately, in the case of an
Adrian Bunk80f72282006-06-30 18:27:16 +0200461 * erroneous fault occurring in a code path which already holds mmap_sem
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462 * we will deadlock attempting to validate the fault against the
463 * address space. Luckily the kernel only validly references user
464 * space from well defined areas of code, which are listed in the
465 * exceptions table.
466 *
467 * As the vast majority of faults will be valid we will only perform
Simon Arlott676b1852007-10-20 01:25:36 +0200468 * the source reference check when there is a possibility of a deadlock.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 * Attempt to lock the address space, if we cannot we then validate the
470 * source. If this is invalid we can skip the address space check,
471 * thus avoiding the deadlock.
472 */
473 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100474 if ((error_code & PF_USER) == 0 &&
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100475 !search_exception_tables(regs->ip))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700476 goto bad_area_nosemaphore;
477 down_read(&mm->mmap_sem);
478 }
479
480 vma = find_vma(mm, address);
481 if (!vma)
482 goto bad_area;
483 if (likely(vma->vm_start <= address))
484 goto good_area;
485 if (!(vma->vm_flags & VM_GROWSDOWN))
486 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100487 if (error_code & PF_USER) {
Harvey Harrison6f4d3682008-01-30 13:33:13 +0100488 /*
489 * Accessing the stack below %sp is always a bug.
490 * The large cushion allows instructions like enter
491 * and pusha to work. ("enter $65535,$31" pushes
492 * 32 pointers and then decrements %sp by 65535.)
Chuck Ebbert03fdc2c2006-06-26 13:59:50 +0200493 */
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100494 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 goto bad_area;
496 }
497 if (expand_stack(vma, address))
498 goto bad_area;
499/*
500 * Ok, we have a good vm_area for this memory access, so
501 * we can handle it..
502 */
503good_area:
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100504 si_code = SEGV_ACCERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100506 switch (error_code & (PF_PROT|PF_WRITE)) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100507 default: /* 3: write, present */
508 /* fall through */
509 case PF_WRITE: /* write, not present */
510 if (!(vma->vm_flags & VM_WRITE))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100512 write++;
513 break;
514 case PF_PROT: /* read, present */
515 goto bad_area;
516 case 0: /* read, not present */
517 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
518 goto bad_area;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519 }
520
521 /*
522 * If for any reason at all we couldn't handle the fault,
523 * make sure we exit gracefully rather than endlessly redo
524 * the fault.
525 */
Nick Piggin83c54072007-07-19 01:47:05 -0700526 fault = handle_mm_fault(mm, vma, address, write);
527 if (unlikely(fault & VM_FAULT_ERROR)) {
528 if (fault & VM_FAULT_OOM)
529 goto out_of_memory;
530 else if (fault & VM_FAULT_SIGBUS)
531 goto do_sigbus;
532 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533 }
Nick Piggin83c54072007-07-19 01:47:05 -0700534 if (fault & VM_FAULT_MAJOR)
535 tsk->maj_flt++;
536 else
537 tsk->min_flt++;
Harvey Harrisond729ab32008-01-30 13:33:23 +0100538
539#ifdef CONFIG_X86_32
540 /*
541 * Did it hit the DOS screen memory VA from vm86 mode?
542 */
543 if (v8086_mode(regs)) {
544 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
545 if (bit < 32)
546 tsk->thread.screen_bitmap |= 1 << bit;
547 }
548#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 up_read(&mm->mmap_sem);
550 return;
551
552/*
553 * Something tried to access memory that isn't in our memory map..
554 * Fix it, but check if it's kernel or user first..
555 */
556bad_area:
557 up_read(&mm->mmap_sem);
558
559bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100561 if (error_code & PF_USER) {
Steven Rostedte5e3c842007-06-06 23:34:04 -0400562
563 /*
564 * It's possible to have interrupts off here.
565 */
566 local_irq_enable();
567
Linus Torvalds1da177e2005-04-16 15:20:36 -0700568 if (is_prefetch(regs, address, error_code))
569 return;
570
571 /* Work around K8 erratum #100 K8 in compat mode
572 occasionally jumps to illegal addresses >4GB. We
573 catch this here in the page fault handler because
574 these addresses are not reachable. Just detect this
575 case and return. Any code segment in LDT is
576 compatibility mode. */
577 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
578 (address >> 32))
579 return;
580
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200581 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
582 printk_ratelimit()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583 printk(
Harvey Harrison6f4d3682008-01-30 13:33:13 +0100584#ifdef CONFIG_X86_32
Harvey Harrisonedcd8112008-01-30 13:33:16 +0100585 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
Harvey Harrison6f4d3682008-01-30 13:33:13 +0100586#else
Andi Kleen03252912008-01-30 13:33:18 +0100587 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
Harvey Harrison6f4d3682008-01-30 13:33:13 +0100588#endif
589 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
590 tsk->comm, task_pid_nr(tsk), address, regs->ip,
591 regs->sp, error_code);
Andi Kleen03252912008-01-30 13:33:18 +0100592 print_vma_addr(" in ", regs->ip);
593 printk("\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594 }
Harvey Harrison33cb5242008-01-30 13:32:19 +0100595
Linus Torvalds1da177e2005-04-16 15:20:36 -0700596 tsk->thread.cr2 = address;
597 /* Kernel addresses are always protection faults */
598 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
599 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100600
601 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602 return;
603 }
604
Harvey Harrison29caf2f2008-01-30 13:34:09 +0100605 if (is_f00f_bug(regs, address))
606 return;
607
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608no_context:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609 /* Are we prepared to handle this kernel fault? */
Harvey Harrison33cb5242008-01-30 13:32:19 +0100610 if (fixup_exception(regs))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612
Harvey Harrison33cb5242008-01-30 13:32:19 +0100613 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614 * Hall of shame of CPU/BIOS bugs.
615 */
616
Harvey Harrison33cb5242008-01-30 13:32:19 +0100617 if (is_prefetch(regs, address, error_code))
618 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619
620 if (is_errata93(regs, address))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100621 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622
623/*
624 * Oops. The kernel tried to access some bad page. We'll have to
625 * terminate things with extreme prejudice.
626 */
627
Jan Beulich12091402005-09-12 18:49:24 +0200628 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629
630 if (address < PAGE_SIZE)
631 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
632 else
633 printk(KERN_ALERT "Unable to handle kernel paging request");
Harvey Harrison33cb5242008-01-30 13:32:19 +0100634 printk(" at %016lx RIP: \n" KERN_ALERT, address);
Harvey Harrison518edc92008-01-30 13:33:24 +0100635 printk_address(regs->ip, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700636 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100637 tsk->thread.cr2 = address;
638 tsk->thread.trap_no = 14;
639 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100640 if (__die("Oops", regs, error_code))
641 regs = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642 /* Executive summary in case the body of the oops scrolled away */
643 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich22f59912008-01-30 13:31:23 +0100644 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645
646/*
647 * We ran out of memory, or some other thing happened to us that made
648 * us unable to handle the page fault gracefully.
649 */
650out_of_memory:
651 up_read(&mm->mmap_sem);
Serge E. Hallynb460cbc2007-10-18 23:39:52 -0700652 if (is_global_init(current)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653 yield();
654 goto again;
655 }
656 printk("VM: killing process %s\n", tsk->comm);
Harvey Harrison318aa292008-01-30 13:32:59 +0100657 if (error_code & PF_USER)
Will Schmidt021daae2007-07-21 17:11:17 +0200658 do_group_exit(SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 goto no_context;
660
661do_sigbus:
662 up_read(&mm->mmap_sem);
663
664 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100665 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 goto no_context;
667
668 tsk->thread.cr2 = address;
669 tsk->thread.error_code = error_code;
670 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100671 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672 return;
673}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100674
Jan Beulich8c914cb2006-03-25 16:29:40 +0100675DEFINE_SPINLOCK(pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200676LIST_HEAD(pgd_list);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100677
678void vmalloc_sync_all(void)
679{
Harvey Harrison6f4d3682008-01-30 13:33:13 +0100680 /*
681 * Note that races in the updates of insync and start aren't
682 * problematic: insync can only get set bits added, and updates to
683 * start are only improving performance (without affecting correctness
684 * if undone).
685 */
Jan Beulich8c914cb2006-03-25 16:29:40 +0100686 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
687 static unsigned long start = VMALLOC_START & PGDIR_MASK;
688 unsigned long address;
689
690 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
691 if (!test_bit(pgd_index(address), insync)) {
692 const pgd_t *pgd_ref = pgd_offset_k(address);
693 struct page *page;
694
695 if (pgd_none(*pgd_ref))
696 continue;
697 spin_lock(&pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200698 list_for_each_entry(page, &pgd_list, lru) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100699 pgd_t *pgd;
700 pgd = (pgd_t *)page_address(page) + pgd_index(address);
701 if (pgd_none(*pgd))
702 set_pgd(pgd, *pgd_ref);
703 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700704 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Jan Beulich8c914cb2006-03-25 16:29:40 +0100705 }
706 spin_unlock(&pgd_lock);
707 set_bit(pgd_index(address), insync);
708 }
709 if (address == start)
710 start = address + PGDIR_SIZE;
711 }
712 /* Check that there is no need to do the same for the modules area. */
713 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100714 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
Jan Beulich8c914cb2006-03-25 16:29:40 +0100715 (__START_KERNEL & PGDIR_MASK)));
716}