blob: 77fee481be4fbcdb456fb83c6ea9aeaae58fb84b [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
Randy Dunlapa9415642006-01-11 12:17:48 -080018#include <linux/capability.h>
Andi Kleen91c6d402005-07-28 21:15:39 -070019#include <linux/cpu.h>
20#include <linux/percpu.h>
Andi Kleen8c566ef2005-09-12 18:49:24 +020021#include <linux/ctype.h>
Andi Kleena98f0dd2007-02-13 13:26:23 +010022#include <linux/kmod.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070023#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070024#include <asm/processor.h>
25#include <asm/msr.h>
26#include <asm/mce.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070027#include <asm/uaccess.h>
Andi Kleen0a9c3ee2006-01-11 22:46:54 +010028#include <asm/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070029
30#define MISC_MCELOG_MINOR 227
Shaohua Li73ca5352006-01-11 22:43:06 +010031#define NR_BANKS 6
Linus Torvalds1da177e2005-04-16 15:20:36 -070032
Andi Kleen553f2652006-04-07 19:49:57 +020033atomic_t mce_entry;
34
Linus Torvalds1da177e2005-04-16 15:20:36 -070035static int mce_dont_init;
36
37/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
38 3: never panic or exit (for testing only) */
39static int tolerant = 1;
40static int banks;
41static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
42static unsigned long console_logged;
43static int notify_user;
Andi Kleen94ad8472005-04-16 15:25:09 -070044static int rip_msr;
Andi Kleene5835382005-11-05 17:25:54 +010045static int mce_bootlog = 1;
Andi Kleena98f0dd2007-02-13 13:26:23 +010046static atomic_t mce_events;
47
48static char trigger[128];
49static char *trigger_argv[2] = { trigger, NULL };
Linus Torvalds1da177e2005-04-16 15:20:36 -070050
51/*
52 * Lockless MCE logging infrastructure.
53 * This avoids deadlocks on printk locks without having to break locks. Also
54 * separate MCEs from kernel messages to avoid bogus bug reports.
55 */
56
57struct mce_log mcelog = {
58 MCE_LOG_SIGNATURE,
59 MCE_LOG_LEN,
60};
61
62void mce_log(struct mce *mce)
63{
64 unsigned next, entry;
Andi Kleena98f0dd2007-02-13 13:26:23 +010065 atomic_inc(&mce_events);
Linus Torvalds1da177e2005-04-16 15:20:36 -070066 mce->finished = 0;
Mike Waychison76441432005-09-30 00:01:27 +020067 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070068 for (;;) {
69 entry = rcu_dereference(mcelog.next);
Mike Waychison76441432005-09-30 00:01:27 +020070 /* The rmb forces the compiler to reload next in each
71 iteration */
72 rmb();
Andi Kleen673242c2005-09-12 18:49:24 +020073 for (;;) {
74 /* When the buffer fills up discard new entries. Assume
75 that the earlier errors are the more interesting. */
76 if (entry >= MCE_LOG_LEN) {
77 set_bit(MCE_OVERFLOW, &mcelog.flags);
78 return;
79 }
80 /* Old left over entry. Skip. */
81 if (mcelog.entry[entry].finished) {
82 entry++;
83 continue;
84 }
Mike Waychison76441432005-09-30 00:01:27 +020085 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -070086 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070087 smp_rmb();
88 next = entry + 1;
89 if (cmpxchg(&mcelog.next, entry, next) == entry)
90 break;
91 }
92 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
Mike Waychison76441432005-09-30 00:01:27 +020093 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070094 mcelog.entry[entry].finished = 1;
Mike Waychison76441432005-09-30 00:01:27 +020095 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070096
97 if (!test_and_set_bit(0, &console_logged))
98 notify_user = 1;
99}
100
101static void print_mce(struct mce *m)
102{
103 printk(KERN_EMERG "\n"
Andi Kleen48551702006-01-11 22:44:48 +0100104 KERN_EMERG "HARDWARE ERROR\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105 KERN_EMERG
106 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
107 m->cpu, m->mcgstatus, m->bank, m->status);
108 if (m->rip) {
109 printk(KERN_EMERG
110 "RIP%s %02x:<%016Lx> ",
111 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
112 m->cs, m->rip);
113 if (m->cs == __KERNEL_CS)
114 print_symbol("{%s}", m->rip);
115 printk("\n");
116 }
117 printk(KERN_EMERG "TSC %Lx ", m->tsc);
118 if (m->addr)
119 printk("ADDR %Lx ", m->addr);
120 if (m->misc)
121 printk("MISC %Lx ", m->misc);
122 printk("\n");
Andi Kleen48551702006-01-11 22:44:48 +0100123 printk(KERN_EMERG "This is not a software problem!\n");
124 printk(KERN_EMERG
125 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126}
127
128static void mce_panic(char *msg, struct mce *backup, unsigned long start)
129{
130 int i;
131 oops_begin();
132 for (i = 0; i < MCE_LOG_LEN; i++) {
133 unsigned long tsc = mcelog.entry[i].tsc;
134 if (time_before(tsc, start))
135 continue;
136 print_mce(&mcelog.entry[i]);
137 if (backup && mcelog.entry[i].tsc == backup->tsc)
138 backup = NULL;
139 }
140 if (backup)
141 print_mce(backup);
142 if (tolerant >= 3)
143 printk("Fake panic: %s\n", msg);
144 else
145 panic(msg);
146}
147
148static int mce_available(struct cpuinfo_x86 *c)
149{
Akinobu Mita3d1712c2006-03-24 03:15:11 -0800150 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151}
152
Andi Kleen94ad8472005-04-16 15:25:09 -0700153static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
154{
155 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
156 m->rip = regs->rip;
157 m->cs = regs->cs;
158 } else {
159 m->rip = 0;
160 m->cs = 0;
161 }
162 if (rip_msr) {
163 /* Assume the RIP in the MSR is exact. Is this true? */
164 m->mcgstatus |= MCG_STATUS_EIPV;
165 rdmsrl(rip_msr, m->rip);
166 m->cs = 0;
167 }
168}
169
Andi Kleena98f0dd2007-02-13 13:26:23 +0100170static void do_mce_trigger(void)
171{
172 static atomic_t mce_logged;
173 int events = atomic_read(&mce_events);
174 if (events != atomic_read(&mce_logged) && trigger[0]) {
175 /* Small race window, but should be harmless. */
176 atomic_set(&mce_logged, events);
Jeremy Fitzhardinge86313c42007-07-17 18:37:03 -0700177 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
Andi Kleena98f0dd2007-02-13 13:26:23 +0100178 }
179}
180
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181/*
182 * The actual machine check handler
183 */
184
185void do_machine_check(struct pt_regs * regs, long error_code)
186{
187 struct mce m, panicm;
188 int nowayout = (tolerant < 1);
189 int kill_it = 0;
190 u64 mcestart = 0;
191 int i;
192 int panicm_found = 0;
193
Andi Kleen553f2652006-04-07 19:49:57 +0200194 atomic_inc(&mce_entry);
195
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 if (regs)
Jan Beulich6e3f3612006-01-11 22:42:14 +0100197 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 if (!banks)
Andi Kleen553f2652006-04-07 19:49:57 +0200199 goto out2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200
201 memset(&m, 0, sizeof(struct mce));
Andi Kleen151f8cc2006-09-26 10:52:37 +0200202 m.cpu = smp_processor_id();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
204 if (!(m.mcgstatus & MCG_STATUS_RIPV))
205 kill_it = 1;
206
207 rdtscll(mcestart);
208 barrier();
209
210 for (i = 0; i < banks; i++) {
211 if (!bank[i])
212 continue;
213
214 m.misc = 0;
215 m.addr = 0;
216 m.bank = i;
217 m.tsc = 0;
218
219 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
220 if ((m.status & MCI_STATUS_VAL) == 0)
221 continue;
222
223 if (m.status & MCI_STATUS_EN) {
224 /* In theory _OVER could be a nowayout too, but
225 assume any overflowed errors were no fatal. */
226 nowayout |= !!(m.status & MCI_STATUS_PCC);
227 kill_it |= !!(m.status & MCI_STATUS_UC);
228 }
229
230 if (m.status & MCI_STATUS_MISCV)
231 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
232 if (m.status & MCI_STATUS_ADDRV)
233 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
234
Andi Kleen94ad8472005-04-16 15:25:09 -0700235 mce_get_rip(&m, regs);
Andi Kleend5172f22005-08-07 09:42:07 -0700236 if (error_code >= 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 rdtscll(m.tsc);
238 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
Andi Kleend5172f22005-08-07 09:42:07 -0700239 if (error_code != -2)
240 mce_log(&m);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241
242 /* Did this bank cause the exception? */
243 /* Assume that the bank with uncorrectable errors did it,
244 and that there is only a single one. */
245 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
246 panicm = m;
247 panicm_found = 1;
248 }
249
Randy Dunlap9f158332005-09-13 01:25:16 -0700250 add_taint(TAINT_MACHINE_CHECK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251 }
252
253 /* Never do anything final in the polling timer */
Andi Kleena98f0dd2007-02-13 13:26:23 +0100254 if (!regs) {
255 /* Normal interrupt context here. Call trigger for any new
256 events. */
257 do_mce_trigger();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258 goto out;
Andi Kleena98f0dd2007-02-13 13:26:23 +0100259 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260
261 /* If we didn't find an uncorrectable error, pick
262 the last one (shouldn't happen, just being safe). */
263 if (!panicm_found)
264 panicm = m;
265 if (nowayout)
266 mce_panic("Machine check", &panicm, mcestart);
267 if (kill_it) {
268 int user_space = 0;
269
270 if (m.mcgstatus & MCG_STATUS_RIPV)
271 user_space = panicm.rip && (panicm.cs & 3);
272
273 /* When the machine was in user space and the CPU didn't get
274 confused it's normally not necessary to panic, unless you
275 are paranoid (tolerant == 0)
276
277 RED-PEN could be more tolerant for MCEs in idle,
278 but most likely they occur at boot anyways, where
279 it is best to just halt the machine. */
280 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
281 (unsigned)current->pid <= 1)
282 mce_panic("Uncorrected machine check", &panicm, mcestart);
283
284 /* do_exit takes an awful lot of locks and has as
285 slight risk of deadlocking. If you don't want that
286 don't set tolerant >= 2 */
287 if (tolerant < 3)
288 do_exit(SIGBUS);
289 }
290
291 out:
292 /* Last thing done in the machine check exception to clear state. */
293 wrmsrl(MSR_IA32_MCG_STATUS, 0);
Andi Kleen553f2652006-04-07 19:49:57 +0200294 out2:
295 atomic_dec(&mce_entry);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296}
297
Dmitriy Zavin15d5f832006-09-26 10:52:42 +0200298#ifdef CONFIG_X86_MCE_INTEL
299/***
300 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
301 * @cpu: The CPU on which the event occured.
302 * @status: Event status information
303 *
304 * This function should be called by the thermal interrupt after the
305 * event has been processed and the decision was made to log the event
306 * further.
307 *
308 * The status parameter will be saved to the 'status' field of 'struct mce'
309 * and historically has been the register value of the
310 * MSR_IA32_THERMAL_STATUS (Intel) msr.
311 */
312void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
313{
314 struct mce m;
315
316 memset(&m, 0, sizeof(m));
317 m.cpu = cpu;
318 m.bank = MCE_THERMAL_BANK;
319 m.status = status;
320 rdtscll(m.tsc);
321 mce_log(&m);
322}
323#endif /* CONFIG_X86_MCE_INTEL */
324
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325/*
Tim Hockin8a336b02007-05-02 19:27:19 +0200326 * Periodic polling timer for "silent" machine check errors. If the
327 * poller finds an MCE, poll 2x faster. When the poller finds no more
328 * errors, poll 2x slower (up to check_interval seconds).
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 */
330
331static int check_interval = 5 * 60; /* 5 minutes */
Tim Hockin8a336b02007-05-02 19:27:19 +0200332static int next_interval; /* in jiffies */
David Howells65f27f32006-11-22 14:55:48 +0000333static void mcheck_timer(struct work_struct *work);
334static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335
336static void mcheck_check_cpu(void *info)
337{
338 if (mce_available(&current_cpu_data))
339 do_machine_check(NULL, 0);
340}
341
David Howells65f27f32006-11-22 14:55:48 +0000342static void mcheck_timer(struct work_struct *work)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343{
344 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345
346 /*
347 * It's ok to read stale data here for notify_user and
348 * console_logged as we'll simply get the updated versions
349 * on the next mcheck_timer execution and atomic operations
350 * on console_logged act as synchronization for notify_user
351 * writes.
352 */
353 if (notify_user && console_logged) {
Tim Hockin8a336b02007-05-02 19:27:19 +0200354 static unsigned long last_print;
355 unsigned long now = jiffies;
356
357 /* if we logged an MCE, reduce the polling interval */
358 next_interval = max(next_interval/2, HZ/100);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 notify_user = 0;
360 clear_bit(0, &console_logged);
Tim Hockin8a336b02007-05-02 19:27:19 +0200361 if (time_after_eq(now, last_print + (check_interval*HZ))) {
362 last_print = now;
363 printk(KERN_INFO "Machine check events logged\n");
364 }
365 } else {
366 next_interval = min(next_interval*2, check_interval*HZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367 }
Tim Hockin8a336b02007-05-02 19:27:19 +0200368
369 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370}
371
372
373static __init int periodic_mcheck_init(void)
374{
Tim Hockin8a336b02007-05-02 19:27:19 +0200375 next_interval = check_interval * HZ;
376 if (next_interval)
377 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378 return 0;
379}
380__initcall(periodic_mcheck_init);
381
382
383/*
384 * Initialize Machine Checks for a CPU.
385 */
386static void mce_init(void *dummy)
387{
388 u64 cap;
389 int i;
390
391 rdmsrl(MSR_IA32_MCG_CAP, cap);
392 banks = cap & 0xff;
393 if (banks > NR_BANKS) {
394 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
395 banks = NR_BANKS;
396 }
Andi Kleen94ad8472005-04-16 15:25:09 -0700397 /* Use accurate RIP reporting if available. */
398 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
399 rip_msr = MSR_IA32_MCG_EIP;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400
401 /* Log the machine checks left over from the previous reset.
402 This also clears all registers */
Andi Kleend5172f22005-08-07 09:42:07 -0700403 do_machine_check(NULL, mce_bootlog ? -1 : -2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404
405 set_in_cr4(X86_CR4_MCE);
406
407 if (cap & MCG_CTL_P)
408 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
409
410 for (i = 0; i < banks; i++) {
411 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
412 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
413 }
414}
415
416/* Add per CPU specific workarounds here */
Ashok Raje6982c62005-06-25 14:54:58 -0700417static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418{
419 /* This should be disabled by the BIOS, but isn't always */
420 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
421 /* disable GART TBL walk error reporting, which trips off
422 incorrectly with the IOMMU & 3ware & Cerberus. */
423 clear_bit(10, &bank[4]);
Andi Kleene5835382005-11-05 17:25:54 +0100424 /* Lots of broken BIOS around that don't clear them
425 by default and leave crap in there. Don't log. */
426 mce_bootlog = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427 }
Andi Kleene5835382005-11-05 17:25:54 +0100428
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429}
430
Ashok Raje6982c62005-06-25 14:54:58 -0700431static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432{
433 switch (c->x86_vendor) {
434 case X86_VENDOR_INTEL:
435 mce_intel_feature_init(c);
436 break;
Jacob Shin89b831e2005-11-05 17:25:53 +0100437 case X86_VENDOR_AMD:
438 mce_amd_feature_init(c);
439 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 default:
441 break;
442 }
443}
444
445/*
446 * Called for each booted CPU to set up machine checks.
447 * Must be called with preempt off.
448 */
Ashok Raje6982c62005-06-25 14:54:58 -0700449void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450{
Ashok Raj7ded5682006-02-03 21:51:23 +0100451 static cpumask_t mce_cpus = CPU_MASK_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452
453 mce_cpu_quirks(c);
454
455 if (mce_dont_init ||
456 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
457 !mce_available(c))
458 return;
459
460 mce_init(NULL);
461 mce_cpu_features(c);
462}
463
464/*
465 * Character device to read and clear the MCE log.
466 */
467
Tim Hockinf528e7b2007-07-21 17:10:35 +0200468static DEFINE_SPINLOCK(mce_state_lock);
469static int open_count; /* #times opened */
470static int open_exclu; /* already open exclusive? */
471
472static int mce_open(struct inode *inode, struct file *file)
473{
474 spin_lock(&mce_state_lock);
475
476 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
477 spin_unlock(&mce_state_lock);
478 return -EBUSY;
479 }
480
481 if (file->f_flags & O_EXCL)
482 open_exclu = 1;
483 open_count++;
484
485 spin_unlock(&mce_state_lock);
486
487 return 0;
488}
489
490static int mce_release(struct inode *inode, struct file *file)
491{
492 spin_lock(&mce_state_lock);
493
494 open_count--;
495 open_exclu = 0;
496
497 spin_unlock(&mce_state_lock);
498
499 return 0;
500}
501
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502static void collect_tscs(void *data)
503{
504 unsigned long *cpu_tsc = (unsigned long *)data;
505 rdtscll(cpu_tsc[smp_processor_id()]);
506}
507
508static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
509{
Andi Kleenf0de53b2005-04-16 15:25:10 -0700510 unsigned long *cpu_tsc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 static DECLARE_MUTEX(mce_read_sem);
512 unsigned next;
513 char __user *buf = ubuf;
514 int i, err;
515
Andi Kleenf0de53b2005-04-16 15:25:10 -0700516 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
517 if (!cpu_tsc)
518 return -ENOMEM;
519
Linus Torvalds1da177e2005-04-16 15:20:36 -0700520 down(&mce_read_sem);
521 next = rcu_dereference(mcelog.next);
522
523 /* Only supports full reads right now */
524 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
525 up(&mce_read_sem);
Andi Kleenf0de53b2005-04-16 15:25:10 -0700526 kfree(cpu_tsc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700527 return -EINVAL;
528 }
529
530 err = 0;
Andi Kleen673242c2005-09-12 18:49:24 +0200531 for (i = 0; i < next; i++) {
532 unsigned long start = jiffies;
533 while (!mcelog.entry[i].finished) {
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700534 if (time_after_eq(jiffies, start + 2)) {
Andi Kleen673242c2005-09-12 18:49:24 +0200535 memset(mcelog.entry + i,0, sizeof(struct mce));
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700536 goto timeout;
Andi Kleen673242c2005-09-12 18:49:24 +0200537 }
538 cpu_relax();
539 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700540 smp_rmb();
541 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
542 buf += sizeof(struct mce);
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700543 timeout:
544 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700545 }
546
547 memset(mcelog.entry, 0, next * sizeof(struct mce));
548 mcelog.next = 0;
549
Paul E. McKenneyb2b18662005-06-25 14:55:38 -0700550 synchronize_sched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700551
552 /* Collect entries that were still getting written before the synchronize. */
553
554 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
555 for (i = next; i < MCE_LOG_LEN; i++) {
556 if (mcelog.entry[i].finished &&
557 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
558 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
559 smp_rmb();
560 buf += sizeof(struct mce);
561 memset(&mcelog.entry[i], 0, sizeof(struct mce));
562 }
563 }
564 up(&mce_read_sem);
Andi Kleenf0de53b2005-04-16 15:25:10 -0700565 kfree(cpu_tsc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700566 return err ? -EFAULT : buf - ubuf;
567}
568
569static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
570{
571 int __user *p = (int __user *)arg;
572 if (!capable(CAP_SYS_ADMIN))
573 return -EPERM;
574 switch (cmd) {
575 case MCE_GET_RECORD_LEN:
576 return put_user(sizeof(struct mce), p);
577 case MCE_GET_LOG_LEN:
578 return put_user(MCE_LOG_LEN, p);
579 case MCE_GETCLEAR_FLAGS: {
580 unsigned flags;
581 do {
582 flags = mcelog.flags;
583 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
584 return put_user(flags, p);
585 }
586 default:
587 return -ENOTTY;
588 }
589}
590
Arjan van de Ven5dfe4c92007-02-12 00:55:31 -0800591static const struct file_operations mce_chrdev_ops = {
Tim Hockinf528e7b2007-07-21 17:10:35 +0200592 .open = mce_open,
593 .release = mce_release,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594 .read = mce_read,
595 .ioctl = mce_ioctl,
596};
597
598static struct miscdevice mce_log_device = {
599 MISC_MCELOG_MINOR,
600 "mcelog",
601 &mce_chrdev_ops,
602};
603
604/*
605 * Old style boot options parsing. Only for compatibility.
606 */
607
608static int __init mcheck_disable(char *str)
609{
610 mce_dont_init = 1;
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800611 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612}
613
614/* mce=off disables machine check. Note you can reenable it later
Andi Kleend5172f22005-08-07 09:42:07 -0700615 using sysfs.
Andi Kleen8c566ef2005-09-12 18:49:24 +0200616 mce=TOLERANCELEVEL (number, see above)
Andi Kleene5835382005-11-05 17:25:54 +0100617 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
618 mce=nobootlog Don't log MCEs from before booting. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619static int __init mcheck_enable(char *str)
620{
Andi Kleend5172f22005-08-07 09:42:07 -0700621 if (*str == '=')
622 str++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623 if (!strcmp(str, "off"))
624 mce_dont_init = 1;
Andi Kleene5835382005-11-05 17:25:54 +0100625 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
626 mce_bootlog = str[0] == 'b';
Andi Kleen8c566ef2005-09-12 18:49:24 +0200627 else if (isdigit(str[0]))
628 get_option(&str, &tolerant);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 else
630 printk("mce= argument %s ignored. Please use /sys", str);
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800631 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632}
633
634__setup("nomce", mcheck_disable);
635__setup("mce", mcheck_enable);
636
637/*
638 * Sysfs support
639 */
640
Andi Kleen413588c2005-09-12 18:49:24 +0200641/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
642 Only one CPU is active at this time, the others get readded later using
643 CPU hotplug. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644static int mce_resume(struct sys_device *dev)
645{
Andi Kleen413588c2005-09-12 18:49:24 +0200646 mce_init(NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 return 0;
648}
649
650/* Reinit MCEs after user configuration changes */
651static void mce_restart(void)
652{
Tim Hockin8a336b02007-05-02 19:27:19 +0200653 if (next_interval)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700654 cancel_delayed_work(&mcheck_work);
655 /* Timer race is harmless here */
656 on_each_cpu(mce_init, NULL, 1, 1);
Tim Hockin8a336b02007-05-02 19:27:19 +0200657 next_interval = check_interval * HZ;
658 if (next_interval)
659 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700660}
661
662static struct sysdev_class mce_sysclass = {
663 .resume = mce_resume,
664 set_kset_name("machinecheck"),
665};
666
Jacob Shinfff2e892006-06-26 13:58:50 +0200667DEFINE_PER_CPU(struct sys_device, device_mce);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668
669/* Why are there no generic functions for this? */
670#define ACCESSOR(name, var, start) \
671 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
672 return sprintf(buf, "%lx\n", (unsigned long)var); \
673 } \
674 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
675 char *end; \
676 unsigned long new = simple_strtoul(buf, &end, 0); \
677 if (end == buf) return -EINVAL; \
678 var = new; \
679 start; \
680 return end-buf; \
681 } \
682 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
683
Andi Kleena98f0dd2007-02-13 13:26:23 +0100684/* TBD should generate these dynamically based on number of available banks */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700685ACCESSOR(bank0ctl,bank[0],mce_restart())
686ACCESSOR(bank1ctl,bank[1],mce_restart())
687ACCESSOR(bank2ctl,bank[2],mce_restart())
688ACCESSOR(bank3ctl,bank[3],mce_restart())
689ACCESSOR(bank4ctl,bank[4],mce_restart())
Shaohua Li73ca5352006-01-11 22:43:06 +0100690ACCESSOR(bank5ctl,bank[5],mce_restart())
Andi Kleena98f0dd2007-02-13 13:26:23 +0100691
692static ssize_t show_trigger(struct sys_device *s, char *buf)
693{
694 strcpy(buf, trigger);
695 strcat(buf, "\n");
696 return strlen(trigger) + 1;
697}
698
699static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
700{
701 char *p;
702 int len;
703 strncpy(trigger, buf, sizeof(trigger));
704 trigger[sizeof(trigger)-1] = 0;
705 len = strlen(trigger);
706 p = strchr(trigger, '\n');
707 if (*p) *p = 0;
708 return len;
709}
710
711static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700712ACCESSOR(tolerant,tolerant,)
713ACCESSOR(check_interval,check_interval,mce_restart())
Andi Kleena98f0dd2007-02-13 13:26:23 +0100714static struct sysdev_attribute *mce_attributes[] = {
715 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
716 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
717 &attr_tolerant, &attr_check_interval, &attr_trigger,
718 NULL
719};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720
Andi Kleen91c6d402005-07-28 21:15:39 -0700721/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
722static __cpuinit int mce_create_device(unsigned int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723{
724 int err;
Shaohua Li73ca5352006-01-11 22:43:06 +0100725 int i;
Andi Kleen91c6d402005-07-28 21:15:39 -0700726 if (!mce_available(&cpu_data[cpu]))
727 return -EIO;
728
729 per_cpu(device_mce,cpu).id = cpu;
730 per_cpu(device_mce,cpu).cls = &mce_sysclass;
731
732 err = sysdev_register(&per_cpu(device_mce,cpu));
733
734 if (!err) {
Andi Kleena98f0dd2007-02-13 13:26:23 +0100735 for (i = 0; mce_attributes[i]; i++)
Shaohua Li73ca5352006-01-11 22:43:06 +0100736 sysdev_create_file(&per_cpu(device_mce,cpu),
Andi Kleena98f0dd2007-02-13 13:26:23 +0100737 mce_attributes[i]);
Andi Kleen91c6d402005-07-28 21:15:39 -0700738 }
739 return err;
740}
741
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700742static void mce_remove_device(unsigned int cpu)
Andi Kleen91c6d402005-07-28 21:15:39 -0700743{
Shaohua Li73ca5352006-01-11 22:43:06 +0100744 int i;
745
Andi Kleena98f0dd2007-02-13 13:26:23 +0100746 for (i = 0; mce_attributes[i]; i++)
Shaohua Li73ca5352006-01-11 22:43:06 +0100747 sysdev_remove_file(&per_cpu(device_mce,cpu),
Andi Kleena98f0dd2007-02-13 13:26:23 +0100748 mce_attributes[i]);
Andi Kleen91c6d402005-07-28 21:15:39 -0700749 sysdev_unregister(&per_cpu(device_mce,cpu));
Rafael J. Wysockid4c45712006-12-07 02:14:12 +0100750 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
Andi Kleen91c6d402005-07-28 21:15:39 -0700751}
Andi Kleen91c6d402005-07-28 21:15:39 -0700752
753/* Get notified when a cpu comes on/off. Be hotplug friendly. */
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700754static int
Andi Kleen91c6d402005-07-28 21:15:39 -0700755mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
756{
757 unsigned int cpu = (unsigned long)hcpu;
758
759 switch (action) {
760 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700761 case CPU_ONLINE_FROZEN:
Andi Kleen91c6d402005-07-28 21:15:39 -0700762 mce_create_device(cpu);
763 break;
Andi Kleen91c6d402005-07-28 21:15:39 -0700764 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700765 case CPU_DEAD_FROZEN:
Andi Kleen91c6d402005-07-28 21:15:39 -0700766 mce_remove_device(cpu);
767 break;
Andi Kleen91c6d402005-07-28 21:15:39 -0700768 }
769 return NOTIFY_OK;
770}
771
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700772static struct notifier_block mce_cpu_notifier = {
Andi Kleen91c6d402005-07-28 21:15:39 -0700773 .notifier_call = mce_cpu_callback,
774};
775
776static __init int mce_init_device(void)
777{
778 int err;
779 int i = 0;
780
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 if (!mce_available(&boot_cpu_data))
782 return -EIO;
783 err = sysdev_class_register(&mce_sysclass);
Andi Kleen91c6d402005-07-28 21:15:39 -0700784
785 for_each_online_cpu(i) {
786 mce_create_device(i);
787 }
788
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700789 register_hotcpu_notifier(&mce_cpu_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790 misc_register(&mce_log_device);
791 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700792}
Andi Kleen91c6d402005-07-28 21:15:39 -0700793
Linus Torvalds1da177e2005-04-16 15:20:36 -0700794device_initcall(mce_init_device);