blob: aa1d1599179433b1e116cd834dfebebe2ab3c67f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
Randy Dunlapa9415642006-01-11 12:17:48 -080018#include <linux/capability.h>
Andi Kleen91c6d402005-07-28 21:15:39 -070019#include <linux/cpu.h>
20#include <linux/percpu.h>
Andi Kleen8c566ef2005-09-12 18:49:24 +020021#include <linux/ctype.h>
Andi Kleena98f0dd2007-02-13 13:26:23 +010022#include <linux/kmod.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070023#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070024#include <asm/processor.h>
25#include <asm/msr.h>
26#include <asm/mce.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070027#include <asm/uaccess.h>
Andi Kleen0a9c3ee2006-01-11 22:46:54 +010028#include <asm/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070029
30#define MISC_MCELOG_MINOR 227
Shaohua Li73ca5352006-01-11 22:43:06 +010031#define NR_BANKS 6
Linus Torvalds1da177e2005-04-16 15:20:36 -070032
Andi Kleen553f2652006-04-07 19:49:57 +020033atomic_t mce_entry;
34
Linus Torvalds1da177e2005-04-16 15:20:36 -070035static int mce_dont_init;
36
37/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
38 3: never panic or exit (for testing only) */
39static int tolerant = 1;
40static int banks;
41static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
42static unsigned long console_logged;
43static int notify_user;
Andi Kleen94ad8472005-04-16 15:25:09 -070044static int rip_msr;
Andi Kleene5835382005-11-05 17:25:54 +010045static int mce_bootlog = 1;
Andi Kleena98f0dd2007-02-13 13:26:23 +010046static atomic_t mce_events;
47
48static char trigger[128];
49static char *trigger_argv[2] = { trigger, NULL };
Linus Torvalds1da177e2005-04-16 15:20:36 -070050
51/*
52 * Lockless MCE logging infrastructure.
53 * This avoids deadlocks on printk locks without having to break locks. Also
54 * separate MCEs from kernel messages to avoid bogus bug reports.
55 */
56
57struct mce_log mcelog = {
58 MCE_LOG_SIGNATURE,
59 MCE_LOG_LEN,
60};
61
62void mce_log(struct mce *mce)
63{
64 unsigned next, entry;
Andi Kleena98f0dd2007-02-13 13:26:23 +010065 atomic_inc(&mce_events);
Linus Torvalds1da177e2005-04-16 15:20:36 -070066 mce->finished = 0;
Mike Waychison76441432005-09-30 00:01:27 +020067 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070068 for (;;) {
69 entry = rcu_dereference(mcelog.next);
Mike Waychison76441432005-09-30 00:01:27 +020070 /* The rmb forces the compiler to reload next in each
71 iteration */
72 rmb();
Andi Kleen673242c2005-09-12 18:49:24 +020073 for (;;) {
74 /* When the buffer fills up discard new entries. Assume
75 that the earlier errors are the more interesting. */
76 if (entry >= MCE_LOG_LEN) {
77 set_bit(MCE_OVERFLOW, &mcelog.flags);
78 return;
79 }
80 /* Old left over entry. Skip. */
81 if (mcelog.entry[entry].finished) {
82 entry++;
83 continue;
84 }
Mike Waychison76441432005-09-30 00:01:27 +020085 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -070086 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070087 smp_rmb();
88 next = entry + 1;
89 if (cmpxchg(&mcelog.next, entry, next) == entry)
90 break;
91 }
92 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
Mike Waychison76441432005-09-30 00:01:27 +020093 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070094 mcelog.entry[entry].finished = 1;
Mike Waychison76441432005-09-30 00:01:27 +020095 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070096
97 if (!test_and_set_bit(0, &console_logged))
98 notify_user = 1;
99}
100
101static void print_mce(struct mce *m)
102{
103 printk(KERN_EMERG "\n"
Andi Kleen48551702006-01-11 22:44:48 +0100104 KERN_EMERG "HARDWARE ERROR\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105 KERN_EMERG
106 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
107 m->cpu, m->mcgstatus, m->bank, m->status);
108 if (m->rip) {
109 printk(KERN_EMERG
110 "RIP%s %02x:<%016Lx> ",
111 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
112 m->cs, m->rip);
113 if (m->cs == __KERNEL_CS)
114 print_symbol("{%s}", m->rip);
115 printk("\n");
116 }
117 printk(KERN_EMERG "TSC %Lx ", m->tsc);
118 if (m->addr)
119 printk("ADDR %Lx ", m->addr);
120 if (m->misc)
121 printk("MISC %Lx ", m->misc);
122 printk("\n");
Andi Kleen48551702006-01-11 22:44:48 +0100123 printk(KERN_EMERG "This is not a software problem!\n");
124 printk(KERN_EMERG
125 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126}
127
128static void mce_panic(char *msg, struct mce *backup, unsigned long start)
129{
130 int i;
131 oops_begin();
132 for (i = 0; i < MCE_LOG_LEN; i++) {
133 unsigned long tsc = mcelog.entry[i].tsc;
134 if (time_before(tsc, start))
135 continue;
136 print_mce(&mcelog.entry[i]);
137 if (backup && mcelog.entry[i].tsc == backup->tsc)
138 backup = NULL;
139 }
140 if (backup)
141 print_mce(backup);
142 if (tolerant >= 3)
143 printk("Fake panic: %s\n", msg);
144 else
145 panic(msg);
146}
147
148static int mce_available(struct cpuinfo_x86 *c)
149{
Akinobu Mita3d1712c2006-03-24 03:15:11 -0800150 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151}
152
Andi Kleen94ad8472005-04-16 15:25:09 -0700153static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
154{
155 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
156 m->rip = regs->rip;
157 m->cs = regs->cs;
158 } else {
159 m->rip = 0;
160 m->cs = 0;
161 }
162 if (rip_msr) {
163 /* Assume the RIP in the MSR is exact. Is this true? */
164 m->mcgstatus |= MCG_STATUS_EIPV;
165 rdmsrl(rip_msr, m->rip);
166 m->cs = 0;
167 }
168}
169
Andi Kleena98f0dd2007-02-13 13:26:23 +0100170static void do_mce_trigger(void)
171{
172 static atomic_t mce_logged;
173 int events = atomic_read(&mce_events);
174 if (events != atomic_read(&mce_logged) && trigger[0]) {
175 /* Small race window, but should be harmless. */
176 atomic_set(&mce_logged, events);
177 call_usermodehelper(trigger, trigger_argv, NULL, -1);
178 }
179}
180
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181/*
182 * The actual machine check handler
183 */
184
185void do_machine_check(struct pt_regs * regs, long error_code)
186{
187 struct mce m, panicm;
188 int nowayout = (tolerant < 1);
189 int kill_it = 0;
190 u64 mcestart = 0;
191 int i;
192 int panicm_found = 0;
193
Andi Kleen553f2652006-04-07 19:49:57 +0200194 atomic_inc(&mce_entry);
195
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 if (regs)
Jan Beulich6e3f3612006-01-11 22:42:14 +0100197 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 if (!banks)
Andi Kleen553f2652006-04-07 19:49:57 +0200199 goto out2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200
201 memset(&m, 0, sizeof(struct mce));
Andi Kleen151f8cc2006-09-26 10:52:37 +0200202 m.cpu = smp_processor_id();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
204 if (!(m.mcgstatus & MCG_STATUS_RIPV))
205 kill_it = 1;
206
207 rdtscll(mcestart);
208 barrier();
209
210 for (i = 0; i < banks; i++) {
211 if (!bank[i])
212 continue;
213
214 m.misc = 0;
215 m.addr = 0;
216 m.bank = i;
217 m.tsc = 0;
218
219 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
220 if ((m.status & MCI_STATUS_VAL) == 0)
221 continue;
222
223 if (m.status & MCI_STATUS_EN) {
224 /* In theory _OVER could be a nowayout too, but
225 assume any overflowed errors were no fatal. */
226 nowayout |= !!(m.status & MCI_STATUS_PCC);
227 kill_it |= !!(m.status & MCI_STATUS_UC);
228 }
229
230 if (m.status & MCI_STATUS_MISCV)
231 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
232 if (m.status & MCI_STATUS_ADDRV)
233 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
234
Andi Kleen94ad8472005-04-16 15:25:09 -0700235 mce_get_rip(&m, regs);
Andi Kleend5172f22005-08-07 09:42:07 -0700236 if (error_code >= 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 rdtscll(m.tsc);
238 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
Andi Kleend5172f22005-08-07 09:42:07 -0700239 if (error_code != -2)
240 mce_log(&m);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241
242 /* Did this bank cause the exception? */
243 /* Assume that the bank with uncorrectable errors did it,
244 and that there is only a single one. */
245 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
246 panicm = m;
247 panicm_found = 1;
248 }
249
Randy Dunlap9f158332005-09-13 01:25:16 -0700250 add_taint(TAINT_MACHINE_CHECK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251 }
252
253 /* Never do anything final in the polling timer */
Andi Kleena98f0dd2007-02-13 13:26:23 +0100254 if (!regs) {
255 /* Normal interrupt context here. Call trigger for any new
256 events. */
257 do_mce_trigger();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258 goto out;
Andi Kleena98f0dd2007-02-13 13:26:23 +0100259 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260
261 /* If we didn't find an uncorrectable error, pick
262 the last one (shouldn't happen, just being safe). */
263 if (!panicm_found)
264 panicm = m;
265 if (nowayout)
266 mce_panic("Machine check", &panicm, mcestart);
267 if (kill_it) {
268 int user_space = 0;
269
270 if (m.mcgstatus & MCG_STATUS_RIPV)
271 user_space = panicm.rip && (panicm.cs & 3);
272
273 /* When the machine was in user space and the CPU didn't get
274 confused it's normally not necessary to panic, unless you
275 are paranoid (tolerant == 0)
276
277 RED-PEN could be more tolerant for MCEs in idle,
278 but most likely they occur at boot anyways, where
279 it is best to just halt the machine. */
280 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
281 (unsigned)current->pid <= 1)
282 mce_panic("Uncorrected machine check", &panicm, mcestart);
283
284 /* do_exit takes an awful lot of locks and has as
285 slight risk of deadlocking. If you don't want that
286 don't set tolerant >= 2 */
287 if (tolerant < 3)
288 do_exit(SIGBUS);
289 }
290
291 out:
292 /* Last thing done in the machine check exception to clear state. */
293 wrmsrl(MSR_IA32_MCG_STATUS, 0);
Andi Kleen553f2652006-04-07 19:49:57 +0200294 out2:
295 atomic_dec(&mce_entry);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296}
297
Dmitriy Zavin15d5f832006-09-26 10:52:42 +0200298#ifdef CONFIG_X86_MCE_INTEL
299/***
300 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
301 * @cpu: The CPU on which the event occured.
302 * @status: Event status information
303 *
304 * This function should be called by the thermal interrupt after the
305 * event has been processed and the decision was made to log the event
306 * further.
307 *
308 * The status parameter will be saved to the 'status' field of 'struct mce'
309 * and historically has been the register value of the
310 * MSR_IA32_THERMAL_STATUS (Intel) msr.
311 */
312void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
313{
314 struct mce m;
315
316 memset(&m, 0, sizeof(m));
317 m.cpu = cpu;
318 m.bank = MCE_THERMAL_BANK;
319 m.status = status;
320 rdtscll(m.tsc);
321 mce_log(&m);
322}
323#endif /* CONFIG_X86_MCE_INTEL */
324
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325/*
Tim Hockin8a336b02007-05-02 19:27:19 +0200326 * Periodic polling timer for "silent" machine check errors. If the
327 * poller finds an MCE, poll 2x faster. When the poller finds no more
328 * errors, poll 2x slower (up to check_interval seconds).
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 */
330
331static int check_interval = 5 * 60; /* 5 minutes */
Tim Hockin8a336b02007-05-02 19:27:19 +0200332static int next_interval; /* in jiffies */
David Howells65f27f32006-11-22 14:55:48 +0000333static void mcheck_timer(struct work_struct *work);
334static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335
336static void mcheck_check_cpu(void *info)
337{
338 if (mce_available(&current_cpu_data))
339 do_machine_check(NULL, 0);
340}
341
David Howells65f27f32006-11-22 14:55:48 +0000342static void mcheck_timer(struct work_struct *work)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343{
344 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345
346 /*
347 * It's ok to read stale data here for notify_user and
348 * console_logged as we'll simply get the updated versions
349 * on the next mcheck_timer execution and atomic operations
350 * on console_logged act as synchronization for notify_user
351 * writes.
352 */
353 if (notify_user && console_logged) {
Tim Hockin8a336b02007-05-02 19:27:19 +0200354 static unsigned long last_print;
355 unsigned long now = jiffies;
356
357 /* if we logged an MCE, reduce the polling interval */
358 next_interval = max(next_interval/2, HZ/100);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 notify_user = 0;
360 clear_bit(0, &console_logged);
Tim Hockin8a336b02007-05-02 19:27:19 +0200361 if (time_after_eq(now, last_print + (check_interval*HZ))) {
362 last_print = now;
363 printk(KERN_INFO "Machine check events logged\n");
364 }
365 } else {
366 next_interval = min(next_interval*2, check_interval*HZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367 }
Tim Hockin8a336b02007-05-02 19:27:19 +0200368
369 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370}
371
372
373static __init int periodic_mcheck_init(void)
374{
Tim Hockin8a336b02007-05-02 19:27:19 +0200375 next_interval = check_interval * HZ;
376 if (next_interval)
377 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378 return 0;
379}
380__initcall(periodic_mcheck_init);
381
382
383/*
384 * Initialize Machine Checks for a CPU.
385 */
386static void mce_init(void *dummy)
387{
388 u64 cap;
389 int i;
390
391 rdmsrl(MSR_IA32_MCG_CAP, cap);
392 banks = cap & 0xff;
393 if (banks > NR_BANKS) {
394 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
395 banks = NR_BANKS;
396 }
Andi Kleen94ad8472005-04-16 15:25:09 -0700397 /* Use accurate RIP reporting if available. */
398 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
399 rip_msr = MSR_IA32_MCG_EIP;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400
401 /* Log the machine checks left over from the previous reset.
402 This also clears all registers */
Andi Kleend5172f22005-08-07 09:42:07 -0700403 do_machine_check(NULL, mce_bootlog ? -1 : -2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404
405 set_in_cr4(X86_CR4_MCE);
406
407 if (cap & MCG_CTL_P)
408 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
409
410 for (i = 0; i < banks; i++) {
411 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
412 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
413 }
414}
415
416/* Add per CPU specific workarounds here */
Ashok Raje6982c62005-06-25 14:54:58 -0700417static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418{
419 /* This should be disabled by the BIOS, but isn't always */
420 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
421 /* disable GART TBL walk error reporting, which trips off
422 incorrectly with the IOMMU & 3ware & Cerberus. */
423 clear_bit(10, &bank[4]);
Andi Kleene5835382005-11-05 17:25:54 +0100424 /* Lots of broken BIOS around that don't clear them
425 by default and leave crap in there. Don't log. */
426 mce_bootlog = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427 }
Andi Kleene5835382005-11-05 17:25:54 +0100428
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429}
430
Ashok Raje6982c62005-06-25 14:54:58 -0700431static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432{
433 switch (c->x86_vendor) {
434 case X86_VENDOR_INTEL:
435 mce_intel_feature_init(c);
436 break;
Jacob Shin89b831e2005-11-05 17:25:53 +0100437 case X86_VENDOR_AMD:
438 mce_amd_feature_init(c);
439 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 default:
441 break;
442 }
443}
444
445/*
446 * Called for each booted CPU to set up machine checks.
447 * Must be called with preempt off.
448 */
Ashok Raje6982c62005-06-25 14:54:58 -0700449void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450{
Ashok Raj7ded5682006-02-03 21:51:23 +0100451 static cpumask_t mce_cpus = CPU_MASK_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452
453 mce_cpu_quirks(c);
454
455 if (mce_dont_init ||
456 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
457 !mce_available(c))
458 return;
459
460 mce_init(NULL);
461 mce_cpu_features(c);
462}
463
464/*
465 * Character device to read and clear the MCE log.
466 */
467
468static void collect_tscs(void *data)
469{
470 unsigned long *cpu_tsc = (unsigned long *)data;
471 rdtscll(cpu_tsc[smp_processor_id()]);
472}
473
474static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
475{
Andi Kleenf0de53b2005-04-16 15:25:10 -0700476 unsigned long *cpu_tsc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477 static DECLARE_MUTEX(mce_read_sem);
478 unsigned next;
479 char __user *buf = ubuf;
480 int i, err;
481
Andi Kleenf0de53b2005-04-16 15:25:10 -0700482 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
483 if (!cpu_tsc)
484 return -ENOMEM;
485
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 down(&mce_read_sem);
487 next = rcu_dereference(mcelog.next);
488
489 /* Only supports full reads right now */
490 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
491 up(&mce_read_sem);
Andi Kleenf0de53b2005-04-16 15:25:10 -0700492 kfree(cpu_tsc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493 return -EINVAL;
494 }
495
496 err = 0;
Andi Kleen673242c2005-09-12 18:49:24 +0200497 for (i = 0; i < next; i++) {
498 unsigned long start = jiffies;
499 while (!mcelog.entry[i].finished) {
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700500 if (time_after_eq(jiffies, start + 2)) {
Andi Kleen673242c2005-09-12 18:49:24 +0200501 memset(mcelog.entry + i,0, sizeof(struct mce));
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700502 goto timeout;
Andi Kleen673242c2005-09-12 18:49:24 +0200503 }
504 cpu_relax();
505 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506 smp_rmb();
507 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
508 buf += sizeof(struct mce);
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700509 timeout:
510 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 }
512
513 memset(mcelog.entry, 0, next * sizeof(struct mce));
514 mcelog.next = 0;
515
Paul E. McKenneyb2b18662005-06-25 14:55:38 -0700516 synchronize_sched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517
518 /* Collect entries that were still getting written before the synchronize. */
519
520 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
521 for (i = next; i < MCE_LOG_LEN; i++) {
522 if (mcelog.entry[i].finished &&
523 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
524 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
525 smp_rmb();
526 buf += sizeof(struct mce);
527 memset(&mcelog.entry[i], 0, sizeof(struct mce));
528 }
529 }
530 up(&mce_read_sem);
Andi Kleenf0de53b2005-04-16 15:25:10 -0700531 kfree(cpu_tsc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700532 return err ? -EFAULT : buf - ubuf;
533}
534
535static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
536{
537 int __user *p = (int __user *)arg;
538 if (!capable(CAP_SYS_ADMIN))
539 return -EPERM;
540 switch (cmd) {
541 case MCE_GET_RECORD_LEN:
542 return put_user(sizeof(struct mce), p);
543 case MCE_GET_LOG_LEN:
544 return put_user(MCE_LOG_LEN, p);
545 case MCE_GETCLEAR_FLAGS: {
546 unsigned flags;
547 do {
548 flags = mcelog.flags;
549 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
550 return put_user(flags, p);
551 }
552 default:
553 return -ENOTTY;
554 }
555}
556
Arjan van de Ven5dfe4c92007-02-12 00:55:31 -0800557static const struct file_operations mce_chrdev_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700558 .read = mce_read,
559 .ioctl = mce_ioctl,
560};
561
562static struct miscdevice mce_log_device = {
563 MISC_MCELOG_MINOR,
564 "mcelog",
565 &mce_chrdev_ops,
566};
567
568/*
569 * Old style boot options parsing. Only for compatibility.
570 */
571
572static int __init mcheck_disable(char *str)
573{
574 mce_dont_init = 1;
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800575 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576}
577
578/* mce=off disables machine check. Note you can reenable it later
Andi Kleend5172f22005-08-07 09:42:07 -0700579 using sysfs.
Andi Kleen8c566ef2005-09-12 18:49:24 +0200580 mce=TOLERANCELEVEL (number, see above)
Andi Kleene5835382005-11-05 17:25:54 +0100581 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
582 mce=nobootlog Don't log MCEs from before booting. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583static int __init mcheck_enable(char *str)
584{
Andi Kleend5172f22005-08-07 09:42:07 -0700585 if (*str == '=')
586 str++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700587 if (!strcmp(str, "off"))
588 mce_dont_init = 1;
Andi Kleene5835382005-11-05 17:25:54 +0100589 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
590 mce_bootlog = str[0] == 'b';
Andi Kleen8c566ef2005-09-12 18:49:24 +0200591 else if (isdigit(str[0]))
592 get_option(&str, &tolerant);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700593 else
594 printk("mce= argument %s ignored. Please use /sys", str);
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800595 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700596}
597
598__setup("nomce", mcheck_disable);
599__setup("mce", mcheck_enable);
600
601/*
602 * Sysfs support
603 */
604
Andi Kleen413588c2005-09-12 18:49:24 +0200605/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
606 Only one CPU is active at this time, the others get readded later using
607 CPU hotplug. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608static int mce_resume(struct sys_device *dev)
609{
Andi Kleen413588c2005-09-12 18:49:24 +0200610 mce_init(NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 return 0;
612}
613
614/* Reinit MCEs after user configuration changes */
615static void mce_restart(void)
616{
Tim Hockin8a336b02007-05-02 19:27:19 +0200617 if (next_interval)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618 cancel_delayed_work(&mcheck_work);
619 /* Timer race is harmless here */
620 on_each_cpu(mce_init, NULL, 1, 1);
Tim Hockin8a336b02007-05-02 19:27:19 +0200621 next_interval = check_interval * HZ;
622 if (next_interval)
623 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624}
625
626static struct sysdev_class mce_sysclass = {
627 .resume = mce_resume,
628 set_kset_name("machinecheck"),
629};
630
Jacob Shinfff2e892006-06-26 13:58:50 +0200631DEFINE_PER_CPU(struct sys_device, device_mce);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632
633/* Why are there no generic functions for this? */
634#define ACCESSOR(name, var, start) \
635 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
636 return sprintf(buf, "%lx\n", (unsigned long)var); \
637 } \
638 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
639 char *end; \
640 unsigned long new = simple_strtoul(buf, &end, 0); \
641 if (end == buf) return -EINVAL; \
642 var = new; \
643 start; \
644 return end-buf; \
645 } \
646 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
647
Andi Kleena98f0dd2007-02-13 13:26:23 +0100648/* TBD should generate these dynamically based on number of available banks */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649ACCESSOR(bank0ctl,bank[0],mce_restart())
650ACCESSOR(bank1ctl,bank[1],mce_restart())
651ACCESSOR(bank2ctl,bank[2],mce_restart())
652ACCESSOR(bank3ctl,bank[3],mce_restart())
653ACCESSOR(bank4ctl,bank[4],mce_restart())
Shaohua Li73ca5352006-01-11 22:43:06 +0100654ACCESSOR(bank5ctl,bank[5],mce_restart())
Andi Kleena98f0dd2007-02-13 13:26:23 +0100655
656static ssize_t show_trigger(struct sys_device *s, char *buf)
657{
658 strcpy(buf, trigger);
659 strcat(buf, "\n");
660 return strlen(trigger) + 1;
661}
662
663static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
664{
665 char *p;
666 int len;
667 strncpy(trigger, buf, sizeof(trigger));
668 trigger[sizeof(trigger)-1] = 0;
669 len = strlen(trigger);
670 p = strchr(trigger, '\n');
671 if (*p) *p = 0;
672 return len;
673}
674
675static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676ACCESSOR(tolerant,tolerant,)
677ACCESSOR(check_interval,check_interval,mce_restart())
Andi Kleena98f0dd2007-02-13 13:26:23 +0100678static struct sysdev_attribute *mce_attributes[] = {
679 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
680 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
681 &attr_tolerant, &attr_check_interval, &attr_trigger,
682 NULL
683};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700684
Andi Kleen91c6d402005-07-28 21:15:39 -0700685/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
686static __cpuinit int mce_create_device(unsigned int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687{
688 int err;
Shaohua Li73ca5352006-01-11 22:43:06 +0100689 int i;
Andi Kleen91c6d402005-07-28 21:15:39 -0700690 if (!mce_available(&cpu_data[cpu]))
691 return -EIO;
692
693 per_cpu(device_mce,cpu).id = cpu;
694 per_cpu(device_mce,cpu).cls = &mce_sysclass;
695
696 err = sysdev_register(&per_cpu(device_mce,cpu));
697
698 if (!err) {
Andi Kleena98f0dd2007-02-13 13:26:23 +0100699 for (i = 0; mce_attributes[i]; i++)
Shaohua Li73ca5352006-01-11 22:43:06 +0100700 sysdev_create_file(&per_cpu(device_mce,cpu),
Andi Kleena98f0dd2007-02-13 13:26:23 +0100701 mce_attributes[i]);
Andi Kleen91c6d402005-07-28 21:15:39 -0700702 }
703 return err;
704}
705
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700706static void mce_remove_device(unsigned int cpu)
Andi Kleen91c6d402005-07-28 21:15:39 -0700707{
Shaohua Li73ca5352006-01-11 22:43:06 +0100708 int i;
709
Andi Kleena98f0dd2007-02-13 13:26:23 +0100710 for (i = 0; mce_attributes[i]; i++)
Shaohua Li73ca5352006-01-11 22:43:06 +0100711 sysdev_remove_file(&per_cpu(device_mce,cpu),
Andi Kleena98f0dd2007-02-13 13:26:23 +0100712 mce_attributes[i]);
Andi Kleen91c6d402005-07-28 21:15:39 -0700713 sysdev_unregister(&per_cpu(device_mce,cpu));
Rafael J. Wysockid4c45712006-12-07 02:14:12 +0100714 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
Andi Kleen91c6d402005-07-28 21:15:39 -0700715}
Andi Kleen91c6d402005-07-28 21:15:39 -0700716
717/* Get notified when a cpu comes on/off. Be hotplug friendly. */
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700718static int
Andi Kleen91c6d402005-07-28 21:15:39 -0700719mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
720{
721 unsigned int cpu = (unsigned long)hcpu;
722
723 switch (action) {
724 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700725 case CPU_ONLINE_FROZEN:
Andi Kleen91c6d402005-07-28 21:15:39 -0700726 mce_create_device(cpu);
727 break;
Andi Kleen91c6d402005-07-28 21:15:39 -0700728 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700729 case CPU_DEAD_FROZEN:
Andi Kleen91c6d402005-07-28 21:15:39 -0700730 mce_remove_device(cpu);
731 break;
Andi Kleen91c6d402005-07-28 21:15:39 -0700732 }
733 return NOTIFY_OK;
734}
735
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700736static struct notifier_block mce_cpu_notifier = {
Andi Kleen91c6d402005-07-28 21:15:39 -0700737 .notifier_call = mce_cpu_callback,
738};
739
740static __init int mce_init_device(void)
741{
742 int err;
743 int i = 0;
744
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745 if (!mce_available(&boot_cpu_data))
746 return -EIO;
747 err = sysdev_class_register(&mce_sysclass);
Andi Kleen91c6d402005-07-28 21:15:39 -0700748
749 for_each_online_cpu(i) {
750 mce_create_device(i);
751 }
752
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700753 register_hotcpu_notifier(&mce_cpu_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754 misc_register(&mce_log_device);
755 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700756}
Andi Kleen91c6d402005-07-28 21:15:39 -0700757
Linus Torvalds1da177e2005-04-16 15:20:36 -0700758device_initcall(mce_init_device);