blob: 07bbfe7aa7f70efa86fceb4adb87139296efad5c [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
Randy Dunlapa9415642006-01-11 12:17:48 -080018#include <linux/capability.h>
Andi Kleen91c6d402005-07-28 21:15:39 -070019#include <linux/cpu.h>
20#include <linux/percpu.h>
Tim Hockine02e68d2007-07-21 17:10:36 +020021#include <linux/poll.h>
22#include <linux/thread_info.h>
Andi Kleen8c566ef2005-09-12 18:49:24 +020023#include <linux/ctype.h>
Andi Kleena98f0dd2007-02-13 13:26:23 +010024#include <linux/kmod.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070025#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026#include <asm/processor.h>
27#include <asm/msr.h>
28#include <asm/mce.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070029#include <asm/uaccess.h>
Andi Kleen0a9c3ee2006-01-11 22:46:54 +010030#include <asm/smp.h>
Tim Hockine02e68d2007-07-21 17:10:36 +020031#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032
33#define MISC_MCELOG_MINOR 227
Shaohua Li73ca5352006-01-11 22:43:06 +010034#define NR_BANKS 6
Linus Torvalds1da177e2005-04-16 15:20:36 -070035
Andi Kleen553f2652006-04-07 19:49:57 +020036atomic_t mce_entry;
37
Linus Torvalds1da177e2005-04-16 15:20:36 -070038static int mce_dont_init;
39
Tim Hockinbd784322007-07-21 17:10:37 +020040/*
41 * Tolerant levels:
42 * 0: always panic on uncorrected errors, log corrected errors
43 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
44 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
45 * 3: never panic or SIGBUS, log all errors (for testing only)
46 */
Linus Torvalds1da177e2005-04-16 15:20:36 -070047static int tolerant = 1;
48static int banks;
49static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
Tim Hockine02e68d2007-07-21 17:10:36 +020050static unsigned long notify_user;
Andi Kleen94ad8472005-04-16 15:25:09 -070051static int rip_msr;
Andi Kleene5835382005-11-05 17:25:54 +010052static int mce_bootlog = 1;
Andi Kleena98f0dd2007-02-13 13:26:23 +010053static atomic_t mce_events;
54
55static char trigger[128];
56static char *trigger_argv[2] = { trigger, NULL };
Linus Torvalds1da177e2005-04-16 15:20:36 -070057
Tim Hockine02e68d2007-07-21 17:10:36 +020058static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
59
Linus Torvalds1da177e2005-04-16 15:20:36 -070060/*
61 * Lockless MCE logging infrastructure.
62 * This avoids deadlocks on printk locks without having to break locks. Also
63 * separate MCEs from kernel messages to avoid bogus bug reports.
64 */
65
66struct mce_log mcelog = {
67 MCE_LOG_SIGNATURE,
68 MCE_LOG_LEN,
69};
70
71void mce_log(struct mce *mce)
72{
73 unsigned next, entry;
Andi Kleena98f0dd2007-02-13 13:26:23 +010074 atomic_inc(&mce_events);
Linus Torvalds1da177e2005-04-16 15:20:36 -070075 mce->finished = 0;
Mike Waychison76441432005-09-30 00:01:27 +020076 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070077 for (;;) {
78 entry = rcu_dereference(mcelog.next);
Andi Kleen673242c2005-09-12 18:49:24 +020079 for (;;) {
80 /* When the buffer fills up discard new entries. Assume
81 that the earlier errors are the more interesting. */
82 if (entry >= MCE_LOG_LEN) {
83 set_bit(MCE_OVERFLOW, &mcelog.flags);
84 return;
85 }
86 /* Old left over entry. Skip. */
87 if (mcelog.entry[entry].finished) {
88 entry++;
89 continue;
90 }
Mike Waychison76441432005-09-30 00:01:27 +020091 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -070092 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070093 smp_rmb();
94 next = entry + 1;
95 if (cmpxchg(&mcelog.next, entry, next) == entry)
96 break;
97 }
98 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
Mike Waychison76441432005-09-30 00:01:27 +020099 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100 mcelog.entry[entry].finished = 1;
Mike Waychison76441432005-09-30 00:01:27 +0200101 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700102
Tim Hockine02e68d2007-07-21 17:10:36 +0200103 set_bit(0, &notify_user);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104}
105
106static void print_mce(struct mce *m)
107{
108 printk(KERN_EMERG "\n"
Andi Kleen48551702006-01-11 22:44:48 +0100109 KERN_EMERG "HARDWARE ERROR\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110 KERN_EMERG
111 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
112 m->cpu, m->mcgstatus, m->bank, m->status);
113 if (m->rip) {
114 printk(KERN_EMERG
115 "RIP%s %02x:<%016Lx> ",
116 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
117 m->cs, m->rip);
118 if (m->cs == __KERNEL_CS)
119 print_symbol("{%s}", m->rip);
120 printk("\n");
121 }
122 printk(KERN_EMERG "TSC %Lx ", m->tsc);
123 if (m->addr)
124 printk("ADDR %Lx ", m->addr);
125 if (m->misc)
126 printk("MISC %Lx ", m->misc);
127 printk("\n");
Andi Kleen48551702006-01-11 22:44:48 +0100128 printk(KERN_EMERG "This is not a software problem!\n");
129 printk(KERN_EMERG
130 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131}
132
133static void mce_panic(char *msg, struct mce *backup, unsigned long start)
134{
135 int i;
Tim Hockine02e68d2007-07-21 17:10:36 +0200136
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137 oops_begin();
138 for (i = 0; i < MCE_LOG_LEN; i++) {
139 unsigned long tsc = mcelog.entry[i].tsc;
140 if (time_before(tsc, start))
141 continue;
142 print_mce(&mcelog.entry[i]);
143 if (backup && mcelog.entry[i].tsc == backup->tsc)
144 backup = NULL;
145 }
146 if (backup)
147 print_mce(backup);
Tim Hockine02e68d2007-07-21 17:10:36 +0200148 panic(msg);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149}
150
151static int mce_available(struct cpuinfo_x86 *c)
152{
Akinobu Mita3d1712c2006-03-24 03:15:11 -0800153 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154}
155
Andi Kleen94ad8472005-04-16 15:25:09 -0700156static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
157{
158 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
159 m->rip = regs->rip;
160 m->cs = regs->cs;
161 } else {
162 m->rip = 0;
163 m->cs = 0;
164 }
165 if (rip_msr) {
166 /* Assume the RIP in the MSR is exact. Is this true? */
167 m->mcgstatus |= MCG_STATUS_EIPV;
168 rdmsrl(rip_msr, m->rip);
169 m->cs = 0;
170 }
171}
172
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173/*
174 * The actual machine check handler
175 */
176
177void do_machine_check(struct pt_regs * regs, long error_code)
178{
179 struct mce m, panicm;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 u64 mcestart = 0;
181 int i;
182 int panicm_found = 0;
Tim Hockinbd784322007-07-21 17:10:37 +0200183 /*
184 * If no_way_out gets set, there is no safe way to recover from this
185 * MCE. If tolerant is cranked up, we'll try anyway.
186 */
187 int no_way_out = 0;
188 /*
189 * If kill_it gets set, there might be a way to recover from this
190 * error.
191 */
192 int kill_it = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193
Andi Kleen553f2652006-04-07 19:49:57 +0200194 atomic_inc(&mce_entry);
195
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 if (regs)
Jan Beulich6e3f3612006-01-11 22:42:14 +0100197 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 if (!banks)
Andi Kleen553f2652006-04-07 19:49:57 +0200199 goto out2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200
201 memset(&m, 0, sizeof(struct mce));
Andi Kleen151f8cc2006-09-26 10:52:37 +0200202 m.cpu = smp_processor_id();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
Tim Hockinbd784322007-07-21 17:10:37 +0200204 /* if the restart IP is not valid, we're done for */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205 if (!(m.mcgstatus & MCG_STATUS_RIPV))
Tim Hockinbd784322007-07-21 17:10:37 +0200206 no_way_out = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207
208 rdtscll(mcestart);
209 barrier();
210
211 for (i = 0; i < banks; i++) {
212 if (!bank[i])
213 continue;
214
215 m.misc = 0;
216 m.addr = 0;
217 m.bank = i;
218 m.tsc = 0;
219
220 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
221 if ((m.status & MCI_STATUS_VAL) == 0)
222 continue;
223
224 if (m.status & MCI_STATUS_EN) {
Tim Hockinbd784322007-07-21 17:10:37 +0200225 /* if PCC was set, there's no way out */
226 no_way_out |= !!(m.status & MCI_STATUS_PCC);
227 /*
228 * If this error was uncorrectable and there was
229 * an overflow, we're in trouble. If no overflow,
230 * we might get away with just killing a task.
231 */
232 if (m.status & MCI_STATUS_UC) {
233 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
234 no_way_out = 1;
235 kill_it = 1;
236 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 }
238
239 if (m.status & MCI_STATUS_MISCV)
240 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
241 if (m.status & MCI_STATUS_ADDRV)
242 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
243
Andi Kleen94ad8472005-04-16 15:25:09 -0700244 mce_get_rip(&m, regs);
Andi Kleend5172f22005-08-07 09:42:07 -0700245 if (error_code >= 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246 rdtscll(m.tsc);
Andi Kleend5172f22005-08-07 09:42:07 -0700247 if (error_code != -2)
248 mce_log(&m);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249
250 /* Did this bank cause the exception? */
251 /* Assume that the bank with uncorrectable errors did it,
252 and that there is only a single one. */
253 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
254 panicm = m;
255 panicm_found = 1;
256 }
257
Randy Dunlap9f158332005-09-13 01:25:16 -0700258 add_taint(TAINT_MACHINE_CHECK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259 }
260
261 /* Never do anything final in the polling timer */
Tim Hockine02e68d2007-07-21 17:10:36 +0200262 if (!regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263 goto out;
264
265 /* If we didn't find an uncorrectable error, pick
266 the last one (shouldn't happen, just being safe). */
267 if (!panicm_found)
268 panicm = m;
Tim Hockinbd784322007-07-21 17:10:37 +0200269
270 /*
271 * If we have decided that we just CAN'T continue, and the user
272 * has not set tolerant to an insane level, give up and die.
273 */
274 if (no_way_out && tolerant < 3)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275 mce_panic("Machine check", &panicm, mcestart);
Tim Hockinbd784322007-07-21 17:10:37 +0200276
277 /*
278 * If the error seems to be unrecoverable, something should be
279 * done. Try to kill as little as possible. If we can kill just
280 * one task, do that. If the user has set the tolerance very
281 * high, don't try to do anything at all.
282 */
283 if (kill_it && tolerant < 3) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 int user_space = 0;
285
Tim Hockinbd784322007-07-21 17:10:37 +0200286 /*
287 * If the EIPV bit is set, it means the saved IP is the
288 * instruction which caused the MCE.
289 */
290 if (m.mcgstatus & MCG_STATUS_EIPV)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291 user_space = panicm.rip && (panicm.cs & 3);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292
Tim Hockinbd784322007-07-21 17:10:37 +0200293 /*
294 * If we know that the error was in user space, send a
295 * SIGBUS. Otherwise, panic if tolerance is low.
296 *
297 * do_exit() takes an awful lot of locks and has a slight
298 * risk of deadlocking.
299 */
300 if (user_space) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 do_exit(SIGBUS);
Tim Hockinbd784322007-07-21 17:10:37 +0200302 } else if (panic_on_oops || tolerant < 2) {
303 mce_panic("Uncorrected machine check",
304 &panicm, mcestart);
305 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306 }
307
Tim Hockine02e68d2007-07-21 17:10:36 +0200308 /* notify userspace ASAP */
309 set_thread_flag(TIF_MCE_NOTIFY);
310
Linus Torvalds1da177e2005-04-16 15:20:36 -0700311 out:
Tim Hockinbd784322007-07-21 17:10:37 +0200312 /* the last thing we do is clear state */
313 for (i = 0; i < banks; i++)
314 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315 wrmsrl(MSR_IA32_MCG_STATUS, 0);
Andi Kleen553f2652006-04-07 19:49:57 +0200316 out2:
317 atomic_dec(&mce_entry);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318}
319
Dmitriy Zavin15d5f832006-09-26 10:52:42 +0200320#ifdef CONFIG_X86_MCE_INTEL
321/***
322 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
Simon Arlott676b1852007-10-20 01:25:36 +0200323 * @cpu: The CPU on which the event occurred.
Dmitriy Zavin15d5f832006-09-26 10:52:42 +0200324 * @status: Event status information
325 *
326 * This function should be called by the thermal interrupt after the
327 * event has been processed and the decision was made to log the event
328 * further.
329 *
330 * The status parameter will be saved to the 'status' field of 'struct mce'
331 * and historically has been the register value of the
332 * MSR_IA32_THERMAL_STATUS (Intel) msr.
333 */
334void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
335{
336 struct mce m;
337
338 memset(&m, 0, sizeof(m));
339 m.cpu = cpu;
340 m.bank = MCE_THERMAL_BANK;
341 m.status = status;
342 rdtscll(m.tsc);
343 mce_log(&m);
344}
345#endif /* CONFIG_X86_MCE_INTEL */
346
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347/*
Tim Hockin8a336b02007-05-02 19:27:19 +0200348 * Periodic polling timer for "silent" machine check errors. If the
349 * poller finds an MCE, poll 2x faster. When the poller finds no more
350 * errors, poll 2x slower (up to check_interval seconds).
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 */
352
353static int check_interval = 5 * 60; /* 5 minutes */
Tim Hockin8a336b02007-05-02 19:27:19 +0200354static int next_interval; /* in jiffies */
David Howells65f27f32006-11-22 14:55:48 +0000355static void mcheck_timer(struct work_struct *work);
356static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357
358static void mcheck_check_cpu(void *info)
359{
360 if (mce_available(&current_cpu_data))
361 do_machine_check(NULL, 0);
362}
363
David Howells65f27f32006-11-22 14:55:48 +0000364static void mcheck_timer(struct work_struct *work)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365{
366 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367
368 /*
Tim Hockine02e68d2007-07-21 17:10:36 +0200369 * Alert userspace if needed. If we logged an MCE, reduce the
370 * polling interval, otherwise increase the polling interval.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700371 */
Tim Hockine02e68d2007-07-21 17:10:36 +0200372 if (mce_notify_user()) {
Tim Hockin8a336b02007-05-02 19:27:19 +0200373 next_interval = max(next_interval/2, HZ/100);
Tim Hockin8a336b02007-05-02 19:27:19 +0200374 } else {
Venki Pallipadi22293e52007-07-21 17:10:44 +0200375 next_interval = min(next_interval*2,
376 (int)round_jiffies_relative(check_interval*HZ));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377 }
Tim Hockin8a336b02007-05-02 19:27:19 +0200378
379 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380}
381
Tim Hockine02e68d2007-07-21 17:10:36 +0200382/*
383 * This is only called from process context. This is where we do
384 * anything we need to alert userspace about new MCEs. This is called
385 * directly from the poller and also from entry.S and idle, thanks to
386 * TIF_MCE_NOTIFY.
387 */
388int mce_notify_user(void)
389{
390 clear_thread_flag(TIF_MCE_NOTIFY);
391 if (test_and_clear_bit(0, &notify_user)) {
392 static unsigned long last_print;
393 unsigned long now = jiffies;
394
395 wake_up_interruptible(&mce_wait);
396 if (trigger[0])
397 call_usermodehelper(trigger, trigger_argv, NULL,
398 UMH_NO_WAIT);
399
400 if (time_after_eq(now, last_print + (check_interval*HZ))) {
401 last_print = now;
402 printk(KERN_INFO "Machine check events logged\n");
403 }
404
405 return 1;
406 }
407 return 0;
408}
409
410/* see if the idle task needs to notify userspace */
411static int
412mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
413{
414 /* IDLE_END should be safe - interrupts are back on */
415 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
416 mce_notify_user();
417
418 return NOTIFY_OK;
419}
420
421static struct notifier_block mce_idle_notifier = {
422 .notifier_call = mce_idle_callback,
423};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700424
425static __init int periodic_mcheck_init(void)
426{
Tim Hockin8a336b02007-05-02 19:27:19 +0200427 next_interval = check_interval * HZ;
428 if (next_interval)
Venki Pallipadi22293e52007-07-21 17:10:44 +0200429 schedule_delayed_work(&mcheck_work,
430 round_jiffies_relative(next_interval));
Tim Hockine02e68d2007-07-21 17:10:36 +0200431 idle_notifier_register(&mce_idle_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432 return 0;
433}
434__initcall(periodic_mcheck_init);
435
436
437/*
438 * Initialize Machine Checks for a CPU.
439 */
440static void mce_init(void *dummy)
441{
442 u64 cap;
443 int i;
444
445 rdmsrl(MSR_IA32_MCG_CAP, cap);
446 banks = cap & 0xff;
447 if (banks > NR_BANKS) {
448 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
449 banks = NR_BANKS;
450 }
Andi Kleen94ad8472005-04-16 15:25:09 -0700451 /* Use accurate RIP reporting if available. */
452 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
453 rip_msr = MSR_IA32_MCG_EIP;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454
455 /* Log the machine checks left over from the previous reset.
456 This also clears all registers */
Andi Kleend5172f22005-08-07 09:42:07 -0700457 do_machine_check(NULL, mce_bootlog ? -1 : -2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458
459 set_in_cr4(X86_CR4_MCE);
460
461 if (cap & MCG_CTL_P)
462 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
463
464 for (i = 0; i < banks; i++) {
465 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
466 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
467 }
468}
469
470/* Add per CPU specific workarounds here */
Ashok Raje6982c62005-06-25 14:54:58 -0700471static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472{
473 /* This should be disabled by the BIOS, but isn't always */
474 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
475 /* disable GART TBL walk error reporting, which trips off
476 incorrectly with the IOMMU & 3ware & Cerberus. */
477 clear_bit(10, &bank[4]);
Andi Kleene5835382005-11-05 17:25:54 +0100478 /* Lots of broken BIOS around that don't clear them
479 by default and leave crap in there. Don't log. */
480 mce_bootlog = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481 }
Andi Kleene5835382005-11-05 17:25:54 +0100482
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483}
484
Ashok Raje6982c62005-06-25 14:54:58 -0700485static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486{
487 switch (c->x86_vendor) {
488 case X86_VENDOR_INTEL:
489 mce_intel_feature_init(c);
490 break;
Jacob Shin89b831e2005-11-05 17:25:53 +0100491 case X86_VENDOR_AMD:
492 mce_amd_feature_init(c);
493 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494 default:
495 break;
496 }
497}
498
499/*
500 * Called for each booted CPU to set up machine checks.
501 * Must be called with preempt off.
502 */
Ashok Raje6982c62005-06-25 14:54:58 -0700503void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700504{
Ashok Raj7ded5682006-02-03 21:51:23 +0100505 static cpumask_t mce_cpus = CPU_MASK_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506
507 mce_cpu_quirks(c);
508
509 if (mce_dont_init ||
510 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
511 !mce_available(c))
512 return;
513
514 mce_init(NULL);
515 mce_cpu_features(c);
516}
517
518/*
519 * Character device to read and clear the MCE log.
520 */
521
Tim Hockinf528e7b2007-07-21 17:10:35 +0200522static DEFINE_SPINLOCK(mce_state_lock);
523static int open_count; /* #times opened */
524static int open_exclu; /* already open exclusive? */
525
526static int mce_open(struct inode *inode, struct file *file)
527{
528 spin_lock(&mce_state_lock);
529
530 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
531 spin_unlock(&mce_state_lock);
532 return -EBUSY;
533 }
534
535 if (file->f_flags & O_EXCL)
536 open_exclu = 1;
537 open_count++;
538
539 spin_unlock(&mce_state_lock);
540
Tim Hockinbd784322007-07-21 17:10:37 +0200541 return nonseekable_open(inode, file);
Tim Hockinf528e7b2007-07-21 17:10:35 +0200542}
543
544static int mce_release(struct inode *inode, struct file *file)
545{
546 spin_lock(&mce_state_lock);
547
548 open_count--;
549 open_exclu = 0;
550
551 spin_unlock(&mce_state_lock);
552
553 return 0;
554}
555
Linus Torvalds1da177e2005-04-16 15:20:36 -0700556static void collect_tscs(void *data)
557{
558 unsigned long *cpu_tsc = (unsigned long *)data;
559 rdtscll(cpu_tsc[smp_processor_id()]);
560}
561
562static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
563{
Andi Kleenf0de53b2005-04-16 15:25:10 -0700564 unsigned long *cpu_tsc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700565 static DECLARE_MUTEX(mce_read_sem);
566 unsigned next;
567 char __user *buf = ubuf;
568 int i, err;
569
Andi Kleenf0de53b2005-04-16 15:25:10 -0700570 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
571 if (!cpu_tsc)
572 return -ENOMEM;
573
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574 down(&mce_read_sem);
575 next = rcu_dereference(mcelog.next);
576
577 /* Only supports full reads right now */
578 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
579 up(&mce_read_sem);
Andi Kleenf0de53b2005-04-16 15:25:10 -0700580 kfree(cpu_tsc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700581 return -EINVAL;
582 }
583
584 err = 0;
Andi Kleen673242c2005-09-12 18:49:24 +0200585 for (i = 0; i < next; i++) {
586 unsigned long start = jiffies;
587 while (!mcelog.entry[i].finished) {
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700588 if (time_after_eq(jiffies, start + 2)) {
Andi Kleen673242c2005-09-12 18:49:24 +0200589 memset(mcelog.entry + i,0, sizeof(struct mce));
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700590 goto timeout;
Andi Kleen673242c2005-09-12 18:49:24 +0200591 }
592 cpu_relax();
593 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594 smp_rmb();
595 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
596 buf += sizeof(struct mce);
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700597 timeout:
598 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599 }
600
601 memset(mcelog.entry, 0, next * sizeof(struct mce));
602 mcelog.next = 0;
603
Paul E. McKenneyb2b18662005-06-25 14:55:38 -0700604 synchronize_sched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605
606 /* Collect entries that were still getting written before the synchronize. */
607
608 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
609 for (i = next; i < MCE_LOG_LEN; i++) {
610 if (mcelog.entry[i].finished &&
611 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
612 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
613 smp_rmb();
614 buf += sizeof(struct mce);
615 memset(&mcelog.entry[i], 0, sizeof(struct mce));
616 }
617 }
618 up(&mce_read_sem);
Andi Kleenf0de53b2005-04-16 15:25:10 -0700619 kfree(cpu_tsc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620 return err ? -EFAULT : buf - ubuf;
621}
622
Tim Hockine02e68d2007-07-21 17:10:36 +0200623static unsigned int mce_poll(struct file *file, poll_table *wait)
624{
625 poll_wait(file, &mce_wait, wait);
626 if (rcu_dereference(mcelog.next))
627 return POLLIN | POLLRDNORM;
628 return 0;
629}
630
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
632{
633 int __user *p = (int __user *)arg;
634 if (!capable(CAP_SYS_ADMIN))
635 return -EPERM;
636 switch (cmd) {
637 case MCE_GET_RECORD_LEN:
638 return put_user(sizeof(struct mce), p);
639 case MCE_GET_LOG_LEN:
640 return put_user(MCE_LOG_LEN, p);
641 case MCE_GETCLEAR_FLAGS: {
642 unsigned flags;
643 do {
644 flags = mcelog.flags;
645 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
646 return put_user(flags, p);
647 }
648 default:
649 return -ENOTTY;
650 }
651}
652
Arjan van de Ven5dfe4c92007-02-12 00:55:31 -0800653static const struct file_operations mce_chrdev_ops = {
Tim Hockinf528e7b2007-07-21 17:10:35 +0200654 .open = mce_open,
655 .release = mce_release,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656 .read = mce_read,
Tim Hockine02e68d2007-07-21 17:10:36 +0200657 .poll = mce_poll,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658 .ioctl = mce_ioctl,
659};
660
661static struct miscdevice mce_log_device = {
662 MISC_MCELOG_MINOR,
663 "mcelog",
664 &mce_chrdev_ops,
665};
666
Andi Kleen8f4e9562007-07-22 11:12:32 +0200667static unsigned long old_cr4 __initdata;
668
669void __init stop_mce(void)
670{
671 old_cr4 = read_cr4();
672 clear_in_cr4(X86_CR4_MCE);
673}
674
675void __init restart_mce(void)
676{
677 if (old_cr4 & X86_CR4_MCE)
678 set_in_cr4(X86_CR4_MCE);
679}
680
Linus Torvalds1da177e2005-04-16 15:20:36 -0700681/*
682 * Old style boot options parsing. Only for compatibility.
683 */
684
685static int __init mcheck_disable(char *str)
686{
687 mce_dont_init = 1;
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800688 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700689}
690
Simon Arlott676b1852007-10-20 01:25:36 +0200691/* mce=off disables machine check. Note you can re-enable it later
Andi Kleend5172f22005-08-07 09:42:07 -0700692 using sysfs.
Andi Kleen8c566ef2005-09-12 18:49:24 +0200693 mce=TOLERANCELEVEL (number, see above)
Andi Kleene5835382005-11-05 17:25:54 +0100694 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
695 mce=nobootlog Don't log MCEs from before booting. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700696static int __init mcheck_enable(char *str)
697{
698 if (!strcmp(str, "off"))
699 mce_dont_init = 1;
Andi Kleene5835382005-11-05 17:25:54 +0100700 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
701 mce_bootlog = str[0] == 'b';
Andi Kleen8c566ef2005-09-12 18:49:24 +0200702 else if (isdigit(str[0]))
703 get_option(&str, &tolerant);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704 else
705 printk("mce= argument %s ignored. Please use /sys", str);
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800706 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707}
708
709__setup("nomce", mcheck_disable);
Andi Kleen909dd322007-10-17 18:04:38 +0200710__setup("mce=", mcheck_enable);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711
712/*
713 * Sysfs support
714 */
715
Andi Kleen413588c2005-09-12 18:49:24 +0200716/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
717 Only one CPU is active at this time, the others get readded later using
718 CPU hotplug. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700719static int mce_resume(struct sys_device *dev)
720{
Andi Kleen413588c2005-09-12 18:49:24 +0200721 mce_init(NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722 return 0;
723}
724
725/* Reinit MCEs after user configuration changes */
726static void mce_restart(void)
727{
Tim Hockin8a336b02007-05-02 19:27:19 +0200728 if (next_interval)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729 cancel_delayed_work(&mcheck_work);
730 /* Timer race is harmless here */
731 on_each_cpu(mce_init, NULL, 1, 1);
Tim Hockin8a336b02007-05-02 19:27:19 +0200732 next_interval = check_interval * HZ;
733 if (next_interval)
Venki Pallipadi22293e52007-07-21 17:10:44 +0200734 schedule_delayed_work(&mcheck_work,
735 round_jiffies_relative(next_interval));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700736}
737
738static struct sysdev_class mce_sysclass = {
739 .resume = mce_resume,
740 set_kset_name("machinecheck"),
741};
742
Jacob Shinfff2e892006-06-26 13:58:50 +0200743DEFINE_PER_CPU(struct sys_device, device_mce);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744
745/* Why are there no generic functions for this? */
746#define ACCESSOR(name, var, start) \
747 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
748 return sprintf(buf, "%lx\n", (unsigned long)var); \
749 } \
750 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
751 char *end; \
752 unsigned long new = simple_strtoul(buf, &end, 0); \
753 if (end == buf) return -EINVAL; \
754 var = new; \
755 start; \
756 return end-buf; \
757 } \
758 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
759
Andi Kleena98f0dd2007-02-13 13:26:23 +0100760/* TBD should generate these dynamically based on number of available banks */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700761ACCESSOR(bank0ctl,bank[0],mce_restart())
762ACCESSOR(bank1ctl,bank[1],mce_restart())
763ACCESSOR(bank2ctl,bank[2],mce_restart())
764ACCESSOR(bank3ctl,bank[3],mce_restart())
765ACCESSOR(bank4ctl,bank[4],mce_restart())
Shaohua Li73ca5352006-01-11 22:43:06 +0100766ACCESSOR(bank5ctl,bank[5],mce_restart())
Andi Kleena98f0dd2007-02-13 13:26:23 +0100767
768static ssize_t show_trigger(struct sys_device *s, char *buf)
769{
770 strcpy(buf, trigger);
771 strcat(buf, "\n");
772 return strlen(trigger) + 1;
773}
774
775static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
776{
777 char *p;
778 int len;
779 strncpy(trigger, buf, sizeof(trigger));
780 trigger[sizeof(trigger)-1] = 0;
781 len = strlen(trigger);
782 p = strchr(trigger, '\n');
783 if (*p) *p = 0;
784 return len;
785}
786
787static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788ACCESSOR(tolerant,tolerant,)
789ACCESSOR(check_interval,check_interval,mce_restart())
Andi Kleena98f0dd2007-02-13 13:26:23 +0100790static struct sysdev_attribute *mce_attributes[] = {
791 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
792 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
793 &attr_tolerant, &attr_check_interval, &attr_trigger,
794 NULL
795};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700796
Andi Kleen91c6d402005-07-28 21:15:39 -0700797/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
798static __cpuinit int mce_create_device(unsigned int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799{
800 int err;
Shaohua Li73ca5352006-01-11 22:43:06 +0100801 int i;
Mike Travis92cb7612007-10-19 20:35:04 +0200802
803 if (!mce_available(&cpu_data(cpu)))
Andi Kleen91c6d402005-07-28 21:15:39 -0700804 return -EIO;
805
Akinobu Mitad435d862007-10-18 03:05:15 -0700806 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
Andi Kleen91c6d402005-07-28 21:15:39 -0700807 per_cpu(device_mce,cpu).id = cpu;
808 per_cpu(device_mce,cpu).cls = &mce_sysclass;
809
810 err = sysdev_register(&per_cpu(device_mce,cpu));
Akinobu Mitad435d862007-10-18 03:05:15 -0700811 if (err)
812 return err;
Andi Kleen91c6d402005-07-28 21:15:39 -0700813
Akinobu Mitad435d862007-10-18 03:05:15 -0700814 for (i = 0; mce_attributes[i]; i++) {
815 err = sysdev_create_file(&per_cpu(device_mce,cpu),
816 mce_attributes[i]);
817 if (err)
818 goto error;
Andi Kleen91c6d402005-07-28 21:15:39 -0700819 }
Akinobu Mitad435d862007-10-18 03:05:15 -0700820
821 return 0;
822error:
823 while (i--) {
824 sysdev_remove_file(&per_cpu(device_mce,cpu),
825 mce_attributes[i]);
826 }
827 sysdev_unregister(&per_cpu(device_mce,cpu));
828
Andi Kleen91c6d402005-07-28 21:15:39 -0700829 return err;
830}
831
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700832static void mce_remove_device(unsigned int cpu)
Andi Kleen91c6d402005-07-28 21:15:39 -0700833{
Shaohua Li73ca5352006-01-11 22:43:06 +0100834 int i;
835
Andi Kleena98f0dd2007-02-13 13:26:23 +0100836 for (i = 0; mce_attributes[i]; i++)
Shaohua Li73ca5352006-01-11 22:43:06 +0100837 sysdev_remove_file(&per_cpu(device_mce,cpu),
Andi Kleena98f0dd2007-02-13 13:26:23 +0100838 mce_attributes[i]);
Andi Kleen91c6d402005-07-28 21:15:39 -0700839 sysdev_unregister(&per_cpu(device_mce,cpu));
840}
Andi Kleen91c6d402005-07-28 21:15:39 -0700841
842/* Get notified when a cpu comes on/off. Be hotplug friendly. */
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700843static int
Andi Kleen91c6d402005-07-28 21:15:39 -0700844mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
845{
846 unsigned int cpu = (unsigned long)hcpu;
Akinobu Mitad435d862007-10-18 03:05:15 -0700847 int err = 0;
Andi Kleen91c6d402005-07-28 21:15:39 -0700848
849 switch (action) {
Akinobu Mitad435d862007-10-18 03:05:15 -0700850 case CPU_UP_PREPARE:
851 case CPU_UP_PREPARE_FROZEN:
852 err = mce_create_device(cpu);
Andi Kleen91c6d402005-07-28 21:15:39 -0700853 break;
Akinobu Mitad435d862007-10-18 03:05:15 -0700854 case CPU_UP_CANCELED:
855 case CPU_UP_CANCELED_FROZEN:
Andi Kleen91c6d402005-07-28 21:15:39 -0700856 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700857 case CPU_DEAD_FROZEN:
Andi Kleen91c6d402005-07-28 21:15:39 -0700858 mce_remove_device(cpu);
859 break;
Andi Kleen91c6d402005-07-28 21:15:39 -0700860 }
Akinobu Mitad435d862007-10-18 03:05:15 -0700861 return err ? NOTIFY_BAD : NOTIFY_OK;
Andi Kleen91c6d402005-07-28 21:15:39 -0700862}
863
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700864static struct notifier_block mce_cpu_notifier = {
Andi Kleen91c6d402005-07-28 21:15:39 -0700865 .notifier_call = mce_cpu_callback,
866};
867
868static __init int mce_init_device(void)
869{
870 int err;
871 int i = 0;
872
Linus Torvalds1da177e2005-04-16 15:20:36 -0700873 if (!mce_available(&boot_cpu_data))
874 return -EIO;
875 err = sysdev_class_register(&mce_sysclass);
Akinobu Mitad435d862007-10-18 03:05:15 -0700876 if (err)
877 return err;
Andi Kleen91c6d402005-07-28 21:15:39 -0700878
879 for_each_online_cpu(i) {
Akinobu Mitad435d862007-10-18 03:05:15 -0700880 err = mce_create_device(i);
881 if (err)
882 return err;
Andi Kleen91c6d402005-07-28 21:15:39 -0700883 }
884
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700885 register_hotcpu_notifier(&mce_cpu_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886 misc_register(&mce_log_device);
887 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888}
Andi Kleen91c6d402005-07-28 21:15:39 -0700889
Linus Torvalds1da177e2005-04-16 15:20:36 -0700890device_initcall(mce_init_device);