3
3
* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4
4
* Rest from unknown author(s).
5
5
* 2004 Andi Kleen. Rewrote most of it.
6
+ * Copyright 2008 Intel Corporation
7
+ * Author: Andi Kleen
6
8
*/
7
9
8
10
#include <linux/init.h>
@@ -189,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
189
191
}
190
192
191
193
/*
192
- * The actual machine check handler
194
+ * Poll for corrected events or events that happened before reset.
195
+ * Those are just logged through /dev/mcelog.
196
+ *
197
+ * This is executed in standard interrupt context.
198
+ */
199
+ void machine_check_poll (enum mcp_flags flags )
200
+ {
201
+ struct mce m ;
202
+ int i ;
203
+
204
+ mce_setup (& m );
205
+
206
+ rdmsrl (MSR_IA32_MCG_STATUS , m .mcgstatus );
207
+ for (i = 0 ; i < banks ; i ++ ) {
208
+ if (!bank [i ])
209
+ continue ;
210
+
211
+ m .misc = 0 ;
212
+ m .addr = 0 ;
213
+ m .bank = i ;
214
+ m .tsc = 0 ;
215
+
216
+ barrier ();
217
+ rdmsrl (MSR_IA32_MC0_STATUS + i * 4 , m .status );
218
+ if (!(m .status & MCI_STATUS_VAL ))
219
+ continue ;
220
+
221
+ /*
222
+ * Uncorrected events are handled by the exception handler
223
+ * when it is enabled. But when the exception is disabled log
224
+ * everything.
225
+ *
226
+ * TBD do the same check for MCI_STATUS_EN here?
227
+ */
228
+ if ((m .status & MCI_STATUS_UC ) && !(flags & MCP_UC ))
229
+ continue ;
230
+
231
+ if (m .status & MCI_STATUS_MISCV )
232
+ rdmsrl (MSR_IA32_MC0_MISC + i * 4 , m .misc );
233
+ if (m .status & MCI_STATUS_ADDRV )
234
+ rdmsrl (MSR_IA32_MC0_ADDR + i * 4 , m .addr );
235
+
236
+ if (!(flags & MCP_TIMESTAMP ))
237
+ m .tsc = 0 ;
238
+ /*
239
+ * Don't get the IP here because it's unlikely to
240
+ * have anything to do with the actual error location.
241
+ */
242
+
243
+ mce_log (& m );
244
+ add_taint (TAINT_MACHINE_CHECK );
245
+
246
+ /*
247
+ * Clear state for this bank.
248
+ */
249
+ wrmsrl (MSR_IA32_MC0_STATUS + 4 * i , 0 );
250
+ }
251
+
252
+ /*
253
+ * Don't clear MCG_STATUS here because it's only defined for
254
+ * exceptions.
255
+ */
256
+ }
257
+
258
+ /*
259
+ * The actual machine check handler. This only handles real
260
+ * exceptions when something got corrupted coming in through int 18.
261
+ *
262
+ * This is executed in NMI context not subject to normal locking rules. This
263
+ * implies that most kernel services cannot be safely used. Don't even
264
+ * think about putting a printk in there!
193
265
*/
194
266
void do_machine_check (struct pt_regs * regs , long error_code )
195
267
{
@@ -207,13 +279,14 @@ void do_machine_check(struct pt_regs * regs, long error_code)
207
279
* error.
208
280
*/
209
281
int kill_it = 0 ;
282
+ DECLARE_BITMAP (toclear , MAX_NR_BANKS );
210
283
211
284
atomic_inc (& mce_entry );
212
285
213
- if ((regs
214
- && notify_die (DIE_NMI , "machine check" , regs , error_code ,
286
+ if (notify_die (DIE_NMI , "machine check" , regs , error_code ,
215
287
18 , SIGKILL ) == NOTIFY_STOP )
216
- || !banks )
288
+ goto out2 ;
289
+ if (!banks )
217
290
goto out2 ;
218
291
219
292
mce_setup (& m );
@@ -227,6 +300,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
227
300
barrier ();
228
301
229
302
for (i = 0 ; i < banks ; i ++ ) {
303
+ __clear_bit (i , toclear );
230
304
if (!bank [i ])
231
305
continue ;
232
306
@@ -238,6 +312,20 @@ void do_machine_check(struct pt_regs * regs, long error_code)
238
312
if ((m .status & MCI_STATUS_VAL ) == 0 )
239
313
continue ;
240
314
315
+ /*
316
+ * Non uncorrected errors are handled by machine_check_poll
317
+ * Leave them alone.
318
+ */
319
+ if ((m .status & MCI_STATUS_UC ) == 0 )
320
+ continue ;
321
+
322
+ /*
323
+ * Set taint even when machine check was not enabled.
324
+ */
325
+ add_taint (TAINT_MACHINE_CHECK );
326
+
327
+ __set_bit (i , toclear );
328
+
241
329
if (m .status & MCI_STATUS_EN ) {
242
330
/* if PCC was set, there's no way out */
243
331
no_way_out |= !!(m .status & MCI_STATUS_PCC );
@@ -251,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
251
339
no_way_out = 1 ;
252
340
kill_it = 1 ;
253
341
}
342
+ } else {
343
+ /*
344
+ * Machine check event was not enabled. Clear, but
345
+ * ignore.
346
+ */
347
+ continue ;
254
348
}
255
349
256
350
if (m .status & MCI_STATUS_MISCV )
@@ -259,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
259
353
rdmsrl (MSR_IA32_MC0_ADDR + i * 4 , m .addr );
260
354
261
355
mce_get_rip (& m , regs );
262
- if (error_code < 0 )
263
- m .tsc = 0 ;
264
- if (error_code != -2 )
265
- mce_log (& m );
356
+ mce_log (& m );
266
357
267
358
/* Did this bank cause the exception? */
268
359
/* Assume that the bank with uncorrectable errors did it,
@@ -271,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
271
362
panicm = m ;
272
363
panicm_found = 1 ;
273
364
}
274
-
275
- add_taint (TAINT_MACHINE_CHECK );
276
365
}
277
366
278
- /* Never do anything final in the polling timer */
279
- if (!regs )
280
- goto out ;
281
-
282
367
/* If we didn't find an uncorrectable error, pick
283
368
the last one (shouldn't happen, just being safe). */
284
369
if (!panicm_found )
@@ -325,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
325
410
/* notify userspace ASAP */
326
411
set_thread_flag (TIF_MCE_NOTIFY );
327
412
328
- out :
329
413
/* the last thing we do is clear state */
330
- for (i = 0 ; i < banks ; i ++ )
331
- wrmsrl (MSR_IA32_MC0_STATUS + 4 * i , 0 );
414
+ for (i = 0 ; i < banks ; i ++ ) {
415
+ if (test_bit (i , toclear ))
416
+ wrmsrl (MSR_IA32_MC0_STATUS + 4 * i , 0 );
417
+ }
332
418
wrmsrl (MSR_IA32_MCG_STATUS , 0 );
333
419
out2 :
334
420
atomic_dec (& mce_entry );
@@ -377,7 +463,7 @@ static void mcheck_timer(unsigned long data)
377
463
WARN_ON (smp_processor_id () != data );
378
464
379
465
if (mce_available (& current_cpu_data ))
380
- do_machine_check ( NULL , 0 );
466
+ machine_check_poll ( MCP_TIMESTAMP );
381
467
382
468
/*
383
469
* Alert userspace if needed. If we logged an MCE, reduce the
@@ -494,9 +580,10 @@ static void mce_init(void *dummy)
494
580
u64 cap ;
495
581
int i ;
496
582
497
- /* Log the machine checks left over from the previous reset.
498
- This also clears all registers */
499
- do_machine_check (NULL , mce_bootlog ? -1 : -2 );
583
+ /*
584
+ * Log the machine checks left over from the previous reset.
585
+ */
586
+ machine_check_poll (MCP_UC );
500
587
501
588
set_in_cr4 (X86_CR4_MCE );
502
589
0 commit comments