Skip to content

Commit 70c459d

Browse files
Zildj1ansuryasaimadhu
authored andcommitted
x86/mce: Simplify AMD severity grading logic
The MCE handler needs to understand the severity of the machine errors to act accordingly. Simplify the AMD grading logic following a logic that closely resembles the descriptions of the public PPR documents. This will help include more fine-grained grading of errors in the future. [ bp: Touchups. ] Signed-off-by: Carlos Bilbao <carlos.bilbao@amd.com> Signed-off-by: Borislav Petkov <bp@suse.de> Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com> Link: https://lore.kernel.org/r/20220405183212.354606-2-carlos.bilbao@amd.com
1 parent e5f2862 commit 70c459d

1 file changed

Lines changed: 36 additions & 65 deletions

File tree

arch/x86/kernel/cpu/mce/severity.c

Lines changed: 36 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -301,85 +301,56 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
301301
}
302302
}
303303

304-
static __always_inline int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
304+
/* See AMD PPR(s) section Machine Check Error Handling. */
305+
static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
305306
{
306-
u64 mcx_cfg;
307+
int ret;
307308

308309
/*
309-
* We need to look at the following bits:
310-
* - "succor" bit (data poisoning support), and
311-
* - TCC bit (Task Context Corrupt)
312-
* in MCi_STATUS to determine error severity.
310+
* Default return value: Action required, the error must be handled
311+
* immediately.
313312
*/
314-
if (!mce_flags.succor)
315-
return MCE_PANIC_SEVERITY;
316-
317-
mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank));
318-
319-
/* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
320-
if ((mcx_cfg & MCI_CONFIG_MCAX) &&
321-
(m->status & MCI_STATUS_TCC) &&
322-
(err_ctx == IN_KERNEL))
323-
return MCE_PANIC_SEVERITY;
324-
325-
/* ...otherwise invoke hwpoison handler. */
326-
return MCE_AR_SEVERITY;
327-
}
328-
329-
/*
330-
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
331-
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
332-
*/
333-
static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
334-
{
335-
enum context ctx = error_context(m, regs);
313+
ret = MCE_AR_SEVERITY;
336314

337315
/* Processor Context Corrupt, no need to fumble too much, die! */
338-
if (m->status & MCI_STATUS_PCC)
339-
return MCE_PANIC_SEVERITY;
340-
341-
if (m->status & MCI_STATUS_UC) {
342-
343-
if (ctx == IN_KERNEL)
344-
return MCE_PANIC_SEVERITY;
316+
if (m->status & MCI_STATUS_PCC) {
317+
ret = MCE_PANIC_SEVERITY;
318+
goto out;
319+
}
345320

346-
/*
347-
* On older systems where overflow_recov flag is not present, we
348-
* should simply panic if an error overflow occurs. If
349-
* overflow_recov flag is present and set, then software can try
350-
* to at least kill process to prolong system operation.
351-
*/
352-
if (mce_flags.overflow_recov) {
353-
if (mce_flags.smca)
354-
return mce_severity_amd_smca(m, ctx);
355-
356-
/* kill current process */
357-
return MCE_AR_SEVERITY;
358-
} else {
359-
/* at least one error was not logged */
360-
if (m->status & MCI_STATUS_OVER)
361-
return MCE_PANIC_SEVERITY;
362-
}
363-
364-
/*
365-
* For any other case, return MCE_UC_SEVERITY so that we log the
366-
* error and exit #MC handler.
367-
*/
368-
return MCE_UC_SEVERITY;
321+
if (m->status & MCI_STATUS_DEFERRED) {
322+
ret = MCE_DEFERRED_SEVERITY;
323+
goto out;
369324
}
370325

371326
/*
372-
* deferred error: poll handler catches these and adds to mce_ring so
373-
* memory-failure can take recovery actions.
327+
* If the UC bit is not set, the system either corrected or deferred
328+
* the error. No action will be required after logging the error.
374329
*/
375-
if (m->status & MCI_STATUS_DEFERRED)
376-
return MCE_DEFERRED_SEVERITY;
330+
if (!(m->status & MCI_STATUS_UC)) {
331+
ret = MCE_KEEP_SEVERITY;
332+
goto out;
333+
}
377334

378335
/*
379-
* corrected error: poll handler catches these and passes responsibility
380-
* of decoding the error to EDAC
336+
* On MCA overflow, without the MCA overflow recovery feature the
337+
* system will not be able to recover, panic.
381338
*/
382-
return MCE_KEEP_SEVERITY;
339+
if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) {
340+
ret = MCE_PANIC_SEVERITY;
341+
goto out;
342+
}
343+
344+
if (!mce_flags.succor) {
345+
ret = MCE_PANIC_SEVERITY;
346+
goto out;
347+
}
348+
349+
if (error_context(m, regs) == IN_KERNEL)
350+
ret = MCE_PANIC_SEVERITY;
351+
352+
out:
353+
return ret;
383354
}
384355

385356
static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)

0 commit comments

Comments
 (0)