Skip to content

Commit 7cb735d

Browse files
yghannambp3tk0v
authored andcommitted
x86/mce: Unify AMD DFR handler with MCA Polling
AMD systems optionally support a deferred error interrupt. The interrupt should be used as another signal to trigger MCA polling. This is similar to how other MCA interrupts are handled. Deferred errors do not require any special handling related to the interrupt, e.g. resetting or rearming the interrupt, etc. However, Scalable MCA systems include a pair of registers, MCA_DESTAT and MCA_DEADDR, that should be checked for valid errors. This check should be done whenever MCA registers are polled. Currently, the deferred error interrupt does this check, but the MCA polling function does not. Call the MCA polling function when handling the deferred error interrupt. This keeps all "polling" cases in a common function. Add an SMCA status check helper. This will do the same status check and register clearing that the interrupt handler has done. And it extends the common polling flow to find AMD deferred errors. Clear the MCA_DESTAT register at the end of the handler rather than the beginning. This maintains the procedure that the 'status' register must be cleared as the final step. Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/20251104-wip-mca-updates-v8-0-66c8eacf67b9@amd.com
1 parent 34da4a5 commit 7cb735d

3 files changed

Lines changed: 49 additions & 99 deletions

File tree

arch/x86/include/asm/mce.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,12 @@
165165
*/
166166
#define MCE_IN_KERNEL_COPYIN BIT_ULL(7)
167167

168+
/*
169+
* Indicates that handler should check and clear Deferred error registers
170+
* rather than common ones.
171+
*/
172+
#define MCE_CHECK_DFR_REGS BIT_ULL(8)
173+
168174
/*
169175
* This structure contains all data related to the MCE log. Also
170176
* carries a signature to make it easier to find from external

arch/x86/kernel/cpu/mce/amd.c

Lines changed: 13 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ static bool thresholding_irq_en;
5656

5757
struct mce_amd_cpu_data {
5858
mce_banks_t thr_intr_banks;
59+
mce_banks_t dfr_intr_banks;
5960
};
6061

6162
static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data);
@@ -300,8 +301,10 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
300301
* APIC based interrupt. First, check that no interrupt has been
301302
* set.
302303
*/
303-
if ((low & BIT(5)) && !((high >> 5) & 0x3))
304+
if ((low & BIT(5)) && !((high >> 5) & 0x3)) {
305+
__set_bit(bank, this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
304306
high |= BIT(5);
307+
}
305308

306309
this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8));
307310

@@ -792,37 +795,6 @@ bool amd_mce_usable_address(struct mce *m)
792795
return false;
793796
}
794797

795-
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
796-
{
797-
struct mce_hw_err err;
798-
struct mce *m = &err.m;
799-
800-
mce_prep_record(&err);
801-
802-
m->status = status;
803-
m->misc = misc;
804-
m->bank = bank;
805-
m->tsc = rdtsc();
806-
807-
if (m->status & MCI_STATUS_ADDRV) {
808-
m->addr = addr;
809-
810-
smca_extract_err_addr(m);
811-
}
812-
813-
if (mce_flags.smca) {
814-
rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
815-
816-
if (m->status & MCI_STATUS_SYNDV) {
817-
rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
818-
rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
819-
rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
820-
}
821-
}
822-
823-
mce_log(&err);
824-
}
825-
826798
DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
827799
{
828800
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
@@ -832,75 +804,10 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
832804
apic_eoi();
833805
}
834806

835-
/*
836-
* Returns true if the logged error is deferred. False, otherwise.
837-
*/
838-
static inline bool
839-
_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
840-
{
841-
u64 status, addr = 0;
842-
843-
rdmsrq(msr_stat, status);
844-
if (!(status & MCI_STATUS_VAL))
845-
return false;
846-
847-
if (status & MCI_STATUS_ADDRV)
848-
rdmsrq(msr_addr, addr);
849-
850-
__log_error(bank, status, addr, misc);
851-
852-
wrmsrq(msr_stat, 0);
853-
854-
return status & MCI_STATUS_DEFERRED;
855-
}
856-
857-
static bool _log_error_deferred(unsigned int bank, u32 misc)
858-
{
859-
if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
860-
mca_msr_reg(bank, MCA_ADDR), misc))
861-
return false;
862-
863-
/*
864-
* Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers.
865-
* Return true here to avoid accessing these registers.
866-
*/
867-
if (!mce_flags.smca)
868-
return true;
869-
870-
/* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */
871-
wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
872-
return true;
873-
}
874-
875-
/*
876-
* We have three scenarios for checking for Deferred errors:
877-
*
878-
* 1) Non-SMCA systems check MCA_STATUS and log error if found.
879-
* 2) SMCA systems check MCA_STATUS. If error is found then log it and also
880-
* clear MCA_DESTAT.
881-
* 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
882-
* log it.
883-
*/
884-
static void log_error_deferred(unsigned int bank)
885-
{
886-
if (_log_error_deferred(bank, 0))
887-
return;
888-
889-
/*
890-
* Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
891-
* for a valid error.
892-
*/
893-
_log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
894-
MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
895-
}
896-
897807
/* APIC interrupt handler for deferred errors */
898808
static void amd_deferred_error_interrupt(void)
899809
{
900-
unsigned int bank;
901-
902-
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank)
903-
log_error_deferred(bank);
810+
machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
904811
}
905812

906813
static void reset_block(struct threshold_block *block)
@@ -952,6 +859,14 @@ void amd_clear_bank(struct mce *m)
952859
{
953860
amd_reset_thr_limit(m->bank);
954861

862+
/* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */
863+
if (m->status & MCI_STATUS_DEFERRED)
864+
mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
865+
866+
/* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */
867+
if (m->kflags & MCE_CHECK_DFR_REGS)
868+
return;
869+
955870
mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
956871
}
957872

arch/x86/kernel/cpu/mce/core.c

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,10 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
687687
m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC));
688688

689689
if (m->status & MCI_STATUS_ADDRV) {
690-
m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
690+
if (m->kflags & MCE_CHECK_DFR_REGS)
691+
m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i));
692+
else
693+
m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
691694

692695
/*
693696
* Mask the reported address by the reported granularity.
@@ -714,6 +717,29 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
714717

715718
DEFINE_PER_CPU(unsigned, mce_poll_count);
716719

720+
/*
721+
* We have three scenarios for checking for Deferred errors:
722+
*
723+
* 1) Non-SMCA systems check MCA_STATUS and log error if found.
724+
* 2) SMCA systems check MCA_STATUS. If error is found then log it and also
725+
* clear MCA_DESTAT.
726+
* 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
727+
* log it.
728+
*/
729+
static bool smca_should_log_poll_error(struct mce *m)
730+
{
731+
if (m->status & MCI_STATUS_VAL)
732+
return true;
733+
734+
m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
735+
if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) {
736+
m->kflags |= MCE_CHECK_DFR_REGS;
737+
return true;
738+
}
739+
740+
return false;
741+
}
742+
717743
/*
718744
* Newer Intel systems that support software error
719745
* recovery need to make additional checks. Other
@@ -740,6 +766,9 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
740766
{
741767
struct mce *m = &err->m;
742768

769+
if (mce_flags.smca)
770+
return smca_should_log_poll_error(m);
771+
743772
/* If this entry is not valid, ignore it. */
744773
if (!(m->status & MCI_STATUS_VAL))
745774
return false;

0 commit comments

Comments
 (0)