Skip to content

Commit 34da4a5

Browse files
yghannambp3tk0v
authored andcommitted
x86/mce: Unify AMD THR handler with MCA Polling
AMD systems optionally support an MCA thresholding interrupt. The interrupt should be used as another signal to trigger MCA polling. This is similar to how the Intel Corrected Machine Check interrupt (CMCI) is handled. AMD MCA thresholding is managed using the MCA_MISC registers within an MCA bank. The OS will need to modify the hardware error count field in order to reset the threshold limit and rearm the interrupt. Management of the MCA_MISC register should be done as a follow up to the basic MCA polling flow. It should not be the main focus of the interrupt handler. Furthermore, future systems will have the ability to send an MCA thresholding interrupt to the OS even when the OS does not manage the feature, i.e. MCA_MISC registers are Read-as-Zero/Locked. Call the common MCA polling function when handling the MCA thresholding interrupt. This will allow the OS to find any valid errors whether or not the MCA thresholding feature is OS-managed. Also, this allows the common MCA polling options and kernel parameters to apply to AMD systems. Add a callback to the MCA polling function to check and reset any threshold blocks that have reached their threshold limit. Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/20251104-wip-mca-updates-v8-0-66c8eacf67b9@amd.com
1 parent 6146a0f commit 34da4a5

1 file changed

Lines changed: 23 additions & 28 deletions

File tree

  • arch/x86/kernel/cpu/mce

arch/x86/kernel/cpu/mce/amd.c

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@
5454

5555
static bool thresholding_irq_en;
5656

57+
struct mce_amd_cpu_data {
58+
mce_banks_t thr_intr_banks;
59+
};
60+
61+
static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data);
62+
5763
static const char * const th_names[] = {
5864
"load_store",
5965
"insn_fetch",
@@ -556,6 +562,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
556562
if (!b.interrupt_capable)
557563
goto done;
558564

565+
__set_bit(bank, this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
559566
b.interrupt_enable = 1;
560567

561568
if (!mce_flags.smca) {
@@ -896,12 +903,7 @@ static void amd_deferred_error_interrupt(void)
896903
log_error_deferred(bank);
897904
}
898905

899-
static void log_error_thresholding(unsigned int bank, u64 misc)
900-
{
901-
_log_error_deferred(bank, misc);
902-
}
903-
904-
static void log_and_reset_block(struct threshold_block *block)
906+
static void reset_block(struct threshold_block *block)
905907
{
906908
struct thresh_restart tr;
907909
u32 low = 0, high = 0;
@@ -915,48 +917,41 @@ static void log_and_reset_block(struct threshold_block *block)
915917
if (!(high & MASK_OVERFLOW_HI))
916918
return;
917919

918-
/* Log the MCE which caused the threshold event. */
919-
log_error_thresholding(block->bank, ((u64)high << 32) | low);
920-
921-
/* Reset threshold block after logging error. */
922920
memset(&tr, 0, sizeof(tr));
923921
tr.b = block;
924922
threshold_restart_block(&tr);
925923
}
926924

927-
/*
928-
* Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
929-
* goes off when error_count reaches threshold_limit.
930-
*/
931-
static void amd_threshold_interrupt(void)
925+
static void amd_reset_thr_limit(unsigned int bank)
932926
{
933-
struct threshold_bank **bp = this_cpu_read(threshold_banks), *thr_bank;
934-
unsigned int bank, cpu = smp_processor_id();
927+
struct threshold_bank **bp = this_cpu_read(threshold_banks);
935928
struct threshold_block *block, *tmp;
936929

937930
/*
938931
* Validate that the threshold bank has been initialized already. The
939932
* handler is installed at boot time, but on a hotplug event the
940933
* interrupt might fire before the data has been initialized.
941934
*/
942-
if (!bp)
935+
if (!bp || !bp[bank])
943936
return;
944937

945-
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
946-
if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank)))
947-
continue;
948-
949-
thr_bank = bp[bank];
950-
if (!thr_bank)
951-
continue;
938+
list_for_each_entry_safe(block, tmp, &bp[bank]->miscj, miscj)
939+
reset_block(block);
940+
}
952941

953-
list_for_each_entry_safe(block, tmp, &thr_bank->miscj, miscj)
954-
log_and_reset_block(block);
955-
}
942+
/*
943+
* Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
944+
* goes off when error_count reaches threshold_limit.
945+
*/
946+
static void amd_threshold_interrupt(void)
947+
{
948+
machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
956949
}
957950

958951
void amd_clear_bank(struct mce *m)
959952
{
953+
amd_reset_thr_limit(m->bank);
954+
960955
mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
961956
}
962957

0 commit comments

Comments
 (0)