Skip to content

Commit eeb3f76

Browse files
yghannambp3tk0v
authored andcommitted
x86/mce: Save and use APEI corrected threshold limit
The MCA threshold limit generally is not something that needs to change during runtime. It is common for a system administrator to decide on a policy for their managed systems. If MCA thresholding is OS-managed, then the threshold limit must be set at every boot. However, many systems allow the user to set a value in their BIOS. And this is reported through an APEI HEST entry even if thresholding is not in FW-First mode. Use this value, if available, to set the OS-managed threshold limit. Users can still override it through sysfs if desired for testing or debug. APEI is parsed after MCE is initialized. So reset the thresholding blocks later to pick up the threshold limit. Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/20251104-wip-mca-updates-v8-0-66c8eacf67b9@amd.com
1 parent 56f17be commit eeb3f76

5 files changed

Lines changed: 39 additions & 2 deletions

File tree

arch/x86/include/asm/mce.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,12 @@ DECLARE_PER_CPU(struct mce, injectm);
308308
/* Disable CMCI/polling for MCA bank claimed by firmware */
309309
extern void mce_disable_bank(int bank);
310310

311+
#ifdef CONFIG_X86_MCE_THRESHOLD
312+
void mce_save_apei_thr_limit(u32 thr_limit);
313+
#else
314+
static inline void mce_save_apei_thr_limit(u32 thr_limit) { }
315+
#endif /* CONFIG_X86_MCE_THRESHOLD */
316+
311317
/*
312318
* Exception handler
313319
*/

arch/x86/kernel/acpi/apei.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
1919
if (!cmc->enabled)
2020
return 0;
2121

22+
mce_save_apei_thr_limit(cmc->notify.error_threshold_value);
23+
2224
/*
2325
* We expect HEST to provide a list of MC banks that report errors
2426
* in firmware first mode. Otherwise, return non-zero value to

arch/x86/kernel/cpu/mce/amd.c

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,18 @@ static void threshold_restart_bank(unsigned int bank, bool intr_en)
489489
}
490490
}
491491

492+
/* Try to use the threshold limit reported through APEI. */
493+
static u16 get_thr_limit(void)
494+
{
495+
u32 thr_limit = mce_get_apei_thr_limit();
496+
497+
/* Fallback to old default if APEI limit is not available. */
498+
if (!thr_limit)
499+
return THRESHOLD_MAX;
500+
501+
return min(thr_limit, THRESHOLD_MAX);
502+
}
503+
492504
static void mce_threshold_block_init(struct threshold_block *b, int offset)
493505
{
494506
struct thresh_restart tr = {
@@ -497,7 +509,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
497509
.lvt_off = offset,
498510
};
499511

500-
b->threshold_limit = THRESHOLD_MAX;
512+
b->threshold_limit = get_thr_limit();
501513
threshold_restart_block(&tr);
502514
};
503515

@@ -1071,7 +1083,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
10711083
b->address = address;
10721084
b->interrupt_enable = 0;
10731085
b->interrupt_capable = lvt_interrupt_supported(bank, high);
1074-
b->threshold_limit = THRESHOLD_MAX;
1086+
b->threshold_limit = get_thr_limit();
10751087

10761088
if (b->interrupt_capable) {
10771089
default_attrs[2] = &interrupt_enable.attr;
@@ -1082,6 +1094,8 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
10821094

10831095
list_add(&b->miscj, &tb->miscj);
10841096

1097+
mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20);
1098+
10851099
err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
10861100
if (err)
10871101
goto out_free;

arch/x86/kernel/cpu/mce/internal.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,15 @@ void mce_track_storm(struct mce *mce);
6767
void mce_inherit_storm(unsigned int bank);
6868
bool mce_get_storm_mode(void);
6969
void mce_set_storm_mode(bool storm);
70+
u32 mce_get_apei_thr_limit(void);
7071
#else
7172
static inline void cmci_storm_begin(unsigned int bank) {}
7273
static inline void cmci_storm_end(unsigned int bank) {}
7374
static inline void mce_track_storm(struct mce *mce) {}
7475
static inline void mce_inherit_storm(unsigned int bank) {}
7576
static inline bool mce_get_storm_mode(void) { return false; }
7677
static inline void mce_set_storm_mode(bool storm) {}
78+
static inline u32 mce_get_apei_thr_limit(void) { return 0; }
7779
#endif
7880

7981
/*

arch/x86/kernel/cpu/mce/threshold.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,19 @@
1313

1414
#include "internal.h"
1515

16+
static u32 mce_apei_thr_limit;
17+
18+
void mce_save_apei_thr_limit(u32 thr_limit)
19+
{
20+
mce_apei_thr_limit = thr_limit;
21+
pr_info("HEST corrected error threshold limit: %u\n", thr_limit);
22+
}
23+
24+
u32 mce_get_apei_thr_limit(void)
25+
{
26+
return mce_apei_thr_limit;
27+
}
28+
1629
static void default_threshold_interrupt(void)
1730
{
1831
pr_err("Unexpected threshold interrupt at vector %x\n",

0 commit comments

Comments
 (0)