Skip to content

Commit def675c

Browse files
committed
drm/xe/mert: Improve handling of MERT CAT errors
All MERT catastrophic errors but VF's LMTT fault are serious, so we shouldn't limit our handling only to print debug messages. Change CATERR message to error level and then declare the device as wedged to match expectation from the design document. For the LMTT faults, add a note about adding tracking of this unexpected VF activity. While at it, rename register fields defnitions to match the BSpec. Also drop trailing include guard name from the regs.h file. BSpec: 74625 Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com> Cc: Lukasz Laguna <lukasz.laguna@intel.com> Reviewed-by: Lukasz Laguna <lukasz.laguna@intel.com> Link: https://patch.msgid.link/20260112183716.28700-1-michal.wajdeczko@intel.com
1 parent 6b2ff1d commit def675c

2 files changed

Lines changed: 39 additions & 14 deletions

File tree

drivers/gpu/drm/xe/regs/xe_mert_regs.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@
1111
#define MERT_LMEM_CFG XE_REG(0x1448b0)
1212

1313
#define MERT_TLB_CT_INTR_ERR_ID_PORT XE_REG(0x145190)
14-
#define MERT_TLB_CT_VFID_MASK REG_GENMASK(16, 9)
15-
#define MERT_TLB_CT_ERROR_MASK REG_GENMASK(5, 0)
16-
#define MERT_TLB_CT_LMTT_FAULT 0x05
14+
#define CATERR_VFID REG_GENMASK(16, 9)
15+
#define CATERR_CODES REG_GENMASK(5, 0)
16+
#define CATERR_NO_ERROR 0x00
17+
#define CATERR_UNMAPPED_GGTT 0x01
18+
#define CATERR_LMTT_FAULT 0x05
1719

1820
#define MERT_TLB_INV_DESC_A XE_REG(0x14cf7c)
1921
#define MERT_TLB_INV_DESC_A_VALID REG_BIT(0)
2022

21-
#endif /* _XE_MERT_REGS_H_ */
23+
#endif

drivers/gpu/drm/xe/xe_mert.c

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "xe_device.h"
1010
#include "xe_mert.h"
1111
#include "xe_mmio.h"
12+
#include "xe_sriov_printk.h"
1213
#include "xe_tile.h"
1314

1415
/**
@@ -55,6 +56,37 @@ int xe_mert_invalidate_lmtt(struct xe_device *xe)
5556
return 0;
5657
}
5758

59+
static void mert_handle_cat_error(struct xe_device *xe)
60+
{
61+
struct xe_tile *tile = xe_device_get_root_tile(xe);
62+
u32 reg_val, vfid, code;
63+
64+
reg_val = xe_mmio_read32(&tile->mmio, MERT_TLB_CT_INTR_ERR_ID_PORT);
65+
if (!reg_val)
66+
return;
67+
xe_mmio_write32(&tile->mmio, MERT_TLB_CT_INTR_ERR_ID_PORT, 0);
68+
69+
vfid = FIELD_GET(CATERR_VFID, reg_val);
70+
code = FIELD_GET(CATERR_CODES, reg_val);
71+
72+
switch (code) {
73+
case CATERR_NO_ERROR:
74+
break;
75+
case CATERR_UNMAPPED_GGTT:
76+
xe_sriov_err(xe, "MERT: CAT_ERR: Access to an unmapped GGTT!\n");
77+
xe_device_declare_wedged(xe);
78+
break;
79+
case CATERR_LMTT_FAULT:
80+
xe_sriov_dbg(xe, "MERT: CAT_ERR: VF%u LMTT fault!\n", vfid);
81+
/* XXX: track/report malicious VF activity */
82+
break;
83+
default:
84+
xe_sriov_err(xe, "MERT: Unexpected CAT_ERR code=%#x!\n", code);
85+
xe_device_declare_wedged(xe);
86+
break;
87+
}
88+
}
89+
5890
/**
5991
* xe_mert_irq_handler - Handler for MERT interrupts
6092
* @xe: the &xe_device
@@ -68,20 +100,11 @@ void xe_mert_irq_handler(struct xe_device *xe, u32 master_ctl)
68100
struct xe_mert *mert = &tile->mert;
69101
unsigned long flags;
70102
u32 reg_val;
71-
u8 err;
72103

73104
if (!(master_ctl & SOC_H2DMEMINT_IRQ))
74105
return;
75106

76-
reg_val = xe_mmio_read32(&tile->mmio, MERT_TLB_CT_INTR_ERR_ID_PORT);
77-
xe_mmio_write32(&tile->mmio, MERT_TLB_CT_INTR_ERR_ID_PORT, 0);
78-
79-
err = FIELD_GET(MERT_TLB_CT_ERROR_MASK, reg_val);
80-
if (err == MERT_TLB_CT_LMTT_FAULT)
81-
drm_dbg(&xe->drm, "MERT catastrophic error: LMTT fault (VF%u)\n",
82-
FIELD_GET(MERT_TLB_CT_VFID_MASK, reg_val));
83-
else if (err)
84-
drm_dbg(&xe->drm, "MERT catastrophic error: Unexpected fault (0x%x)\n", err);
107+
mert_handle_cat_error(xe);
85108

86109
spin_lock_irqsave(&mert->lock, flags);
87110
if (mert->tlb_inv_triggered) {

0 commit comments

Comments
 (0)