Skip to content

Commit ba391a1

Browse files
committed
drm/i915/guc: Include the GuC registers in the error state
If GuC hangs, the GuC logs might not contain enough information to understand exactly why the hang occurred. In this case, we need to look at the GuC HW state to try to understand where the GuC is stuck. It is therefore useful to include the GuC HW state in the error capture. The list of registers that are part of the GuC HW state can change based on platform, but it is the same for all platforms from TGL to MTL so we only need to support one version for i915. v2: revised list v3: remove confusing comment, use sizeof(u32) instead of 4 (John) Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: John Harrison <John.C.Harrison@Intel.com> Link: https://lore.kernel.org/r/20250909223621.3782625-2-daniele.ceraolospurio@intel.com
1 parent f8d2c26 commit ba391a1

4 files changed

Lines changed: 112 additions & 0 deletions

File tree

drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ static void guc_prepare_xfer(struct intel_gt *gt)
4646
/* allows for 5us (in 10ns units) before GT can go to RC6 */
4747
intel_uncore_write(uncore, GUC_ARAT_C6DIS, 0x1FF);
4848
}
49+
50+
/*
51+
* Starting from IP 12.50 we need to enable the mirroring of GuC
52+
* internal state to debug registers. This is always enabled on previous
53+
* IPs.
54+
*/
55+
if (GRAPHICS_VER_FULL(uncore->i915) >= IP_VER(12, 50))
56+
intel_uncore_rmw(uncore, GUC_SHIM_CONTROL2, 0, GUC_ENABLE_DEBUG_REG);
4957
}
5058

5159
static int guc_xfer_rsa_mmio(struct intel_uc_fw *guc_fw,

drivers/gpu/drm/i915/gt/uc/intel_guc_reg.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@
9696
#define GUC_GEN10_SHIM_WC_ENABLE (1<<21)
9797

9898
#define GUC_SHIM_CONTROL2 _MMIO(0xc068)
99+
#define GUC_ENABLE_DEBUG_REG (1<<11)
99100
#define GUC_IS_PRIVILEGED (1<<29)
100101
#define GSC_LOADS_HUC (1<<30)
101102

drivers/gpu/drm/i915/i915_gpu_error.c

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,74 @@ static void err_print_guc_ctb(struct drm_i915_error_state_buf *m,
685685
ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, ctb->size);
686686
}
687687

688+
/* This list includes registers that are useful in debugging GuC hangs. */
689+
const struct {
690+
u32 start;
691+
u32 count;
692+
} guc_hw_reg_state[] = {
693+
{ 0xc0b0, 2 },
694+
{ 0xc000, 65 },
695+
{ 0xc140, 1 },
696+
{ 0xc180, 16 },
697+
{ 0xc1dc, 10 },
698+
{ 0xc300, 79 },
699+
{ 0xc4b4, 47 },
700+
{ 0xc574, 1 },
701+
{ 0xc57c, 1 },
702+
{ 0xc584, 11 },
703+
{ 0xc5c0, 8 },
704+
{ 0xc5e4, 1 },
705+
{ 0xc5ec, 103 },
706+
{ 0xc7c0, 1 },
707+
{ 0xc0b0, 2 }
708+
};
709+
710+
static u32 print_range_line(struct drm_i915_error_state_buf *m, u32 start, u32 *dump, u32 count)
711+
{
712+
if (count >= 8) {
713+
err_printf(m, "[0x%04x] 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n",
714+
start, dump[0], dump[1], dump[2], dump[3],
715+
dump[4], dump[5], dump[6], dump[7]);
716+
return 8;
717+
} else if (count >= 4) {
718+
err_printf(m, "[0x%04x] 0x%08x 0x%08x 0x%08x 0x%08x\n",
719+
start, dump[0], dump[1], dump[2], dump[3]);
720+
return 4;
721+
} else if (count >= 2) {
722+
err_printf(m, "[0x%04x] 0x%08x 0x%08x\n", start, dump[0], dump[1]);
723+
return 2;
724+
}
725+
726+
err_printf(m, "[0x%04x] 0x%08x\n", start, dump[0]);
727+
return 1;
728+
}
729+
730+
static void err_print_guc_hw_state(struct drm_i915_error_state_buf *m, u32 *hw_state)
731+
{
732+
u32 total = 0;
733+
int i;
734+
735+
if (!hw_state)
736+
return;
737+
738+
err_printf(m, "GuC Register State:\n");
739+
740+
for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++) {
741+
u32 entry = 0;
742+
743+
while (entry < guc_hw_reg_state[i].count) {
744+
u32 start = guc_hw_reg_state[i].start + entry * sizeof(u32);
745+
u32 count = guc_hw_reg_state[i].count - entry;
746+
u32 *values = hw_state + total + entry;
747+
748+
entry += print_range_line(m, start, values, count);
749+
}
750+
751+
GEM_BUG_ON(entry != guc_hw_reg_state[i].count);
752+
total += entry;
753+
}
754+
}
755+
688756
static void err_print_uc(struct drm_i915_error_state_buf *m,
689757
const struct intel_uc_coredump *error_uc)
690758
{
@@ -693,6 +761,7 @@ static void err_print_uc(struct drm_i915_error_state_buf *m,
693761
intel_uc_fw_dump(&error_uc->guc_fw, &p);
694762
intel_uc_fw_dump(&error_uc->huc_fw, &p);
695763
err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->guc.timestamp);
764+
err_print_guc_hw_state(m, error_uc->guc.hw_state);
696765
intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_log);
697766
err_printf(m, "GuC CTB fence: %d\n", error_uc->guc.last_fence);
698767
err_print_guc_ctb(m, "Send", error_uc->guc.ctb + 0);
@@ -1025,6 +1094,7 @@ static void cleanup_uc(struct intel_uc_coredump *uc)
10251094
kfree(uc->huc_fw.file_wanted.path);
10261095
i915_vma_coredump_free(uc->guc.vma_log);
10271096
i915_vma_coredump_free(uc->guc.vma_ctb);
1097+
kfree(uc->guc.hw_state);
10281098

10291099
kfree(uc);
10301100
}
@@ -1721,6 +1791,37 @@ static void gt_record_guc_ctb(struct intel_ctb_coredump *saved,
17211791
saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr;
17221792
}
17231793

1794+
static u32 read_guc_state_reg(struct intel_uncore *uncore, int range, int count)
1795+
{
1796+
GEM_BUG_ON(range >= ARRAY_SIZE(guc_hw_reg_state));
1797+
GEM_BUG_ON(count >= guc_hw_reg_state[range].count);
1798+
1799+
return intel_uncore_read(uncore,
1800+
_MMIO(guc_hw_reg_state[range].start + count * sizeof(u32)));
1801+
}
1802+
1803+
static void gt_record_guc_hw_state(struct intel_uncore *uncore,
1804+
struct intel_uc_coredump *error_uc)
1805+
{
1806+
u32 *hw_state;
1807+
u32 count = 0;
1808+
int i, j;
1809+
1810+
for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++)
1811+
count += guc_hw_reg_state[i].count;
1812+
1813+
hw_state = kcalloc(count, sizeof(u32), ALLOW_FAIL);
1814+
if (!hw_state)
1815+
return;
1816+
1817+
count = 0;
1818+
for (i = 0; i < ARRAY_SIZE(guc_hw_reg_state); i++)
1819+
for (j = 0; j < guc_hw_reg_state[i].count; j++)
1820+
hw_state[count++] = read_guc_state_reg(uncore, i, j);
1821+
1822+
error_uc->guc.hw_state = hw_state;
1823+
}
1824+
17241825
static struct intel_uc_coredump *
17251826
gt_record_uc(struct intel_gt_coredump *gt,
17261827
struct i915_vma_compress *compress)
@@ -1755,6 +1856,7 @@ gt_record_uc(struct intel_gt_coredump *gt,
17551856
uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc);
17561857
gt_record_guc_ctb(error_uc->guc.ctb + 1, &uc->guc.ct.ctbs.recv,
17571858
uc->guc.ct.ctbs.send.desc, (struct intel_guc *)&uc->guc);
1859+
gt_record_guc_hw_state(gt->_gt->uncore, error_uc);
17581860

17591861
return error_uc;
17601862
}

drivers/gpu/drm/i915/i915_gpu_error.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ struct intel_gt_coredump {
177177
struct intel_ctb_coredump ctb[2];
178178
struct i915_vma_coredump *vma_ctb;
179179
struct i915_vma_coredump *vma_log;
180+
u32 *hw_state;
180181
u32 timestamp;
181182
u16 last_fence;
182183
bool is_guc_capture;

0 commit comments

Comments
 (0)