Skip to content

Commit f6eeea8

Browse files
drm/i915/guc: Dump error capture to dmesg on CTB error
In the past, There have been sporadic CTB failures which proved hard to reproduce manually. The most effective solution was to dump the GuC log at the point of failure and let the CI system do the repro. It is preferable not to dump the GuC log via dmesg for all issues as it is not always necessary and is not helpful for end users. But rather than trying to re-invent the code to do this each time it is wanted, commit the code but for DEBUG_GUC builds only. v2: Use IS_ENABLED for testing config options. Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20230418181744.3251240-3-John.C.Harrison@Intel.com
1 parent 6197cff commit f6eeea8

2 files changed

Lines changed: 59 additions & 0 deletions

File tree

drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,30 @@
1313
#include "intel_guc_ct.h"
1414
#include "intel_guc_print.h"
1515

16+
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
17+
enum {
18+
CT_DEAD_ALIVE = 0,
19+
CT_DEAD_SETUP,
20+
CT_DEAD_WRITE,
21+
CT_DEAD_DEADLOCK,
22+
CT_DEAD_H2G_HAS_ROOM,
23+
CT_DEAD_READ,
24+
CT_DEAD_PROCESS_FAILED,
25+
};
26+
27+
static void ct_dead_ct_worker_func(struct work_struct *w);
28+
29+
#define CT_DEAD(ct, reason) \
30+
do { \
31+
if (!(ct)->dead_ct_reported) { \
32+
(ct)->dead_ct_reason |= 1 << CT_DEAD_##reason; \
33+
queue_work(system_unbound_wq, &(ct)->dead_ct_worker); \
34+
} \
35+
} while (0)
36+
#else
37+
#define CT_DEAD(ct, reason) do { } while (0)
38+
#endif
39+
1640
static inline struct intel_guc *ct_to_guc(struct intel_guc_ct *ct)
1741
{
1842
return container_of(ct, struct intel_guc, ct);
@@ -93,6 +117,9 @@ void intel_guc_ct_init_early(struct intel_guc_ct *ct)
93117
spin_lock_init(&ct->requests.lock);
94118
INIT_LIST_HEAD(&ct->requests.pending);
95119
INIT_LIST_HEAD(&ct->requests.incoming);
120+
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
121+
INIT_WORK(&ct->dead_ct_worker, ct_dead_ct_worker_func);
122+
#endif
96123
INIT_WORK(&ct->requests.worker, ct_incoming_request_worker_func);
97124
tasklet_setup(&ct->receive_tasklet, ct_receive_tasklet_func);
98125
init_waitqueue_head(&ct->wq);
@@ -319,11 +346,16 @@ int intel_guc_ct_enable(struct intel_guc_ct *ct)
319346

320347
ct->enabled = true;
321348
ct->stall_time = KTIME_MAX;
349+
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
350+
ct->dead_ct_reported = false;
351+
ct->dead_ct_reason = CT_DEAD_ALIVE;
352+
#endif
322353

323354
return 0;
324355

325356
err_out:
326357
CT_PROBE_ERROR(ct, "Failed to enable CTB (%pe)\n", ERR_PTR(err));
358+
CT_DEAD(ct, SETUP);
327359
return err;
328360
}
329361

@@ -434,6 +466,7 @@ static int ct_write(struct intel_guc_ct *ct,
434466
corrupted:
435467
CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n",
436468
desc->head, desc->tail, desc->status);
469+
CT_DEAD(ct, WRITE);
437470
ctb->broken = true;
438471
return -EPIPE;
439472
}
@@ -504,6 +537,7 @@ static inline bool ct_deadlocked(struct intel_guc_ct *ct)
504537
CT_ERROR(ct, "Head: %u\n (Dwords)", ct->ctbs.recv.desc->head);
505538
CT_ERROR(ct, "Tail: %u\n (Dwords)", ct->ctbs.recv.desc->tail);
506539

540+
CT_DEAD(ct, DEADLOCK);
507541
ct->ctbs.send.broken = true;
508542
}
509543

@@ -552,6 +586,7 @@ static inline bool h2g_has_room(struct intel_guc_ct *ct, u32 len_dw)
552586
head, ctb->size);
553587
desc->status |= GUC_CTB_STATUS_OVERFLOW;
554588
ctb->broken = true;
589+
CT_DEAD(ct, H2G_HAS_ROOM);
555590
return false;
556591
}
557592

@@ -914,6 +949,7 @@ static int ct_read(struct intel_guc_ct *ct, struct ct_incoming_msg **msg)
914949
CT_ERROR(ct, "Corrupted descriptor head=%u tail=%u status=%#x\n",
915950
desc->head, desc->tail, desc->status);
916951
ctb->broken = true;
952+
CT_DEAD(ct, READ);
917953
return -EPIPE;
918954
}
919955

@@ -1063,6 +1099,7 @@ static bool ct_process_incoming_requests(struct intel_guc_ct *ct)
10631099
if (unlikely(err)) {
10641100
CT_ERROR(ct, "Failed to process CT message (%pe) %*ph\n",
10651101
ERR_PTR(err), 4 * request->size, request->msg);
1102+
CT_DEAD(ct, PROCESS_FAILED);
10661103
ct_free_msg(request);
10671104
}
10681105

@@ -1239,3 +1276,19 @@ void intel_guc_ct_print_info(struct intel_guc_ct *ct,
12391276
drm_printf(p, "Tail: %u\n",
12401277
ct->ctbs.recv.desc->tail);
12411278
}
1279+
1280+
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
1281+
static void ct_dead_ct_worker_func(struct work_struct *w)
1282+
{
1283+
struct intel_guc_ct *ct = container_of(w, struct intel_guc_ct, dead_ct_worker);
1284+
struct intel_guc *guc = ct_to_guc(ct);
1285+
1286+
if (ct->dead_ct_reported)
1287+
return;
1288+
1289+
ct->dead_ct_reported = true;
1290+
1291+
guc_info(guc, "CTB is dead - reason=0x%X\n", ct->dead_ct_reason);
1292+
intel_klog_error_capture(guc_to_gt(guc), (intel_engine_mask_t)~0U);
1293+
}
1294+
#endif

drivers/gpu/drm/i915/gt/uc/intel_guc_ct.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,12 @@ struct intel_guc_ct {
8585

8686
/** @stall_time: time of first time a CTB submission is stalled */
8787
ktime_t stall_time;
88+
89+
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GUC)
90+
int dead_ct_reason;
91+
bool dead_ct_reported;
92+
struct work_struct dead_ct_worker;
93+
#endif
8894
};
8995

9096
void intel_guc_ct_init_early(struct intel_guc_ct *ct);

0 commit comments

Comments
 (0)