Skip to content

Commit b2edc41

Browse files
drm/i915/guc: Force a reset on internal GuC error
If GuC hits an internal error (and survives long enough to report it to the KMD), it is basically toast and will stop until a GT reset and subsequent GuC reload is performed. Previously, the KMD just printed an error message and then waited for the heartbeat to eventually kick in and trigger a reset (assuming the heartbeat had not been disabled). Instead, force the reset immediately to guarantee that it happens and to eliminate the very long heartbeat delay. The captured error state is also more likely to be useful if captured at the time of the error rather than many seconds later. Note that it is not possible to trigger a reset from with the G2H handler itself. The reset prepare process involves flushing outstanding G2H contents. So a deadlock could result. Instead, the G2H handler queues a worker thread to do the reset asynchronously. v2: Flush the worker on suspend and shutdown. Add rate limiting to prevent spam from a totally dead system (review feedback from Daniele). Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20230816003957.3572654-1-John.C.Harrison@Intel.com
1 parent 14128d6 commit b2edc41

3 files changed

Lines changed: 54 additions & 5 deletions

File tree

drivers/gpu/drm/i915/gt/uc/intel_guc.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,21 @@ static void gen11_disable_guc_interrupts(struct intel_guc *guc)
159159
gen11_reset_guc_interrupts(guc);
160160
}
161161

162+
static void guc_dead_worker_func(struct work_struct *w)
163+
{
164+
struct intel_guc *guc = container_of(w, struct intel_guc, dead_guc_worker);
165+
struct intel_gt *gt = guc_to_gt(guc);
166+
unsigned long last = guc->last_dead_guc_jiffies;
167+
unsigned long delta = jiffies_to_msecs(jiffies - last);
168+
169+
if (delta < 500) {
170+
intel_gt_set_wedged(gt);
171+
} else {
172+
intel_gt_handle_error(gt, ALL_ENGINES, I915_ERROR_CAPTURE, "dead GuC");
173+
guc->last_dead_guc_jiffies = jiffies;
174+
}
175+
}
176+
162177
void intel_guc_init_early(struct intel_guc *guc)
163178
{
164179
struct intel_gt *gt = guc_to_gt(guc);
@@ -171,6 +186,8 @@ void intel_guc_init_early(struct intel_guc *guc)
171186
intel_guc_slpc_init_early(&guc->slpc);
172187
intel_guc_rc_init_early(guc);
173188

189+
INIT_WORK(&guc->dead_guc_worker, guc_dead_worker_func);
190+
174191
mutex_init(&guc->send_mutex);
175192
spin_lock_init(&guc->irq_lock);
176193
if (GRAPHICS_VER(i915) >= 11) {
@@ -449,6 +466,8 @@ void intel_guc_fini(struct intel_guc *guc)
449466
if (!intel_uc_fw_is_loadable(&guc->fw))
450467
return;
451468

469+
flush_work(&guc->dead_guc_worker);
470+
452471
if (intel_guc_slpc_is_used(guc))
453472
intel_guc_slpc_fini(&guc->slpc);
454473

@@ -573,6 +592,20 @@ int intel_guc_send_mmio(struct intel_guc *guc, const u32 *request, u32 len,
573592
return ret;
574593
}
575594

595+
int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action)
596+
{
597+
if (action == INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED)
598+
guc_err(guc, "Crash dump notification\n");
599+
else if (action == INTEL_GUC_ACTION_NOTIFY_EXCEPTION)
600+
guc_err(guc, "Exception notification\n");
601+
else
602+
guc_err(guc, "Unknown crash notification: 0x%04X\n", action);
603+
604+
queue_work(system_unbound_wq, &guc->dead_guc_worker);
605+
606+
return 0;
607+
}
608+
576609
int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
577610
const u32 *payload, u32 len)
578611
{
@@ -589,6 +622,9 @@ int intel_guc_to_host_process_recv_msg(struct intel_guc *guc,
589622
if (msg & INTEL_GUC_RECV_MSG_EXCEPTION)
590623
guc_err(guc, "Received early exception notification!\n");
591624

625+
if (msg & (INTEL_GUC_RECV_MSG_CRASH_DUMP_POSTED | INTEL_GUC_RECV_MSG_EXCEPTION))
626+
queue_work(system_unbound_wq, &guc->dead_guc_worker);
627+
592628
return 0;
593629
}
594630

@@ -628,6 +664,8 @@ int intel_guc_suspend(struct intel_guc *guc)
628664
return 0;
629665

630666
if (intel_guc_submission_is_used(guc)) {
667+
flush_work(&guc->dead_guc_worker);
668+
631669
/*
632670
* This H2G MMIO command tears down the GuC in two steps. First it will
633671
* generate a G2H CTB for every active context indicating a reset. In

drivers/gpu/drm/i915/gt/uc/intel_guc.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,20 @@ struct intel_guc {
266266
unsigned long last_stat_jiffies;
267267
} timestamp;
268268

269+
/**
270+
* @dead_guc_worker: Asynchronous worker thread for forcing a GuC reset.
271+
* Specifically used when the G2H handler wants to issue a reset. Resets
272+
* require flushing the G2H queue. So, the G2H processing itself must not
273+
* trigger a reset directly. Instead, go via this worker.
274+
*/
275+
struct work_struct dead_guc_worker;
276+
/**
277+
* @last_dead_guc_jiffies: timestamp of previous 'dead guc' occurrance
278+
* used to prevent a fundamentally broken system from continuously
279+
* reloading the GuC.
280+
*/
281+
unsigned long last_dead_guc_jiffies;
282+
269283
#ifdef CONFIG_DRM_I915_SELFTEST
270284
/**
271285
* @number_guc_id_stolen: The number of guc_ids that have been stolen
@@ -476,6 +490,7 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
476490
const u32 *msg, u32 len);
477491
int intel_guc_error_capture_process_msg(struct intel_guc *guc,
478492
const u32 *msg, u32 len);
493+
int intel_guc_crash_process_msg(struct intel_guc *guc, u32 action);
479494

480495
struct intel_engine_cs *
481496
intel_guc_lookup_engine(struct intel_guc *guc, u8 guc_class, u8 instance);

drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1112,12 +1112,8 @@ static int ct_process_request(struct intel_guc_ct *ct, struct ct_incoming_msg *r
11121112
ret = 0;
11131113
break;
11141114
case INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED:
1115-
CT_ERROR(ct, "Received GuC crash dump notification!\n");
1116-
ret = 0;
1117-
break;
11181115
case INTEL_GUC_ACTION_NOTIFY_EXCEPTION:
1119-
CT_ERROR(ct, "Received GuC exception notification!\n");
1120-
ret = 0;
1116+
ret = intel_guc_crash_process_msg(guc, action);
11211117
break;
11221118
default:
11231119
ret = -EOPNOTSUPP;

0 commit comments

Comments
 (0)