Skip to content

Commit 3b09b11

Browse files
drm/xe/guc: Return an error code if the GuC load fails
Due to multiple explosion issues in the early days of the Xe driver, the GuC load was hacked to never return a failure. That prevented kernel panics and such initially, but now all it achieves is creating more confusing errors when the driver tries to submit commands to a GuC it already knows is not there. So fix that up. As a stop-gap and to help with debug of load failures due to invalid GuC init params, a wedge call had been added to the inner GuC load function. The reason being that it leaves the GuC log accessible via debugfs. However, for an end user, simply aborting the module load is much cleaner than wedging and trying to continue. The wedge blocks user submissions but it seems that various bits of the driver itself still try to submit to a dead GuC and lots of subsequent errors occur. And with regards to developers debugging why their particular code change is being rejected by the GuC, it is trivial to either add the wedge back in and hack the return code to zero again or to just do a GuC log dump to dmesg. v2: Add support for error injection testing and drop the now redundant wedge call. CC: Rodrigo Vivi <rodrigo.vivi@intel.com> Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Matt Atwood <matthew.s.atwood@intel.com> Link: https://lore.kernel.org/r/20250909224132.536320-1-John.C.Harrison@Intel.com
1 parent 1a86916 commit 3b09b11

1 file changed

Lines changed: 9 additions & 4 deletions

File tree

drivers/gpu/drm/xe/xe_guc.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,7 +1055,7 @@ static s32 guc_pc_get_cur_freq(struct xe_guc_pc *guc_pc)
10551055
#endif
10561056
#define GUC_LOAD_TIME_WARN_MS 200
10571057

1058-
static void guc_wait_ucode(struct xe_guc *guc)
1058+
static int guc_wait_ucode(struct xe_guc *guc)
10591059
{
10601060
struct xe_gt *gt = guc_to_gt(guc);
10611061
struct xe_mmio *mmio = &gt->mmio;
@@ -1162,7 +1162,7 @@ static void guc_wait_ucode(struct xe_guc *guc)
11621162
break;
11631163
}
11641164

1165-
xe_device_declare_wedged(gt_to_xe(gt));
1165+
return -EPROTO;
11661166
} else if (delta_ms > GUC_LOAD_TIME_WARN_MS) {
11671167
xe_gt_warn(gt, "excessive init time: %lldms! [status = 0x%08X, timeouts = %d]\n",
11681168
delta_ms, status, count);
@@ -1174,7 +1174,10 @@ static void guc_wait_ucode(struct xe_guc *guc)
11741174
delta_ms, xe_guc_pc_get_act_freq(guc_pc), guc_pc_get_cur_freq(guc_pc),
11751175
before_freq, status, count);
11761176
}
1177+
1178+
return 0;
11771179
}
1180+
ALLOW_ERROR_INJECTION(guc_wait_ucode, ERRNO);
11781181

11791182
static int __xe_guc_upload(struct xe_guc *guc)
11801183
{
@@ -1206,14 +1209,16 @@ static int __xe_guc_upload(struct xe_guc *guc)
12061209
goto out;
12071210

12081211
/* Wait for authentication */
1209-
guc_wait_ucode(guc);
1212+
ret = guc_wait_ucode(guc);
1213+
if (ret)
1214+
goto out;
12101215

12111216
xe_uc_fw_change_status(&guc->fw, XE_UC_FIRMWARE_RUNNING);
12121217
return 0;
12131218

12141219
out:
12151220
xe_uc_fw_change_status(&guc->fw, XE_UC_FIRMWARE_LOAD_FAIL);
1216-
return 0 /* FIXME: ret, don't want to stop load currently */;
1221+
return ret;
12171222
}
12181223

12191224
static int vf_guc_min_load_for_hwconfig(struct xe_guc *guc)

0 commit comments

Comments
 (0)