Skip to content

Commit f262015

Browse files
lukaszlagunaThomas Hellström
authored andcommitted
drm/xe: Update wedged.mode only after successful reset policy change
Previously, the driver's internal wedged.mode state was updated without verifying whether the corresponding engine reset policy update in GuC succeeded. This could leave the driver reporting a wedged.mode state that doesn't match the actual reset behavior programmed in GuC. With this change, the reset policy is updated first, and the driver's wedged.mode state is modified only if the policy update succeeds on all available GTs. This patch also introduces two functional improvements: - The policy is sent to GuC only when a change is required. An update is needed only when entering or leaving XE_WEDGED_MODE_UPON_ANY_HANG, because only in that case the reset policy changes. For example, switching between XE_WEDGED_MODE_UPON_CRITICAL_ERROR and XE_WEDGED_MODE_NEVER doesn't affect the reset policy, so there is no need to send the same value to GuC. - An inconsistent_reset flag is added to track cases where reset policy update succeeds only on a subset of GTs. If such inconsistency is detected, future wedged mode configuration will force a retry of the reset policy update to restore a consistent state across all GTs. Fixes: 6b8ef44 ("drm/xe: Introduce the wedged_mode debugfs") Signed-off-by: Lukasz Laguna <lukasz.laguna@intel.com> Link: https://patch.msgid.link/20260107174741.29163-3-lukasz.laguna@intel.com Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com> (cherry picked from commit 0f13dea) Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
1 parent 772157f commit f262015

4 files changed

Lines changed: 87 additions & 22 deletions

File tree

drivers/gpu/drm/xe/xe_debugfs.c

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -256,14 +256,64 @@ static ssize_t wedged_mode_show(struct file *f, char __user *ubuf,
256256
return simple_read_from_buffer(ubuf, size, pos, buf, len);
257257
}
258258

259+
static int __wedged_mode_set_reset_policy(struct xe_gt *gt, enum xe_wedged_mode mode)
260+
{
261+
bool enable_engine_reset;
262+
int ret;
263+
264+
enable_engine_reset = (mode != XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET);
265+
ret = xe_guc_ads_scheduler_policy_toggle_reset(&gt->uc.guc.ads,
266+
enable_engine_reset);
267+
if (ret)
268+
xe_gt_err(gt, "Failed to update GuC ADS scheduler policy (%pe)\n", ERR_PTR(ret));
269+
270+
return ret;
271+
}
272+
273+
static int wedged_mode_set_reset_policy(struct xe_device *xe, enum xe_wedged_mode mode)
274+
{
275+
struct xe_gt *gt;
276+
int ret;
277+
u8 id;
278+
279+
guard(xe_pm_runtime)(xe);
280+
for_each_gt(gt, xe, id) {
281+
ret = __wedged_mode_set_reset_policy(gt, mode);
282+
if (ret) {
283+
if (id > 0) {
284+
xe->wedged.inconsistent_reset = true;
285+
drm_err(&xe->drm, "Inconsistent reset policy state between GTs\n");
286+
}
287+
return ret;
288+
}
289+
}
290+
291+
xe->wedged.inconsistent_reset = false;
292+
293+
return 0;
294+
}
295+
296+
static bool wedged_mode_needs_policy_update(struct xe_device *xe, enum xe_wedged_mode mode)
297+
{
298+
if (xe->wedged.inconsistent_reset)
299+
return true;
300+
301+
if (xe->wedged.mode == mode)
302+
return false;
303+
304+
if (xe->wedged.mode == XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET ||
305+
mode == XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET)
306+
return true;
307+
308+
return false;
309+
}
310+
259311
static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
260312
size_t size, loff_t *pos)
261313
{
262314
struct xe_device *xe = file_inode(f)->i_private;
263-
struct xe_gt *gt;
264315
u32 wedged_mode;
265316
ssize_t ret;
266-
u8 id;
267317

268318
ret = kstrtouint_from_user(ubuf, size, 0, &wedged_mode);
269319
if (ret)
@@ -272,22 +322,14 @@ static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
272322
if (wedged_mode > 2)
273323
return -EINVAL;
274324

275-
if (xe->wedged.mode == wedged_mode)
276-
return size;
325+
if (wedged_mode_needs_policy_update(xe, wedged_mode)) {
326+
ret = wedged_mode_set_reset_policy(xe, wedged_mode);
327+
if (ret)
328+
return ret;
329+
}
277330

278331
xe->wedged.mode = wedged_mode;
279332

280-
xe_pm_runtime_get(xe);
281-
for_each_gt(gt, xe, id) {
282-
ret = xe_guc_ads_scheduler_policy_toggle_reset(&gt->uc.guc.ads);
283-
if (ret) {
284-
xe_gt_err(gt, "Failed to update GuC ADS scheduler policy. GuC may still cause engine reset even with wedged_mode=2\n");
285-
xe_pm_runtime_put(xe);
286-
return -EIO;
287-
}
288-
}
289-
xe_pm_runtime_put(xe);
290-
291333
return size;
292334
}
293335

drivers/gpu/drm/xe/xe_device_types.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,22 @@ struct xe_pat_ops;
4444
struct xe_pxp;
4545
struct xe_vram_region;
4646

47+
/**
48+
* enum xe_wedged_mode - possible wedged modes
49+
* @XE_WEDGED_MODE_NEVER: Device will never be declared wedged.
50+
* @XE_WEDGED_MODE_UPON_CRITICAL_ERROR: Device will be declared wedged only
51+
* when critical error occurs like GT reset failure or firmware failure.
52+
* This is the default mode.
53+
* @XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET: Device will be declared wedged on
54+
* any hang. In this mode, engine resets are disabled to avoid automatic
55+
* recovery attempts. This mode is primarily intended for debugging hangs.
56+
*/
57+
enum xe_wedged_mode {
58+
XE_WEDGED_MODE_NEVER = 0,
59+
XE_WEDGED_MODE_UPON_CRITICAL_ERROR = 1,
60+
XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET = 2,
61+
};
62+
4763
#define XE_BO_INVALID_OFFSET LONG_MAX
4864

4965
#define GRAPHICS_VER(xe) ((xe)->info.graphics_verx100 / 100)
@@ -587,6 +603,8 @@ struct xe_device {
587603
int mode;
588604
/** @wedged.method: Recovery method to be sent in the drm device wedged uevent */
589605
unsigned long method;
606+
/** @wedged.inconsistent_reset: Inconsistent reset policy state between GTs */
607+
bool inconsistent_reset;
590608
} wedged;
591609

592610
/** @bo_device: Struct to control async free of BOs */

drivers/gpu/drm/xe/xe_guc_ads.c

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -983,16 +983,17 @@ static int guc_ads_action_update_policies(struct xe_guc_ads *ads, u32 policy_off
983983
/**
984984
* xe_guc_ads_scheduler_policy_toggle_reset - Toggle reset policy
985985
* @ads: Additional data structures object
986+
* @enable_engine_reset: true to enable engine resets, false otherwise
986987
*
987-
* This function update the GuC's engine reset policy based on wedged.mode.
988+
* This function update the GuC's engine reset policy.
988989
*
989990
* Return: 0 on success, and negative error code otherwise.
990991
*/
991-
int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads)
992+
int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads,
993+
bool enable_engine_reset)
992994
{
993995
struct guc_policies *policies;
994996
struct xe_guc *guc = ads_to_guc(ads);
995-
struct xe_device *xe = ads_to_xe(ads);
996997
CLASS(xe_guc_buf, buf)(&guc->buf, sizeof(*policies));
997998

998999
if (!xe_guc_buf_is_valid(buf))
@@ -1004,10 +1005,11 @@ int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads)
10041005
policies->dpc_promote_time = ads_blob_read(ads, policies.dpc_promote_time);
10051006
policies->max_num_work_items = ads_blob_read(ads, policies.max_num_work_items);
10061007
policies->is_valid = 1;
1007-
if (xe->wedged.mode == 2)
1008-
policies->global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
1009-
else
1008+
1009+
if (enable_engine_reset)
10101010
policies->global_flags &= ~GLOBAL_POLICY_DISABLE_ENGINE_RESET;
1011+
else
1012+
policies->global_flags |= GLOBAL_POLICY_DISABLE_ENGINE_RESET;
10111013

10121014
return guc_ads_action_update_policies(ads, xe_guc_buf_flush(buf));
10131015
}

drivers/gpu/drm/xe/xe_guc_ads.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,16 @@
66
#ifndef _XE_GUC_ADS_H_
77
#define _XE_GUC_ADS_H_
88

9+
#include <linux/types.h>
10+
911
struct xe_guc_ads;
1012

1113
int xe_guc_ads_init(struct xe_guc_ads *ads);
1214
int xe_guc_ads_init_post_hwconfig(struct xe_guc_ads *ads);
1315
void xe_guc_ads_populate(struct xe_guc_ads *ads);
1416
void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads);
1517
void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads);
16-
int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads);
18+
int xe_guc_ads_scheduler_policy_toggle_reset(struct xe_guc_ads *ads,
19+
bool enable_engine_reset);
1720

1821
#endif

0 commit comments

Comments
 (0)