Skip to content

Commit e63b922

Browse files
committed
Merge tag 'drm-xe-fixes-2026-01-22' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes
UAPI Changes: - Disallow bind-queue sharing across multiple VMs (Matt Auld) Core Changes: - Fix xe userptr in the absence of CONFIG_DEVICE_PRIVATE (Thomas) Driver Changes: - Fix a missed page count update (Matt Brost) - Fix a confused argument to alloc_workqueue() (Marco Crivellari) - Kernel-doc fixes (Jani) - Disable a workaround on VFs (Matt Brost) - Fix a job lock assert (Matt Auld) - Update wedged.mode only after successful reset policy change (Lukasz) - Select CONFIG_DEVICE_PRIVATE when DRM_XE_GPUSVM is selected (Thomas) Signed-off-by: Dave Airlie <airlied@redhat.com> From: Thomas Hellstrom <thomas.hellstrom@linux.intel.com> Link: https://patch.msgid.link/aXIdiXaY-RxoaviV@fedora
2 parents 353f91b + e27ada4 commit e63b922

20 files changed

Lines changed: 175 additions & 40 deletions

drivers/gpu/drm/Kconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ config DRM_GPUVM
210210

211211
config DRM_GPUSVM
212212
tristate
213-
depends on DRM && DEVICE_PRIVATE
213+
depends on DRM
214214
select HMM_MIRROR
215215
select MMU_NOTIFIER
216216
help

drivers/gpu/drm/Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,10 @@ obj-$(CONFIG_DRM_EXEC) += drm_exec.o
108108
obj-$(CONFIG_DRM_GPUVM) += drm_gpuvm.o
109109

110110
drm_gpusvm_helper-y := \
111-
drm_gpusvm.o\
111+
drm_gpusvm.o
112+
drm_gpusvm_helper-$(CONFIG_ZONE_DEVICE) += \
112113
drm_pagemap.o
114+
113115
obj-$(CONFIG_DRM_GPUSVM) += drm_gpusvm_helper.o
114116

115117
obj-$(CONFIG_DRM_BUDDY) += drm_buddy.o

drivers/gpu/drm/xe/Kconfig

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ config DRM_XE
3939
select DRM_TTM
4040
select DRM_TTM_HELPER
4141
select DRM_EXEC
42-
select DRM_GPUSVM if !UML && DEVICE_PRIVATE
42+
select DRM_GPUSVM if !UML
4343
select DRM_GPUVM
4444
select DRM_SCHED
4545
select MMU_NOTIFIER
@@ -80,8 +80,9 @@ config DRM_XE_GPUSVM
8080
bool "Enable CPU to GPU address mirroring"
8181
depends on DRM_XE
8282
depends on !UML
83-
depends on DEVICE_PRIVATE
83+
depends on ZONE_DEVICE
8484
default y
85+
select DEVICE_PRIVATE
8586
select DRM_GPUSVM
8687
help
8788
Enable this option if you want support for CPU to GPU address

drivers/gpu/drm/xe/xe_bo.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,6 +1055,7 @@ static long xe_bo_shrink_purge(struct ttm_operation_ctx *ctx,
10551055
unsigned long *scanned)
10561056
{
10571057
struct xe_device *xe = ttm_to_xe_device(bo->bdev);
1058+
struct ttm_tt *tt = bo->ttm;
10581059
long lret;
10591060

10601061
/* Fake move to system, without copying data. */
@@ -1079,8 +1080,10 @@ static long xe_bo_shrink_purge(struct ttm_operation_ctx *ctx,
10791080
.writeback = false,
10801081
.allow_move = false});
10811082

1082-
if (lret > 0)
1083+
if (lret > 0) {
10831084
xe_ttm_tt_account_subtract(xe, bo->ttm);
1085+
update_global_total_pages(bo->bdev, -(long)tt->num_pages);
1086+
}
10841087

10851088
return lret;
10861089
}
@@ -1166,8 +1169,10 @@ long xe_bo_shrink(struct ttm_operation_ctx *ctx, struct ttm_buffer_object *bo,
11661169
if (needs_rpm)
11671170
xe_pm_runtime_put(xe);
11681171

1169-
if (lret > 0)
1172+
if (lret > 0) {
11701173
xe_ttm_tt_account_subtract(xe, tt);
1174+
update_global_total_pages(bo->bdev, -(long)tt->num_pages);
1175+
}
11711176

11721177
out_unref:
11731178
xe_bo_put(xe_bo);

drivers/gpu/drm/xe/xe_debugfs.c

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -256,14 +256,64 @@ static ssize_t wedged_mode_show(struct file *f, char __user *ubuf,
256256
return simple_read_from_buffer(ubuf, size, pos, buf, len);
257257
}
258258

259+
static int __wedged_mode_set_reset_policy(struct xe_gt *gt, enum xe_wedged_mode mode)
260+
{
261+
bool enable_engine_reset;
262+
int ret;
263+
264+
enable_engine_reset = (mode != XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET);
265+
ret = xe_guc_ads_scheduler_policy_toggle_reset(&gt->uc.guc.ads,
266+
enable_engine_reset);
267+
if (ret)
268+
xe_gt_err(gt, "Failed to update GuC ADS scheduler policy (%pe)\n", ERR_PTR(ret));
269+
270+
return ret;
271+
}
272+
273+
static int wedged_mode_set_reset_policy(struct xe_device *xe, enum xe_wedged_mode mode)
274+
{
275+
struct xe_gt *gt;
276+
int ret;
277+
u8 id;
278+
279+
guard(xe_pm_runtime)(xe);
280+
for_each_gt(gt, xe, id) {
281+
ret = __wedged_mode_set_reset_policy(gt, mode);
282+
if (ret) {
283+
if (id > 0) {
284+
xe->wedged.inconsistent_reset = true;
285+
drm_err(&xe->drm, "Inconsistent reset policy state between GTs\n");
286+
}
287+
return ret;
288+
}
289+
}
290+
291+
xe->wedged.inconsistent_reset = false;
292+
293+
return 0;
294+
}
295+
296+
static bool wedged_mode_needs_policy_update(struct xe_device *xe, enum xe_wedged_mode mode)
297+
{
298+
if (xe->wedged.inconsistent_reset)
299+
return true;
300+
301+
if (xe->wedged.mode == mode)
302+
return false;
303+
304+
if (xe->wedged.mode == XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET ||
305+
mode == XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET)
306+
return true;
307+
308+
return false;
309+
}
310+
259311
static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
260312
size_t size, loff_t *pos)
261313
{
262314
struct xe_device *xe = file_inode(f)->i_private;
263-
struct xe_gt *gt;
264315
u32 wedged_mode;
265316
ssize_t ret;
266-
u8 id;
267317

268318
ret = kstrtouint_from_user(ubuf, size, 0, &wedged_mode);
269319
if (ret)
@@ -272,22 +322,14 @@ static ssize_t wedged_mode_set(struct file *f, const char __user *ubuf,
272322
if (wedged_mode > 2)
273323
return -EINVAL;
274324

275-
if (xe->wedged.mode == wedged_mode)
276-
return size;
325+
if (wedged_mode_needs_policy_update(xe, wedged_mode)) {
326+
ret = wedged_mode_set_reset_policy(xe, wedged_mode);
327+
if (ret)
328+
return ret;
329+
}
277330

278331
xe->wedged.mode = wedged_mode;
279332

280-
xe_pm_runtime_get(xe);
281-
for_each_gt(gt, xe, id) {
282-
ret = xe_guc_ads_scheduler_policy_toggle_reset(&gt->uc.guc.ads);
283-
if (ret) {
284-
xe_gt_err(gt, "Failed to update GuC ADS scheduler policy. GuC may still cause engine reset even with wedged_mode=2\n");
285-
xe_pm_runtime_put(xe);
286-
return -EIO;
287-
}
288-
}
289-
xe_pm_runtime_put(xe);
290-
291333
return size;
292334
}
293335

drivers/gpu/drm/xe/xe_device_types.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,22 @@ struct xe_pat_ops;
4444
struct xe_pxp;
4545
struct xe_vram_region;
4646

47+
/**
48+
* enum xe_wedged_mode - possible wedged modes
49+
* @XE_WEDGED_MODE_NEVER: Device will never be declared wedged.
50+
* @XE_WEDGED_MODE_UPON_CRITICAL_ERROR: Device will be declared wedged only
51+
* when critical error occurs like GT reset failure or firmware failure.
52+
* This is the default mode.
53+
* @XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET: Device will be declared wedged on
54+
* any hang. In this mode, engine resets are disabled to avoid automatic
55+
* recovery attempts. This mode is primarily intended for debugging hangs.
56+
*/
57+
enum xe_wedged_mode {
58+
XE_WEDGED_MODE_NEVER = 0,
59+
XE_WEDGED_MODE_UPON_CRITICAL_ERROR = 1,
60+
XE_WEDGED_MODE_UPON_ANY_HANG_NO_RESET = 2,
61+
};
62+
4763
#define XE_BO_INVALID_OFFSET LONG_MAX
4864

4965
#define GRAPHICS_VER(xe) ((xe)->info.graphics_verx100 / 100)
@@ -587,6 +603,8 @@ struct xe_device {
587603
int mode;
588604
/** @wedged.method: Recovery method to be sent in the drm device wedged uevent */
589605
unsigned long method;
606+
/** @wedged.inconsistent_reset: Inconsistent reset policy state between GTs */
607+
bool inconsistent_reset;
590608
} wedged;
591609

592610
/** @bo_device: Struct to control async free of BOs */

drivers/gpu/drm/xe/xe_exec_queue.c

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ struct xe_exec_queue *xe_exec_queue_create_class(struct xe_device *xe, struct xe
328328
* @xe: Xe device.
329329
* @tile: tile which bind exec queue belongs to.
330330
* @flags: exec queue creation flags
331+
* @user_vm: The user VM which this exec queue belongs to
331332
* @extensions: exec queue creation extensions
332333
*
333334
* Normalize bind exec queue creation. Bind exec queue is tied to migration VM
@@ -341,6 +342,7 @@ struct xe_exec_queue *xe_exec_queue_create_class(struct xe_device *xe, struct xe
341342
*/
342343
struct xe_exec_queue *xe_exec_queue_create_bind(struct xe_device *xe,
343344
struct xe_tile *tile,
345+
struct xe_vm *user_vm,
344346
u32 flags, u64 extensions)
345347
{
346348
struct xe_gt *gt = tile->primary_gt;
@@ -377,6 +379,9 @@ struct xe_exec_queue *xe_exec_queue_create_bind(struct xe_device *xe,
377379
xe_exec_queue_put(q);
378380
return ERR_PTR(err);
379381
}
382+
383+
if (user_vm)
384+
q->user_vm = xe_vm_get(user_vm);
380385
}
381386

382387
return q;
@@ -407,6 +412,11 @@ void xe_exec_queue_destroy(struct kref *ref)
407412
xe_exec_queue_put(eq);
408413
}
409414

415+
if (q->user_vm) {
416+
xe_vm_put(q->user_vm);
417+
q->user_vm = NULL;
418+
}
419+
410420
q->ops->destroy(q);
411421
}
412422

@@ -742,16 +752,34 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
742752
XE_IOCTL_DBG(xe, eci[0].engine_instance != 0))
743753
return -EINVAL;
744754

755+
vm = xe_vm_lookup(xef, args->vm_id);
756+
if (XE_IOCTL_DBG(xe, !vm))
757+
return -ENOENT;
758+
759+
err = down_read_interruptible(&vm->lock);
760+
if (err) {
761+
xe_vm_put(vm);
762+
return err;
763+
}
764+
765+
if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
766+
up_read(&vm->lock);
767+
xe_vm_put(vm);
768+
return -ENOENT;
769+
}
770+
745771
for_each_tile(tile, xe, id) {
746772
struct xe_exec_queue *new;
747773

748774
flags |= EXEC_QUEUE_FLAG_VM;
749775
if (id)
750776
flags |= EXEC_QUEUE_FLAG_BIND_ENGINE_CHILD;
751777

752-
new = xe_exec_queue_create_bind(xe, tile, flags,
778+
new = xe_exec_queue_create_bind(xe, tile, vm, flags,
753779
args->extensions);
754780
if (IS_ERR(new)) {
781+
up_read(&vm->lock);
782+
xe_vm_put(vm);
755783
err = PTR_ERR(new);
756784
if (q)
757785
goto put_exec_queue;
@@ -763,6 +791,8 @@ int xe_exec_queue_create_ioctl(struct drm_device *dev, void *data,
763791
list_add_tail(&new->multi_gt_list,
764792
&q->multi_gt_link);
765793
}
794+
up_read(&vm->lock);
795+
xe_vm_put(vm);
766796
} else {
767797
logical_mask = calc_validate_logical_mask(xe, eci,
768798
args->width,

drivers/gpu/drm/xe/xe_exec_queue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ struct xe_exec_queue *xe_exec_queue_create_class(struct xe_device *xe, struct xe
2828
u32 flags, u64 extensions);
2929
struct xe_exec_queue *xe_exec_queue_create_bind(struct xe_device *xe,
3030
struct xe_tile *tile,
31+
struct xe_vm *user_vm,
3132
u32 flags, u64 extensions);
3233

3334
void xe_exec_queue_fini(struct xe_exec_queue *q);

drivers/gpu/drm/xe/xe_exec_queue_types.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@ struct xe_exec_queue {
5454
struct kref refcount;
5555
/** @vm: VM (address space) for this exec queue */
5656
struct xe_vm *vm;
57+
/**
58+
* @user_vm: User VM (address space) for this exec queue (bind queues
59+
* only)
60+
*/
61+
struct xe_vm *user_vm;
62+
5763
/** @class: class of this exec queue */
5864
enum xe_engine_class class;
5965
/**

drivers/gpu/drm/xe/xe_ggtt.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ int xe_ggtt_init_early(struct xe_ggtt *ggtt)
322322
else
323323
ggtt->pt_ops = &xelp_pt_ops;
324324

325-
ggtt->wq = alloc_workqueue("xe-ggtt-wq", 0, WQ_MEM_RECLAIM);
325+
ggtt->wq = alloc_workqueue("xe-ggtt-wq", WQ_MEM_RECLAIM, 0);
326326
if (!ggtt->wq)
327327
return -ENOMEM;
328328

0 commit comments

Comments
 (0)