Skip to content

Commit 6266673

Browse files
dceraolojohnharr-intel
authored andcommitted
drm/xe: Fix error handling if PXP fails to start
Since the PXP start comes after __xe_exec_queue_init() has completed, we need to cleanup what was done in that function in case of a PXP start error. __xe_exec_queue_init calls the submission backend init() function, so we need to introduce an opposite for that. Unfortunately, while we already have a fini() function pointer, it performs other operations in addition to cleaning up what was done by the init(). Therefore, for clarity, the existing fini() has been renamed to destroy(), while a new fini() has been added to only clean up what was done by the init(), with the latter being called by the former (via xe_exec_queue_fini). Fixes: 72d4796 ("drm/xe/pxp/uapi: Add userspace and LRC support for PXP-using queues") Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: John Harrison <John.C.Harrison@Intel.com> Cc: Matthew Brost <matthew.brost@intel.com> Reviewed-by: John Harrison <John.C.Harrison@Intel.com> Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Link: https://lore.kernel.org/r/20250909221240.3711023-3-daniele.ceraolospurio@intel.com
1 parent 9e6eb49 commit 6266673

6 files changed

Lines changed: 72 additions & 41 deletions

File tree

drivers/gpu/drm/xe/xe_exec_queue.c

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,16 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q)
199199
return err;
200200
}
201201

202+
static void __xe_exec_queue_fini(struct xe_exec_queue *q)
203+
{
204+
int i;
205+
206+
q->ops->fini(q);
207+
208+
for (i = 0; i < q->width; ++i)
209+
xe_lrc_put(q->lrc[i]);
210+
}
211+
202212
struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm,
203213
u32 logical_mask, u16 width,
204214
struct xe_hw_engine *hwe, u32 flags,
@@ -229,11 +239,13 @@ struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *v
229239
if (xe_exec_queue_uses_pxp(q)) {
230240
err = xe_pxp_exec_queue_add(xe->pxp, q);
231241
if (err)
232-
goto err_post_alloc;
242+
goto err_post_init;
233243
}
234244

235245
return q;
236246

247+
err_post_init:
248+
__xe_exec_queue_fini(q);
237249
err_post_alloc:
238250
__xe_exec_queue_free(q);
239251
return ERR_PTR(err);
@@ -331,13 +343,11 @@ void xe_exec_queue_destroy(struct kref *ref)
331343
xe_exec_queue_put(eq);
332344
}
333345

334-
q->ops->fini(q);
346+
q->ops->destroy(q);
335347
}
336348

337349
void xe_exec_queue_fini(struct xe_exec_queue *q)
338350
{
339-
int i;
340-
341351
/*
342352
* Before releasing our ref to lrc and xef, accumulate our run ticks
343353
* and wakeup any waiters.
@@ -346,9 +356,7 @@ void xe_exec_queue_fini(struct xe_exec_queue *q)
346356
if (q->xef && atomic_dec_and_test(&q->xef->exec_queue.pending_removal))
347357
wake_up_var(&q->xef->exec_queue.pending_removal);
348358

349-
for (i = 0; i < q->width; ++i)
350-
xe_lrc_put(q->lrc[i]);
351-
359+
__xe_exec_queue_fini(q);
352360
__xe_exec_queue_free(q);
353361
}
354362

drivers/gpu/drm/xe/xe_exec_queue_types.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,14 @@ struct xe_exec_queue_ops {
181181
int (*init)(struct xe_exec_queue *q);
182182
/** @kill: Kill inflight submissions for backend */
183183
void (*kill)(struct xe_exec_queue *q);
184-
/** @fini: Fini exec queue for submission backend */
184+
/** @fini: Undoes the init() for submission backend */
185185
void (*fini)(struct xe_exec_queue *q);
186+
/**
187+
* @destroy: Destroy exec queue for submission backend. The backend
188+
* function must call xe_exec_queue_fini() (which will in turn call the
189+
* fini() backend function) to ensure the queue is properly cleaned up.
190+
*/
191+
void (*destroy)(struct xe_exec_queue *q);
186192
/** @set_priority: Set priority for exec queue */
187193
int (*set_priority)(struct xe_exec_queue *q,
188194
enum xe_exec_queue_priority priority);

drivers/gpu/drm/xe/xe_execlist.c

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -385,10 +385,20 @@ static int execlist_exec_queue_init(struct xe_exec_queue *q)
385385
return err;
386386
}
387387

388-
static void execlist_exec_queue_fini_async(struct work_struct *w)
388+
static void execlist_exec_queue_fini(struct xe_exec_queue *q)
389+
{
390+
struct xe_execlist_exec_queue *exl = q->execlist;
391+
392+
drm_sched_entity_fini(&exl->entity);
393+
drm_sched_fini(&exl->sched);
394+
395+
kfree(exl);
396+
}
397+
398+
static void execlist_exec_queue_destroy_async(struct work_struct *w)
389399
{
390400
struct xe_execlist_exec_queue *ee =
391-
container_of(w, struct xe_execlist_exec_queue, fini_async);
401+
container_of(w, struct xe_execlist_exec_queue, destroy_async);
392402
struct xe_exec_queue *q = ee->q;
393403
struct xe_execlist_exec_queue *exl = q->execlist;
394404
struct xe_device *xe = gt_to_xe(q->gt);
@@ -401,10 +411,6 @@ static void execlist_exec_queue_fini_async(struct work_struct *w)
401411
list_del(&exl->active_link);
402412
spin_unlock_irqrestore(&exl->port->lock, flags);
403413

404-
drm_sched_entity_fini(&exl->entity);
405-
drm_sched_fini(&exl->sched);
406-
kfree(exl);
407-
408414
xe_exec_queue_fini(q);
409415
}
410416

@@ -413,10 +419,10 @@ static void execlist_exec_queue_kill(struct xe_exec_queue *q)
413419
/* NIY */
414420
}
415421

416-
static void execlist_exec_queue_fini(struct xe_exec_queue *q)
422+
static void execlist_exec_queue_destroy(struct xe_exec_queue *q)
417423
{
418-
INIT_WORK(&q->execlist->fini_async, execlist_exec_queue_fini_async);
419-
queue_work(system_unbound_wq, &q->execlist->fini_async);
424+
INIT_WORK(&q->execlist->destroy_async, execlist_exec_queue_destroy_async);
425+
queue_work(system_unbound_wq, &q->execlist->destroy_async);
420426
}
421427

422428
static int execlist_exec_queue_set_priority(struct xe_exec_queue *q,
@@ -467,6 +473,7 @@ static const struct xe_exec_queue_ops execlist_exec_queue_ops = {
467473
.init = execlist_exec_queue_init,
468474
.kill = execlist_exec_queue_kill,
469475
.fini = execlist_exec_queue_fini,
476+
.destroy = execlist_exec_queue_destroy,
470477
.set_priority = execlist_exec_queue_set_priority,
471478
.set_timeslice = execlist_exec_queue_set_timeslice,
472479
.set_preempt_timeout = execlist_exec_queue_set_preempt_timeout,

drivers/gpu/drm/xe/xe_execlist_types.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ struct xe_execlist_exec_queue {
4242

4343
bool has_run;
4444

45-
struct work_struct fini_async;
45+
struct work_struct destroy_async;
4646

4747
enum xe_exec_queue_priority active_priority;
4848
struct list_head active_link;

drivers/gpu/drm/xe/xe_guc_exec_queue_types.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ struct xe_guc_exec_queue {
3535
struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE];
3636
/** @lr_tdr: long running TDR worker */
3737
struct work_struct lr_tdr;
38-
/** @fini_async: do final fini async from this worker */
39-
struct work_struct fini_async;
38+
/** @destroy_async: do final destroy async from this worker */
39+
struct work_struct destroy_async;
4040
/** @resume_time: time of last resume */
4141
u64 resume_time;
4242
/** @state: GuC specific state for this xe_exec_queue */

drivers/gpu/drm/xe/xe_guc_submit.c

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,48 +1419,57 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
14191419
return DRM_GPU_SCHED_STAT_NO_HANG;
14201420
}
14211421

1422-
static void __guc_exec_queue_fini_async(struct work_struct *w)
1422+
static void guc_exec_queue_fini(struct xe_exec_queue *q)
1423+
{
1424+
struct xe_guc_exec_queue *ge = q->guc;
1425+
struct xe_guc *guc = exec_queue_to_guc(q);
1426+
1427+
release_guc_id(guc, q);
1428+
xe_sched_entity_fini(&ge->entity);
1429+
xe_sched_fini(&ge->sched);
1430+
1431+
/*
1432+
* RCU free due sched being exported via DRM scheduler fences
1433+
* (timeline name).
1434+
*/
1435+
kfree_rcu(ge, rcu);
1436+
}
1437+
1438+
static void __guc_exec_queue_destroy_async(struct work_struct *w)
14231439
{
14241440
struct xe_guc_exec_queue *ge =
1425-
container_of(w, struct xe_guc_exec_queue, fini_async);
1441+
container_of(w, struct xe_guc_exec_queue, destroy_async);
14261442
struct xe_exec_queue *q = ge->q;
14271443
struct xe_guc *guc = exec_queue_to_guc(q);
14281444

14291445
xe_pm_runtime_get(guc_to_xe(guc));
14301446
trace_xe_exec_queue_destroy(q);
14311447

1432-
release_guc_id(guc, q);
14331448
if (xe_exec_queue_is_lr(q))
14341449
cancel_work_sync(&ge->lr_tdr);
14351450
/* Confirm no work left behind accessing device structures */
14361451
cancel_delayed_work_sync(&ge->sched.base.work_tdr);
1437-
xe_sched_entity_fini(&ge->entity);
1438-
xe_sched_fini(&ge->sched);
14391452

1440-
/*
1441-
* RCU free due sched being exported via DRM scheduler fences
1442-
* (timeline name).
1443-
*/
1444-
kfree_rcu(ge, rcu);
14451453
xe_exec_queue_fini(q);
1454+
14461455
xe_pm_runtime_put(guc_to_xe(guc));
14471456
}
14481457

1449-
static void guc_exec_queue_fini_async(struct xe_exec_queue *q)
1458+
static void guc_exec_queue_destroy_async(struct xe_exec_queue *q)
14501459
{
14511460
struct xe_guc *guc = exec_queue_to_guc(q);
14521461
struct xe_device *xe = guc_to_xe(guc);
14531462

1454-
INIT_WORK(&q->guc->fini_async, __guc_exec_queue_fini_async);
1463+
INIT_WORK(&q->guc->destroy_async, __guc_exec_queue_destroy_async);
14551464

14561465
/* We must block on kernel engines so slabs are empty on driver unload */
14571466
if (q->flags & EXEC_QUEUE_FLAG_PERMANENT || exec_queue_wedged(q))
1458-
__guc_exec_queue_fini_async(&q->guc->fini_async);
1467+
__guc_exec_queue_destroy_async(&q->guc->destroy_async);
14591468
else
1460-
queue_work(xe->destroy_wq, &q->guc->fini_async);
1469+
queue_work(xe->destroy_wq, &q->guc->destroy_async);
14611470
}
14621471

1463-
static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q)
1472+
static void __guc_exec_queue_destroy(struct xe_guc *guc, struct xe_exec_queue *q)
14641473
{
14651474
/*
14661475
* Might be done from within the GPU scheduler, need to do async as we
@@ -1469,7 +1478,7 @@ static void __guc_exec_queue_fini(struct xe_guc *guc, struct xe_exec_queue *q)
14691478
* this we and don't really care when everything is fini'd, just that it
14701479
* is.
14711480
*/
1472-
guc_exec_queue_fini_async(q);
1481+
guc_exec_queue_destroy_async(q);
14731482
}
14741483

14751484
static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
@@ -1483,7 +1492,7 @@ static void __guc_exec_queue_process_msg_cleanup(struct xe_sched_msg *msg)
14831492
if (exec_queue_registered(q))
14841493
disable_scheduling_deregister(guc, q);
14851494
else
1486-
__guc_exec_queue_fini(guc, q);
1495+
__guc_exec_queue_destroy(guc, q);
14871496
}
14881497

14891498
static bool guc_exec_queue_allowed_to_change_state(struct xe_exec_queue *q)
@@ -1716,14 +1725,14 @@ static bool guc_exec_queue_try_add_msg(struct xe_exec_queue *q,
17161725
#define STATIC_MSG_CLEANUP 0
17171726
#define STATIC_MSG_SUSPEND 1
17181727
#define STATIC_MSG_RESUME 2
1719-
static void guc_exec_queue_fini(struct xe_exec_queue *q)
1728+
static void guc_exec_queue_destroy(struct xe_exec_queue *q)
17201729
{
17211730
struct xe_sched_msg *msg = q->guc->static_msgs + STATIC_MSG_CLEANUP;
17221731

17231732
if (!(q->flags & EXEC_QUEUE_FLAG_PERMANENT) && !exec_queue_wedged(q))
17241733
guc_exec_queue_add_msg(q, msg, CLEANUP);
17251734
else
1726-
__guc_exec_queue_fini(exec_queue_to_guc(q), q);
1735+
__guc_exec_queue_destroy(exec_queue_to_guc(q), q);
17271736
}
17281737

17291738
static int guc_exec_queue_set_priority(struct xe_exec_queue *q,
@@ -1853,6 +1862,7 @@ static const struct xe_exec_queue_ops guc_exec_queue_ops = {
18531862
.init = guc_exec_queue_init,
18541863
.kill = guc_exec_queue_kill,
18551864
.fini = guc_exec_queue_fini,
1865+
.destroy = guc_exec_queue_destroy,
18561866
.set_priority = guc_exec_queue_set_priority,
18571867
.set_timeslice = guc_exec_queue_set_timeslice,
18581868
.set_preempt_timeout = guc_exec_queue_set_preempt_timeout,
@@ -1874,7 +1884,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, struct xe_exec_queue *q)
18741884
if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
18751885
xe_exec_queue_put(q);
18761886
else if (exec_queue_destroyed(q))
1877-
__guc_exec_queue_fini(guc, q);
1887+
__guc_exec_queue_destroy(guc, q);
18781888
}
18791889
if (q->guc->suspend_pending) {
18801890
set_exec_queue_suspended(q);
@@ -2203,7 +2213,7 @@ static void handle_deregister_done(struct xe_guc *guc, struct xe_exec_queue *q)
22032213
if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
22042214
xe_exec_queue_put(q);
22052215
else
2206-
__guc_exec_queue_fini(guc, q);
2216+
__guc_exec_queue_destroy(guc, q);
22072217
}
22082218

22092219
int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len)

0 commit comments

Comments
 (0)