Skip to content

Commit cb99e12

Browse files
committed
drm/xe: Decouple bind queue last fence from TLB invalidations
Separate the bind queue’s last fence to apply exclusively to the bind job, avoiding unnecessary serialization on prior TLB invalidations. Preserve correct user fence signaling by merging bind and TLB invalidation fences later in the pipeline. v3: - Fix lockdep assert for migrate queues (CI) - Use individual dma fence contexts for array out fences (Testing) - Don't set last fence with arrays (Testing) - Move TLB invalid last fence under migrate lock (Testing) - Don't set queue last for migrate queues (Testing) Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6047 Signed-off-by: Matthew Brost <matthew.brost@intel.com> Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> Link: https://patch.msgid.link/20251031234050.3043507-4-matthew.brost@intel.com
1 parent b2d7ec4 commit cb99e12

6 files changed

Lines changed: 143 additions & 110 deletions

File tree

drivers/gpu/drm/xe/xe_pt.c

Lines changed: 27 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
* Copyright © 2022 Intel Corporation
44
*/
55

6-
#include <linux/dma-fence-array.h>
7-
86
#include "xe_pt.h"
97

108
#include "regs/xe_gtt_defs.h"
@@ -2359,10 +2357,9 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
23592357
struct xe_vm *vm = vops->vm;
23602358
struct xe_vm_pgtable_update_ops *pt_update_ops =
23612359
&vops->pt_update_ops[tile->id];
2362-
struct dma_fence *fence, *ifence, *mfence;
2360+
struct xe_exec_queue *q = pt_update_ops->q;
2361+
struct dma_fence *fence, *ifence = NULL, *mfence = NULL;
23632362
struct xe_tlb_inval_job *ijob = NULL, *mjob = NULL;
2364-
struct dma_fence **fences = NULL;
2365-
struct dma_fence_array *cf = NULL;
23662363
struct xe_range_fence *rfence;
23672364
struct xe_vma_op *op;
23682365
int err = 0, i;
@@ -2390,15 +2387,14 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
23902387
#endif
23912388

23922389
if (pt_update_ops->needs_invalidation) {
2393-
struct xe_exec_queue *q = pt_update_ops->q;
23942390
struct xe_dep_scheduler *dep_scheduler =
23952391
to_dep_scheduler(q, tile->primary_gt);
23962392

23972393
ijob = xe_tlb_inval_job_create(q, &tile->primary_gt->tlb_inval,
2398-
dep_scheduler,
2394+
dep_scheduler, vm,
23992395
pt_update_ops->start,
24002396
pt_update_ops->last,
2401-
vm->usm.asid);
2397+
XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
24022398
if (IS_ERR(ijob)) {
24032399
err = PTR_ERR(ijob);
24042400
goto kill_vm_tile1;
@@ -2410,26 +2406,15 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
24102406

24112407
mjob = xe_tlb_inval_job_create(q,
24122408
&tile->media_gt->tlb_inval,
2413-
dep_scheduler,
2409+
dep_scheduler, vm,
24142410
pt_update_ops->start,
24152411
pt_update_ops->last,
2416-
vm->usm.asid);
2412+
XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT);
24172413
if (IS_ERR(mjob)) {
24182414
err = PTR_ERR(mjob);
24192415
goto free_ijob;
24202416
}
24212417
update.mjob = mjob;
2422-
2423-
fences = kmalloc_array(2, sizeof(*fences), GFP_KERNEL);
2424-
if (!fences) {
2425-
err = -ENOMEM;
2426-
goto free_ijob;
2427-
}
2428-
cf = dma_fence_array_alloc(2);
2429-
if (!cf) {
2430-
err = -ENOMEM;
2431-
goto free_ijob;
2432-
}
24332418
}
24342419
}
24352420

@@ -2460,38 +2445,27 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
24602445
pt_update_ops->last, fence))
24612446
dma_fence_wait(fence, false);
24622447

2463-
/* tlb invalidation must be done before signaling unbind/rebind */
2464-
if (ijob) {
2465-
struct dma_fence *__fence;
2466-
2448+
if (ijob)
24672449
ifence = xe_tlb_inval_job_push(ijob, tile->migrate, fence);
2468-
__fence = ifence;
2450+
if (mjob)
2451+
mfence = xe_tlb_inval_job_push(mjob, tile->migrate, fence);
24692452

2470-
if (mjob) {
2471-
fences[0] = ifence;
2472-
mfence = xe_tlb_inval_job_push(mjob, tile->migrate,
2473-
fence);
2474-
fences[1] = mfence;
2475-
2476-
dma_fence_array_init(cf, 2, fences,
2477-
vm->composite_fence_ctx,
2478-
vm->composite_fence_seqno++,
2479-
false);
2480-
__fence = &cf->base;
2481-
}
2482-
2483-
dma_fence_put(fence);
2484-
fence = __fence;
2485-
}
2486-
2487-
if (!mjob) {
2453+
if (!mjob && !ijob) {
24882454
dma_resv_add_fence(xe_vm_resv(vm), fence,
24892455
pt_update_ops->wait_vm_bookkeep ?
24902456
DMA_RESV_USAGE_KERNEL :
24912457
DMA_RESV_USAGE_BOOKKEEP);
24922458

24932459
list_for_each_entry(op, &vops->list, link)
24942460
op_commit(vops->vm, tile, pt_update_ops, op, fence, NULL);
2461+
} else if (ijob && !mjob) {
2462+
dma_resv_add_fence(xe_vm_resv(vm), ifence,
2463+
pt_update_ops->wait_vm_bookkeep ?
2464+
DMA_RESV_USAGE_KERNEL :
2465+
DMA_RESV_USAGE_BOOKKEEP);
2466+
2467+
list_for_each_entry(op, &vops->list, link)
2468+
op_commit(vops->vm, tile, pt_update_ops, op, ifence, NULL);
24952469
} else {
24962470
dma_resv_add_fence(xe_vm_resv(vm), ifence,
24972471
pt_update_ops->wait_vm_bookkeep ?
@@ -2511,16 +2485,23 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
25112485
if (pt_update_ops->needs_svm_lock)
25122486
xe_svm_notifier_unlock(vm);
25132487

2488+
/*
2489+
* The last fence is only used for zero bind queue idling; migrate
2490+
* queues are not exposed to user space.
2491+
*/
2492+
if (!(q->flags & EXEC_QUEUE_FLAG_MIGRATE))
2493+
xe_exec_queue_last_fence_set(q, vm, fence);
2494+
25142495
xe_tlb_inval_job_put(mjob);
25152496
xe_tlb_inval_job_put(ijob);
2497+
dma_fence_put(ifence);
2498+
dma_fence_put(mfence);
25162499

25172500
return fence;
25182501

25192502
free_rfence:
25202503
kfree(rfence);
25212504
free_ijob:
2522-
kfree(cf);
2523-
kfree(fences);
25242505
xe_tlb_inval_job_put(mjob);
25252506
xe_tlb_inval_job_put(ijob);
25262507
kill_vm_tile1:

drivers/gpu/drm/xe/xe_sync.c

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
#include <drm/drm_syncobj.h>
1515
#include <uapi/drm/xe_drm.h>
1616

17-
#include "xe_device_types.h"
17+
#include "xe_device.h"
1818
#include "xe_exec_queue.h"
1919
#include "xe_macros.h"
2020
#include "xe_sched_job_types.h"
@@ -297,26 +297,67 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
297297
struct dma_fence **fences = NULL;
298298
struct dma_fence_array *cf = NULL;
299299
struct dma_fence *fence;
300-
int i, num_in_fence = 0, current_fence = 0;
300+
int i, num_fence = 0, current_fence = 0;
301301

302302
lockdep_assert_held(&vm->lock);
303303

304304
/* Count in-fences */
305305
for (i = 0; i < num_sync; ++i) {
306306
if (sync[i].fence) {
307-
++num_in_fence;
307+
++num_fence;
308308
fence = sync[i].fence;
309309
}
310310
}
311311

312312
/* Easy case... */
313-
if (!num_in_fence) {
313+
if (!num_fence) {
314+
if (q->flags & EXEC_QUEUE_FLAG_VM) {
315+
struct xe_exec_queue *__q;
316+
struct xe_tile *tile;
317+
u8 id;
318+
319+
for_each_tile(tile, vm->xe, id)
320+
num_fence += (1 + XE_MAX_GT_PER_TILE);
321+
322+
fences = kmalloc_array(num_fence, sizeof(*fences),
323+
GFP_KERNEL);
324+
if (!fences)
325+
return ERR_PTR(-ENOMEM);
326+
327+
fences[current_fence++] =
328+
xe_exec_queue_last_fence_get(q, vm);
329+
for_each_tlb_inval(i)
330+
fences[current_fence++] =
331+
xe_exec_queue_tlb_inval_last_fence_get(q, vm, i);
332+
list_for_each_entry(__q, &q->multi_gt_list,
333+
multi_gt_link) {
334+
fences[current_fence++] =
335+
xe_exec_queue_last_fence_get(__q, vm);
336+
for_each_tlb_inval(i)
337+
fences[current_fence++] =
338+
xe_exec_queue_tlb_inval_last_fence_get(__q, vm, i);
339+
}
340+
341+
xe_assert(vm->xe, current_fence == num_fence);
342+
cf = dma_fence_array_create(num_fence, fences,
343+
dma_fence_context_alloc(1),
344+
1, false);
345+
if (!cf)
346+
goto err_out;
347+
348+
return &cf->base;
349+
}
350+
314351
fence = xe_exec_queue_last_fence_get(q, vm);
315352
return fence;
316353
}
317354

318-
/* Create composite fence */
319-
fences = kmalloc_array(num_in_fence + 1, sizeof(*fences), GFP_KERNEL);
355+
/*
356+
* Create composite fence - FIXME - the below code doesn't work. This is
357+
* unused in Mesa so we are ok for the moment. Perhaps we just disable
358+
* this entire code path if number of in fences != 0.
359+
*/
360+
fences = kmalloc_array(num_fence + 1, sizeof(*fences), GFP_KERNEL);
320361
if (!fences)
321362
return ERR_PTR(-ENOMEM);
322363
for (i = 0; i < num_sync; ++i) {
@@ -326,14 +367,10 @@ xe_sync_in_fence_get(struct xe_sync_entry *sync, int num_sync,
326367
}
327368
}
328369
fences[current_fence++] = xe_exec_queue_last_fence_get(q, vm);
329-
cf = dma_fence_array_create(num_in_fence, fences,
330-
vm->composite_fence_ctx,
331-
vm->composite_fence_seqno++,
332-
false);
333-
if (!cf) {
334-
--vm->composite_fence_seqno;
370+
cf = dma_fence_array_create(num_fence, fences,
371+
dma_fence_context_alloc(1), 1, false);
372+
if (!cf)
335373
goto err_out;
336-
}
337374

338375
return &cf->base;
339376

drivers/gpu/drm/xe/xe_tlb_inval_job.c

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "xe_tlb_inval_job.h"
1313
#include "xe_migrate.h"
1414
#include "xe_pm.h"
15+
#include "xe_vm.h"
1516

1617
/** struct xe_tlb_inval_job - TLB invalidation job */
1718
struct xe_tlb_inval_job {
@@ -21,6 +22,8 @@ struct xe_tlb_inval_job {
2122
struct xe_tlb_inval *tlb_inval;
2223
/** @q: exec queue issuing the invalidate */
2324
struct xe_exec_queue *q;
25+
/** @vm: VM which TLB invalidation is being issued for */
26+
struct xe_vm *vm;
2427
/** @refcount: ref count of this job */
2528
struct kref refcount;
2629
/**
@@ -32,8 +35,8 @@ struct xe_tlb_inval_job {
3235
u64 start;
3336
/** @end: End address to invalidate */
3437
u64 end;
35-
/** @asid: Address space ID to invalidate */
36-
u32 asid;
38+
/** @type: GT type */
39+
int type;
3740
/** @fence_armed: Fence has been armed */
3841
bool fence_armed;
3942
};
@@ -46,7 +49,7 @@ static struct dma_fence *xe_tlb_inval_job_run(struct xe_dep_job *dep_job)
4649
container_of(job->fence, typeof(*ifence), base);
4750

4851
xe_tlb_inval_range(job->tlb_inval, ifence, job->start,
49-
job->end, job->asid);
52+
job->end, job->vm->usm.asid);
5053

5154
return job->fence;
5255
}
@@ -70,9 +73,10 @@ static const struct xe_dep_job_ops dep_job_ops = {
7073
* @q: exec queue issuing the invalidate
7174
* @tlb_inval: TLB invalidation client
7275
* @dep_scheduler: Dependency scheduler for job
76+
* @vm: VM which TLB invalidation is being issued for
7377
* @start: Start address to invalidate
7478
* @end: End address to invalidate
75-
* @asid: Address space ID to invalidate
79+
* @type: GT type
7680
*
7781
* Create a TLB invalidation job and initialize internal fields. The caller is
7882
* responsible for releasing the creation reference.
@@ -81,28 +85,33 @@ static const struct xe_dep_job_ops dep_job_ops = {
8185
*/
8286
struct xe_tlb_inval_job *
8387
xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
84-
struct xe_dep_scheduler *dep_scheduler, u64 start,
85-
u64 end, u32 asid)
88+
struct xe_dep_scheduler *dep_scheduler,
89+
struct xe_vm *vm, u64 start, u64 end, int type)
8690
{
8791
struct xe_tlb_inval_job *job;
8892
struct drm_sched_entity *entity =
8993
xe_dep_scheduler_entity(dep_scheduler);
9094
struct xe_tlb_inval_fence *ifence;
9195
int err;
9296

97+
xe_assert(vm->xe, type == XE_EXEC_QUEUE_TLB_INVAL_MEDIA_GT ||
98+
type == XE_EXEC_QUEUE_TLB_INVAL_PRIMARY_GT);
99+
93100
job = kmalloc(sizeof(*job), GFP_KERNEL);
94101
if (!job)
95102
return ERR_PTR(-ENOMEM);
96103

97104
job->q = q;
105+
job->vm = vm;
98106
job->tlb_inval = tlb_inval;
99107
job->start = start;
100108
job->end = end;
101-
job->asid = asid;
102109
job->fence_armed = false;
103110
job->dep.ops = &dep_job_ops;
111+
job->type = type;
104112
kref_init(&job->refcount);
105113
xe_exec_queue_get(q); /* Pairs with put in xe_tlb_inval_job_destroy */
114+
xe_vm_get(vm); /* Pairs with put in xe_tlb_inval_job_destroy */
106115

107116
ifence = kmalloc(sizeof(*ifence), GFP_KERNEL);
108117
if (!ifence) {
@@ -124,6 +133,7 @@ xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
124133
err_fence:
125134
kfree(ifence);
126135
err_job:
136+
xe_vm_put(vm);
127137
xe_exec_queue_put(q);
128138
kfree(job);
129139

@@ -138,6 +148,7 @@ static void xe_tlb_inval_job_destroy(struct kref *ref)
138148
container_of(job->fence, typeof(*ifence), base);
139149
struct xe_exec_queue *q = job->q;
140150
struct xe_device *xe = gt_to_xe(q->gt);
151+
struct xe_vm *vm = job->vm;
141152

142153
if (!job->fence_armed)
143154
kfree(ifence);
@@ -147,6 +158,7 @@ static void xe_tlb_inval_job_destroy(struct kref *ref)
147158

148159
drm_sched_job_cleanup(&job->dep.drm);
149160
kfree(job);
161+
xe_vm_put(vm); /* Pairs with get from xe_tlb_inval_job_create */
150162
xe_exec_queue_put(q); /* Pairs with get from xe_tlb_inval_job_create */
151163
xe_pm_runtime_put(xe); /* Pairs with get from xe_tlb_inval_job_create */
152164
}
@@ -231,6 +243,11 @@ struct dma_fence *xe_tlb_inval_job_push(struct xe_tlb_inval_job *job,
231243
dma_fence_get(&job->dep.drm.s_fence->finished);
232244
drm_sched_entity_push_job(&job->dep.drm);
233245

246+
/* Let the upper layers fish this out */
247+
xe_exec_queue_tlb_inval_last_fence_set(job->q, job->vm,
248+
&job->dep.drm.s_fence->finished,
249+
job->type);
250+
234251
xe_migrate_job_unlock(m, job->q);
235252

236253
/*

drivers/gpu/drm/xe/xe_tlb_inval_job.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,15 @@
1111
struct dma_fence;
1212
struct xe_dep_scheduler;
1313
struct xe_exec_queue;
14+
struct xe_migrate;
1415
struct xe_tlb_inval;
1516
struct xe_tlb_inval_job;
16-
struct xe_migrate;
17+
struct xe_vm;
1718

1819
struct xe_tlb_inval_job *
1920
xe_tlb_inval_job_create(struct xe_exec_queue *q, struct xe_tlb_inval *tlb_inval,
2021
struct xe_dep_scheduler *dep_scheduler,
21-
u64 start, u64 end, u32 asid);
22+
struct xe_vm *vm, u64 start, u64 end, int type);
2223

2324
int xe_tlb_inval_job_alloc_dep(struct xe_tlb_inval_job *job);
2425

0 commit comments

Comments
 (0)