Skip to content

Commit f88e295

Browse files
ChristianKoenigAMDalexdeucher
authored andcommitted
drm/amdgpu: add VM generation token
Instead of using the VRAM lost counter add a 64bit token which indicates if a context or job is still valid to use. Should the VRAM be lost or the page tables need re-creation the token will change indicating that userspace needs to act and re-create the contexts and re-submit the work. Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Luben Tuikov <luben.tuikov@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 55bf196 commit f88e295

7 files changed

Lines changed: 37 additions & 7 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
309309
}
310310
p->gang_leader = p->jobs[p->gang_leader_idx];
311311

312-
if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) {
312+
if (p->ctx->generation != p->gang_leader->generation) {
313313
ret = -ECANCELED;
314314
goto free_all_kdata;
315315
}

drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
333333

334334
ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
335335
ctx->reset_counter_query = ctx->reset_counter;
336-
ctx->vram_lost_counter = atomic_read(&mgr->adev->vram_lost_counter);
336+
ctx->generation = amdgpu_vm_generation(mgr->adev, &fpriv->vm);
337337
ctx->init_priority = priority;
338338
ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;
339339

@@ -586,7 +586,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
586586
if (ctx->reset_counter != atomic_read(&adev->gpu_reset_counter))
587587
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RESET;
588588

589-
if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
589+
if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm))
590590
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST;
591591

592592
if (atomic_read(&ctx->guilty))

drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ struct amdgpu_ctx {
4747
struct amdgpu_ctx_mgr *mgr;
4848
unsigned reset_counter;
4949
unsigned reset_counter_query;
50-
uint32_t vram_lost_counter;
50+
uint64_t generation;
5151
spinlock_t ring_lock;
5252
struct amdgpu_ctx_entity *entities[AMDGPU_HW_IP_NUM][AMDGPU_MAX_ENTITY_NUM];
5353
bool preamble_presented;

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
109109
(*job)->vm = vm;
110110

111111
amdgpu_sync_create(&(*job)->explicit_sync);
112-
(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
112+
(*job)->generation = amdgpu_vm_generation(adev, vm);
113113
(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
114114

115115
if (!entity)
@@ -295,7 +295,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
295295
trace_amdgpu_sched_run_job(job);
296296

297297
/* Skip job if VRAM is lost and never resubmit gangs */
298-
if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter) ||
298+
if (job->generation != amdgpu_vm_generation(adev, job->vm) ||
299299
(job->job_run_counter && job->gang_submit))
300300
dma_fence_set_error(finished, -ECANCELED);
301301

drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ struct amdgpu_job {
6161
uint32_t gds_base, gds_size;
6262
uint32_t gws_base, gws_size;
6363
uint32_t oa_base, oa_size;
64-
uint32_t vram_lost_counter;
64+
uint64_t generation;
6565

6666
/* user fence handling */
6767
uint64_t uf_addr;

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,30 @@ static void amdgpu_vm_fini_entities(struct amdgpu_vm *vm)
405405
drm_sched_entity_destroy(&vm->delayed);
406406
}
407407

408+
/**
409+
* amdgpu_vm_generation - return the page table re-generation counter
410+
* @adev: the amdgpu_device
411+
* @vm: optional VM to check, might be NULL
412+
*
413+
* Returns a page table re-generation token to allow checking if submissions
414+
* are still valid to use this VM. The VM parameter might be NULL in which case
415+
* just the VRAM lost counter will be used.
416+
*/
417+
uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm)
418+
{
419+
uint64_t result = (u64)atomic_read(&adev->vram_lost_counter) << 32;
420+
421+
if (!vm)
422+
return result;
423+
424+
result += vm->generation;
425+
/* Add one if the page tables will be re-generated on next CS */
426+
if (drm_sched_entity_error(&vm->delayed))
427+
++result;
428+
429+
return result;
430+
}
431+
408432
/**
409433
* amdgpu_vm_validate_pt_bos - validate the page table BOs
410434
*
@@ -428,6 +452,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
428452
int r;
429453

430454
if (drm_sched_entity_error(&vm->delayed)) {
455+
++vm->generation;
431456
amdgpu_vm_bo_reset_state_machine(vm);
432457
amdgpu_vm_fini_entities(vm);
433458
r = amdgpu_vm_init_entities(adev, vm);
@@ -2134,6 +2159,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
21342159
vm->last_update = dma_fence_get_stub();
21352160
vm->last_unlocked = dma_fence_get_stub();
21362161
vm->last_tlb_flush = dma_fence_get_stub();
2162+
vm->generation = 0;
21372163

21382164
mutex_init(&vm->eviction_lock);
21392165
vm->evicting = false;

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,9 @@ struct amdgpu_vm {
295295
atomic64_t tlb_seq;
296296
struct dma_fence *last_tlb_flush;
297297

298+
/* How many times we had to re-generate the page tables */
299+
uint64_t generation;
300+
298301
/* Last unlocked submission to the scheduler entities */
299302
struct dma_fence *last_unlocked;
300303

@@ -397,6 +400,7 @@ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
397400
struct list_head *validated,
398401
struct amdgpu_bo_list_entry *entry);
399402
bool amdgpu_vm_ready(struct amdgpu_vm *vm);
403+
uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm);
400404
int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
401405
int (*callback)(void *p, struct amdgpu_bo *bo),
402406
void *param);

0 commit comments

Comments
 (0)