Skip to content

Commit 55bf196

Browse files
ChristianKoenigAMDalexdeucher
authored andcommitted
drm/amdgpu: reset VM when an error is detected
When some problem with the updates of page tables is detected reset the state machine of the VM and re-create all page tables from scratch. Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Luben Tuikov <luben.tuikov@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent e84e697 commit 55bf196

1 file changed

Lines changed: 65 additions & 16 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

Lines changed: 65 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,32 @@ static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo)
266266
spin_unlock(&vm_bo->vm->status_lock);
267267
}
268268

269+
/**
270+
* amdgpu_vm_bo_reset_state_machine - reset the vm_bo state machine
271+
* @vm: the VM which state machine to reset
272+
*
273+
* Move all vm_bo object in the VM into a state where they will be updated
274+
* again during validation.
275+
*/
276+
static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
277+
{
278+
struct amdgpu_vm_bo_base *vm_bo, *tmp;
279+
280+
spin_lock(&vm->status_lock);
281+
list_splice_init(&vm->done, &vm->invalidated);
282+
list_for_each_entry(vm_bo, &vm->invalidated, vm_status)
283+
vm_bo->moved = true;
284+
list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) {
285+
struct amdgpu_bo *bo = vm_bo->bo;
286+
287+
if (!bo || bo->tbo.type != ttm_bo_type_kernel)
288+
list_move(&vm_bo->vm_status, &vm_bo->vm->moved);
289+
else if (bo->parent)
290+
list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
291+
}
292+
spin_unlock(&vm->status_lock);
293+
}
294+
269295
/**
270296
* amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
271297
*
@@ -351,6 +377,34 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
351377
spin_unlock(&adev->mman.bdev.lru_lock);
352378
}
353379

380+
/* Create scheduler entities for page table updates */
381+
static int amdgpu_vm_init_entities(struct amdgpu_device *adev,
382+
struct amdgpu_vm *vm)
383+
{
384+
int r;
385+
386+
r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
387+
adev->vm_manager.vm_pte_scheds,
388+
adev->vm_manager.vm_pte_num_scheds, NULL);
389+
if (r)
390+
goto error;
391+
392+
return drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
393+
adev->vm_manager.vm_pte_scheds,
394+
adev->vm_manager.vm_pte_num_scheds, NULL);
395+
396+
error:
397+
drm_sched_entity_destroy(&vm->immediate);
398+
return r;
399+
}
400+
401+
/* Destroy the entities for page table updates again */
402+
static void amdgpu_vm_fini_entities(struct amdgpu_vm *vm)
403+
{
404+
drm_sched_entity_destroy(&vm->immediate);
405+
drm_sched_entity_destroy(&vm->delayed);
406+
}
407+
354408
/**
355409
* amdgpu_vm_validate_pt_bos - validate the page table BOs
356410
*
@@ -373,6 +427,14 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
373427
struct amdgpu_bo *bo;
374428
int r;
375429

430+
if (drm_sched_entity_error(&vm->delayed)) {
431+
amdgpu_vm_bo_reset_state_machine(vm);
432+
amdgpu_vm_fini_entities(vm);
433+
r = amdgpu_vm_init_entities(adev, vm);
434+
if (r)
435+
return r;
436+
}
437+
376438
spin_lock(&vm->status_lock);
377439
while (!list_empty(&vm->evicted)) {
378440
bo_base = list_first_entry(&vm->evicted,
@@ -2048,19 +2110,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
20482110
INIT_LIST_HEAD(&vm->pt_freed);
20492111
INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
20502112

2051-
/* create scheduler entities for page table updates */
2052-
r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
2053-
adev->vm_manager.vm_pte_scheds,
2054-
adev->vm_manager.vm_pte_num_scheds, NULL);
2113+
r = amdgpu_vm_init_entities(adev, vm);
20552114
if (r)
20562115
return r;
20572116

2058-
r = drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
2059-
adev->vm_manager.vm_pte_scheds,
2060-
adev->vm_manager.vm_pte_num_scheds, NULL);
2061-
if (r)
2062-
goto error_free_immediate;
2063-
20642117
vm->pte_support_ats = false;
20652118
vm->is_compute_context = false;
20662119

@@ -2121,10 +2174,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
21212174
error_free_delayed:
21222175
dma_fence_put(vm->last_tlb_flush);
21232176
dma_fence_put(vm->last_unlocked);
2124-
drm_sched_entity_destroy(&vm->delayed);
2125-
2126-
error_free_immediate:
2127-
drm_sched_entity_destroy(&vm->immediate);
2177+
amdgpu_vm_fini_entities(vm);
21282178

21292179
return r;
21302180
}
@@ -2277,8 +2327,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
22772327
amdgpu_bo_unref(&root);
22782328
WARN_ON(vm->root.bo);
22792329

2280-
drm_sched_entity_destroy(&vm->immediate);
2281-
drm_sched_entity_destroy(&vm->delayed);
2330+
amdgpu_vm_fini_entities(vm);
22822331

22832332
if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
22842333
dev_err(adev->dev, "still active bo inside vm\n");

0 commit comments

Comments
 (0)