Skip to content

Commit 39203f5

Browse files
ChristianKoenigAMDalexdeucher
authored andcommitted
drm/amdgpu: fix userq VM validation v4
That was actually complete nonsense and not validating the BOs at all. The code just cleared all VM areas were it couldn't grab the lock for a BO. Try to fix this. Only compile tested at the moment. v2: fix fence slot reservation as well as pointed out by Sunil. also validate PDs, PTs, per VM BOs and update PDEs v3: grab the status_lock while working with the done list. v4: rename functions, add some comments, fix waiting for updates to complete. v4: rename amdgpu_vm_lock_done_list(), add some more comments Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Sunil Khatri <sunil.khatri@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent d7ddcf9 commit 39203f5

3 files changed

Lines changed: 110 additions & 75 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c

Lines changed: 73 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -706,108 +706,106 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
706706
return ret;
707707
}
708708

709+
static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo)
710+
{
711+
struct ttm_operation_ctx ctx = { false, false };
712+
713+
amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
714+
return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
715+
}
716+
717+
/* Handle all BOs on the invalidated list, validate them and update the PTs */
709718
static int
710-
amdgpu_userq_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
719+
amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec,
720+
struct amdgpu_vm *vm)
711721
{
712722
struct ttm_operation_ctx ctx = { false, false };
723+
struct amdgpu_bo_va *bo_va;
724+
struct amdgpu_bo *bo;
713725
int ret;
714726

715-
amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
727+
spin_lock(&vm->status_lock);
728+
while (!list_empty(&vm->invalidated)) {
729+
bo_va = list_first_entry(&vm->invalidated,
730+
struct amdgpu_bo_va,
731+
base.vm_status);
732+
spin_unlock(&vm->status_lock);
716733

717-
ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
718-
if (ret)
719-
DRM_ERROR("Fail to validate\n");
734+
bo = bo_va->base.bo;
735+
ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2);
736+
if (unlikely(ret))
737+
return ret;
720738

721-
return ret;
739+
amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
740+
ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
741+
if (ret)
742+
return ret;
743+
744+
/* This moves the bo_va to the done list */
745+
ret = amdgpu_vm_bo_update(adev, bo_va, false);
746+
if (ret)
747+
return ret;
748+
749+
spin_lock(&vm->status_lock);
750+
}
751+
spin_unlock(&vm->status_lock);
752+
753+
return 0;
722754
}
723755

756+
/* Make sure the whole VM is ready to be used */
724757
static int
725-
amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
758+
amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr)
726759
{
727760
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
728-
struct amdgpu_vm *vm = &fpriv->vm;
729761
struct amdgpu_device *adev = uq_mgr->adev;
762+
struct amdgpu_vm *vm = &fpriv->vm;
730763
struct amdgpu_bo_va *bo_va;
731-
struct ww_acquire_ctx *ticket;
732764
struct drm_exec exec;
733-
struct amdgpu_bo *bo;
734-
struct dma_resv *resv;
735-
bool clear, unlock;
736-
int ret = 0;
765+
int ret;
737766

738767
drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
739768
drm_exec_until_all_locked(&exec) {
740-
ret = amdgpu_vm_lock_pd(vm, &exec, 2);
769+
ret = amdgpu_vm_lock_pd(vm, &exec, 1);
741770
drm_exec_retry_on_contention(&exec);
742-
if (unlikely(ret)) {
743-
drm_file_err(uq_mgr->file, "Failed to lock PD\n");
771+
if (unlikely(ret))
744772
goto unlock_all;
745-
}
746-
747-
/* Lock the done list */
748-
list_for_each_entry(bo_va, &vm->done, base.vm_status) {
749-
bo = bo_va->base.bo;
750-
if (!bo)
751-
continue;
752773

753-
ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
754-
drm_exec_retry_on_contention(&exec);
755-
if (unlikely(ret))
756-
goto unlock_all;
757-
}
758-
}
759-
760-
spin_lock(&vm->status_lock);
761-
while (!list_empty(&vm->moved)) {
762-
bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
763-
base.vm_status);
764-
spin_unlock(&vm->status_lock);
765-
766-
/* Per VM BOs never need to bo cleared in the page tables */
767-
ret = amdgpu_vm_bo_update(adev, bo_va, false);
768-
if (ret)
774+
ret = amdgpu_vm_lock_done_list(vm, &exec, 1);
775+
drm_exec_retry_on_contention(&exec);
776+
if (unlikely(ret))
769777
goto unlock_all;
770-
spin_lock(&vm->status_lock);
771-
}
772-
773-
ticket = &exec.ticket;
774-
while (!list_empty(&vm->invalidated)) {
775-
bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
776-
base.vm_status);
777-
resv = bo_va->base.bo->tbo.base.resv;
778-
spin_unlock(&vm->status_lock);
779778

780-
bo = bo_va->base.bo;
781-
ret = amdgpu_userq_validate_vm_bo(NULL, bo);
782-
if (ret) {
783-
drm_file_err(uq_mgr->file, "Failed to validate BO\n");
779+
/* This validates PDs, PTs and per VM BOs */
780+
ret = amdgpu_vm_validate(adev, vm, NULL,
781+
amdgpu_userq_validate_vm,
782+
NULL);
783+
if (unlikely(ret))
784784
goto unlock_all;
785-
}
786785

787-
/* Try to reserve the BO to avoid clearing its ptes */
788-
if (!adev->debug_vm && dma_resv_trylock(resv)) {
789-
clear = false;
790-
unlock = true;
791-
/* The caller is already holding the reservation lock */
792-
} else if (dma_resv_locking_ctx(resv) == ticket) {
793-
clear = false;
794-
unlock = false;
795-
/* Somebody else is using the BO right now */
796-
} else {
797-
clear = true;
798-
unlock = false;
799-
}
786+
/* This locks and validates the remaining evicted BOs */
787+
ret = amdgpu_userq_bo_validate(adev, &exec, vm);
788+
drm_exec_retry_on_contention(&exec);
789+
if (unlikely(ret))
790+
goto unlock_all;
791+
}
800792

801-
ret = amdgpu_vm_bo_update(adev, bo_va, clear);
793+
ret = amdgpu_vm_handle_moved(adev, vm, NULL);
794+
if (ret)
795+
goto unlock_all;
802796

803-
if (unlock)
804-
dma_resv_unlock(resv);
805-
if (ret)
806-
goto unlock_all;
797+
ret = amdgpu_vm_update_pdes(adev, vm, false);
798+
if (ret)
799+
goto unlock_all;
807800

808-
spin_lock(&vm->status_lock);
809-
}
810-
spin_unlock(&vm->status_lock);
801+
/*
802+
* We need to wait for all VM updates to finish before restarting the
803+
* queues. Using the done list like that is now ok since everything is
804+
* locked in place.
805+
*/
806+
list_for_each_entry(bo_va, &vm->done, base.vm_status)
807+
dma_fence_wait(bo_va->last_pt_update, false);
808+
dma_fence_wait(vm->last_update, false);
811809

812810
ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
813811
if (ret)
@@ -828,7 +826,7 @@ static void amdgpu_userq_restore_worker(struct work_struct *work)
828826

829827
mutex_lock(&uq_mgr->userq_mutex);
830828

831-
ret = amdgpu_userq_validate_bos(uq_mgr);
829+
ret = amdgpu_userq_vm_validate(uq_mgr);
832830
if (ret) {
833831
drm_file_err(uq_mgr->file, "Failed to validate BOs to restore\n");
834832
goto unlock;

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,41 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec,
484484
2 + num_fences);
485485
}
486486

487+
/**
488+
* amdgpu_vm_lock_done_list - lock all BOs on the done list
489+
* @exec: drm execution context
490+
* @num_fences: number of extra fences to reserve
491+
*
492+
* Lock the BOs on the done list in the DRM execution context.
493+
*/
494+
int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec,
495+
unsigned int num_fences)
496+
{
497+
struct list_head *prev = &vm->done;
498+
struct amdgpu_bo_va *bo_va;
499+
struct amdgpu_bo *bo;
500+
int ret;
501+
502+
/* We can only trust prev->next while holding the lock */
503+
spin_lock(&vm->status_lock);
504+
while (!list_is_head(prev->next, &vm->done)) {
505+
bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status);
506+
spin_unlock(&vm->status_lock);
507+
508+
bo = bo_va->base.bo;
509+
if (bo) {
510+
ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 1);
511+
if (unlikely(ret))
512+
return ret;
513+
}
514+
spin_lock(&vm->status_lock);
515+
prev = prev->next;
516+
}
517+
spin_unlock(&vm->status_lock);
518+
519+
return 0;
520+
}
521+
487522
/**
488523
* amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU
489524
*

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,8 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm);
491491
void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
492492
int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec,
493493
unsigned int num_fences);
494+
int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec,
495+
unsigned int num_fences);
494496
bool amdgpu_vm_ready(struct amdgpu_vm *vm);
495497
uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm);
496498
int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,

0 commit comments

Comments
 (0)