Skip to content

Commit ff780f4

Browse files
committed
drm/amdgpu: set an error on all fences from a bad context
When we backup ring contents to reemit after a queue reset, we don't backup ring contents from the bad context. When we signal the fences, we should set an error on those fences as well. v2: misc cleanups v3: add locking for fence error, fix comment (Christian) v4: fix wrap around, locking (Christian) Fixes: 77cc0da ("drm/amdgpu: track ring state associated with a fence") Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 1f22fcb commit ff780f4

3 files changed

Lines changed: 37 additions & 6 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -758,11 +758,42 @@ void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring)
758758
* @fence: fence of the ring to signal
759759
*
760760
*/
761-
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence)
761+
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
762762
{
763-
dma_fence_set_error(&fence->base, -ETIME);
764-
amdgpu_fence_write(fence->ring, fence->seq);
765-
amdgpu_fence_process(fence->ring);
763+
struct dma_fence *unprocessed;
764+
struct dma_fence __rcu **ptr;
765+
struct amdgpu_fence *fence;
766+
struct amdgpu_ring *ring = af->ring;
767+
unsigned long flags;
768+
u32 seq, last_seq;
769+
770+
last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
771+
seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
772+
773+
/* mark all fences from the guilty context with an error */
774+
spin_lock_irqsave(&ring->fence_drv.lock, flags);
775+
do {
776+
last_seq++;
777+
last_seq &= ring->fence_drv.num_fences_mask;
778+
779+
ptr = &ring->fence_drv.fences[last_seq];
780+
rcu_read_lock();
781+
unprocessed = rcu_dereference(*ptr);
782+
783+
if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
784+
fence = container_of(unprocessed, struct amdgpu_fence, base);
785+
786+
if (fence == af)
787+
dma_fence_set_error(&fence->base, -ETIME);
788+
else if (fence->context == af->context)
789+
dma_fence_set_error(&fence->base, -ECANCELED);
790+
}
791+
rcu_read_unlock();
792+
} while (last_seq != seq);
793+
spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
794+
/* signal the guilty fence */
795+
amdgpu_fence_write(ring, af->seq);
796+
amdgpu_fence_process(ring);
766797
}
767798

768799
void amdgpu_fence_save_wptr(struct dma_fence *fence)

drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -811,7 +811,7 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
811811
if (r)
812812
return r;
813813

814-
/* signal the fence of the bad job */
814+
/* signal the guilty fence and set an error on all fences from the context */
815815
if (guilty_fence)
816816
amdgpu_fence_driver_guilty_force_completion(guilty_fence);
817817
/* Re-emit the non-guilty commands */

drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ extern const struct drm_sched_backend_ops amdgpu_sched_ops;
155155
void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
156156
void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error);
157157
void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
158-
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence);
158+
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af);
159159
void amdgpu_fence_save_wptr(struct dma_fence *fence);
160160

161161
int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring);

0 commit comments

Comments
 (0)