Skip to content

Commit 6952255

Browse files
committed
drm/amdgpu: re-add the bad job to the pending list for ring resets
Returning DRM_GPU_SCHED_STAT_NO_HANG causes the scheduler to add the bad job back the pending list. We've already set the errors on the fence and killed the bad job at this point so it's the correct behavior. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 5cc7bbd commit 6952255

2 files changed

Lines changed: 8 additions & 5 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
9292
struct drm_wedge_task_info *info = NULL;
9393
struct amdgpu_task_info *ti = NULL;
9494
struct amdgpu_device *adev = ring->adev;
95+
enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_RESET;
9596
int idx, r;
9697

9798
if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
@@ -135,13 +136,19 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
135136
ring->funcs->reset) {
136137
dev_err(adev->dev, "Starting %s ring reset\n",
137138
s_job->sched->name);
139+
/* Stop the scheduler to prevent anybody else from touching the ring buffer. */
140+
drm_sched_wqueue_stop(&ring->sched);
138141
r = amdgpu_ring_reset(ring, job->vmid, job->hw_fence);
139142
if (!r) {
143+
/* Start the scheduler again */
144+
drm_sched_wqueue_start(&ring->sched);
140145
atomic_inc(&ring->adev->gpu_reset_counter);
141146
dev_err(adev->dev, "Ring %s reset succeeded\n",
142147
ring->sched.name);
143148
drm_dev_wedged_event(adev_to_drm(adev),
144149
DRM_WEDGE_RECOVERY_NONE, info);
150+
/* This is needed to add the job back to the pending list */
151+
status = DRM_GPU_SCHED_STAT_NO_HANG;
145152
goto exit;
146153
}
147154
dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name);
@@ -177,7 +184,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
177184
exit:
178185
amdgpu_vm_put_task_info(ti);
179186
drm_dev_exit(idx);
180-
return DRM_GPU_SCHED_STAT_RESET;
187+
return status;
181188
}
182189

183190
int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,

drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -868,8 +868,6 @@ bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring)
868868
void amdgpu_ring_reset_helper_begin(struct amdgpu_ring *ring,
869869
struct amdgpu_fence *guilty_fence)
870870
{
871-
/* Stop the scheduler to prevent anybody else from touching the ring buffer. */
872-
drm_sched_wqueue_stop(&ring->sched);
873871
/* back up the non-guilty commands */
874872
amdgpu_ring_backup_unprocessed_commands(ring, guilty_fence);
875873
}
@@ -895,8 +893,6 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
895893
amdgpu_ring_write(ring, ring->ring_backup[i]);
896894
amdgpu_ring_commit(ring);
897895
}
898-
/* Start the scheduler again */
899-
drm_sched_wqueue_start(&ring->sched);
900896
return 0;
901897
}
902898

0 commit comments

Comments
 (0)