@@ -91,8 +91,8 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
9191 struct amdgpu_job * job = to_amdgpu_job (s_job );
9292 struct amdgpu_task_info * ti ;
9393 struct amdgpu_device * adev = ring -> adev ;
94- int idx ;
95- int r ;
94+ bool set_error = false ;
95+ int idx , r ;
9696
9797 if (!drm_dev_enter (adev_to_drm (adev ), & idx )) {
9898 dev_info (adev -> dev , "%s - device unplugged skipping recovery on scheduler:%s" ,
@@ -136,10 +136,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
136136 } else if (amdgpu_gpu_recovery && ring -> funcs -> reset ) {
137137 bool is_guilty ;
138138
139- dev_err (adev -> dev , "Starting %s ring reset\n" , s_job -> sched -> name );
140- /* stop the scheduler, but don't mess with the
141- * bad job yet because if ring reset fails
142- * we'll fall back to full GPU reset.
139+ dev_err (adev -> dev , "Starting %s ring reset\n" ,
140+ s_job -> sched -> name );
141+
142+ /*
143+ * Stop the scheduler to prevent anybody else from touching the
144+ * ring buffer.
143145 */
144146 drm_sched_wqueue_stop (& ring -> sched );
145147
@@ -152,26 +154,29 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
152154 else
153155 is_guilty = true;
154156
155- if (is_guilty )
157+ if (is_guilty ) {
156158 dma_fence_set_error (& s_job -> s_fence -> finished , - ETIME );
159+ set_error = true;
160+ }
157161
158162 r = amdgpu_ring_reset (ring , job -> vmid , NULL );
159163 if (!r ) {
160- if (amdgpu_ring_sched_ready (ring ))
161- drm_sched_stop (& ring -> sched , s_job );
162164 if (is_guilty ) {
163165 atomic_inc (& ring -> adev -> gpu_reset_counter );
164166 amdgpu_fence_driver_force_completion (ring );
165167 }
166- if (amdgpu_ring_sched_ready (ring ))
167- drm_sched_start (& ring -> sched , 0 );
168- dev_err (adev -> dev , "Ring %s reset succeeded\n" , ring -> sched .name );
169- drm_dev_wedged_event (adev_to_drm (adev ), DRM_WEDGE_RECOVERY_NONE );
168+ drm_sched_wqueue_start (& ring -> sched );
169+ dev_err (adev -> dev , "Ring %s reset succeeded\n" ,
170+ ring -> sched .name );
171+ drm_dev_wedged_event (adev_to_drm (adev ),
172+ DRM_WEDGE_RECOVERY_NONE );
170173 goto exit ;
171174 }
172- dev_err (adev -> dev , "Ring %s reset failure \n" , ring -> sched .name );
175+ dev_err (adev -> dev , "Ring %s reset failed \n" , ring -> sched .name );
173176 }
174- dma_fence_set_error (& s_job -> s_fence -> finished , - ETIME );
177+
178+ if (!set_error )
179+ dma_fence_set_error (& s_job -> s_fence -> finished , - ETIME );
175180
176181 if (amdgpu_device_should_recover_gpu (ring -> adev )) {
177182 struct amdgpu_reset_context reset_context ;
0 commit comments