Skip to content

Commit 43ca5eb

Browse files
committed
drm/amdgpu: move guilty handling into ring resets
Move guilty logic into the ring reset callbacks. This allows each ring reset callback to better handle fence errors and force completions in line with the reset behavior for each IP. It also allows us to remove the ring guilty callback since that logic now lives in the reset callback. Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 2dee58c commit 43ca5eb

3 files changed

Lines changed: 3 additions & 51 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
9191
struct amdgpu_job *job = to_amdgpu_job(s_job);
9292
struct amdgpu_task_info *ti;
9393
struct amdgpu_device *adev = ring->adev;
94-
bool set_error = false;
9594
int idx, r;
9695

9796
if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
@@ -134,8 +133,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
134133
if (unlikely(adev->debug_disable_gpu_ring_reset)) {
135134
dev_err(adev->dev, "Ring reset disabled by debug mask\n");
136135
} else if (amdgpu_gpu_recovery && ring->funcs->reset) {
137-
bool is_guilty;
138-
139136
dev_err(adev->dev, "Starting %s ring reset\n",
140137
s_job->sched->name);
141138

@@ -145,24 +142,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
145142
*/
146143
drm_sched_wqueue_stop(&ring->sched);
147144

148-
/* for engine resets, we need to reset the engine,
149-
* but individual queues may be unaffected.
150-
* check here to make sure the accounting is correct.
151-
*/
152-
if (ring->funcs->is_guilty)
153-
is_guilty = ring->funcs->is_guilty(ring);
154-
else
155-
is_guilty = true;
156-
157-
if (is_guilty) {
158-
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
159-
set_error = true;
160-
}
161-
162145
r = amdgpu_ring_reset(ring, job->vmid, NULL);
163146
if (!r) {
164-
if (is_guilty)
165-
atomic_inc(&ring->adev->gpu_reset_counter);
147+
atomic_inc(&ring->adev->gpu_reset_counter);
166148
drm_sched_wqueue_start(&ring->sched);
167149
dev_err(adev->dev, "Ring %s reset succeeded\n",
168150
ring->sched.name);
@@ -173,8 +155,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
173155
dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name);
174156
}
175157

176-
if (!set_error)
177-
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
158+
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
178159

179160
if (amdgpu_device_should_recover_gpu(ring->adev)) {
180161
struct amdgpu_reset_context reset_context;

drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,6 @@ struct amdgpu_ring_funcs {
271271
int (*reset)(struct amdgpu_ring *ring, unsigned int vmid,
272272
struct amdgpu_fence *timedout_fence);
273273
void (*emit_cleaner_shader)(struct amdgpu_ring *ring);
274-
bool (*is_guilty)(struct amdgpu_ring *ring);
275274
};
276275

277276
/**

drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1656,30 +1656,10 @@ static bool sdma_v4_4_2_is_queue_selected(struct amdgpu_device *adev, uint32_t i
16561656
return (context_status & SDMA_GFX_CONTEXT_STATUS__SELECTED_MASK) != 0;
16571657
}
16581658

1659-
static bool sdma_v4_4_2_ring_is_guilty(struct amdgpu_ring *ring)
1660-
{
1661-
struct amdgpu_device *adev = ring->adev;
1662-
uint32_t instance_id = ring->me;
1663-
1664-
return sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
1665-
}
1666-
1667-
static bool sdma_v4_4_2_page_ring_is_guilty(struct amdgpu_ring *ring)
1668-
{
1669-
struct amdgpu_device *adev = ring->adev;
1670-
uint32_t instance_id = ring->me;
1671-
1672-
if (!adev->sdma.has_page_queue)
1673-
return false;
1674-
1675-
return sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
1676-
}
1677-
16781659
static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring,
16791660
unsigned int vmid,
16801661
struct amdgpu_fence *timedout_fence)
16811662
{
1682-
bool is_guilty = ring->funcs->is_guilty(ring);
16831663
struct amdgpu_device *adev = ring->adev;
16841664
u32 id = ring->me;
16851665
int r;
@@ -1690,13 +1670,7 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring,
16901670
amdgpu_amdkfd_suspend(adev, true);
16911671
r = amdgpu_sdma_reset_engine(adev, id);
16921672
amdgpu_amdkfd_resume(adev, true);
1693-
if (r)
1694-
return r;
1695-
1696-
if (is_guilty)
1697-
amdgpu_fence_driver_force_completion(ring);
1698-
1699-
return 0;
1673+
return r;
17001674
}
17011675

17021676
static int sdma_v4_4_2_stop_queue(struct amdgpu_ring *ring)
@@ -2181,7 +2155,6 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_ring_funcs = {
21812155
.emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait,
21822156
.emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
21832157
.reset = sdma_v4_4_2_reset_queue,
2184-
.is_guilty = sdma_v4_4_2_ring_is_guilty,
21852158
};
21862159

21872160
static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = {
@@ -2214,7 +2187,6 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = {
22142187
.emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait,
22152188
.emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
22162189
.reset = sdma_v4_4_2_reset_queue,
2217-
.is_guilty = sdma_v4_4_2_page_ring_is_guilty,
22182190
};
22192191

22202192
static void sdma_v4_4_2_set_ring_funcs(struct amdgpu_device *adev)

0 commit comments

Comments
 (0)