Skip to content

Commit bb34bc2

Browse files
Ma Junalexdeucher
authored andcommitted
drm/amdgpu: Fix the warning info in mode1 reset
Fix the warning info below during mode1 reset. [ +0.000004] Call Trace: [ +0.000004] <TASK> [ +0.000006] ? show_regs+0x6e/0x80 [ +0.000011] ? __flush_work.isra.0+0x2e8/0x390 [ +0.000005] ? __warn+0x91/0x150 [ +0.000009] ? __flush_work.isra.0+0x2e8/0x390 [ +0.000006] ? report_bug+0x19d/0x1b0 [ +0.000013] ? handle_bug+0x46/0x80 [ +0.000012] ? exc_invalid_op+0x1d/0x80 [ +0.000011] ? asm_exc_invalid_op+0x1f/0x30 [ +0.000014] ? __flush_work.isra.0+0x2e8/0x390 [ +0.000007] ? __flush_work.isra.0+0x208/0x390 [ +0.000007] ? _prb_read_valid+0x216/0x290 [ +0.000008] __cancel_work_timer+0x11d/0x1a0 [ +0.000007] ? try_to_grab_pending+0xe8/0x190 [ +0.000012] cancel_work_sync+0x14/0x20 [ +0.000008] amddrm_sched_stop+0x3c/0x1d0 [amd_sched] [ +0.000032] amdgpu_device_gpu_recover+0x29a/0xe90 [amdgpu] This warning info was printed after applying the patch "drm/sched: Convert drm scheduler to use a work queue rather than kthread". The root cause is that amdgpu driver tries to use the uninitialized work_struct in the struct drm_gpu_scheduler v2: - Rename the function to amdgpu_ring_sched_ready and move it to amdgpu_ring.c (Alex) v3: - Fix a few more checks based on Vitaly's patch (Alex) v4: - squash in fix noticed by Bert in https://gitlab.freedesktop.org/drm/amd/-/issues/3139 Fixes: 11b3b9f ("drm/sched: Check scheduler ready before calling timeout handling") Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com> Signed-off-by: Ma Jun <Jun.Ma2@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent faf51b2 commit bb34bc2

5 files changed

Lines changed: 24 additions & 12 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus
290290
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
291291
struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
292292

293-
if (!(ring && drm_sched_wqueue_ready(&ring->sched)))
293+
if (!amdgpu_ring_sched_ready(ring))
294294
continue;
295295

296296
/* stop secheduler and drain ring. */

drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1678,7 +1678,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
16781678
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
16791679
struct amdgpu_ring *ring = adev->rings[i];
16801680

1681-
if (!ring || !drm_sched_wqueue_ready(&ring->sched))
1681+
if (!amdgpu_ring_sched_ready(ring))
16821682
continue;
16831683
drm_sched_wqueue_stop(&ring->sched);
16841684
}
@@ -1694,7 +1694,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
16941694
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
16951695
struct amdgpu_ring *ring = adev->rings[i];
16961696

1697-
if (!ring || !drm_sched_wqueue_ready(&ring->sched))
1697+
if (!amdgpu_ring_sched_ready(ring))
16981698
continue;
16991699
drm_sched_wqueue_start(&ring->sched);
17001700
}
@@ -1916,8 +1916,8 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
19161916

19171917
ring = adev->rings[val];
19181918

1919-
if (!ring || !ring->funcs->preempt_ib ||
1920-
!drm_sched_wqueue_ready(&ring->sched))
1919+
if (!amdgpu_ring_sched_ready(ring) ||
1920+
!ring->funcs->preempt_ib)
19211921
return -EINVAL;
19221922

19231923
/* the last preemption failed */

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5021,7 +5021,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
50215021
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
50225022
struct amdgpu_ring *ring = adev->rings[i];
50235023

5024-
if (!ring || !drm_sched_wqueue_ready(&ring->sched))
5024+
if (!amdgpu_ring_sched_ready(ring))
50255025
continue;
50265026

50275027
spin_lock(&ring->sched.job_list_lock);
@@ -5160,7 +5160,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
51605160
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
51615161
struct amdgpu_ring *ring = adev->rings[i];
51625162

5163-
if (!ring || !drm_sched_wqueue_ready(&ring->sched))
5163+
if (!amdgpu_ring_sched_ready(ring))
51645164
continue;
51655165

51665166
/* Clear job fence from fence drv to avoid force_completion
@@ -5627,7 +5627,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
56275627
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
56285628
struct amdgpu_ring *ring = tmp_adev->rings[i];
56295629

5630-
if (!ring || !drm_sched_wqueue_ready(&ring->sched))
5630+
if (!amdgpu_ring_sched_ready(ring))
56315631
continue;
56325632

56335633
drm_sched_stop(&ring->sched, job ? &job->base : NULL);
@@ -5696,7 +5696,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
56965696
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
56975697
struct amdgpu_ring *ring = tmp_adev->rings[i];
56985698

5699-
if (!ring || !drm_sched_wqueue_ready(&ring->sched))
5699+
if (!amdgpu_ring_sched_ready(ring))
57005700
continue;
57015701

57025702
drm_sched_start(&ring->sched, true);
@@ -6051,7 +6051,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
60516051
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
60526052
struct amdgpu_ring *ring = adev->rings[i];
60536053

6054-
if (!ring || !drm_sched_wqueue_ready(&ring->sched))
6054+
if (!amdgpu_ring_sched_ready(ring))
60556055
continue;
60566056

60576057
drm_sched_stop(&ring->sched, NULL);
@@ -6179,7 +6179,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
61796179
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
61806180
struct amdgpu_ring *ring = adev->rings[i];
61816181

6182-
if (!ring || !drm_sched_wqueue_ready(&ring->sched))
6182+
if (!amdgpu_ring_sched_ready(ring))
61836183
continue;
61846184

61856185
drm_sched_start(&ring->sched, true);

drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,7 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
635635
ring->name);
636636

637637
ring->sched.ready = !r;
638+
638639
return r;
639640
}
640641

@@ -717,3 +718,14 @@ void amdgpu_ring_ib_on_emit_de(struct amdgpu_ring *ring)
717718
if (ring->is_sw_ring)
718719
amdgpu_sw_ring_ib_mark_offset(ring, AMDGPU_MUX_OFFSET_TYPE_DE);
719720
}
721+
722+
bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring)
723+
{
724+
if (!ring)
725+
return false;
726+
727+
if (ring->no_scheduler || !drm_sched_wqueue_ready(&ring->sched))
728+
return false;
729+
730+
return true;
731+
}

drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,5 +450,5 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
450450
int amdgpu_ib_pool_init(struct amdgpu_device *adev);
451451
void amdgpu_ib_pool_fini(struct amdgpu_device *adev);
452452
int amdgpu_ib_ring_tests(struct amdgpu_device *adev);
453-
453+
bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring);
454454
#endif

0 commit comments

Comments
 (0)