Skip to content

Commit fb1c93c

Browse files
ChristianKoenigAMDalexdeucher
authored andcommitted
drm/amdgpu: revert "Adjust removal control flow for smu v13_0_2"
Calling amdgpu_device_ip_resume_phase1() during shutdown leaves the HW in an active state and is an unbalanced use of the IP callbacks. Using the IP callbacks like this can lead to memory leaks, double free and imbalanced reference counters. Leaving the HW in an active state can lead to DMA accesses to memory now freed by the driver. Both is a complete no-go for driver unload so completely revert the workaround for now. This reverts commit f5c7e77. Signed-off-by: Christian König <christian.koenig@amd.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Cc: stable@vger.kernel.org
1 parent 3c4e4eb commit fb1c93c

4 files changed

Lines changed: 1 addition & 65 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 1 addition & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -5246,7 +5246,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
52465246
struct amdgpu_device *tmp_adev = NULL;
52475247
bool need_full_reset, skip_hw_reset, vram_lost = false;
52485248
int r = 0;
5249-
bool gpu_reset_for_dev_remove = 0;
52505249

52515250
/* Try reset handler method first */
52525251
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
@@ -5266,10 +5265,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
52665265
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
52675266
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
52685267

5269-
gpu_reset_for_dev_remove =
5270-
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5271-
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5272-
52735268
/*
52745269
* ASIC reset has to be done on all XGMI hive nodes ASAP
52755270
* to allow proper links negotiation in FW (within 1 sec)
@@ -5312,18 +5307,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
53125307
amdgpu_ras_intr_cleared();
53135308
}
53145309

5315-
/* Since the mode1 reset affects base ip blocks, the
5316-
* phase1 ip blocks need to be resumed. Otherwise there
5317-
* will be a BIOS signature error and the psp bootloader
5318-
* can't load kdb on the next amdgpu install.
5319-
*/
5320-
if (gpu_reset_for_dev_remove) {
5321-
list_for_each_entry(tmp_adev, device_list_handle, reset_list)
5322-
amdgpu_device_ip_resume_phase1(tmp_adev);
5323-
5324-
goto end;
5325-
}
5326-
53275310
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
53285311
if (need_full_reset) {
53295312
/* post card */
@@ -5560,11 +5543,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
55605543
int i, r = 0;
55615544
bool need_emergency_restart = false;
55625545
bool audio_suspended = false;
5563-
bool gpu_reset_for_dev_remove = false;
5564-
5565-
gpu_reset_for_dev_remove =
5566-
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5567-
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
55685546

55695547
/*
55705548
* Special case: RAS triggered and full reset isn't supported
@@ -5602,7 +5580,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
56025580
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
56035581
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
56045582
list_add_tail(&tmp_adev->reset_list, &device_list);
5605-
if (gpu_reset_for_dev_remove && adev->shutdown)
5583+
if (adev->shutdown)
56065584
tmp_adev->shutdown = true;
56075585
}
56085586
if (!list_is_first(&adev->reset_list, &device_list))
@@ -5687,10 +5665,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
56875665

56885666
retry: /* Rest of adevs pre asic reset from XGMI hive. */
56895667
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5690-
if (gpu_reset_for_dev_remove) {
5691-
/* Workaroud for ASICs need to disable SMC first */
5692-
amdgpu_device_smu_fini_early(tmp_adev);
5693-
}
56945668
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
56955669
/*TODO Should we stop ?*/
56965670
if (r) {
@@ -5722,9 +5696,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
57225696
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
57235697
if (r && r == -EAGAIN)
57245698
goto retry;
5725-
5726-
if (!r && gpu_reset_for_dev_remove)
5727-
goto recover_end;
57285699
}
57295700

57305701
skip_hw_reset:
@@ -5780,7 +5751,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
57805751
amdgpu_ras_set_error_query_ready(tmp_adev, true);
57815752
}
57825753

5783-
recover_end:
57845754
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
57855755
reset_list);
57865756
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2337,38 +2337,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
23372337
pm_runtime_forbid(dev->dev);
23382338
}
23392339

2340-
if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&
2341-
!amdgpu_sriov_vf(adev)) {
2342-
bool need_to_reset_gpu = false;
2343-
2344-
if (adev->gmc.xgmi.num_physical_nodes > 1) {
2345-
struct amdgpu_hive_info *hive;
2346-
2347-
hive = amdgpu_get_xgmi_hive(adev);
2348-
if (hive->device_remove_count == 0)
2349-
need_to_reset_gpu = true;
2350-
hive->device_remove_count++;
2351-
amdgpu_put_xgmi_hive(hive);
2352-
} else {
2353-
need_to_reset_gpu = true;
2354-
}
2355-
2356-
/* Workaround for ASICs need to reset SMU.
2357-
* Called only when the first device is removed.
2358-
*/
2359-
if (need_to_reset_gpu) {
2360-
struct amdgpu_reset_context reset_context;
2361-
2362-
adev->shutdown = true;
2363-
memset(&reset_context, 0, sizeof(reset_context));
2364-
reset_context.method = AMD_RESET_METHOD_NONE;
2365-
reset_context.reset_req_dev = adev;
2366-
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
2367-
set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
2368-
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
2369-
}
2370-
}
2371-
23722340
amdgpu_driver_unload_kms(dev);
23732341

23742342
/*

drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ enum AMDGPU_RESET_FLAGS {
3232

3333
AMDGPU_NEED_FULL_RESET = 0,
3434
AMDGPU_SKIP_HW_RESET = 1,
35-
AMDGPU_RESET_FOR_DEVICE_REMOVE = 2,
3635
};
3736

3837
struct amdgpu_reset_context {

drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ struct amdgpu_hive_info {
4343
} pstate;
4444

4545
struct amdgpu_reset_domain *reset_domain;
46-
uint32_t device_remove_count;
4746
atomic_t ras_recovery;
4847
};
4948

0 commit comments

Comments
 (0)