Skip to content

Commit b57c4ec

Browse files
Lijo Lazaralexdeucher
authored andcommitted
drm/amdgpu: Fix error handling in slot reset
If the device has not recovered after slot reset is called, it goes to out label for error handling. There it could make decision based on uninitialized hive pointer and could result in accessing an uninitialized list. Initialize the list and hive properly so that it handles the error situation and also releases the reset domain lock which is acquired during error_detected callback. Fixes: 732c6ce ("drm/amdgpu: Replace tmp_adev with hive in amdgpu_pci_slot_reset") Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Ce Sun <cesun102@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> (cherry picked from commit bb71362)
1 parent a5fe1a5 commit b57c4ec

1 file changed

Lines changed: 10 additions & 7 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7059,6 +7059,15 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
70597059
dev_info(adev->dev, "PCI error: slot reset callback!!\n");
70607060

70617061
memset(&reset_context, 0, sizeof(reset_context));
7062+
INIT_LIST_HEAD(&device_list);
7063+
hive = amdgpu_get_xgmi_hive(adev);
7064+
if (hive) {
7065+
mutex_lock(&hive->hive_lock);
7066+
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
7067+
list_add_tail(&tmp_adev->reset_list, &device_list);
7068+
} else {
7069+
list_add_tail(&adev->reset_list, &device_list);
7070+
}
70627071

70637072
if (adev->pcie_reset_ctx.swus)
70647073
link_dev = adev->pcie_reset_ctx.swus;
@@ -7099,19 +7108,13 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
70997108
reset_context.reset_req_dev = adev;
71007109
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
71017110
set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
7102-
INIT_LIST_HEAD(&device_list);
71037111

7104-
hive = amdgpu_get_xgmi_hive(adev);
71057112
if (hive) {
7106-
mutex_lock(&hive->hive_lock);
71077113
reset_context.hive = hive;
7108-
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
7114+
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
71097115
tmp_adev->pcie_reset_ctx.in_link_reset = true;
7110-
list_add_tail(&tmp_adev->reset_list, &device_list);
7111-
}
71127116
} else {
71137117
set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
7114-
list_add_tail(&adev->reset_list, &device_list);
71157118
}
71167119

71177120
r = amdgpu_device_asic_reset(adev, &device_list, &reset_context);

0 commit comments

Comments
 (0)