Skip to content

Commit d95ca7f

Browse files
YiPeng Chaialexdeucher
authored andcommitted
drm/amdgpu: suspend ras module before gpu reset
During gpu reset, all GPU-related resources are inaccessible. To avoid affecting ras functionality, suspend ras module before gpu reset and resume it after gpu reset is complete. V2: Rename functions to avoid misunderstanding. V3: Move flush_delayed_work to amdgpu_ras_process_pause, Move schedule_delayed_work to amdgpu_ras_process_unpause. V4: Rename functions. V5: Move the function to amdgpu_ras.c. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Acked-by: Lijo Lazar <lijo.lazar@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent d4432f1 commit d95ca7f

10 files changed

Lines changed: 148 additions & 2 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171

7272
#include "amdgpu_xgmi.h"
7373
#include "amdgpu_ras.h"
74+
#include "amdgpu_ras_mgr.h"
7475
#include "amdgpu_pmu.h"
7576
#include "amdgpu_fru_eeprom.h"
7677
#include "amdgpu_reset.h"
@@ -6660,6 +6661,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
66606661
goto end_reset;
66616662
}
66626663

6664+
/* Cannot be called after locking reset domain */
6665+
amdgpu_ras_pre_reset(adev, &device_list);
6666+
66636667
/* We need to lock reset domain only once both for XGMI and single device */
66646668
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
66656669

@@ -6691,6 +6695,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
66916695
reset_unlock:
66926696
amdgpu_device_recovery_put_reset_lock(adev, &device_list);
66936697
end_reset:
6698+
amdgpu_ras_post_reset(adev, &device_list);
66946699
if (hive) {
66956700
mutex_unlock(&hive->hive_lock);
66966701
amdgpu_put_xgmi_hive(hive);

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2921,8 +2921,12 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
29212921
type = amdgpu_ras_get_fatal_error_event(adev);
29222922
list_for_each_entry(remote_adev,
29232923
device_list_handle, gmc.xgmi.head) {
2924-
amdgpu_ras_query_err_status(remote_adev);
2925-
amdgpu_ras_log_on_err_counter(remote_adev, type);
2924+
if (amdgpu_uniras_enabled(remote_adev)) {
2925+
amdgpu_ras_mgr_update_ras_ecc(remote_adev);
2926+
} else {
2927+
amdgpu_ras_query_err_status(remote_adev);
2928+
amdgpu_ras_log_on_err_counter(remote_adev, type);
2929+
}
29262930
}
29272931

29282932
}
@@ -5673,3 +5677,25 @@ bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr
56735677

56745678
return ret;
56755679
}
5680+
5681+
void amdgpu_ras_pre_reset(struct amdgpu_device *adev,
5682+
struct list_head *device_list)
5683+
{
5684+
struct amdgpu_device *tmp_adev = NULL;
5685+
5686+
list_for_each_entry(tmp_adev, device_list, reset_list) {
5687+
if (amdgpu_uniras_enabled(tmp_adev))
5688+
amdgpu_ras_mgr_pre_reset(tmp_adev);
5689+
}
5690+
}
5691+
5692+
void amdgpu_ras_post_reset(struct amdgpu_device *adev,
5693+
struct list_head *device_list)
5694+
{
5695+
struct amdgpu_device *tmp_adev = NULL;
5696+
5697+
list_for_each_entry(tmp_adev, device_list, reset_list) {
5698+
if (amdgpu_uniras_enabled(tmp_adev))
5699+
amdgpu_ras_mgr_post_reset(tmp_adev);
5700+
}
5701+
}

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,4 +1039,9 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
10391039
const char *fmt, ...);
10401040

10411041
bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
1042+
1043+
void amdgpu_ras_pre_reset(struct amdgpu_device *adev,
1044+
struct list_head *device_list);
1045+
void amdgpu_ras_post_reset(struct amdgpu_device *adev,
1046+
struct list_head *device_list);
10421047
#endif

drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,3 +624,25 @@ int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
624624

625625
return ret;
626626
}
627+
628+
int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev)
629+
{
630+
if (!amdgpu_ras_mgr_is_ready(adev)) {
631+
RAS_DEV_ERR(adev, "Invalid ras suspend!\n");
632+
return -EPERM;
633+
}
634+
635+
amdgpu_ras_process_pre_reset(adev);
636+
return 0;
637+
}
638+
639+
int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev)
640+
{
641+
if (!amdgpu_ras_mgr_is_ready(adev)) {
642+
RAS_DEV_ERR(adev, "Invalid ras resume!\n");
643+
return -EPERM;
644+
}
645+
646+
amdgpu_ras_process_post_reset(adev);
647+
return 0;
648+
}

drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ struct amdgpu_ras_mgr {
5252
struct ras_event_manager ras_event_mgr;
5353
uint64_t last_poison_consumption_seqno;
5454
bool ras_is_ready;
55+
56+
bool is_paused;
57+
struct completion ras_event_done;
5558
};
5659

5760
extern const struct amdgpu_ip_block_version ras_v1_0_ip_block;
@@ -75,4 +78,6 @@ bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev);
7578
int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
7679
uint32_t cmd_id, void *input, uint32_t input_size,
7780
void *output, uint32_t out_size);
81+
int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev);
82+
int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev);
7883
#endif

drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "amdgpu_ras_process.h"
3030

3131
#define RAS_MGR_RETIRE_PAGE_INTERVAL 100
32+
#define RAS_EVENT_PROCESS_TIMEOUT 1200
3233

3334
static void ras_process_retire_page_dwork(struct work_struct *work)
3435
{
@@ -57,6 +58,9 @@ int amdgpu_ras_process_init(struct amdgpu_device *adev)
5758
{
5859
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
5960

61+
ras_mgr->is_paused = false;
62+
init_completion(&ras_mgr->ras_event_done);
63+
6064
INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork);
6165

6266
return 0;
@@ -66,6 +70,7 @@ int amdgpu_ras_process_fini(struct amdgpu_device *adev)
6670
{
6771
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
6872

73+
ras_mgr->is_paused = false;
6974
/* Save all cached bad pages to eeprom */
7075
flush_delayed_work(&ras_mgr->retire_page_dwork);
7176
cancel_delayed_work_sync(&ras_mgr->retire_page_dwork);
@@ -124,3 +129,62 @@ int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev,
124129

125130
return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false);
126131
}
132+
133+
int amdgpu_ras_process_begin(struct amdgpu_device *adev)
134+
{
135+
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
136+
137+
if (ras_mgr->is_paused)
138+
return -EAGAIN;
139+
140+
reinit_completion(&ras_mgr->ras_event_done);
141+
return 0;
142+
}
143+
144+
int amdgpu_ras_process_end(struct amdgpu_device *adev)
145+
{
146+
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
147+
148+
complete(&ras_mgr->ras_event_done);
149+
return 0;
150+
}
151+
152+
int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev)
153+
{
154+
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
155+
long rc;
156+
157+
if (!ras_mgr || !ras_mgr->ras_core)
158+
return -EINVAL;
159+
160+
if (!ras_mgr->ras_core->is_initialized)
161+
return -EPERM;
162+
163+
ras_mgr->is_paused = true;
164+
165+
/* Wait for RAS event processing to complete */
166+
rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done,
167+
msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT));
168+
if (rc <= 0)
169+
RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n",
170+
rc ? "interrupted" : "timeout");
171+
172+
flush_delayed_work(&ras_mgr->retire_page_dwork);
173+
return 0;
174+
}
175+
176+
int amdgpu_ras_process_post_reset(struct amdgpu_device *adev)
177+
{
178+
struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
179+
180+
if (!ras_mgr || !ras_mgr->ras_core)
181+
return -EINVAL;
182+
183+
if (!ras_mgr->ras_core->is_initialized)
184+
return -EPERM;
185+
186+
ras_mgr->is_paused = false;
187+
188+
schedule_delayed_work(&ras_mgr->retire_page_dwork, 0);
189+
return 0;
190+
}

drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,8 @@ int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev,
3434
void *data);
3535
int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev,
3636
void *data);
37+
int amdgpu_ras_process_begin(struct amdgpu_device *adev);
38+
int amdgpu_ras_process_end(struct amdgpu_device *adev);
39+
int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev);
40+
int amdgpu_ras_process_post_reset(struct amdgpu_device *adev);
3741
#endif

drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,12 @@ static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core,
142142
case RAS_EVENT_ID__RESET_GPU:
143143
ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data);
144144
break;
145+
case RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN:
146+
ret = amdgpu_ras_process_begin(ras_core->dev);
147+
break;
148+
case RAS_EVENT_ID__RAS_EVENT_PROC_END:
149+
ret = amdgpu_ras_process_end(ras_core->dev);
150+
break;
145151
default:
146152
RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", event_id);
147153
break;

drivers/gpu/drm/amd/ras/rascore/ras.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ enum ras_notify_event {
115115
RAS_EVENT_ID__FATAL_ERROR_DETECTED,
116116
RAS_EVENT_ID__RESET_GPU,
117117
RAS_EVENT_ID__RESET_VF,
118+
RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN,
119+
RAS_EVENT_ID__RAS_EVENT_PROC_END,
118120
};
119121

120122
enum ras_gpu_status {

drivers/gpu/drm/amd/ras/rascore/ras_process.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,11 @@ int ras_process_handle_ras_event(struct ras_core_context *ras_core)
162162
uint32_t umc_event_count;
163163
int ret;
164164

165+
ret = ras_core_event_notify(ras_core,
166+
RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL);
167+
if (ret)
168+
return ret;
169+
165170
ras_aca_clear_fatal_flag(ras_core);
166171
ras_umc_log_pending_bad_bank(ras_core);
167172

@@ -185,6 +190,8 @@ int ras_process_handle_ras_event(struct ras_core_context *ras_core)
185190
atomic_set(&ras_proc->umc_interrupt_count, 0);
186191
}
187192

193+
ras_core_event_notify(ras_core,
194+
RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL);
188195
return ret;
189196
}
190197

0 commit comments

Comments
 (0)