Skip to content

Commit 4d33e0f

Browse files
Tao Zhoualexdeucher
authored andcommitted
drm/amdgpu: exclude duplicate pages from UMC RAS UE count
If a UMC bad page is reserved but not freed by an application, the application may trigger uncorrectable error repeatly by accessing the page. v2: add specific function to do the check. v3: remove duplicate pages, calculate new added bad page number. v4: reuse save_bad_pages to calculate new added bad page number. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent e69c785 commit 4d33e0f

3 files changed

Lines changed: 17 additions & 6 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
176176
if (amdgpu_bad_page_threshold != 0) {
177177
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
178178
err_data.err_addr_cnt);
179-
amdgpu_ras_save_bad_pages(adev);
179+
amdgpu_ras_save_bad_pages(adev, NULL);
180180
}
181181

182182
dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
@@ -2084,22 +2084,32 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
20842084
/*
20852085
* write error record array to eeprom, the function should be
20862086
* protected by recovery_lock
2087+
* new_cnt: new added UE count, excluding reserved bad pages, can be NULL
20872088
*/
2088-
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
2089+
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
2090+
unsigned long *new_cnt)
20892091
{
20902092
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
20912093
struct ras_err_handler_data *data;
20922094
struct amdgpu_ras_eeprom_control *control;
20932095
int save_count;
20942096

2095-
if (!con || !con->eh_data)
2097+
if (!con || !con->eh_data) {
2098+
if (new_cnt)
2099+
*new_cnt = 0;
2100+
20962101
return 0;
2102+
}
20972103

20982104
mutex_lock(&con->recovery_lock);
20992105
control = &con->eeprom_control;
21002106
data = con->eh_data;
21012107
save_count = data->count - control->ras_num_recs;
21022108
mutex_unlock(&con->recovery_lock);
2109+
2110+
if (new_cnt)
2111+
*new_cnt = save_count / adev->umc.retire_unit;
2112+
21032113
/* only new entries are saved */
21042114
if (save_count > 0) {
21052115
if (amdgpu_ras_eeprom_append(control,

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,8 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
547547
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
548548
struct eeprom_table_record *bps, int pages);
549549

550-
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
550+
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
551+
unsigned long *new_cnt);
551552

552553
static inline enum ta_ras_block
553554
amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
6868
if (amdgpu_bad_page_threshold != 0) {
6969
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
7070
err_data.err_addr_cnt);
71-
amdgpu_ras_save_bad_pages(adev);
71+
amdgpu_ras_save_bad_pages(adev, NULL);
7272
}
7373

7474
out:
@@ -147,7 +147,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
147147
err_data->err_addr_cnt) {
148148
amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
149149
err_data->err_addr_cnt);
150-
amdgpu_ras_save_bad_pages(adev);
150+
amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count));
151151

152152
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
153153

0 commit comments

Comments
 (0)