Skip to content

Commit 22106ed

Browse files
Tao Zhoualexdeucher
authored andcommitted
drm/amdgpu: add bad_page_threshold check in ras_eeprom_check_err
bad_page_threshold controls page retirement behavior and it should be also checked. v2: simplify the condition of bad page handling path. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent f3cbe70 commit 22106ed

1 file changed

Lines changed: 14 additions & 5 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,8 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
417417
{
418418
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
419419

420-
if (!__is_ras_eeprom_supported(adev))
420+
if (!__is_ras_eeprom_supported(adev) ||
421+
!amdgpu_bad_page_threshold)
421422
return false;
422423

423424
/* skip check eeprom table for VEGA20 Gaming */
@@ -428,10 +429,18 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
428429
return false;
429430

430431
if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
431-
dev_warn(adev->dev, "This GPU is in BAD status.");
432-
dev_warn(adev->dev, "Please retire it or set a larger "
433-
"threshold value when reloading driver.\n");
434-
return true;
432+
if (amdgpu_bad_page_threshold == -1) {
433+
dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
434+
con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
435+
dev_warn(adev->dev,
436+
"But GPU can be operated due to bad_page_threshold = -1.\n");
437+
return false;
438+
} else {
439+
dev_warn(adev->dev, "This GPU is in BAD status.");
440+
dev_warn(adev->dev, "Please retire it or set a larger "
441+
"threshold value when reloading driver.\n");
442+
return true;
443+
}
435444
}
436445

437446
return false;

0 commit comments

Comments
 (0)