Skip to content

Commit 0bc3137

Browse files
Stanley.Yangalexdeucher
authored andcommitted
drm/amdgpu: Set EEPROM ras info
Set EEPROM ras info: rma status, health percent and bad page threshold. Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 7c2551f commit 0bc3137

2 files changed

Lines changed: 29 additions & 0 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
406406
{
407407
struct amdgpu_device *adev = to_amdgpu_device(control);
408408
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
409+
struct amdgpu_ras_eeprom_table_ras_info *rai = &control->tbl_rai;
409410
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
410411
u8 csum;
411412
int res;
@@ -423,6 +424,14 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
423424
hdr->first_rec_offset = RAS_RECORD_START_V2_1;
424425
hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
425426
RAS_TABLE_V2_1_INFO_SIZE;
427+
rai->rma_status = GPU_HEALTH_USABLE;
428+
/**
429+
* GPU health represented as a percentage.
430+
* 0 means worst health, 100 means fully health.
431+
*/
432+
rai->health_percent = 100;
433+
/* ecc_page_threshold = 0 means disable bad page retirement */
434+
rai->ecc_page_threshold = con->bad_page_cnt_threshold;
426435
} else {
427436
hdr->first_rec_offset = RAS_RECORD_START;
428437
hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
@@ -712,6 +721,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
712721
"Saved bad pages %d reaches threshold value %d\n",
713722
control->ras_num_recs, ras->bad_page_cnt_threshold);
714723
control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
724+
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
725+
control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
726+
control->tbl_rai.health_percent = 0;
727+
}
715728
}
716729

717730
if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
@@ -749,6 +762,17 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
749762
goto Out;
750763
}
751764

765+
/**
766+
* bad page records have been stored in eeprom,
767+
* now calculate gpu health percent
768+
*/
769+
if (amdgpu_bad_page_threshold != 0 &&
770+
control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
771+
control->ras_num_recs < ras->bad_page_cnt_threshold)
772+
control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
773+
control->ras_num_recs) * 100) /
774+
ras->bad_page_cnt_threshold;
775+
752776
/* Recalc the checksum.
753777
*/
754778
csum = 0;

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@
3131

3232
struct amdgpu_device;
3333

34+
enum amdgpu_ras_gpu_health_status {
35+
GPU_HEALTH_USABLE = 0,
36+
GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
37+
};
38+
3439
enum amdgpu_ras_eeprom_err_type {
3540
AMDGPU_RAS_EEPROM_ERR_NA,
3641
AMDGPU_RAS_EEPROM_ERR_RECOVERABLE,

0 commit comments

Comments
 (0)