@@ -406,6 +406,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
406406{
407407 struct amdgpu_device * adev = to_amdgpu_device (control );
408408 struct amdgpu_ras_eeprom_table_header * hdr = & control -> tbl_hdr ;
409+ struct amdgpu_ras_eeprom_table_ras_info * rai = & control -> tbl_rai ;
409410 struct amdgpu_ras * con = amdgpu_ras_get_context (adev );
410411 u8 csum ;
411412 int res ;
@@ -423,6 +424,14 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
423424 hdr -> first_rec_offset = RAS_RECORD_START_V2_1 ;
424425 hdr -> tbl_size = RAS_TABLE_HEADER_SIZE +
425426 RAS_TABLE_V2_1_INFO_SIZE ;
427+ rai -> rma_status = GPU_HEALTH_USABLE ;
428+ /**
429+ * GPU health represented as a percentage.
430+ * 0 means worst health, 100 means fully health.
431+ */
432+ rai -> health_percent = 100 ;
433+ /* ecc_page_threshold = 0 means disable bad page retirement */
434+ rai -> ecc_page_threshold = con -> bad_page_cnt_threshold ;
426435 } else {
427436 hdr -> first_rec_offset = RAS_RECORD_START ;
428437 hdr -> tbl_size = RAS_TABLE_HEADER_SIZE ;
@@ -712,6 +721,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
712721 "Saved bad pages %d reaches threshold value %d\n" ,
713722 control -> ras_num_recs , ras -> bad_page_cnt_threshold );
714723 control -> tbl_hdr .header = RAS_TABLE_HDR_BAD ;
724+ if (control -> tbl_hdr .version == RAS_TABLE_VER_V2_1 ) {
725+ control -> tbl_rai .rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD ;
726+ control -> tbl_rai .health_percent = 0 ;
727+ }
715728 }
716729
717730 if (control -> tbl_hdr .version == RAS_TABLE_VER_V2_1 )
@@ -749,6 +762,17 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
749762 goto Out ;
750763 }
751764
765+ /**
766+ * bad page records have been stored in eeprom,
767+ * now calculate gpu health percent
768+ */
769+ if (amdgpu_bad_page_threshold != 0 &&
770+ control -> tbl_hdr .version == RAS_TABLE_VER_V2_1 &&
771+ control -> ras_num_recs < ras -> bad_page_cnt_threshold )
772+ control -> tbl_rai .health_percent = ((ras -> bad_page_cnt_threshold -
773+ control -> ras_num_recs ) * 100 ) /
774+ ras -> bad_page_cnt_threshold ;
775+
752776 /* Recalc the checksum.
753777 */
754778 csum = 0 ;
0 commit comments