@@ -176,7 +176,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
176176 if (amdgpu_bad_page_threshold != 0 ) {
177177 amdgpu_ras_add_bad_pages (adev , err_data .err_addr ,
178178 err_data .err_addr_cnt );
179- amdgpu_ras_save_bad_pages (adev );
179+ amdgpu_ras_save_bad_pages (adev , NULL );
180180 }
181181
182182 dev_warn (adev -> dev , "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n" );
@@ -2084,22 +2084,32 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
20842084/*
20852085 * write error record array to eeprom, the function should be
20862086 * protected by recovery_lock
2087+ * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
20872088 */
2088- int amdgpu_ras_save_bad_pages (struct amdgpu_device * adev )
2089+ int amdgpu_ras_save_bad_pages (struct amdgpu_device * adev ,
2090+ unsigned long * new_cnt )
20892091{
20902092 struct amdgpu_ras * con = amdgpu_ras_get_context (adev );
20912093 struct ras_err_handler_data * data ;
20922094 struct amdgpu_ras_eeprom_control * control ;
20932095 int save_count ;
20942096
2095- if (!con || !con -> eh_data )
2097+ if (!con || !con -> eh_data ) {
2098+ if (new_cnt )
2099+ * new_cnt = 0 ;
2100+
20962101 return 0 ;
2102+ }
20972103
20982104 mutex_lock (& con -> recovery_lock );
20992105 control = & con -> eeprom_control ;
21002106 data = con -> eh_data ;
21012107 save_count = data -> count - control -> ras_num_recs ;
21022108 mutex_unlock (& con -> recovery_lock );
2109+
2110+ if (new_cnt )
2111+ * new_cnt = save_count / adev -> umc .retire_unit ;
2112+
21032113 /* only new entries are saved */
21042114 if (save_count > 0 ) {
21052115 if (amdgpu_ras_eeprom_append (control ,
@@ -2186,11 +2196,12 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
21862196 /*
21872197 * Justification of value bad_page_cnt_threshold in ras structure
21882198 *
2189- * Generally, -1 <= amdgpu_bad_page_threshold <= max record length
2190- * in eeprom, and introduce two scenarios accordingly.
2199+ * Generally, 0 <= amdgpu_bad_page_threshold <= max record length
2200+ * in eeprom or amdgpu_bad_page_threshold == -2, introduce two
2201+ * scenarios accordingly.
21912202 *
21922203 * Bad page retirement enablement:
2193- * - If amdgpu_bad_page_threshold = -1 ,
2204+ * - If amdgpu_bad_page_threshold = -2 ,
21942205 * bad_page_cnt_threshold = typical value by formula.
21952206 *
21962207 * - When the value from user is 0 < amdgpu_bad_page_threshold <
0 commit comments