@@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
124124
125125#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
126126
127+ #define MAX_FLUSH_RETIRE_DWORK_TIMES 100
128+
127129enum amdgpu_ras_retire_page_reservation {
128130 AMDGPU_RAS_RETIRE_PAGE_RESERVED ,
129131 AMDGPU_RAS_RETIRE_PAGE_PENDING ,
@@ -2907,6 +2909,23 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
29072909 ecc_log -> prev_de_queried_count = 0 ;
29082910}
29092911
2912+ static bool amdgpu_ras_schedule_retirement_dwork (struct amdgpu_ras * con ,
2913+ uint32_t delayed_ms )
2914+ {
2915+ int ret ;
2916+
2917+ mutex_lock (& con -> umc_ecc_log .lock );
2918+ ret = radix_tree_tagged (& con -> umc_ecc_log .de_page_tree ,
2919+ UMC_ECC_NEW_DETECTED_TAG );
2920+ mutex_unlock (& con -> umc_ecc_log .lock );
2921+
2922+ if (ret )
2923+ schedule_delayed_work (& con -> page_retirement_dwork ,
2924+ msecs_to_jiffies (delayed_ms ));
2925+
2926+ return ret ? true : false;
2927+ }
2928+
29102929static void amdgpu_ras_do_page_retirement (struct work_struct * work )
29112930{
29122931 struct amdgpu_ras * con = container_of (work , struct amdgpu_ras ,
@@ -2928,12 +2947,8 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
29282947 if (err_cnt && con -> is_rma )
29292948 amdgpu_ras_reset_gpu (adev );
29302949
2931- mutex_lock (& con -> umc_ecc_log .lock );
2932- if (radix_tree_tagged (& con -> umc_ecc_log .de_page_tree ,
2933- UMC_ECC_NEW_DETECTED_TAG ))
2934- schedule_delayed_work (& con -> page_retirement_dwork ,
2935- msecs_to_jiffies (AMDGPU_RAS_RETIRE_PAGE_INTERVAL ));
2936- mutex_unlock (& con -> umc_ecc_log .lock );
2950+ amdgpu_ras_schedule_retirement_dwork (con ,
2951+ AMDGPU_RAS_RETIRE_PAGE_INTERVAL );
29372952}
29382953
29392954static int amdgpu_ras_poison_creation_handler (struct amdgpu_device * adev ,
@@ -3237,11 +3252,19 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
32373252{
32383253 struct amdgpu_ras * con = amdgpu_ras_get_context (adev );
32393254 struct ras_err_handler_data * data = con -> eh_data ;
3255+ int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES ;
3256+ bool ret ;
32403257
32413258 /* recovery_init failed to init it, fini is useless */
32423259 if (!data )
32433260 return 0 ;
32443261
3262+ /* Save all cached bad pages to eeprom */
3263+ do {
3264+ flush_delayed_work (& con -> page_retirement_dwork );
3265+ ret = amdgpu_ras_schedule_retirement_dwork (con , 0 );
3266+ } while (ret && max_flush_timeout -- );
3267+
32453268 if (con -> page_retirement_thread )
32463269 kthread_stop (con -> page_retirement_thread );
32473270
0 commit comments