@@ -6019,16 +6019,12 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
60196019 return ret ;
60206020}
60216021
6022- static int amdgpu_device_halt_activities (struct amdgpu_device * adev ,
6023- struct amdgpu_job * job ,
6024- struct amdgpu_reset_context * reset_context ,
6025- struct list_head * device_list ,
6026- struct amdgpu_hive_info * hive ,
6027- bool need_emergency_restart )
6022+ static int amdgpu_device_recovery_prepare (struct amdgpu_device * adev ,
6023+ struct list_head * device_list ,
6024+ struct amdgpu_hive_info * hive )
60286025{
6029- struct list_head * device_list_handle = NULL ;
60306026 struct amdgpu_device * tmp_adev = NULL ;
6031- int i , r = 0 ;
6027+ int r ;
60326028
60336029 /*
60346030 * Build list of devices to reset.
@@ -6045,26 +6041,54 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
60456041 }
60466042 if (!list_is_first (& adev -> reset_list , device_list ))
60476043 list_rotate_to_front (& adev -> reset_list , device_list );
6048- device_list_handle = device_list ;
60496044 } else {
60506045 list_add_tail (& adev -> reset_list , device_list );
6051- device_list_handle = device_list ;
60526046 }
60536047
60546048 if (!amdgpu_sriov_vf (adev ) && (!adev -> pcie_reset_ctx .occurs_dpc )) {
6055- r = amdgpu_device_health_check (device_list_handle );
6049+ r = amdgpu_device_health_check (device_list );
60566050 if (r )
60576051 return r ;
60586052 }
60596053
6060- /* We need to lock reset domain only once both for XGMI and single device */
6061- tmp_adev = list_first_entry (device_list_handle , struct amdgpu_device ,
6062- reset_list );
6054+ return 0 ;
6055+ }
6056+
6057+ static void amdgpu_device_recovery_get_reset_lock (struct amdgpu_device * adev ,
6058+ struct list_head * device_list )
6059+ {
6060+ struct amdgpu_device * tmp_adev = NULL ;
6061+
6062+ if (list_empty (device_list ))
6063+ return ;
6064+ tmp_adev =
6065+ list_first_entry (device_list , struct amdgpu_device , reset_list );
60636066 amdgpu_device_lock_reset_domain (tmp_adev -> reset_domain );
6067+ }
60646068
6065- /* block all schedulers and reset given job's ring */
6066- list_for_each_entry (tmp_adev , device_list_handle , reset_list ) {
6069+ static void amdgpu_device_recovery_put_reset_lock (struct amdgpu_device * adev ,
6070+ struct list_head * device_list )
6071+ {
6072+ struct amdgpu_device * tmp_adev = NULL ;
60676073
6074+ if (list_empty (device_list ))
6075+ return ;
6076+ tmp_adev =
6077+ list_first_entry (device_list , struct amdgpu_device , reset_list );
6078+ amdgpu_device_unlock_reset_domain (tmp_adev -> reset_domain );
6079+ }
6080+
6081+ static int amdgpu_device_halt_activities (
6082+ struct amdgpu_device * adev , struct amdgpu_job * job ,
6083+ struct amdgpu_reset_context * reset_context ,
6084+ struct list_head * device_list , struct amdgpu_hive_info * hive ,
6085+ bool need_emergency_restart )
6086+ {
6087+ struct amdgpu_device * tmp_adev = NULL ;
6088+ int i , r = 0 ;
6089+
6090+ /* block all schedulers and reset given job's ring */
6091+ list_for_each_entry (tmp_adev , device_list , reset_list ) {
60686092 amdgpu_device_set_mp1_state (tmp_adev );
60696093
60706094 /*
@@ -6252,11 +6276,6 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
62526276 amdgpu_ras_set_error_query_ready (tmp_adev , true);
62536277
62546278 }
6255-
6256- tmp_adev = list_first_entry (device_list , struct amdgpu_device ,
6257- reset_list );
6258- amdgpu_device_unlock_reset_domain (tmp_adev -> reset_domain );
6259-
62606279}
62616280
62626281
@@ -6324,10 +6343,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
63246343 reset_context -> hive = hive ;
63256344 INIT_LIST_HEAD (& device_list );
63266345
6346+ if (amdgpu_device_recovery_prepare (adev , & device_list , hive ))
6347+ goto end_reset ;
6348+
6349+ /* We need to lock reset domain only once both for XGMI and single device */
6350+ amdgpu_device_recovery_get_reset_lock (adev , & device_list );
6351+
63276352 r = amdgpu_device_halt_activities (adev , job , reset_context , & device_list ,
63286353 hive , need_emergency_restart );
63296354 if (r )
6330- goto end_reset ;
6355+ goto reset_unlock ;
63316356
63326357 if (need_emergency_restart )
63336358 goto skip_sched_resume ;
@@ -6337,21 +6362,23 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
63376362 *
63386363 * job->base holds a reference to parent fence
63396364 */
6340- if (job && dma_fence_is_signaled (& job -> hw_fence )) {
6365+ if (job && dma_fence_is_signaled (& job -> hw_fence . base )) {
63416366 job_signaled = true;
63426367 dev_info (adev -> dev , "Guilty job already signaled, skipping HW reset" );
63436368 goto skip_hw_reset ;
63446369 }
63456370
63466371 r = amdgpu_device_asic_reset (adev , & device_list , reset_context );
63476372 if (r )
6348- goto end_reset ;
6373+ goto reset_unlock ;
63496374skip_hw_reset :
63506375 r = amdgpu_device_sched_resume (& device_list , reset_context , job_signaled );
63516376 if (r )
6352- goto end_reset ;
6377+ goto reset_unlock ;
63536378skip_sched_resume :
63546379 amdgpu_device_gpu_resume (adev , & device_list , need_emergency_restart );
6380+ reset_unlock :
6381+ amdgpu_device_recovery_put_reset_lock (adev , & device_list );
63556382end_reset :
63566383 if (hive ) {
63576384 mutex_unlock (& hive -> hive_lock );
@@ -6763,6 +6790,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
67636790 memset (& reset_context , 0 , sizeof (reset_context ));
67646791 INIT_LIST_HEAD (& device_list );
67656792
6793+ amdgpu_device_recovery_prepare (adev , & device_list , hive );
6794+ amdgpu_device_recovery_get_reset_lock (adev , & device_list );
67666795 r = amdgpu_device_halt_activities (adev , NULL , & reset_context , & device_list ,
67676796 hive , false);
67686797 if (hive ) {
@@ -6880,8 +6909,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
68806909 if (hive ) {
68816910 list_for_each_entry (tmp_adev , & device_list , reset_list )
68826911 amdgpu_device_unset_mp1_state (tmp_adev );
6883- amdgpu_device_unlock_reset_domain (adev -> reset_domain );
68846912 }
6913+ amdgpu_device_recovery_put_reset_lock (adev , & device_list );
68856914 }
68866915
68876916 if (hive ) {
@@ -6927,6 +6956,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
69276956
69286957 amdgpu_device_sched_resume (& device_list , NULL , NULL );
69296958 amdgpu_device_gpu_resume (adev , & device_list , false);
6959+ amdgpu_device_recovery_put_reset_lock (adev , & device_list );
69306960 adev -> pcie_reset_ctx .occurs_dpc = false;
69316961
69326962 if (hive ) {
0 commit comments