7171
7272#include "amdgpu_xgmi.h"
7373#include "amdgpu_ras.h"
74+ #include "amdgpu_ras_mgr.h"
7475#include "amdgpu_pmu.h"
7576#include "amdgpu_fru_eeprom.h"
7677#include "amdgpu_reset.h"
@@ -179,6 +180,10 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
179180 BIT (AMD_IP_BLOCK_TYPE_PSP )
180181};
181182
183+ static int amdgpu_device_ip_resume_phase1 (struct amdgpu_device * adev );
184+ static int amdgpu_device_ip_resume_phase2 (struct amdgpu_device * adev );
185+ static int amdgpu_device_ip_resume_phase3 (struct amdgpu_device * adev );
186+
182187static void amdgpu_device_load_switch_state (struct amdgpu_device * adev );
183188
184189static inline bool amdgpu_ip_member_of_hwini (struct amdgpu_device * adev ,
@@ -2494,6 +2499,7 @@ static const char *ip_block_names[] = {
24942499 [AMD_IP_BLOCK_TYPE_VPE ] = "vpe" ,
24952500 [AMD_IP_BLOCK_TYPE_UMSCH_MM ] = "umsch_mm" ,
24962501 [AMD_IP_BLOCK_TYPE_ISP ] = "isp" ,
2502+ [AMD_IP_BLOCK_TYPE_RAS ] = "ras" ,
24972503};
24982504
24992505static const char * ip_block_name (struct amdgpu_device * adev , enum amd_ip_block_type type )
@@ -3784,7 +3790,7 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
37843790 */
37853791static int amdgpu_device_ip_suspend_phase1 (struct amdgpu_device * adev )
37863792{
3787- int i , r ;
3793+ int i , r , rec ;
37883794
37893795 amdgpu_device_set_pg_state (adev , AMD_PG_STATE_UNGATE );
37903796 amdgpu_device_set_cg_state (adev , AMD_CG_STATE_UNGATE );
@@ -3807,10 +3813,23 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
38073813
38083814 r = amdgpu_ip_block_suspend (& adev -> ip_blocks [i ]);
38093815 if (r )
3810- return r ;
3816+ goto unwind ;
38113817 }
38123818
38133819 return 0 ;
3820+ unwind :
3821+ rec = amdgpu_device_ip_resume_phase3 (adev );
3822+ if (rec )
3823+ dev_err (adev -> dev ,
3824+ "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n" ,
3825+ rec );
3826+
3827+ amdgpu_dpm_set_df_cstate (adev , DF_CSTATE_ALLOW );
3828+
3829+ amdgpu_device_set_pg_state (adev , AMD_PG_STATE_GATE );
3830+ amdgpu_device_set_cg_state (adev , AMD_CG_STATE_GATE );
3831+
3832+ return r ;
38143833}
38153834
38163835/**
@@ -3826,7 +3845,7 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
38263845 */
38273846static int amdgpu_device_ip_suspend_phase2 (struct amdgpu_device * adev )
38283847{
3829- int i , r ;
3848+ int i , r , rec ;
38303849
38313850 if (adev -> in_s0ix )
38323851 amdgpu_dpm_gfx_state_change (adev , sGpuChangeState_D3Entry );
@@ -3889,7 +3908,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
38893908
38903909 r = amdgpu_ip_block_suspend (& adev -> ip_blocks [i ]);
38913910 if (r )
3892- return r ;
3911+ goto unwind ;
38933912
38943913 /* handle putting the SMC in the appropriate state */
38953914 if (!amdgpu_sriov_vf (adev )) {
@@ -3899,13 +3918,40 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
38993918 dev_err (adev -> dev ,
39003919 "SMC failed to set mp1 state %d, %d\n" ,
39013920 adev -> mp1_state , r );
3902- return r ;
3921+ goto unwind ;
39033922 }
39043923 }
39053924 }
39063925 }
39073926
39083927 return 0 ;
3928+ unwind :
3929+ /* suspend phase 2 = resume phase 1 + resume phase 2 */
3930+ rec = amdgpu_device_ip_resume_phase1 (adev );
3931+ if (rec ) {
3932+ dev_err (adev -> dev ,
3933+ "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n" ,
3934+ rec );
3935+ return r ;
3936+ }
3937+
3938+ rec = amdgpu_device_fw_loading (adev );
3939+ if (rec ) {
3940+ dev_err (adev -> dev ,
3941+ "amdgpu_device_fw_loading failed during unwind: %d\n" ,
3942+ rec );
3943+ return r ;
3944+ }
3945+
3946+ rec = amdgpu_device_ip_resume_phase2 (adev );
3947+ if (rec ) {
3948+ dev_err (adev -> dev ,
3949+ "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n" ,
3950+ rec );
3951+ return r ;
3952+ }
3953+
3954+ return r ;
39093955}
39103956
39113957/**
@@ -4607,6 +4653,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
46074653 }
46084654
46094655 INIT_WORK (& adev -> xgmi_reset_work , amdgpu_device_xgmi_reset_func );
4656+ INIT_WORK (& adev -> userq_reset_work , amdgpu_userq_reset_work );
46104657
46114658 adev -> gfx .gfx_off_req_count = 1 ;
46124659 adev -> gfx .gfx_off_residency = 0 ;
@@ -5229,7 +5276,7 @@ void amdgpu_device_complete(struct drm_device *dev)
52295276int amdgpu_device_suspend (struct drm_device * dev , bool notify_clients )
52305277{
52315278 struct amdgpu_device * adev = drm_to_adev (dev );
5232- int r = 0 ;
5279+ int r , rec ;
52335280
52345281 if (dev -> switch_power_state == DRM_SWITCH_POWER_OFF )
52355282 return 0 ;
@@ -5245,8 +5292,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
52455292 return r ;
52465293 }
52475294
5248- if (amdgpu_acpi_smart_shift_update (adev , AMDGPU_SS_DEV_D3 ))
5249- dev_warn (adev -> dev , "smart shift update failed\n" );
5295+ r = amdgpu_acpi_smart_shift_update (adev , AMDGPU_SS_DEV_D3 );
5296+ if (r )
5297+ goto unwind_sriov ;
52505298
52515299 if (notify_clients )
52525300 drm_client_dev_suspend (adev_to_drm (adev ));
@@ -5257,33 +5305,79 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
52575305
52585306 r = amdgpu_device_ip_suspend_phase1 (adev );
52595307 if (r )
5260- return r ;
5308+ goto unwind_smartshift ;
52615309
52625310 amdgpu_amdkfd_suspend (adev , !amdgpu_sriov_vf (adev ) && !adev -> in_runpm );
52635311 r = amdgpu_userq_suspend (adev );
52645312 if (r )
5265- return r ;
5313+ goto unwind_ip_phase1 ;
52665314
52675315 r = amdgpu_device_evict_resources (adev );
52685316 if (r )
5269- return r ;
5317+ goto unwind_userq ;
52705318
52715319 amdgpu_ttm_set_buffer_funcs_status (adev , false);
52725320
52735321 amdgpu_fence_driver_hw_fini (adev );
52745322
52755323 r = amdgpu_device_ip_suspend_phase2 (adev );
52765324 if (r )
5277- return r ;
5325+ goto unwind_evict ;
52785326
52795327 if (amdgpu_sriov_vf (adev ))
52805328 amdgpu_virt_release_full_gpu (adev , false);
52815329
5282- r = amdgpu_dpm_notify_rlc_state (adev , false);
5283- if (r )
5330+ return 0 ;
5331+
5332+ unwind_evict :
5333+ if (adev -> mman .buffer_funcs_ring -> sched .ready )
5334+ amdgpu_ttm_set_buffer_funcs_status (adev , true);
5335+ amdgpu_fence_driver_hw_init (adev );
5336+
5337+ unwind_userq :
5338+ rec = amdgpu_userq_resume (adev );
5339+ if (rec ) {
5340+ dev_warn (adev -> dev , "failed to re-initialize user queues: %d\n" , rec );
5341+ return r ;
5342+ }
5343+ rec = amdgpu_amdkfd_resume (adev , !amdgpu_sriov_vf (adev ) && !adev -> in_runpm );
5344+ if (rec ) {
5345+ dev_warn (adev -> dev , "failed to re-initialize kfd: %d\n" , rec );
52845346 return r ;
5347+ }
52855348
5286- return 0 ;
5349+ unwind_ip_phase1 :
5350+ /* suspend phase 1 = resume phase 3 */
5351+ rec = amdgpu_device_ip_resume_phase3 (adev );
5352+ if (rec ) {
5353+ dev_warn (adev -> dev , "failed to re-initialize IPs phase1: %d\n" , rec );
5354+ return r ;
5355+ }
5356+
5357+ unwind_smartshift :
5358+ rec = amdgpu_acpi_smart_shift_update (adev , AMDGPU_SS_DEV_D0 );
5359+ if (rec ) {
5360+ dev_warn (adev -> dev , "failed to re-update smart shift: %d\n" , rec );
5361+ return r ;
5362+ }
5363+
5364+ if (notify_clients )
5365+ drm_client_dev_resume (adev_to_drm (adev ));
5366+
5367+ amdgpu_ras_resume (adev );
5368+
5369+ unwind_sriov :
5370+ if (amdgpu_sriov_vf (adev )) {
5371+ rec = amdgpu_virt_request_full_gpu (adev , true);
5372+ if (rec ) {
5373+ dev_warn (adev -> dev , "failed to reinitialize sriov: %d\n" , rec );
5374+ return r ;
5375+ }
5376+ }
5377+
5378+ adev -> in_suspend = adev -> in_s0ix = adev -> in_s3 = false;
5379+
5380+ return r ;
52875381}
52885382
52895383static inline int amdgpu_virt_resume (struct amdgpu_device * adev )
@@ -5989,6 +6083,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
59896083 if (r )
59906084 goto out ;
59916085
6086+ r = amdgpu_userq_post_reset (tmp_adev , vram_lost );
6087+ if (r )
6088+ goto out ;
6089+
59926090 drm_client_dev_resume (adev_to_drm (tmp_adev ));
59936091
59946092 /*
@@ -6211,6 +6309,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
62116309 if (!amdgpu_sriov_vf (adev ))
62126310 cancel_work (& adev -> reset_work );
62136311#endif
6312+ cancel_work (& adev -> userq_reset_work );
62146313
62156314 if (adev -> kfd .dev )
62166315 cancel_work (& adev -> kfd .reset_work );
@@ -6331,6 +6430,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
63316430 amdgpu_device_ip_need_full_reset (tmp_adev ))
63326431 amdgpu_ras_suspend (tmp_adev );
63336432
6433+ amdgpu_userq_pre_reset (tmp_adev );
6434+
63346435 for (i = 0 ; i < AMDGPU_MAX_RINGS ; ++ i ) {
63356436 struct amdgpu_ring * ring = tmp_adev -> rings [i ];
63366437
@@ -6560,6 +6661,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
65606661 goto end_reset ;
65616662 }
65626663
6664+ /* Cannot be called after locking reset domain */
6665+ amdgpu_ras_pre_reset (adev , & device_list );
6666+
65636667 /* We need to lock reset domain only once both for XGMI and single device */
65646668 amdgpu_device_recovery_get_reset_lock (adev , & device_list );
65656669
@@ -6590,6 +6694,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
65906694 amdgpu_device_gpu_resume (adev , & device_list , need_emergency_restart );
65916695reset_unlock :
65926696 amdgpu_device_recovery_put_reset_lock (adev , & device_list );
6697+ amdgpu_ras_post_reset (adev , & device_list );
65936698end_reset :
65946699 if (hive ) {
65956700 mutex_unlock (& hive -> hive_lock );
@@ -7327,7 +7432,7 @@ void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
73277432 return ;
73287433 }
73297434
7330- amdgpu_asic_flush_hdp (adev , ring );
7435+ amdgpu_hdp_flush (adev , ring );
73317436}
73327437
73337438void amdgpu_device_invalidate_hdp (struct amdgpu_device * adev ,
@@ -7340,7 +7445,7 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
73407445 if (adev -> gmc .xgmi .connected_to_cpu )
73417446 return ;
73427447
7343- amdgpu_asic_invalidate_hdp (adev , ring );
7448+ amdgpu_hdp_invalidate (adev , ring );
73447449}
73457450
73467451int amdgpu_in_reset (struct amdgpu_device * adev )
0 commit comments