Skip to content

Commit 2a084f4

Browse files
committed
Merge tag 'amd-drm-next-6.19-2025-11-07' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.19-2025-11-07: amdgpu: - Misc fixes - HMM cleanup - HDP flush rework - RAS updates - SMU 13.x updates - SI DPM cleanup - Suspend rework - UQ reset support - Replay/PSR fixes - HDCP updates - DC PMO fixes - DC pstate fixes - DCN4 fixes - GPUVM fixes - SMU 13 parition metrics - Fix possible fence leak in job cleanup - Hibernation fix - MST fix amdkfd: - HMM cleanup - Process cleanup fix Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexander.deucher@amd.com> Link: https://patch.msgid.link/20251107145938.26669-1-alexander.deucher@amd.com
2 parents e237dfe + 2e640e8 commit 2a084f4

106 files changed

Lines changed: 2438 additions & 1134 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

MAINTAINERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1080,7 +1080,7 @@ M: Austin Zheng <austin.zheng@amd.com>
10801080
M: Jun Lei <jun.lei@amd.com>
10811081
S: Supported
10821082
F: drivers/gpu/drm/amd/display/dc/dml/
1083-
F: drivers/gpu/drm/amd/display/dc/dml2/
1083+
F: drivers/gpu/drm/amd/display/dc/dml2_0/
10841084

10851085
AMD FAM15H PROCESSOR POWER MONITORING DRIVER
10861086
M: Huang Rui <ray.huang@amd.com>

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1316,6 +1316,7 @@ struct amdgpu_device {
13161316
bool apu_prefer_gtt;
13171317

13181318
bool userq_halt_for_enforce_isolation;
1319+
struct work_struct userq_reset_work;
13191320
struct amdgpu_uid *uid_info;
13201321

13211322
/* KFD
@@ -1539,11 +1540,6 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
15391540
#define amdgpu_asic_read_bios_from_rom(adev, b, l) (adev)->asic_funcs->read_bios_from_rom((adev), (b), (l))
15401541
#define amdgpu_asic_read_register(adev, se, sh, offset, v)((adev)->asic_funcs->read_register((adev), (se), (sh), (offset), (v)))
15411542
#define amdgpu_asic_get_config_memsize(adev) (adev)->asic_funcs->get_config_memsize((adev))
1542-
#define amdgpu_asic_flush_hdp(adev, r) \
1543-
((adev)->asic_funcs->flush_hdp ? (adev)->asic_funcs->flush_hdp((adev), (r)) : (adev)->hdp.funcs->flush_hdp((adev), (r)))
1544-
#define amdgpu_asic_invalidate_hdp(adev, r) \
1545-
((adev)->asic_funcs->invalidate_hdp ? (adev)->asic_funcs->invalidate_hdp((adev), (r)) : \
1546-
((adev)->hdp.funcs->invalidate_hdp ? (adev)->hdp.funcs->invalidate_hdp((adev), (r)) : (void)0))
15471543
#define amdgpu_asic_need_full_reset(adev) (adev)->asic_funcs->need_full_reset((adev))
15481544
#define amdgpu_asic_init_doorbell_index(adev) (adev)->asic_funcs->init_doorbell_index((adev))
15491545
#define amdgpu_asic_get_pcie_usage(adev, cnt0, cnt1) ((adev)->asic_funcs->get_pcie_usage((adev), (cnt0), (cnt1)))

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,6 +1274,10 @@ static int unmap_bo_from_gpuvm(struct kgd_mem *mem,
12741274

12751275
(void)amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
12761276

1277+
/* VM entity stopped if process killed, don't clear freed pt bo */
1278+
if (!amdgpu_vm_ready(vm))
1279+
return 0;
1280+
12771281
(void)amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
12781282

12791283
(void)amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 122 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171

7272
#include "amdgpu_xgmi.h"
7373
#include "amdgpu_ras.h"
74+
#include "amdgpu_ras_mgr.h"
7475
#include "amdgpu_pmu.h"
7576
#include "amdgpu_fru_eeprom.h"
7677
#include "amdgpu_reset.h"
@@ -179,6 +180,10 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
179180
BIT(AMD_IP_BLOCK_TYPE_PSP)
180181
};
181182

183+
static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev);
184+
static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev);
185+
static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev);
186+
182187
static void amdgpu_device_load_switch_state(struct amdgpu_device *adev);
183188

184189
static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
@@ -2494,6 +2499,7 @@ static const char *ip_block_names[] = {
24942499
[AMD_IP_BLOCK_TYPE_VPE] = "vpe",
24952500
[AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm",
24962501
[AMD_IP_BLOCK_TYPE_ISP] = "isp",
2502+
[AMD_IP_BLOCK_TYPE_RAS] = "ras",
24972503
};
24982504

24992505
static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type)
@@ -3784,7 +3790,7 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
37843790
*/
37853791
static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
37863792
{
3787-
int i, r;
3793+
int i, r, rec;
37883794

37893795
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
37903796
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
@@ -3807,10 +3813,23 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
38073813

38083814
r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
38093815
if (r)
3810-
return r;
3816+
goto unwind;
38113817
}
38123818

38133819
return 0;
3820+
unwind:
3821+
rec = amdgpu_device_ip_resume_phase3(adev);
3822+
if (rec)
3823+
dev_err(adev->dev,
3824+
"amdgpu_device_ip_resume_phase3 failed during unwind: %d\n",
3825+
rec);
3826+
3827+
amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW);
3828+
3829+
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3830+
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3831+
3832+
return r;
38143833
}
38153834

38163835
/**
@@ -3826,7 +3845,7 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
38263845
*/
38273846
static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
38283847
{
3829-
int i, r;
3848+
int i, r, rec;
38303849

38313850
if (adev->in_s0ix)
38323851
amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
@@ -3889,7 +3908,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
38893908

38903909
r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
38913910
if (r)
3892-
return r;
3911+
goto unwind;
38933912

38943913
/* handle putting the SMC in the appropriate state */
38953914
if (!amdgpu_sriov_vf(adev)) {
@@ -3899,13 +3918,40 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
38993918
dev_err(adev->dev,
39003919
"SMC failed to set mp1 state %d, %d\n",
39013920
adev->mp1_state, r);
3902-
return r;
3921+
goto unwind;
39033922
}
39043923
}
39053924
}
39063925
}
39073926

39083927
return 0;
3928+
unwind:
3929+
/* suspend phase 2 = resume phase 1 + resume phase 2 */
3930+
rec = amdgpu_device_ip_resume_phase1(adev);
3931+
if (rec) {
3932+
dev_err(adev->dev,
3933+
"amdgpu_device_ip_resume_phase1 failed during unwind: %d\n",
3934+
rec);
3935+
return r;
3936+
}
3937+
3938+
rec = amdgpu_device_fw_loading(adev);
3939+
if (rec) {
3940+
dev_err(adev->dev,
3941+
"amdgpu_device_fw_loading failed during unwind: %d\n",
3942+
rec);
3943+
return r;
3944+
}
3945+
3946+
rec = amdgpu_device_ip_resume_phase2(adev);
3947+
if (rec) {
3948+
dev_err(adev->dev,
3949+
"amdgpu_device_ip_resume_phase2 failed during unwind: %d\n",
3950+
rec);
3951+
return r;
3952+
}
3953+
3954+
return r;
39093955
}
39103956

39113957
/**
@@ -4607,6 +4653,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
46074653
}
46084654

46094655
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4656+
INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
46104657

46114658
adev->gfx.gfx_off_req_count = 1;
46124659
adev->gfx.gfx_off_residency = 0;
@@ -5229,7 +5276,7 @@ void amdgpu_device_complete(struct drm_device *dev)
52295276
int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
52305277
{
52315278
struct amdgpu_device *adev = drm_to_adev(dev);
5232-
int r = 0;
5279+
int r, rec;
52335280

52345281
if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
52355282
return 0;
@@ -5245,8 +5292,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
52455292
return r;
52465293
}
52475294

5248-
if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3))
5249-
dev_warn(adev->dev, "smart shift update failed\n");
5295+
r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3);
5296+
if (r)
5297+
goto unwind_sriov;
52505298

52515299
if (notify_clients)
52525300
drm_client_dev_suspend(adev_to_drm(adev));
@@ -5257,33 +5305,79 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
52575305

52585306
r = amdgpu_device_ip_suspend_phase1(adev);
52595307
if (r)
5260-
return r;
5308+
goto unwind_smartshift;
52615309

52625310
amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
52635311
r = amdgpu_userq_suspend(adev);
52645312
if (r)
5265-
return r;
5313+
goto unwind_ip_phase1;
52665314

52675315
r = amdgpu_device_evict_resources(adev);
52685316
if (r)
5269-
return r;
5317+
goto unwind_userq;
52705318

52715319
amdgpu_ttm_set_buffer_funcs_status(adev, false);
52725320

52735321
amdgpu_fence_driver_hw_fini(adev);
52745322

52755323
r = amdgpu_device_ip_suspend_phase2(adev);
52765324
if (r)
5277-
return r;
5325+
goto unwind_evict;
52785326

52795327
if (amdgpu_sriov_vf(adev))
52805328
amdgpu_virt_release_full_gpu(adev, false);
52815329

5282-
r = amdgpu_dpm_notify_rlc_state(adev, false);
5283-
if (r)
5330+
return 0;
5331+
5332+
unwind_evict:
5333+
if (adev->mman.buffer_funcs_ring->sched.ready)
5334+
amdgpu_ttm_set_buffer_funcs_status(adev, true);
5335+
amdgpu_fence_driver_hw_init(adev);
5336+
5337+
unwind_userq:
5338+
rec = amdgpu_userq_resume(adev);
5339+
if (rec) {
5340+
dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec);
5341+
return r;
5342+
}
5343+
rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
5344+
if (rec) {
5345+
dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec);
52845346
return r;
5347+
}
52855348

5286-
return 0;
5349+
unwind_ip_phase1:
5350+
/* suspend phase 1 = resume phase 3 */
5351+
rec = amdgpu_device_ip_resume_phase3(adev);
5352+
if (rec) {
5353+
dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec);
5354+
return r;
5355+
}
5356+
5357+
unwind_smartshift:
5358+
rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0);
5359+
if (rec) {
5360+
dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec);
5361+
return r;
5362+
}
5363+
5364+
if (notify_clients)
5365+
drm_client_dev_resume(adev_to_drm(adev));
5366+
5367+
amdgpu_ras_resume(adev);
5368+
5369+
unwind_sriov:
5370+
if (amdgpu_sriov_vf(adev)) {
5371+
rec = amdgpu_virt_request_full_gpu(adev, true);
5372+
if (rec) {
5373+
dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec);
5374+
return r;
5375+
}
5376+
}
5377+
5378+
adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
5379+
5380+
return r;
52875381
}
52885382

52895383
static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
@@ -5989,6 +6083,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
59896083
if (r)
59906084
goto out;
59916085

6086+
r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
6087+
if (r)
6088+
goto out;
6089+
59926090
drm_client_dev_resume(adev_to_drm(tmp_adev));
59936091

59946092
/*
@@ -6211,6 +6309,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
62116309
if (!amdgpu_sriov_vf(adev))
62126310
cancel_work(&adev->reset_work);
62136311
#endif
6312+
cancel_work(&adev->userq_reset_work);
62146313

62156314
if (adev->kfd.dev)
62166315
cancel_work(&adev->kfd.reset_work);
@@ -6331,6 +6430,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
63316430
amdgpu_device_ip_need_full_reset(tmp_adev))
63326431
amdgpu_ras_suspend(tmp_adev);
63336432

6433+
amdgpu_userq_pre_reset(tmp_adev);
6434+
63346435
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
63356436
struct amdgpu_ring *ring = tmp_adev->rings[i];
63366437

@@ -6560,6 +6661,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
65606661
goto end_reset;
65616662
}
65626663

6664+
/* Cannot be called after locking reset domain */
6665+
amdgpu_ras_pre_reset(adev, &device_list);
6666+
65636667
/* We need to lock reset domain only once both for XGMI and single device */
65646668
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
65656669

@@ -6590,6 +6694,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
65906694
amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
65916695
reset_unlock:
65926696
amdgpu_device_recovery_put_reset_lock(adev, &device_list);
6697+
amdgpu_ras_post_reset(adev, &device_list);
65936698
end_reset:
65946699
if (hive) {
65956700
mutex_unlock(&hive->hive_lock);
@@ -7327,7 +7432,7 @@ void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
73277432
return;
73287433
}
73297434

7330-
amdgpu_asic_flush_hdp(adev, ring);
7435+
amdgpu_hdp_flush(adev, ring);
73317436
}
73327437

73337438
void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
@@ -7340,7 +7445,7 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
73407445
if (adev->gmc.xgmi.connected_to_cpu)
73417446
return;
73427447

7343-
amdgpu_asic_invalidate_hdp(adev, ring);
7448+
amdgpu_hdp_invalidate(adev, ring);
73447449
}
73457450

73467451
int amdgpu_in_reset(struct amdgpu_device *adev)

drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,13 +81,20 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf,
8181
struct drm_gem_object *obj = dmabuf->priv;
8282
struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
8383
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
84+
int r;
8485

8586
if (!amdgpu_dmabuf_is_xgmi_accessible(attach_adev, bo) &&
8687
pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0)
8788
attach->peer2peer = false;
8889

90+
r = dma_resv_lock(bo->tbo.base.resv, NULL);
91+
if (r)
92+
return r;
93+
8994
amdgpu_vm_bo_update_shared(bo);
9095

96+
dma_resv_unlock(bo->tbo.base.resv);
97+
9198
return 0;
9299
}
93100

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2625,9 +2625,14 @@ static int amdgpu_pmops_suspend_noirq(struct device *dev)
26252625
{
26262626
struct drm_device *drm_dev = dev_get_drvdata(dev);
26272627
struct amdgpu_device *adev = drm_to_adev(drm_dev);
2628+
int r;
26282629

2629-
if (amdgpu_acpi_should_gpu_reset(adev))
2630-
return amdgpu_asic_reset(adev);
2630+
if (amdgpu_acpi_should_gpu_reset(adev)) {
2631+
amdgpu_device_lock_reset_domain(adev->reset_domain);
2632+
r = amdgpu_asic_reset(adev);
2633+
amdgpu_device_unlock_reset_domain(adev->reset_domain);
2634+
return r;
2635+
}
26312636

26322637
return 0;
26332638
}

drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,19 @@ void amdgpu_hdp_generic_flush(struct amdgpu_device *adev,
6666
0);
6767
}
6868
}
69+
70+
void amdgpu_hdp_invalidate(struct amdgpu_device *adev, struct amdgpu_ring *ring)
71+
{
72+
if (adev->asic_funcs && adev->asic_funcs->invalidate_hdp)
73+
adev->asic_funcs->invalidate_hdp(adev, ring);
74+
else if (adev->hdp.funcs && adev->hdp.funcs->invalidate_hdp)
75+
adev->hdp.funcs->invalidate_hdp(adev, ring);
76+
}
77+
78+
void amdgpu_hdp_flush(struct amdgpu_device *adev, struct amdgpu_ring *ring)
79+
{
80+
if (adev->asic_funcs && adev->asic_funcs->flush_hdp)
81+
adev->asic_funcs->flush_hdp(adev, ring);
82+
else if (adev->hdp.funcs && adev->hdp.funcs->flush_hdp)
83+
adev->hdp.funcs->flush_hdp(adev, ring);
84+
}

0 commit comments

Comments
 (0)