Skip to content

Commit 218da12

Browse files
committed
Merge tag 'amd-drm-next-6.20-2026-02-06' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.20-2026-02-06: amdgpu: - DML 2.1 fixes - Panel replay fixes - Display writeback fixes - MES 11 old firmware compat fix - DC CRC improvements - DPIA fixes - XGMI fixes - ASPM fix - SMU feature bit handling fixes - DC LUT fixes - RAS fixes - Misc memory leak in error path fixes - SDMA queue reset fixes - PG handling fixes - 5 level GPUVM page table fix - SR-IOV fix - Queue reset fix amdkfd: - Fix possible double deletion of validate list - Event setup fix - Device disconnect regression fix Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexander.deucher@amd.com> Link: https://patch.msgid.link/20260206192706.59396-1-alexander.deucher@amd.com
2 parents 2f5db9b + 5028a24 commit 218da12

108 files changed

Lines changed: 1297 additions & 538 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1186,8 +1186,10 @@ int amdgpu_acpi_enumerate_xcc(void)
11861186
if (!dev_info)
11871187
ret = amdgpu_acpi_dev_init(&dev_info, xcc_info, sbdf);
11881188

1189-
if (ret == -ENOMEM)
1189+
if (ret == -ENOMEM) {
1190+
kfree(xcc_info);
11901191
return ret;
1192+
}
11911193

11921194
if (!dev_info) {
11931195
kfree(xcc_info);

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,7 @@ int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev)
317317
void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev)
318318
{
319319
if (amdgpu_device_should_recover_gpu(adev))
320-
amdgpu_reset_domain_schedule(adev->reset_domain,
321-
&adev->kfd.reset_work);
320+
(void)amdgpu_reset_domain_schedule(adev->reset_domain, &adev->kfd.reset_work);
322321
}
323322

324323
int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size,
@@ -720,9 +719,8 @@ void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle)
720719
if (gfx_block != NULL)
721720
gfx_block->version->funcs->set_powergating_state((void *)gfx_block, state);
722721
}
723-
amdgpu_dpm_switch_power_profile(adev,
724-
PP_SMC_POWER_PROFILE_COMPUTE,
725-
!idle);
722+
(void)amdgpu_dpm_switch_power_profile(adev, PP_SMC_POWER_PROFILE_COMPUTE, !idle);
723+
726724
}
727725

728726
bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid)

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1924,21 +1924,21 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
19241924

19251925
/* Make sure restore workers don't access the BO any more */
19261926
mutex_lock(&process_info->lock);
1927-
list_del(&mem->validate_list);
1927+
if (!list_empty(&mem->validate_list))
1928+
list_del_init(&mem->validate_list);
19281929
mutex_unlock(&process_info->lock);
19291930

1931+
ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx);
1932+
if (unlikely(ret))
1933+
return ret;
1934+
19301935
/* Cleanup user pages and MMU notifiers */
19311936
if (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) {
19321937
amdgpu_hmm_unregister(mem->bo);
1933-
mutex_lock(&process_info->notifier_lock);
19341938
amdgpu_hmm_range_free(mem->range);
1935-
mutex_unlock(&process_info->notifier_lock);
1939+
mem->range = NULL;
19361940
}
19371941

1938-
ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx);
1939-
if (unlikely(ret))
1940-
return ret;
1941-
19421942
amdgpu_amdkfd_remove_eviction_fence(mem->bo,
19431943
process_info->eviction_fence);
19441944
pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -892,8 +892,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
892892
struct amdgpu_bo *bo = e->bo;
893893

894894
e->range = amdgpu_hmm_range_alloc(NULL);
895-
if (unlikely(!e->range))
896-
return -ENOMEM;
895+
if (unlikely(!e->range)) {
896+
r = -ENOMEM;
897+
goto out_free_user_pages;
898+
}
897899

898900
r = amdgpu_ttm_tt_get_user_pages(bo, e->range);
899901
if (r)

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3504,9 +3504,6 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
35043504
}
35053505
}
35063506

3507-
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3508-
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3509-
35103507
amdgpu_amdkfd_suspend(adev, true);
35113508
amdgpu_amdkfd_teardown_processes(adev);
35123509
amdgpu_userq_suspend(adev);
@@ -4902,6 +4899,9 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
49024899
amdgpu_virt_fini_data_exchange(adev);
49034900
}
49044901

4902+
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
4903+
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
4904+
49054905
/* disable all interrupts */
49064906
amdgpu_irq_disable_all(adev);
49074907
if (adev->mode_info.mode_config_initialized) {
@@ -4924,7 +4924,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
49244924
* before ip_fini_early to prevent kfd locking refcount issues by calling
49254925
* amdgpu_amdkfd_suspend()
49264926
*/
4927-
if (drm_dev_is_unplugged(adev_to_drm(adev)))
4927+
if (pci_dev_is_disconnected(adev->pdev))
49284928
amdgpu_amdkfd_device_fini_sw(adev);
49294929

49304930
amdgpu_device_ip_fini_early(adev);
@@ -4936,7 +4936,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
49364936

49374937
amdgpu_gart_dummy_page_fini(adev);
49384938

4939-
if (drm_dev_is_unplugged(adev_to_drm(adev)))
4939+
if (pci_dev_is_disconnected(adev->pdev))
49404940
amdgpu_device_unmap_mmio(adev);
49414941

49424942
}
@@ -5733,6 +5733,9 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
57335733
/* enable mmio access after mode 1 reset completed */
57345734
adev->no_hw_access = false;
57355735

5736+
/* ensure no_hw_access is updated before we access hw */
5737+
smp_mb();
5738+
57365739
amdgpu_device_load_pci_state(adev->pdev);
57375740
ret = amdgpu_psp_wait_for_bootloader(adev);
57385741
if (ret)
@@ -7357,6 +7360,9 @@ void amdgpu_device_halt(struct amdgpu_device *adev)
73577360
amdgpu_xcp_dev_unplug(adev);
73587361
drm_dev_unplug(ddev);
73597362

7363+
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
7364+
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
7365+
73607366
amdgpu_irq_disable_all(adev);
73617367

73627368
amdgpu_fence_driver_hw_fini(adev);

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2405,9 +2405,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
24052405
return -ENODEV;
24062406
}
24072407

2408-
if (amdgpu_aspm == -1 && !pcie_aspm_enabled(pdev))
2409-
amdgpu_aspm = 0;
2410-
24112408
if (amdgpu_virtual_display ||
24122409
amdgpu_device_asic_has_dc_support(pdev, flags & AMD_ASIC_MASK))
24132410
supports_atomic = true;

drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1436,7 +1436,7 @@ int amdgpu_gmc_get_nps_memranges(struct amdgpu_device *adev,
14361436
if (!*exp_ranges)
14371437
*exp_ranges = range_cnt;
14381438
err:
1439-
kfree(ranges);
1439+
kvfree(ranges);
14401440

14411441
return ret;
14421442
}

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
9292
struct drm_wedge_task_info *info = NULL;
9393
struct amdgpu_task_info *ti = NULL;
9494
struct amdgpu_device *adev = ring->adev;
95+
enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_RESET;
9596
int idx, r;
9697

9798
if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
@@ -135,13 +136,19 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
135136
ring->funcs->reset) {
136137
dev_err(adev->dev, "Starting %s ring reset\n",
137138
s_job->sched->name);
139+
/* Stop the scheduler to prevent anybody else from touching the ring buffer. */
140+
drm_sched_wqueue_stop(&ring->sched);
138141
r = amdgpu_ring_reset(ring, job->vmid, job->hw_fence);
139142
if (!r) {
143+
/* Start the scheduler again */
144+
drm_sched_wqueue_start(&ring->sched);
140145
atomic_inc(&ring->adev->gpu_reset_counter);
141146
dev_err(adev->dev, "Ring %s reset succeeded\n",
142147
ring->sched.name);
143148
drm_dev_wedged_event(adev_to_drm(adev),
144149
DRM_WEDGE_RECOVERY_NONE, info);
150+
/* This is needed to add the job back to the pending list */
151+
status = DRM_GPU_SCHED_STAT_NO_HANG;
145152
goto exit;
146153
}
147154
dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name);
@@ -177,7 +184,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
177184
exit:
178185
amdgpu_vm_put_task_info(ti);
179186
drm_dev_exit(idx);
180-
return DRM_GPU_SCHED_STAT_RESET;
187+
return status;
181188
}
182189

183190
int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4352,7 +4352,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
43524352
* to handle fatal error */
43534353
r = amdgpu_nbio_ras_sw_init(adev);
43544354
if (r)
4355-
return r;
4355+
goto release_con;
43564356

43574357
if (adev->nbio.ras &&
43584358
adev->nbio.ras->init_ras_controller_interrupt) {
@@ -4650,6 +4650,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
46504650
amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
46514651
}
46524652

4653+
amdgpu_ras_check_bad_page_status(adev);
4654+
46534655
return 0;
46544656
}
46554657

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1712,10 +1712,6 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
17121712
dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
17131713
control->ras_num_bad_pages,
17141714
ras->bad_page_cnt_threshold);
1715-
if (amdgpu_bad_page_threshold != 0 &&
1716-
control->ras_num_bad_pages >= ras->bad_page_cnt_threshold)
1717-
amdgpu_dpm_send_rma_reason(adev);
1718-
17191715
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
17201716
amdgpu_bad_page_threshold != 0) {
17211717
if (hdr->version >= RAS_TABLE_VER_V2_1) {
@@ -1932,3 +1928,26 @@ int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
19321928
result);
19331929
return -EOPNOTSUPP;
19341930
}
1931+
1932+
void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev)
1933+
{
1934+
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1935+
struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
1936+
1937+
if (!control || amdgpu_bad_page_threshold == 0)
1938+
return;
1939+
1940+
if (control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
1941+
if (amdgpu_dpm_send_rma_reason(adev))
1942+
dev_warn(adev->dev, "Unable to send out-of-band RMA CPER");
1943+
else
1944+
dev_dbg(adev->dev, "Sent out-of-band RMA CPER");
1945+
1946+
if (adev->cper.enabled && !amdgpu_uniras_enabled(adev)) {
1947+
if (amdgpu_cper_generate_bp_threshold_record(adev))
1948+
dev_warn(adev->dev, "Unable to send in-band RMA CPER");
1949+
else
1950+
dev_dbg(adev->dev, "Sent in-band RMA CPER");
1951+
}
1952+
}
1953+
}

0 commit comments

Comments
 (0)