Skip to content

Commit 5028a24

Browse files
kentrussellalexdeucher
authored andcommitted
drm/amdgpu: Send applicable RMA CPERs at end of RAS init
Firmware and monitoring tools may not be ready to receive a CPER when we read the bad pages, so send the CPERs at the end of RAS initialization to ensure that the FW is ready to receive and process the CPER. This removes the previous CPER submission that was added during bad page load, and sends both in-band and out-of-band at the same time. Signed-off-by: Kent Russell <kent.russell@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent f7afda7 commit 5028a24

3 files changed

Lines changed: 27 additions & 4 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4650,6 +4650,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
46504650
amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
46514651
}
46524652

4653+
amdgpu_ras_check_bad_page_status(adev);
4654+
46534655
return 0;
46544656
}
46554657

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1712,10 +1712,6 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
17121712
dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
17131713
control->ras_num_bad_pages,
17141714
ras->bad_page_cnt_threshold);
1715-
if (amdgpu_bad_page_threshold != 0 &&
1716-
control->ras_num_bad_pages >= ras->bad_page_cnt_threshold)
1717-
amdgpu_dpm_send_rma_reason(adev);
1718-
17191715
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
17201716
amdgpu_bad_page_threshold != 0) {
17211717
if (hdr->version >= RAS_TABLE_VER_V2_1) {
@@ -1932,3 +1928,26 @@ int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
19321928
result);
19331929
return -EOPNOTSUPP;
19341930
}
1931+
1932+
void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev)
1933+
{
1934+
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1935+
struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
1936+
1937+
if (!control || amdgpu_bad_page_threshold == 0)
1938+
return;
1939+
1940+
if (control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
1941+
if (amdgpu_dpm_send_rma_reason(adev))
1942+
dev_warn(adev->dev, "Unable to send out-of-band RMA CPER");
1943+
else
1944+
dev_dbg(adev->dev, "Sent out-of-band RMA CPER");
1945+
1946+
if (adev->cper.enabled && !amdgpu_uniras_enabled(adev)) {
1947+
if (amdgpu_cper_generate_bp_threshold_record(adev))
1948+
dev_warn(adev->dev, "Unable to send in-band RMA CPER");
1949+
else
1950+
dev_dbg(adev->dev, "Sent in-band RMA CPER");
1951+
}
1952+
}
1953+
}

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
193193

194194
int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control);
195195

196+
void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev);
197+
196198
extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
197199
extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
198200

0 commit comments

Comments
 (0)