Skip to content

Commit b1e9a71

Browse files
candicelicyalexdeucher
authored andcommitted
drm/amdgpu: Add ecc info query interface for umc v8_10
Support ecc info query for umc v8_10. v2: Simplied by convert_error_address. v3: Remove unused variable and invalid checking. Signed-off-by: Candice Li <candice.li@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 2d53b57 commit b1e9a71

1 file changed

Lines changed: 134 additions & 0 deletions

File tree

drivers/gpu/drm/amd/amdgpu/umc_v8_10.c

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,138 @@ static bool umc_v8_10_query_ras_poison_mode(struct amdgpu_device *adev)
360360
return true;
361361
}
362362

363+
static void umc_v8_10_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
364+
uint32_t node_inst, uint32_t umc_inst, uint32_t ch_inst,
365+
unsigned long *error_count)
366+
{
367+
uint64_t mc_umc_status;
368+
uint32_t eccinfo_table_idx;
369+
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
370+
371+
eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
372+
adev->umc.channel_inst_num +
373+
umc_inst * adev->umc.channel_inst_num +
374+
ch_inst;
375+
376+
/* check the MCUMC_STATUS */
377+
mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
378+
if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
379+
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
380+
*error_count += 1;
381+
}
382+
}
383+
384+
static void umc_v8_10_ecc_info_query_uncorrectable_error_count(struct amdgpu_device *adev,
385+
uint32_t node_inst, uint32_t umc_inst, uint32_t ch_inst,
386+
unsigned long *error_count)
387+
{
388+
uint64_t mc_umc_status;
389+
uint32_t eccinfo_table_idx;
390+
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
391+
392+
eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
393+
adev->umc.channel_inst_num +
394+
umc_inst * adev->umc.channel_inst_num +
395+
ch_inst;
396+
397+
/* check the MCUMC_STATUS */
398+
mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
399+
if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
400+
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
401+
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
402+
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
403+
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
404+
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
405+
*error_count += 1;
406+
}
407+
}
408+
409+
static void umc_v8_10_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
410+
void *ras_error_status)
411+
{
412+
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
413+
414+
uint32_t node_inst = 0;
415+
uint32_t umc_inst = 0;
416+
uint32_t ch_inst = 0;
417+
418+
/* TODO: driver needs to toggle DF Cstate to ensure
419+
* safe access of UMC registers. Will add the protection
420+
*/
421+
LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
422+
umc_v8_10_ecc_info_query_correctable_error_count(adev,
423+
node_inst, umc_inst, ch_inst,
424+
&(err_data->ce_count));
425+
umc_v8_10_ecc_info_query_uncorrectable_error_count(adev,
426+
node_inst, umc_inst, ch_inst,
427+
&(err_data->ue_count));
428+
}
429+
}
430+
431+
static void umc_v8_10_ecc_info_query_error_address(struct amdgpu_device *adev,
432+
struct ras_err_data *err_data,
433+
uint32_t ch_inst,
434+
uint32_t umc_inst,
435+
uint32_t node_inst)
436+
{
437+
uint32_t eccinfo_table_idx, channel_index;
438+
uint64_t mc_umc_status, err_addr;
439+
440+
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
441+
442+
eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
443+
adev->umc.channel_inst_num +
444+
umc_inst * adev->umc.channel_inst_num +
445+
ch_inst;
446+
channel_index =
447+
adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num *
448+
adev->umc.channel_inst_num +
449+
umc_inst * adev->umc.channel_inst_num +
450+
ch_inst];
451+
452+
mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
453+
454+
if (mc_umc_status == 0)
455+
return;
456+
457+
if (!err_data->err_addr)
458+
return;
459+
460+
/* calculate error address if ue error is detected */
461+
if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
462+
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 &&
463+
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1)) {
464+
465+
err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
466+
err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
467+
468+
umc_v8_10_convert_error_address(adev, err_data, err_addr,
469+
ch_inst, umc_inst, node_inst, mc_umc_status);
470+
}
471+
}
472+
473+
static void umc_v8_10_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
474+
void *ras_error_status)
475+
{
476+
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
477+
478+
uint32_t node_inst = 0;
479+
uint32_t umc_inst = 0;
480+
uint32_t ch_inst = 0;
481+
482+
/* TODO: driver needs to toggle DF Cstate to ensure
483+
* safe access of UMC resgisters. Will add the protection
484+
* when firmware interface is ready
485+
*/
486+
LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
487+
umc_v8_10_ecc_info_query_error_address(adev,
488+
err_data,
489+
ch_inst,
490+
umc_inst,
491+
node_inst);
492+
}
493+
}
494+
363495
const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = {
364496
.query_ras_error_count = umc_v8_10_query_ras_error_count,
365497
.query_ras_error_address = umc_v8_10_query_ras_error_address,
@@ -371,4 +503,6 @@ struct amdgpu_umc_ras umc_v8_10_ras = {
371503
},
372504
.err_cnt_init = umc_v8_10_err_cnt_init,
373505
.query_ras_poison_mode = umc_v8_10_query_ras_poison_mode,
506+
.ecc_info_query_ras_error_count = umc_v8_10_ecc_info_query_ras_error_count,
507+
.ecc_info_query_ras_error_address = umc_v8_10_ecc_info_query_ras_error_address,
374508
};

0 commit comments

Comments
 (0)