@@ -360,6 +360,138 @@ static bool umc_v8_10_query_ras_poison_mode(struct amdgpu_device *adev)
360360 return true;
361361}
362362
363+ static void umc_v8_10_ecc_info_query_correctable_error_count (struct amdgpu_device * adev ,
364+ uint32_t node_inst , uint32_t umc_inst , uint32_t ch_inst ,
365+ unsigned long * error_count )
366+ {
367+ uint64_t mc_umc_status ;
368+ uint32_t eccinfo_table_idx ;
369+ struct amdgpu_ras * ras = amdgpu_ras_get_context (adev );
370+
371+ eccinfo_table_idx = node_inst * adev -> umc .umc_inst_num *
372+ adev -> umc .channel_inst_num +
373+ umc_inst * adev -> umc .channel_inst_num +
374+ ch_inst ;
375+
376+ /* check the MCUMC_STATUS */
377+ mc_umc_status = ras -> umc_ecc .ecc [eccinfo_table_idx ].mca_umc_status ;
378+ if (REG_GET_FIELD (mc_umc_status , MCA_UMC_UMC0_MCUMC_STATUST0 , Val ) == 1 &&
379+ REG_GET_FIELD (mc_umc_status , MCA_UMC_UMC0_MCUMC_STATUST0 , CECC ) == 1 ) {
380+ * error_count += 1 ;
381+ }
382+ }
383+
384+ static void umc_v8_10_ecc_info_query_uncorrectable_error_count (struct amdgpu_device * adev ,
385+ uint32_t node_inst , uint32_t umc_inst , uint32_t ch_inst ,
386+ unsigned long * error_count )
387+ {
388+ uint64_t mc_umc_status ;
389+ uint32_t eccinfo_table_idx ;
390+ struct amdgpu_ras * ras = amdgpu_ras_get_context (adev );
391+
392+ eccinfo_table_idx = node_inst * adev -> umc .umc_inst_num *
393+ adev -> umc .channel_inst_num +
394+ umc_inst * adev -> umc .channel_inst_num +
395+ ch_inst ;
396+
397+ /* check the MCUMC_STATUS */
398+ mc_umc_status = ras -> umc_ecc .ecc [eccinfo_table_idx ].mca_umc_status ;
399+ if ((REG_GET_FIELD (mc_umc_status , MCA_UMC_UMC0_MCUMC_STATUST0 , Val ) == 1 ) &&
400+ (REG_GET_FIELD (mc_umc_status , MCA_UMC_UMC0_MCUMC_STATUST0 , Deferred ) == 1 ||
401+ REG_GET_FIELD (mc_umc_status , MCA_UMC_UMC0_MCUMC_STATUST0 , UECC ) == 1 ||
402+ REG_GET_FIELD (mc_umc_status , MCA_UMC_UMC0_MCUMC_STATUST0 , PCC ) == 1 ||
403+ REG_GET_FIELD (mc_umc_status , MCA_UMC_UMC0_MCUMC_STATUST0 , UC ) == 1 ||
404+ REG_GET_FIELD (mc_umc_status , MCA_UMC_UMC0_MCUMC_STATUST0 , TCC ) == 1 )) {
405+ * error_count += 1 ;
406+ }
407+ }
408+
409+ static void umc_v8_10_ecc_info_query_ras_error_count (struct amdgpu_device * adev ,
410+ void * ras_error_status )
411+ {
412+ struct ras_err_data * err_data = (struct ras_err_data * )ras_error_status ;
413+
414+ uint32_t node_inst = 0 ;
415+ uint32_t umc_inst = 0 ;
416+ uint32_t ch_inst = 0 ;
417+
418+ /* TODO: driver needs to toggle DF Cstate to ensure
419+ * safe access of UMC registers. Will add the protection
420+ */
421+ LOOP_UMC_EACH_NODE_INST_AND_CH (node_inst , umc_inst , ch_inst ) {
422+ umc_v8_10_ecc_info_query_correctable_error_count (adev ,
423+ node_inst , umc_inst , ch_inst ,
424+ & (err_data -> ce_count ));
425+ umc_v8_10_ecc_info_query_uncorrectable_error_count (adev ,
426+ node_inst , umc_inst , ch_inst ,
427+ & (err_data -> ue_count ));
428+ }
429+ }
430+
431+ static void umc_v8_10_ecc_info_query_error_address (struct amdgpu_device * adev ,
432+ struct ras_err_data * err_data ,
433+ uint32_t ch_inst ,
434+ uint32_t umc_inst ,
435+ uint32_t node_inst )
436+ {
437+ uint32_t eccinfo_table_idx , channel_index ;
438+ uint64_t mc_umc_status , err_addr ;
439+
440+ struct amdgpu_ras * ras = amdgpu_ras_get_context (adev );
441+
442+ eccinfo_table_idx = node_inst * adev -> umc .umc_inst_num *
443+ adev -> umc .channel_inst_num +
444+ umc_inst * adev -> umc .channel_inst_num +
445+ ch_inst ;
446+ channel_index =
447+ adev -> umc .channel_idx_tbl [node_inst * adev -> umc .umc_inst_num *
448+ adev -> umc .channel_inst_num +
449+ umc_inst * adev -> umc .channel_inst_num +
450+ ch_inst ];
451+
452+ mc_umc_status = ras -> umc_ecc .ecc [eccinfo_table_idx ].mca_umc_status ;
453+
454+ if (mc_umc_status == 0 )
455+ return ;
456+
457+ if (!err_data -> err_addr )
458+ return ;
459+
460+ /* calculate error address if ue error is detected */
461+ if (REG_GET_FIELD (mc_umc_status , MCA_UMC_UMC0_MCUMC_STATUST0 , Val ) == 1 &&
462+ REG_GET_FIELD (mc_umc_status , MCA_UMC_UMC0_MCUMC_STATUST0 , AddrV ) == 1 &&
463+ (REG_GET_FIELD (mc_umc_status , MCA_UMC_UMC0_MCUMC_STATUST0 , UECC ) == 1 )) {
464+
465+ err_addr = ras -> umc_ecc .ecc [eccinfo_table_idx ].mca_umc_addr ;
466+ err_addr = REG_GET_FIELD (err_addr , MCA_UMC_UMC0_MCUMC_ADDRT0 , ErrorAddr );
467+
468+ umc_v8_10_convert_error_address (adev , err_data , err_addr ,
469+ ch_inst , umc_inst , node_inst , mc_umc_status );
470+ }
471+ }
472+
473+ static void umc_v8_10_ecc_info_query_ras_error_address (struct amdgpu_device * adev ,
474+ void * ras_error_status )
475+ {
476+ struct ras_err_data * err_data = (struct ras_err_data * )ras_error_status ;
477+
478+ uint32_t node_inst = 0 ;
479+ uint32_t umc_inst = 0 ;
480+ uint32_t ch_inst = 0 ;
481+
482+ /* TODO: driver needs to toggle DF Cstate to ensure
483+ * safe access of UMC resgisters. Will add the protection
484+ * when firmware interface is ready
485+ */
486+ LOOP_UMC_EACH_NODE_INST_AND_CH (node_inst , umc_inst , ch_inst ) {
487+ umc_v8_10_ecc_info_query_error_address (adev ,
488+ err_data ,
489+ ch_inst ,
490+ umc_inst ,
491+ node_inst );
492+ }
493+ }
494+
363495const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = {
364496 .query_ras_error_count = umc_v8_10_query_ras_error_count ,
365497 .query_ras_error_address = umc_v8_10_query_ras_error_address ,
@@ -371,4 +503,6 @@ struct amdgpu_umc_ras umc_v8_10_ras = {
371503 },
372504 .err_cnt_init = umc_v8_10_err_cnt_init ,
373505 .query_ras_poison_mode = umc_v8_10_query_ras_poison_mode ,
506+ .ecc_info_query_ras_error_count = umc_v8_10_ecc_info_query_ras_error_count ,
507+ .ecc_info_query_ras_error_address = umc_v8_10_ecc_info_query_ras_error_address ,
374508};
0 commit comments