@@ -113,6 +113,43 @@ static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = {
113113 smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
114114};
115115
116+ static const u64 xgmi_v6_4_0_mca_base_array [] = {
117+ 0x11a09200 ,
118+ 0x11b09200 ,
119+ };
120+
121+ static const char * xgmi_v6_4_0_ras_error_code_ext [32 ] = {
122+ [0x00 ] = "XGMI PCS DataLossErr" ,
123+ [0x01 ] = "XGMI PCS TrainingErr" ,
124+ [0x02 ] = "XGMI PCS FlowCtrlAckErr" ,
125+ [0x03 ] = "XGMI PCS RxFifoUnderflowErr" ,
126+ [0x04 ] = "XGMI PCS RxFifoOverflowErr" ,
127+ [0x05 ] = "XGMI PCS CRCErr" ,
128+ [0x06 ] = "XGMI PCS BERExceededErr" ,
129+ [0x07 ] = "XGMI PCS TxMetaDataErr" ,
130+ [0x08 ] = "XGMI PCS ReplayBufParityErr" ,
131+ [0x09 ] = "XGMI PCS DataParityErr" ,
132+ [0x0a ] = "XGMI PCS ReplayFifoOverflowErr" ,
133+ [0x0b ] = "XGMI PCS ReplayFifoUnderflowErr" ,
134+ [0x0c ] = "XGMI PCS ElasticFifoOverflowErr" ,
135+ [0x0d ] = "XGMI PCS DeskewErr" ,
136+ [0x0e ] = "XGMI PCS FlowCtrlCRCErr" ,
137+ [0x0f ] = "XGMI PCS DataStartupLimitErr" ,
138+ [0x10 ] = "XGMI PCS FCInitTimeoutErr" ,
139+ [0x11 ] = "XGMI PCS RecoveryTimeoutErr" ,
140+ [0x12 ] = "XGMI PCS ReadySerialTimeoutErr" ,
141+ [0x13 ] = "XGMI PCS ReadySerialAttemptErr" ,
142+ [0x14 ] = "XGMI PCS RecoveryAttemptErr" ,
143+ [0x15 ] = "XGMI PCS RecoveryRelockAttemptErr" ,
144+ [0x16 ] = "XGMI PCS ReplayAttemptErr" ,
145+ [0x17 ] = "XGMI PCS SyncHdrErr" ,
146+ [0x18 ] = "XGMI PCS TxReplayTimeoutErr" ,
147+ [0x19 ] = "XGMI PCS RxReplayTimeoutErr" ,
148+ [0x1a ] = "XGMI PCS LinkSubTxTimeoutErr" ,
149+ [0x1b ] = "XGMI PCS LinkSubRxTimeoutErr" ,
150+ [0x1c ] = "XGMI PCS RxCMDPktErr" ,
151+ };
152+
116153static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields [] = {
117154 {"XGMI PCS DataLossErr" ,
118155 SOC15_REG_FIELD (XGMI0_PCS_GOPX16_PCS_ERROR_STATUS , DataLossErr )},
@@ -936,7 +973,7 @@ static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg
936973 WREG32_PCIE (pcs_status_reg , 0 );
937974}
938975
939- static void amdgpu_xgmi_reset_ras_error_count (struct amdgpu_device * adev )
976+ static void amdgpu_xgmi_legacy_reset_ras_error_count (struct amdgpu_device * adev )
940977{
941978 uint32_t i ;
942979
@@ -974,6 +1011,39 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
9741011 }
9751012}
9761013
1014+ static void __xgmi_v6_4_0_reset_error_count (struct amdgpu_device * adev , int xgmi_inst , u64 mca_base )
1015+ {
1016+ WREG64_MCA (xgmi_inst , mca_base , MCA_REG_IDX_STATUS , 0ULL );
1017+ }
1018+
1019+ static void xgmi_v6_4_0_reset_error_count (struct amdgpu_device * adev , int xgmi_inst )
1020+ {
1021+ int i ;
1022+
1023+ for (i = 0 ; i < ARRAY_SIZE (xgmi_v6_4_0_mca_base_array ); i ++ )
1024+ __xgmi_v6_4_0_reset_error_count (adev , xgmi_inst , xgmi_v6_4_0_mca_base_array [i ]);
1025+ }
1026+
1027+ static void xgmi_v6_4_0_reset_ras_error_count (struct amdgpu_device * adev )
1028+ {
1029+ int i ;
1030+
1031+ for_each_inst (i , adev -> aid_mask )
1032+ xgmi_v6_4_0_reset_error_count (adev , i );
1033+ }
1034+
1035+ static void amdgpu_xgmi_reset_ras_error_count (struct amdgpu_device * adev )
1036+ {
1037+ switch (amdgpu_ip_version (adev , XGMI_HWIP , 0 )) {
1038+ case IP_VERSION (6 , 4 , 0 ):
1039+ xgmi_v6_4_0_reset_ras_error_count (adev );
1040+ break ;
1041+ default :
1042+ amdgpu_xgmi_legacy_reset_ras_error_count (adev );
1043+ break ;
1044+ }
1045+ }
1046+
9771047static int amdgpu_xgmi_query_pcs_error_status (struct amdgpu_device * adev ,
9781048 uint32_t value ,
9791049 uint32_t mask_value ,
@@ -1025,8 +1095,8 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
10251095 return 0 ;
10261096}
10271097
1028- static void amdgpu_xgmi_query_ras_error_count (struct amdgpu_device * adev ,
1029- void * ras_error_status )
1098+ static void amdgpu_xgmi_legacy_query_ras_error_count (struct amdgpu_device * adev ,
1099+ void * ras_error_status )
10301100{
10311101 struct ras_err_data * err_data = (struct ras_err_data * )ras_error_status ;
10321102 int i , supported = 1 ;
@@ -1121,6 +1191,88 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
11211191 err_data -> ce_count += ce_cnt ;
11221192}
11231193
1194+ static enum amdgpu_mca_error_type xgmi_v6_4_0_pcs_mca_get_error_type (struct amdgpu_device * adev , u64 status )
1195+ {
1196+ const char * error_str ;
1197+ int ext_error_code ;
1198+
1199+ ext_error_code = MCA_REG__STATUS__ERRORCODEEXT (status );
1200+
1201+ error_str = ext_error_code < ARRAY_SIZE (xgmi_v6_4_0_ras_error_code_ext ) ?
1202+ xgmi_v6_4_0_ras_error_code_ext [ext_error_code ] : NULL ;
1203+ if (error_str )
1204+ dev_info (adev -> dev , "%s detected\n" , error_str );
1205+
1206+ switch (ext_error_code ) {
1207+ case 0 :
1208+ return AMDGPU_MCA_ERROR_TYPE_UE ;
1209+ case 6 :
1210+ return AMDGPU_MCA_ERROR_TYPE_CE ;
1211+ default :
1212+ return - EINVAL ;
1213+ }
1214+
1215+ return - EINVAL ;
1216+ }
1217+
1218+ static void __xgmi_v6_4_0_query_error_count (struct amdgpu_device * adev , struct amdgpu_smuio_mcm_config_info * mcm_info ,
1219+ u64 mca_base , struct ras_err_data * err_data )
1220+ {
1221+ int xgmi_inst = mcm_info -> die_id ;
1222+ u64 status = 0 ;
1223+
1224+ status = RREG64_MCA (xgmi_inst , mca_base , MCA_REG_IDX_STATUS );
1225+ if (!MCA_REG__STATUS__VAL (status ))
1226+ return ;
1227+
1228+ switch (xgmi_v6_4_0_pcs_mca_get_error_type (adev , status )) {
1229+ case AMDGPU_MCA_ERROR_TYPE_UE :
1230+ amdgpu_ras_error_statistic_ue_count (err_data , mcm_info , 1ULL );
1231+ break ;
1232+ case AMDGPU_MCA_ERROR_TYPE_CE :
1233+ amdgpu_ras_error_statistic_ce_count (err_data , mcm_info , 1ULL );
1234+ break ;
1235+ default :
1236+ break ;
1237+ }
1238+
1239+ WREG64_MCA (xgmi_inst , mca_base , MCA_REG_IDX_STATUS , 0ULL );
1240+ }
1241+
1242+ static void xgmi_v6_4_0_query_error_count (struct amdgpu_device * adev , int xgmi_inst , struct ras_err_data * err_data )
1243+ {
1244+ struct amdgpu_smuio_mcm_config_info mcm_info = {
1245+ .socket_id = adev -> smuio .funcs -> get_socket_id (adev ),
1246+ .die_id = xgmi_inst ,
1247+ };
1248+ int i ;
1249+
1250+ for (i = 0 ; i < ARRAY_SIZE (xgmi_v6_4_0_mca_base_array ); i ++ )
1251+ __xgmi_v6_4_0_query_error_count (adev , & mcm_info , xgmi_v6_4_0_mca_base_array [i ], err_data );
1252+ }
1253+
1254+ static void xgmi_v6_4_0_query_ras_error_count (struct amdgpu_device * adev , void * ras_error_status )
1255+ {
1256+ struct ras_err_data * err_data = (struct ras_err_data * )ras_error_status ;
1257+ int i ;
1258+
1259+ for_each_inst (i , adev -> aid_mask )
1260+ xgmi_v6_4_0_query_error_count (adev , i , err_data );
1261+ }
1262+
1263+ static void amdgpu_xgmi_query_ras_error_count (struct amdgpu_device * adev ,
1264+ void * ras_error_status )
1265+ {
1266+ switch (amdgpu_ip_version (adev , XGMI_HWIP , 0 )) {
1267+ case IP_VERSION (6 , 4 , 0 ):
1268+ xgmi_v6_4_0_query_ras_error_count (adev , ras_error_status );
1269+ break ;
1270+ default :
1271+ amdgpu_xgmi_legacy_query_ras_error_count (adev , ras_error_status );
1272+ break ;
1273+ }
1274+ }
1275+
11241276/* Trigger XGMI/WAFL error */
11251277static int amdgpu_ras_error_inject_xgmi (struct amdgpu_device * adev ,
11261278 void * inject_if , uint32_t instance_mask )
0 commit comments