@@ -524,6 +524,8 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
524524 adev -> unique_id =
525525 ((struct amd_sriov_msg_pf2vf_info * )pf2vf_info )-> uuid ;
526526 adev -> virt .ras_en_caps .all = ((struct amd_sriov_msg_pf2vf_info * )pf2vf_info )-> ras_en_caps .all ;
527+ adev -> virt .ras_telemetry_en_caps .all =
528+ ((struct amd_sriov_msg_pf2vf_info * )pf2vf_info )-> ras_telemetry_en_caps .all ;
527529 break ;
528530 default :
529531 dev_err (adev -> dev , "invalid pf2vf version: 0x%x\n" , pf2vf_info -> version );
@@ -704,13 +706,17 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev)
704706 adev -> virt .fw_reserve .p_vf2pf =
705707 (struct amd_sriov_msg_vf2pf_info_header * )
706708 (adev -> mman .fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10 ));
709+ adev -> virt .fw_reserve .ras_telemetry =
710+ (adev -> mman .fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10 ));
707711 } else if (adev -> mman .drv_vram_usage_va ) {
708712 adev -> virt .fw_reserve .p_pf2vf =
709713 (struct amd_sriov_msg_pf2vf_info_header * )
710714 (adev -> mman .drv_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10 ));
711715 adev -> virt .fw_reserve .p_vf2pf =
712716 (struct amd_sriov_msg_vf2pf_info_header * )
713717 (adev -> mman .drv_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10 ));
718+ adev -> virt .fw_reserve .ras_telemetry =
719+ (adev -> mman .drv_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10 ));
714720 }
715721
716722 amdgpu_virt_read_pf2vf_data (adev );
@@ -1197,3 +1203,133 @@ bool amdgpu_virt_get_ras_capability(struct amdgpu_device *adev)
11971203
11981204 return true;
11991205}
1206+
1207+ static inline enum amd_sriov_ras_telemetry_gpu_block
1208+ amdgpu_ras_block_to_sriov (struct amdgpu_device * adev , enum amdgpu_ras_block block ) {
1209+ switch (block ) {
1210+ case AMDGPU_RAS_BLOCK__UMC :
1211+ return RAS_TELEMETRY_GPU_BLOCK_UMC ;
1212+ case AMDGPU_RAS_BLOCK__SDMA :
1213+ return RAS_TELEMETRY_GPU_BLOCK_SDMA ;
1214+ case AMDGPU_RAS_BLOCK__GFX :
1215+ return RAS_TELEMETRY_GPU_BLOCK_GFX ;
1216+ case AMDGPU_RAS_BLOCK__MMHUB :
1217+ return RAS_TELEMETRY_GPU_BLOCK_MMHUB ;
1218+ case AMDGPU_RAS_BLOCK__ATHUB :
1219+ return RAS_TELEMETRY_GPU_BLOCK_ATHUB ;
1220+ case AMDGPU_RAS_BLOCK__PCIE_BIF :
1221+ return RAS_TELEMETRY_GPU_BLOCK_PCIE_BIF ;
1222+ case AMDGPU_RAS_BLOCK__HDP :
1223+ return RAS_TELEMETRY_GPU_BLOCK_HDP ;
1224+ case AMDGPU_RAS_BLOCK__XGMI_WAFL :
1225+ return RAS_TELEMETRY_GPU_BLOCK_XGMI_WAFL ;
1226+ case AMDGPU_RAS_BLOCK__DF :
1227+ return RAS_TELEMETRY_GPU_BLOCK_DF ;
1228+ case AMDGPU_RAS_BLOCK__SMN :
1229+ return RAS_TELEMETRY_GPU_BLOCK_SMN ;
1230+ case AMDGPU_RAS_BLOCK__SEM :
1231+ return RAS_TELEMETRY_GPU_BLOCK_SEM ;
1232+ case AMDGPU_RAS_BLOCK__MP0 :
1233+ return RAS_TELEMETRY_GPU_BLOCK_MP0 ;
1234+ case AMDGPU_RAS_BLOCK__MP1 :
1235+ return RAS_TELEMETRY_GPU_BLOCK_MP1 ;
1236+ case AMDGPU_RAS_BLOCK__FUSE :
1237+ return RAS_TELEMETRY_GPU_BLOCK_FUSE ;
1238+ case AMDGPU_RAS_BLOCK__MCA :
1239+ return RAS_TELEMETRY_GPU_BLOCK_MCA ;
1240+ case AMDGPU_RAS_BLOCK__VCN :
1241+ return RAS_TELEMETRY_GPU_BLOCK_VCN ;
1242+ case AMDGPU_RAS_BLOCK__JPEG :
1243+ return RAS_TELEMETRY_GPU_BLOCK_JPEG ;
1244+ case AMDGPU_RAS_BLOCK__IH :
1245+ return RAS_TELEMETRY_GPU_BLOCK_IH ;
1246+ case AMDGPU_RAS_BLOCK__MPIO :
1247+ return RAS_TELEMETRY_GPU_BLOCK_MPIO ;
1248+ default :
1249+ dev_err (adev -> dev , "Unsupported SRIOV RAS telemetry block 0x%x\n" , block );
1250+ return RAS_TELEMETRY_GPU_BLOCK_COUNT ;
1251+ }
1252+ }
1253+
1254+ static int amdgpu_virt_cache_host_error_counts (struct amdgpu_device * adev ,
1255+ struct amdsriov_ras_telemetry * host_telemetry )
1256+ {
1257+ struct amd_sriov_ras_telemetry_error_count * tmp = NULL ;
1258+ uint32_t checksum , used_size ;
1259+
1260+ checksum = host_telemetry -> header .checksum ;
1261+ used_size = host_telemetry -> header .used_size ;
1262+
1263+ if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10 ))
1264+ return 0 ;
1265+
1266+ tmp = kmalloc (used_size , GFP_KERNEL );
1267+ if (!tmp )
1268+ return - ENOMEM ;
1269+
1270+ memcpy (tmp , & host_telemetry -> body .error_count , used_size );
1271+
1272+ if (checksum != amd_sriov_msg_checksum (tmp , used_size , 0 , 0 ))
1273+ goto out ;
1274+
1275+ memcpy (& adev -> virt .count_cache , tmp ,
1276+ min (used_size , sizeof (adev -> virt .count_cache )));
1277+ out :
1278+ kfree (tmp );
1279+
1280+ return 0 ;
1281+ }
1282+
1283+ static int amdgpu_virt_req_ras_err_count_internal (struct amdgpu_device * adev , bool force_update )
1284+ {
1285+ struct amdgpu_virt * virt = & adev -> virt ;
1286+
1287+ /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host
1288+ * will ignore incoming guest messages. Ratelimit the guest messages to
1289+ * prevent guest self DOS.
1290+ */
1291+ if (__ratelimit (& adev -> virt .ras_telemetry_rs ) || force_update ) {
1292+ if (!virt -> ops -> req_ras_err_count (adev ))
1293+ amdgpu_virt_cache_host_error_counts (adev ,
1294+ adev -> virt .fw_reserve .ras_telemetry );
1295+ }
1296+
1297+ return 0 ;
1298+ }
1299+
1300+ /* Bypass ACA interface and query ECC counts directly from host */
1301+ int amdgpu_virt_req_ras_err_count (struct amdgpu_device * adev , enum amdgpu_ras_block block ,
1302+ struct ras_err_data * err_data )
1303+ {
1304+ enum amd_sriov_ras_telemetry_gpu_block sriov_block ;
1305+
1306+ sriov_block = amdgpu_ras_block_to_sriov (adev , block );
1307+
1308+ if (sriov_block >= RAS_TELEMETRY_GPU_BLOCK_COUNT ||
1309+ !amdgpu_sriov_ras_telemetry_block_en (adev , sriov_block ))
1310+ return - EOPNOTSUPP ;
1311+
1312+ /* Host Access may be lost during reset, just return last cached data. */
1313+ if (down_read_trylock (& adev -> reset_domain -> sem )) {
1314+ amdgpu_virt_req_ras_err_count_internal (adev , false);
1315+ up_read (& adev -> reset_domain -> sem );
1316+ }
1317+
1318+ err_data -> ue_count = adev -> virt .count_cache .block [sriov_block ].ue_count ;
1319+ err_data -> ce_count = adev -> virt .count_cache .block [sriov_block ].ce_count ;
1320+ err_data -> de_count = adev -> virt .count_cache .block [sriov_block ].de_count ;
1321+
1322+ return 0 ;
1323+ }
1324+
1325+ int amdgpu_virt_ras_telemetry_post_reset (struct amdgpu_device * adev )
1326+ {
1327+ unsigned long ue_count , ce_count ;
1328+
1329+ if (amdgpu_sriov_ras_telemetry_en (adev )) {
1330+ amdgpu_virt_req_ras_err_count_internal (adev , true);
1331+ amdgpu_ras_query_error_count (adev , & ce_count , & ue_count , NULL );
1332+ }
1333+
1334+ return 0 ;
1335+ }
0 commit comments