Skip to content

Commit 20238a2

Browse files
Tao Zhoualexdeucher
authored andcommitted
drm/amdgpu: add RAS reset/query operations for XGMI v6_4
Reset/query RAS error status and count. v2: use XGMI IP version instead of WAFL version. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 61fe553 commit 20238a2

1 file changed

Lines changed: 43 additions & 3 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,16 @@ static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = {
103103
smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
104104
};
105105

106+
static const int xgmi3x16_pcs_err_status_reg_v6_4[] = {
107+
smnPCS_XGMI3X16_PCS_ERROR_STATUS,
108+
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000
109+
};
110+
111+
static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = {
112+
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK,
113+
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
114+
};
115+
106116
static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
107117
{"XGMI PCS DataLossErr",
108118
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
@@ -952,6 +962,16 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
952962
default:
953963
break;
954964
}
965+
966+
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
967+
case IP_VERSION(6, 4, 0):
968+
for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++)
969+
pcs_clear_status(adev,
970+
xgmi3x16_pcs_err_status_reg_v6_4[i]);
971+
break;
972+
default:
973+
break;
974+
}
955975
}
956976

957977
static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
@@ -969,7 +989,9 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
969989

970990
if (is_xgmi_pcs) {
971991
if (amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
972-
IP_VERSION(6, 1, 0)) {
992+
IP_VERSION(6, 1, 0) ||
993+
amdgpu_ip_version(adev, XGMI_HWIP, 0) ==
994+
IP_VERSION(6, 4, 0)) {
973995
pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0];
974996
field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields);
975997
} else {
@@ -1007,7 +1029,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
10071029
void *ras_error_status)
10081030
{
10091031
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
1010-
int i;
1032+
int i, supported = 1;
10111033
uint32_t data, mask_data = 0;
10121034
uint32_t ue_cnt = 0, ce_cnt = 0;
10131035

@@ -1071,7 +1093,25 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
10711093
}
10721094
break;
10731095
default:
1074-
dev_warn(adev->dev, "XGMI RAS error query not supported");
1096+
supported = 0;
1097+
break;
1098+
}
1099+
1100+
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
1101+
case IP_VERSION(6, 4, 0):
1102+
/* check xgmi3x16 pcs error */
1103+
for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_v6_4); i++) {
1104+
data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_v6_4[i]);
1105+
mask_data =
1106+
RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[i]);
1107+
if (data)
1108+
amdgpu_xgmi_query_pcs_error_status(adev, data,
1109+
mask_data, &ue_cnt, &ce_cnt, true, true);
1110+
}
1111+
break;
1112+
default:
1113+
if (!supported)
1114+
dev_warn(adev->dev, "XGMI RAS error query not supported");
10751115
break;
10761116
}
10771117

0 commit comments

Comments
 (0)