Skip to content

Commit f1fdeb3

Browse files
Xiang Liualexdeucher
authored andcommitted
drm/amdgpu: Introduce VF critical region check for RAS poison injection
The SRIOV guest send requet to host to check whether the poison injection address is in VF critical region or not via mabox. Signed-off-by: Xiang Liu <xiang.liu@amd.com> Reviewed-by: Shravan Kumar Gande <Shravankumar.Gande@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 18f769f commit f1fdeb3

5 files changed

Lines changed: 79 additions & 0 deletions

File tree

drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -828,11 +828,14 @@ static void amdgpu_virt_init_ras(struct amdgpu_device *adev)
828828
{
829829
ratelimit_state_init(&adev->virt.ras.ras_error_cnt_rs, 5 * HZ, 1);
830830
ratelimit_state_init(&adev->virt.ras.ras_cper_dump_rs, 5 * HZ, 1);
831+
ratelimit_state_init(&adev->virt.ras.ras_chk_criti_rs, 5 * HZ, 1);
831832

832833
ratelimit_set_flags(&adev->virt.ras.ras_error_cnt_rs,
833834
RATELIMIT_MSG_ON_RELEASE);
834835
ratelimit_set_flags(&adev->virt.ras.ras_cper_dump_rs,
835836
RATELIMIT_MSG_ON_RELEASE);
837+
ratelimit_set_flags(&adev->virt.ras.ras_chk_criti_rs,
838+
RATELIMIT_MSG_ON_RELEASE);
836839

837840
mutex_init(&adev->virt.ras.ras_telemetry_mutex);
838841

@@ -1501,3 +1504,55 @@ void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev)
15011504
if (virt->ops && virt->ops->req_bad_pages)
15021505
virt->ops->req_bad_pages(adev);
15031506
}
1507+
1508+
static int amdgpu_virt_cache_chk_criti_hit(struct amdgpu_device *adev,
1509+
struct amdsriov_ras_telemetry *host_telemetry,
1510+
bool *hit)
1511+
{
1512+
struct amd_sriov_ras_chk_criti *tmp = NULL;
1513+
uint32_t checksum, used_size;
1514+
1515+
checksum = host_telemetry->header.checksum;
1516+
used_size = host_telemetry->header.used_size;
1517+
1518+
if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10))
1519+
return 0;
1520+
1521+
tmp = kmemdup(&host_telemetry->body.chk_criti, used_size, GFP_KERNEL);
1522+
if (!tmp)
1523+
return -ENOMEM;
1524+
1525+
if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0))
1526+
goto out;
1527+
1528+
if (hit)
1529+
*hit = tmp->hit ? true : false;
1530+
1531+
out:
1532+
kfree(tmp);
1533+
1534+
return 0;
1535+
}
1536+
1537+
int amdgpu_virt_check_vf_critical_region(struct amdgpu_device *adev, u64 addr, bool *hit)
1538+
{
1539+
struct amdgpu_virt *virt = &adev->virt;
1540+
int r = -EPERM;
1541+
1542+
if (!virt->ops || !virt->ops->req_ras_chk_criti)
1543+
return -EOPNOTSUPP;
1544+
1545+
/* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host
1546+
* will ignore incoming guest messages. Ratelimit the guest messages to
1547+
* prevent guest self DOS.
1548+
*/
1549+
if (__ratelimit(&virt->ras.ras_chk_criti_rs)) {
1550+
mutex_lock(&virt->ras.ras_telemetry_mutex);
1551+
if (!virt->ops->req_ras_chk_criti(adev, addr))
1552+
r = amdgpu_virt_cache_chk_criti_hit(
1553+
adev, virt->fw_reserve.ras_telemetry, hit);
1554+
mutex_unlock(&virt->ras.ras_telemetry_mutex);
1555+
}
1556+
1557+
return r;
1558+
}

drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ struct amdgpu_virt_ops {
9898
int (*req_ras_err_count)(struct amdgpu_device *adev);
9999
int (*req_ras_cper_dump)(struct amdgpu_device *adev, u64 vf_rptr);
100100
int (*req_bad_pages)(struct amdgpu_device *adev);
101+
int (*req_ras_chk_criti)(struct amdgpu_device *adev, u64 addr);
101102
};
102103

103104
/*
@@ -252,6 +253,7 @@ struct amdgpu_virt_ras_err_handler_data {
252253
struct amdgpu_virt_ras {
253254
struct ratelimit_state ras_error_cnt_rs;
254255
struct ratelimit_state ras_cper_dump_rs;
256+
struct ratelimit_state ras_chk_criti_rs;
255257
struct mutex ras_telemetry_mutex;
256258
uint64_t cper_rptr;
257259
};
@@ -453,4 +455,5 @@ int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev);
453455
bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev,
454456
enum amdgpu_ras_block block);
455457
void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev);
458+
int amdgpu_virt_check_vf_critical_region(struct amdgpu_device *adev, u64 addr, bool *hit);
456459
#endif

drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,12 +405,17 @@ struct amd_sriov_ras_cper_dump {
405405
uint32_t buf[];
406406
};
407407

408+
struct amd_sriov_ras_chk_criti {
409+
uint32_t hit;
410+
};
411+
408412
struct amdsriov_ras_telemetry {
409413
struct amd_sriov_ras_telemetry_header header;
410414

411415
union {
412416
struct amd_sriov_ras_telemetry_error_count error_count;
413417
struct amd_sriov_ras_cper_dump cper_dump;
418+
struct amd_sriov_ras_chk_criti chk_criti;
414419
} body;
415420
};
416421

drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,9 @@ static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev,
202202
case IDH_REQ_RAS_CPER_DUMP:
203203
event = IDH_RAS_CPER_DUMP_READY;
204204
break;
205+
case IDH_REQ_RAS_CHK_CRITI:
206+
event = IDH_REQ_RAS_CHK_CRITI_READY;
207+
break;
205208
default:
206209
break;
207210
}
@@ -556,6 +559,16 @@ static int xgpu_nv_req_ras_bad_pages(struct amdgpu_device *adev)
556559
return xgpu_nv_send_access_requests(adev, IDH_REQ_RAS_BAD_PAGES);
557560
}
558561

562+
static int xgpu_nv_check_vf_critical_region(struct amdgpu_device *adev, u64 addr)
563+
{
564+
uint32_t addr_hi, addr_lo;
565+
566+
addr_hi = (uint32_t)(addr >> 32);
567+
addr_lo = (uint32_t)(addr & 0xFFFFFFFF);
568+
return xgpu_nv_send_access_requests_with_param(
569+
adev, IDH_REQ_RAS_CHK_CRITI, addr_hi, addr_lo, 0);
570+
}
571+
559572
const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
560573
.req_full_gpu = xgpu_nv_request_full_gpu_access,
561574
.rel_full_gpu = xgpu_nv_release_full_gpu_access,
@@ -569,4 +582,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
569582
.req_ras_err_count = xgpu_nv_req_ras_err_count,
570583
.req_ras_cper_dump = xgpu_nv_req_ras_cper_dump,
571584
.req_bad_pages = xgpu_nv_req_ras_bad_pages,
585+
.req_ras_chk_criti = xgpu_nv_check_vf_critical_region
572586
};

drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ enum idh_request {
4343
IDH_REQ_RAS_ERROR_COUNT = 203,
4444
IDH_REQ_RAS_CPER_DUMP = 204,
4545
IDH_REQ_RAS_BAD_PAGES = 205,
46+
IDH_REQ_RAS_CHK_CRITI = 206
4647
};
4748

4849
enum idh_event {
@@ -62,6 +63,7 @@ enum idh_event {
6263
IDH_RAS_BAD_PAGES_READY = 15,
6364
IDH_RAS_BAD_PAGES_NOTIFICATION = 16,
6465
IDH_UNRECOV_ERR_NOTIFICATION = 17,
66+
IDH_REQ_RAS_CHK_CRITI_READY = 18,
6567

6668
IDH_TEXT_MESSAGE = 255,
6769
};

0 commit comments

Comments
 (0)