Skip to content

Commit e568dc3

Browse files
committed
accel/amdxdna: Add IOCTL parameter for telemetry data
Extend DRM_IOCTL_AMDXDNA_GET_INFO to include additional parameters that allow collection of telemetry data. Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com> Link: https://patch.msgid.link/20251104062546.833771-3-lizhi.hou@amd.com
1 parent 1556c17 commit e568dc3

7 files changed

Lines changed: 173 additions & 10 deletions

File tree

drivers/accel/amdxdna/aie2_message.c

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
4747
ndev->mgmt_chann = NULL;
4848
}
4949

50-
if (!ret && *hdl->data != AIE2_STATUS_SUCCESS) {
50+
if (!ret && *hdl->status != AIE2_STATUS_SUCCESS) {
5151
XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
5252
msg->opcode, *hdl->data);
5353
ret = -EINVAL;
@@ -336,11 +336,6 @@ int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf,
336336
goto fail;
337337
}
338338

339-
if (resp.status != AIE2_STATUS_SUCCESS) {
340-
XDNA_ERR(xdna, "Query NPU status failed, status 0x%x", resp.status);
341-
ret = -EINVAL;
342-
goto fail;
343-
}
344339
XDNA_DBG(xdna, "Query NPU status completed");
345340

346341
if (size < resp.size) {
@@ -362,6 +357,55 @@ int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf,
362357
return ret;
363358
}
364359

360+
int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
361+
char __user *buf, u32 size,
362+
struct amdxdna_drm_query_telemetry_header *header)
363+
{
364+
DECLARE_AIE2_MSG(get_telemetry, MSG_OP_GET_TELEMETRY);
365+
struct amdxdna_dev *xdna = ndev->xdna;
366+
dma_addr_t dma_addr;
367+
u8 *addr;
368+
int ret;
369+
370+
if (header->type >= MAX_TELEMETRY_TYPE)
371+
return -EINVAL;
372+
373+
addr = dma_alloc_noncoherent(xdna->ddev.dev, size, &dma_addr,
374+
DMA_FROM_DEVICE, GFP_KERNEL);
375+
if (!addr)
376+
return -ENOMEM;
377+
378+
req.buf_addr = dma_addr;
379+
req.buf_size = size;
380+
req.type = header->type;
381+
382+
drm_clflush_virt_range(addr, size); /* device can access */
383+
ret = aie2_send_mgmt_msg_wait(ndev, &msg);
384+
if (ret) {
385+
XDNA_ERR(xdna, "Query telemetry failed, status %d", ret);
386+
goto free_buf;
387+
}
388+
389+
if (size < resp.size) {
390+
ret = -EINVAL;
391+
XDNA_ERR(xdna, "Bad buffer size. Available: %u. Needs: %u", size, resp.size);
392+
goto free_buf;
393+
}
394+
395+
if (copy_to_user(buf, addr, resp.size)) {
396+
ret = -EFAULT;
397+
XDNA_ERR(xdna, "Failed to copy telemetry to user space");
398+
goto free_buf;
399+
}
400+
401+
header->major = resp.major;
402+
header->minor = resp.minor;
403+
404+
free_buf:
405+
dma_free_noncoherent(xdna->ddev.dev, size, addr, dma_addr, DMA_FROM_DEVICE);
406+
return ret;
407+
}
408+
365409
int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
366410
void *handle, int (*cb)(void*, void __iomem *, size_t))
367411
{

drivers/accel/amdxdna/aie2_msg_priv.h

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
enum aie2_msg_opcode {
1010
MSG_OP_CREATE_CONTEXT = 0x2,
1111
MSG_OP_DESTROY_CONTEXT = 0x3,
12-
MSG_OP_SYNC_BO = 0x7,
12+
MSG_OP_GET_TELEMETRY = 0x4,
13+
MSG_OP_SYNC_BO = 0x7,
1314
MSG_OP_EXECUTE_BUFFER_CF = 0xC,
1415
MSG_OP_QUERY_COL_STATUS = 0xD,
1516
MSG_OP_QUERY_AIE_TILE_INFO = 0xE,
@@ -137,6 +138,28 @@ struct destroy_ctx_resp {
137138
enum aie2_msg_status status;
138139
} __packed;
139140

141+
enum telemetry_type {
142+
TELEMETRY_TYPE_DISABLED,
143+
TELEMETRY_TYPE_HEALTH,
144+
TELEMETRY_TYPE_ERROR_INFO,
145+
TELEMETRY_TYPE_PROFILING,
146+
TELEMETRY_TYPE_DEBUG,
147+
MAX_TELEMETRY_TYPE
148+
};
149+
150+
struct get_telemetry_req {
151+
enum telemetry_type type;
152+
__u64 buf_addr;
153+
__u32 buf_size;
154+
} __packed;
155+
156+
struct get_telemetry_resp {
157+
__u32 major;
158+
__u32 minor;
159+
__u32 size;
160+
enum aie2_msg_status status;
161+
} __packed;
162+
140163
struct execute_buffer_req {
141164
__u32 cu_idx;
142165
__u32 payload[19];

drivers/accel/amdxdna/aie2_pci.c

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -862,6 +862,76 @@ static int aie2_query_resource_info(struct amdxdna_client *client,
862862
return 0;
863863
}
864864

865+
static int aie2_fill_hwctx_map(struct amdxdna_hwctx *hwctx, void *arg)
866+
{
867+
struct amdxdna_dev *xdna = hwctx->client->xdna;
868+
u32 *map = arg;
869+
870+
if (hwctx->fw_ctx_id >= xdna->dev_handle->priv->hwctx_limit) {
871+
XDNA_ERR(xdna, "Invalid fw ctx id %d/%d ", hwctx->fw_ctx_id,
872+
xdna->dev_handle->priv->hwctx_limit);
873+
return -EINVAL;
874+
}
875+
876+
map[hwctx->fw_ctx_id] = hwctx->id;
877+
return 0;
878+
}
879+
880+
static int aie2_get_telemetry(struct amdxdna_client *client,
881+
struct amdxdna_drm_get_info *args)
882+
{
883+
struct amdxdna_drm_query_telemetry_header *header __free(kfree) = NULL;
884+
u32 telemetry_data_sz, header_sz, elem_num;
885+
struct amdxdna_dev *xdna = client->xdna;
886+
struct amdxdna_client *tmp_client;
887+
int ret;
888+
889+
elem_num = xdna->dev_handle->priv->hwctx_limit;
890+
header_sz = struct_size(header, map, elem_num);
891+
if (args->buffer_size <= header_sz) {
892+
XDNA_ERR(xdna, "Invalid buffer size");
893+
return -EINVAL;
894+
}
895+
896+
telemetry_data_sz = args->buffer_size - header_sz;
897+
if (telemetry_data_sz > SZ_4M) {
898+
XDNA_ERR(xdna, "Buffer size is too big, %d", telemetry_data_sz);
899+
return -EINVAL;
900+
}
901+
902+
header = kzalloc(header_sz, GFP_KERNEL);
903+
if (!header)
904+
return -ENOMEM;
905+
906+
if (copy_from_user(header, u64_to_user_ptr(args->buffer), sizeof(*header))) {
907+
XDNA_ERR(xdna, "Failed to copy telemetry header from user");
908+
return -EFAULT;
909+
}
910+
911+
header->map_num_elements = elem_num;
912+
list_for_each_entry(tmp_client, &xdna->client_list, node) {
913+
ret = amdxdna_hwctx_walk(tmp_client, &header->map,
914+
aie2_fill_hwctx_map);
915+
if (ret)
916+
return ret;
917+
}
918+
919+
ret = aie2_query_telemetry(xdna->dev_handle,
920+
u64_to_user_ptr(args->buffer + header_sz),
921+
telemetry_data_sz, header);
922+
if (ret) {
923+
XDNA_ERR(xdna, "Query telemetry failed ret %d", ret);
924+
return ret;
925+
}
926+
927+
if (copy_to_user(u64_to_user_ptr(args->buffer), header, header_sz)) {
928+
XDNA_ERR(xdna, "Copy header failed");
929+
return -EFAULT;
930+
}
931+
932+
return 0;
933+
}
934+
865935
static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_info *args)
866936
{
867937
struct amdxdna_dev *xdna = client->xdna;
@@ -896,6 +966,9 @@ static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_i
896966
case DRM_AMDXDNA_GET_POWER_MODE:
897967
ret = aie2_get_power_mode(client, args);
898968
break;
969+
case DRM_AMDXDNA_QUERY_TELEMETRY:
970+
ret = aie2_get_telemetry(client, args);
971+
break;
899972
case DRM_AMDXDNA_QUERY_RESOURCE_INFO:
900973
ret = aie2_query_resource_info(client, args);
901974
break;

drivers/accel/amdxdna/aie2_pci.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,9 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
305305
int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
306306
int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
307307
int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf, u32 size, u32 *cols_filled);
308+
int aie2_query_telemetry(struct amdxdna_dev_hdl *ndev,
309+
char __user *buf, u32 size,
310+
struct amdxdna_drm_query_telemetry_header *header);
308311
int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
309312
void *handle, int (*cb)(void*, void __iomem *, size_t));
310313
int aie2_config_cu(struct amdxdna_hwctx *hwctx,

drivers/accel/amdxdna/amdxdna_mailbox_helper.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,18 @@ struct xdna_notify {
1616
u32 *data;
1717
size_t size;
1818
int error;
19+
u32 *status;
1920
};
2021

21-
#define DECLARE_XDNA_MSG_COMMON(name, op, status) \
22+
#define DECLARE_XDNA_MSG_COMMON(name, op, s) \
2223
struct name##_req req = { 0 }; \
23-
struct name##_resp resp = { status }; \
24+
struct name##_resp resp = { .status = s }; \
2425
struct xdna_notify hdl = { \
2526
.error = 0, \
2627
.data = (u32 *)&resp, \
2728
.size = sizeof(resp), \
2829
.comp = COMPLETION_INITIALIZER_ONSTACK(hdl.comp), \
30+
.status = (u32 *)&resp.status, \
2931
}; \
3032
struct xdna_mailbox_msg msg = { \
3133
.send_data = (u8 *)&req, \

drivers/accel/amdxdna/amdxdna_pci_drv.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,10 @@ MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin");
3030
* 0.2: Support getting last error hardware error
3131
* 0.3: Support firmware debug buffer
3232
* 0.4: Support getting resource information
33+
* 0.5: Support getting telemetry data
3334
*/
3435
#define AMDXDNA_DRIVER_MAJOR 0
35-
#define AMDXDNA_DRIVER_MINOR 4
36+
#define AMDXDNA_DRIVER_MINOR 5
3637

3738
/*
3839
* Bind the driver base on (vendor_id, device_id) pair and later use the

include/uapi/drm/amdxdna_accel.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,7 @@ enum amdxdna_drm_get_param {
442442
DRM_AMDXDNA_QUERY_HW_CONTEXTS,
443443
DRM_AMDXDNA_QUERY_FIRMWARE_VERSION = 8,
444444
DRM_AMDXDNA_GET_POWER_MODE,
445+
DRM_AMDXDNA_QUERY_TELEMETRY,
445446
DRM_AMDXDNA_QUERY_RESOURCE_INFO = 12,
446447
};
447448

@@ -461,6 +462,22 @@ struct amdxdna_drm_get_resource_info {
461462
__u64 npu_task_curr;
462463
};
463464

465+
/**
466+
* struct amdxdna_drm_query_telemetry_header - Telemetry data header
467+
*/
468+
struct amdxdna_drm_query_telemetry_header {
469+
/** @major: Firmware telemetry interface major version number */
470+
__u32 major;
471+
/** @minor: Firmware telemetry interface minor version number */
472+
__u32 minor;
473+
/** @type: Telemetry query type */
474+
__u32 type;
475+
/** @map_num_elements: Total number of elements in the map table */
476+
__u32 map_num_elements;
477+
/** @map: Element map */
478+
__u32 map[];
479+
};
480+
464481
/**
465482
* struct amdxdna_drm_get_info - Get some information from the AIE hardware.
466483
* @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed in the buffer.

0 commit comments

Comments
 (0)