Skip to content

Commit a3fd283

Browse files
Alon Mizrahiogabbay
authored andcommitted
habanalabs: add validation cs counter, fix misplaced counters
Up until now validation errors were counted in the parsing field of the cs_counters struct, so we added a new counter and increased it when needed. In addition, there were some locations where only one of the counters was updated (ctx or aggregate) so add the second one to be updated as well. Signed-off-by: Alon Mizrahi <amizrahi@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
1 parent 98e8781 commit a3fd283

4 files changed

Lines changed: 68 additions & 18 deletions

File tree

drivers/misc/habanalabs/common/command_submission.c

Lines changed: 57 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -472,8 +472,11 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
472472
cntr = &hdev->aggregated_cs_counters;
473473

474474
cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
475-
if (!cs)
475+
if (!cs) {
476+
atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
477+
atomic64_inc(&cntr->out_of_mem_drop_cnt);
476478
return -ENOMEM;
479+
}
477480

478481
cs->ctx = ctx;
479482
cs->submitted = false;
@@ -486,6 +489,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
486489

487490
cs_cmpl = kmalloc(sizeof(*cs_cmpl), GFP_ATOMIC);
488491
if (!cs_cmpl) {
492+
atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
493+
atomic64_inc(&cntr->out_of_mem_drop_cnt);
489494
rc = -ENOMEM;
490495
goto free_cs;
491496
}
@@ -513,6 +518,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
513518
cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
514519
sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
515520
if (!cs->jobs_in_queue_cnt) {
521+
atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
522+
atomic64_inc(&cntr->out_of_mem_drop_cnt);
516523
rc = -ENOMEM;
517524
goto free_fence;
518525
}
@@ -764,11 +771,14 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
764771

765772
static int hl_cs_copy_chunk_array(struct hl_device *hdev,
766773
struct hl_cs_chunk **cs_chunk_array,
767-
void __user *chunks, u32 num_chunks)
774+
void __user *chunks, u32 num_chunks,
775+
struct hl_ctx *ctx)
768776
{
769777
u32 size_to_copy;
770778

771779
if (num_chunks > HL_MAX_JOBS_PER_CS) {
780+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
781+
atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
772782
dev_err(hdev->dev,
773783
"Number of chunks can NOT be larger than %d\n",
774784
HL_MAX_JOBS_PER_CS);
@@ -777,11 +787,16 @@ static int hl_cs_copy_chunk_array(struct hl_device *hdev,
777787

778788
*cs_chunk_array = kmalloc_array(num_chunks, sizeof(**cs_chunk_array),
779789
GFP_ATOMIC);
780-
if (!*cs_chunk_array)
790+
if (!*cs_chunk_array) {
791+
atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
792+
atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
781793
return -ENOMEM;
794+
}
782795

783796
size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
784797
if (copy_from_user(*cs_chunk_array, chunks, size_to_copy)) {
798+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
799+
atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
785800
dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
786801
kfree(*cs_chunk_array);
787802
return -EFAULT;
@@ -797,6 +812,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
797812
struct hl_device *hdev = hpriv->hdev;
798813
struct hl_cs_chunk *cs_chunk_array;
799814
struct hl_cs_counters_atomic *cntr;
815+
struct hl_ctx *ctx = hpriv->ctx;
800816
struct hl_cs_job *job;
801817
struct hl_cs *cs;
802818
struct hl_cb *cb;
@@ -805,7 +821,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
805821
cntr = &hdev->aggregated_cs_counters;
806822
*cs_seq = ULLONG_MAX;
807823

808-
rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks);
824+
rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
825+
hpriv->ctx);
809826
if (rc)
810827
goto out;
811828

@@ -832,17 +849,17 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
832849
rc = validate_queue_index(hdev, chunk, &queue_type,
833850
&is_kernel_allocated_cb);
834851
if (rc) {
835-
atomic64_inc(&hpriv->ctx->cs_counters.parsing_drop_cnt);
836-
atomic64_inc(&cntr->parsing_drop_cnt);
852+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
853+
atomic64_inc(&cntr->validation_drop_cnt);
837854
goto free_cs_object;
838855
}
839856

840857
if (is_kernel_allocated_cb) {
841858
cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
842859
if (!cb) {
843860
atomic64_inc(
844-
&hpriv->ctx->cs_counters.parsing_drop_cnt);
845-
atomic64_inc(&cntr->parsing_drop_cnt);
861+
&ctx->cs_counters.validation_drop_cnt);
862+
atomic64_inc(&cntr->validation_drop_cnt);
846863
rc = -EINVAL;
847864
goto free_cs_object;
848865
}
@@ -856,8 +873,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
856873
job = hl_cs_allocate_job(hdev, queue_type,
857874
is_kernel_allocated_cb);
858875
if (!job) {
859-
atomic64_inc(
860-
&hpriv->ctx->cs_counters.out_of_mem_drop_cnt);
876+
atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
861877
atomic64_inc(&cntr->out_of_mem_drop_cnt);
862878
dev_err(hdev->dev, "Failed to allocate a new job\n");
863879
rc = -ENOMEM;
@@ -891,7 +907,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
891907

892908
rc = cs_parser(hpriv, job);
893909
if (rc) {
894-
atomic64_inc(&hpriv->ctx->cs_counters.parsing_drop_cnt);
910+
atomic64_inc(&ctx->cs_counters.parsing_drop_cnt);
895911
atomic64_inc(&cntr->parsing_drop_cnt);
896912
dev_err(hdev->dev,
897913
"Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
@@ -901,8 +917,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
901917
}
902918

903919
if (int_queues_only) {
904-
atomic64_inc(&hpriv->ctx->cs_counters.parsing_drop_cnt);
905-
atomic64_inc(&cntr->parsing_drop_cnt);
920+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
921+
atomic64_inc(&cntr->validation_drop_cnt);
906922
dev_err(hdev->dev,
907923
"Reject CS %d.%llu because only internal queues jobs are present\n",
908924
cs->ctx->asid, cs->sequence);
@@ -1042,7 +1058,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
10421058
}
10431059

10441060
static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
1045-
struct hl_cs_chunk *chunk, u64 *signal_seq)
1061+
struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx)
10461062
{
10471063
u64 *signal_seq_arr = NULL;
10481064
u32 size_to_copy, signal_seq_arr_len;
@@ -1052,6 +1068,8 @@ static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
10521068

10531069
/* currently only one signal seq is supported */
10541070
if (signal_seq_arr_len != 1) {
1071+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1072+
atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
10551073
dev_err(hdev->dev,
10561074
"Wait for signal CS supports only one signal CS seq\n");
10571075
return -EINVAL;
@@ -1060,13 +1078,18 @@ static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
10601078
signal_seq_arr = kmalloc_array(signal_seq_arr_len,
10611079
sizeof(*signal_seq_arr),
10621080
GFP_ATOMIC);
1063-
if (!signal_seq_arr)
1081+
if (!signal_seq_arr) {
1082+
atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1083+
atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
10641084
return -ENOMEM;
1085+
}
10651086

10661087
size_to_copy = chunk->num_signal_seq_arr * sizeof(*signal_seq_arr);
10671088
if (copy_from_user(signal_seq_arr,
10681089
u64_to_user_ptr(chunk->signal_seq_arr),
10691090
size_to_copy)) {
1091+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1092+
atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
10701093
dev_err(hdev->dev,
10711094
"Failed to copy signal seq array from user\n");
10721095
rc = -EFAULT;
@@ -1153,23 +1176,28 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
11531176
struct hl_device *hdev = hpriv->hdev;
11541177
struct hl_cs_compl *sig_waitcs_cmpl;
11551178
u32 q_idx, collective_engine_id = 0;
1179+
struct hl_cs_counters_atomic *cntr;
11561180
struct hl_fence *sig_fence = NULL;
11571181
struct hl_ctx *ctx = hpriv->ctx;
11581182
enum hl_queue_type q_type;
11591183
struct hl_cs *cs;
11601184
u64 signal_seq;
11611185
int rc;
11621186

1187+
cntr = &hdev->aggregated_cs_counters;
11631188
*cs_seq = ULLONG_MAX;
11641189

1165-
rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks);
1190+
rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
1191+
ctx);
11661192
if (rc)
11671193
goto out;
11681194

11691195
/* currently it is guaranteed to have only one chunk */
11701196
chunk = &cs_chunk_array[0];
11711197

11721198
if (chunk->queue_index >= hdev->asic_prop.max_queues) {
1199+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1200+
atomic64_inc(&cntr->validation_drop_cnt);
11731201
dev_err(hdev->dev, "Queue index %d is invalid\n",
11741202
chunk->queue_index);
11751203
rc = -EINVAL;
@@ -1181,6 +1209,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
11811209
q_type = hw_queue_prop->type;
11821210

11831211
if (!hw_queue_prop->supports_sync_stream) {
1212+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1213+
atomic64_inc(&cntr->validation_drop_cnt);
11841214
dev_err(hdev->dev,
11851215
"Queue index %d does not support sync stream operations\n",
11861216
q_idx);
@@ -1190,6 +1220,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
11901220

11911221
if (cs_type == CS_TYPE_COLLECTIVE_WAIT) {
11921222
if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
1223+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1224+
atomic64_inc(&cntr->validation_drop_cnt);
11931225
dev_err(hdev->dev,
11941226
"Queue index %d is invalid\n", q_idx);
11951227
rc = -EINVAL;
@@ -1200,12 +1232,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
12001232
}
12011233

12021234
if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT) {
1203-
rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq);
1235+
rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq, ctx);
12041236
if (rc)
12051237
goto free_cs_chunk_array;
12061238

12071239
sig_fence = hl_ctx_get_fence(ctx, signal_seq);
12081240
if (IS_ERR(sig_fence)) {
1241+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1242+
atomic64_inc(&cntr->validation_drop_cnt);
12091243
dev_err(hdev->dev,
12101244
"Failed to get signal CS with seq 0x%llx\n",
12111245
signal_seq);
@@ -1223,6 +1257,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
12231257
container_of(sig_fence, struct hl_cs_compl, base_fence);
12241258

12251259
if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL) {
1260+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1261+
atomic64_inc(&cntr->validation_drop_cnt);
12261262
dev_err(hdev->dev,
12271263
"CS seq 0x%llx is not of a signal CS\n",
12281264
signal_seq);
@@ -1270,8 +1306,11 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
12701306
else if (cs_type == CS_TYPE_COLLECTIVE_WAIT)
12711307
rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx,
12721308
cs, q_idx, collective_engine_id);
1273-
else
1309+
else {
1310+
atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1311+
atomic64_inc(&cntr->validation_drop_cnt);
12741312
rc = -EINVAL;
1313+
}
12751314

12761315
if (rc)
12771316
goto free_cs_object;

drivers/misc/habanalabs/common/habanalabs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,13 +1000,15 @@ struct hl_va_range {
10001000
* @queue_full_drop_cnt: dropped due to queue full
10011001
* @device_in_reset_drop_cnt: dropped due to device in reset
10021002
* @max_cs_in_flight_drop_cnt: dropped due to maximum CS in-flight
1003+
* @validation_drop_cnt: dropped due to error in validation
10031004
*/
10041005
struct hl_cs_counters_atomic {
10051006
atomic64_t out_of_mem_drop_cnt;
10061007
atomic64_t parsing_drop_cnt;
10071008
atomic64_t queue_full_drop_cnt;
10081009
atomic64_t device_in_reset_drop_cnt;
10091010
atomic64_t max_cs_in_flight_drop_cnt;
1011+
atomic64_t validation_drop_cnt;
10101012
};
10111013

10121014
/**

drivers/misc/habanalabs/common/habanalabs_ioctl.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,8 @@ static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
335335
atomic64_read(&cntr->device_in_reset_drop_cnt);
336336
cs_counters.total_max_cs_in_flight_drop_cnt =
337337
atomic64_read(&cntr->max_cs_in_flight_drop_cnt);
338+
cs_counters.total_validation_drop_cnt =
339+
atomic64_read(&cntr->validation_drop_cnt);
338340

339341
if (hpriv->ctx) {
340342
cs_counters.ctx_out_of_mem_drop_cnt =
@@ -352,6 +354,9 @@ static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
352354
cs_counters.ctx_max_cs_in_flight_drop_cnt =
353355
atomic64_read(
354356
&hpriv->ctx->cs_counters.max_cs_in_flight_drop_cnt);
357+
cs_counters.ctx_validation_drop_cnt =
358+
atomic64_read(
359+
&hpriv->ctx->cs_counters.validation_drop_cnt);
355360
}
356361

357362
return copy_to_user(out, &cs_counters,

include/uapi/misc/habanalabs.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,8 @@ struct hl_info_sync_manager {
426426
* @ctx_device_in_reset_drop_cnt: context dropped due to device in reset
427427
* @total_max_cs_in_flight_drop_cnt: total dropped due to maximum CS in-flight
428428
* @ctx_max_cs_in_flight_drop_cnt: context dropped due to maximum CS in-flight
429+
* @total_validation_drop_cnt: total dropped due to validation error
430+
* @ctx_validation_drop_cnt: context dropped due to validation error
429431
*/
430432
struct hl_info_cs_counters {
431433
__u64 total_out_of_mem_drop_cnt;
@@ -438,6 +440,8 @@ struct hl_info_cs_counters {
438440
__u64 ctx_device_in_reset_drop_cnt;
439441
__u64 total_max_cs_in_flight_drop_cnt;
440442
__u64 ctx_max_cs_in_flight_drop_cnt;
443+
__u64 total_validation_drop_cnt;
444+
__u64 ctx_validation_drop_cnt;
441445
};
442446

443447
enum gaudi_dcores {

0 commit comments

Comments
 (0)