@@ -472,8 +472,11 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
472472 cntr = & hdev -> aggregated_cs_counters ;
473473
474474 cs = kzalloc (sizeof (* cs ), GFP_ATOMIC );
475- if (!cs )
475+ if (!cs ) {
476+ atomic64_inc (& ctx -> cs_counters .out_of_mem_drop_cnt );
477+ atomic64_inc (& cntr -> out_of_mem_drop_cnt );
476478 return - ENOMEM ;
479+ }
477480
478481 cs -> ctx = ctx ;
479482 cs -> submitted = false;
@@ -486,6 +489,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
486489
487490 cs_cmpl = kmalloc (sizeof (* cs_cmpl ), GFP_ATOMIC );
488491 if (!cs_cmpl ) {
492+ atomic64_inc (& ctx -> cs_counters .out_of_mem_drop_cnt );
493+ atomic64_inc (& cntr -> out_of_mem_drop_cnt );
489494 rc = - ENOMEM ;
490495 goto free_cs ;
491496 }
@@ -513,6 +518,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
513518 cs -> jobs_in_queue_cnt = kcalloc (hdev -> asic_prop .max_queues ,
514519 sizeof (* cs -> jobs_in_queue_cnt ), GFP_ATOMIC );
515520 if (!cs -> jobs_in_queue_cnt ) {
521+ atomic64_inc (& ctx -> cs_counters .out_of_mem_drop_cnt );
522+ atomic64_inc (& cntr -> out_of_mem_drop_cnt );
516523 rc = - ENOMEM ;
517524 goto free_fence ;
518525 }
@@ -562,7 +569,7 @@ void hl_cs_rollback_all(struct hl_device *hdev)
562569 for (i = 0 ; i < hdev -> asic_prop .completion_queues_count ; i ++ )
563570 flush_workqueue (hdev -> cq_wq [i ]);
564571
565- /* Make sure we don't have leftovers in the H/W queues mirror list */
572+ /* Make sure we don't have leftovers in the CS mirror list */
566573 list_for_each_entry_safe (cs , tmp , & hdev -> cs_mirror_list , mirror_node ) {
567574 cs_get (cs );
568575 cs -> aborted = true;
@@ -764,11 +771,14 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
764771
765772static int hl_cs_copy_chunk_array (struct hl_device * hdev ,
766773 struct hl_cs_chunk * * cs_chunk_array ,
767- void __user * chunks , u32 num_chunks )
774+ void __user * chunks , u32 num_chunks ,
775+ struct hl_ctx * ctx )
768776{
769777 u32 size_to_copy ;
770778
771779 if (num_chunks > HL_MAX_JOBS_PER_CS ) {
780+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
781+ atomic64_inc (& hdev -> aggregated_cs_counters .validation_drop_cnt );
772782 dev_err (hdev -> dev ,
773783 "Number of chunks can NOT be larger than %d\n" ,
774784 HL_MAX_JOBS_PER_CS );
@@ -777,11 +787,16 @@ static int hl_cs_copy_chunk_array(struct hl_device *hdev,
777787
778788 * cs_chunk_array = kmalloc_array (num_chunks , sizeof (* * cs_chunk_array ),
779789 GFP_ATOMIC );
780- if (!* cs_chunk_array )
790+ if (!* cs_chunk_array ) {
791+ atomic64_inc (& ctx -> cs_counters .out_of_mem_drop_cnt );
792+ atomic64_inc (& hdev -> aggregated_cs_counters .out_of_mem_drop_cnt );
781793 return - ENOMEM ;
794+ }
782795
783796 size_to_copy = num_chunks * sizeof (struct hl_cs_chunk );
784797 if (copy_from_user (* cs_chunk_array , chunks , size_to_copy )) {
798+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
799+ atomic64_inc (& hdev -> aggregated_cs_counters .validation_drop_cnt );
785800 dev_err (hdev -> dev , "Failed to copy cs chunk array from user\n" );
786801 kfree (* cs_chunk_array );
787802 return - EFAULT ;
@@ -797,6 +812,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
797812 struct hl_device * hdev = hpriv -> hdev ;
798813 struct hl_cs_chunk * cs_chunk_array ;
799814 struct hl_cs_counters_atomic * cntr ;
815+ struct hl_ctx * ctx = hpriv -> ctx ;
800816 struct hl_cs_job * job ;
801817 struct hl_cs * cs ;
802818 struct hl_cb * cb ;
@@ -805,7 +821,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
805821 cntr = & hdev -> aggregated_cs_counters ;
806822 * cs_seq = ULLONG_MAX ;
807823
808- rc = hl_cs_copy_chunk_array (hdev , & cs_chunk_array , chunks , num_chunks );
824+ rc = hl_cs_copy_chunk_array (hdev , & cs_chunk_array , chunks , num_chunks ,
825+ hpriv -> ctx );
809826 if (rc )
810827 goto out ;
811828
@@ -832,17 +849,17 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
832849 rc = validate_queue_index (hdev , chunk , & queue_type ,
833850 & is_kernel_allocated_cb );
834851 if (rc ) {
835- atomic64_inc (& hpriv -> ctx -> cs_counters .parsing_drop_cnt );
836- atomic64_inc (& cntr -> parsing_drop_cnt );
852+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
853+ atomic64_inc (& cntr -> validation_drop_cnt );
837854 goto free_cs_object ;
838855 }
839856
840857 if (is_kernel_allocated_cb ) {
841858 cb = get_cb_from_cs_chunk (hdev , & hpriv -> cb_mgr , chunk );
842859 if (!cb ) {
843860 atomic64_inc (
844- & hpriv -> ctx -> cs_counters .parsing_drop_cnt );
845- atomic64_inc (& cntr -> parsing_drop_cnt );
861+ & ctx -> cs_counters .validation_drop_cnt );
862+ atomic64_inc (& cntr -> validation_drop_cnt );
846863 rc = - EINVAL ;
847864 goto free_cs_object ;
848865 }
@@ -856,8 +873,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
856873 job = hl_cs_allocate_job (hdev , queue_type ,
857874 is_kernel_allocated_cb );
858875 if (!job ) {
859- atomic64_inc (
860- & hpriv -> ctx -> cs_counters .out_of_mem_drop_cnt );
876+ atomic64_inc (& ctx -> cs_counters .out_of_mem_drop_cnt );
861877 atomic64_inc (& cntr -> out_of_mem_drop_cnt );
862878 dev_err (hdev -> dev , "Failed to allocate a new job\n" );
863879 rc = - ENOMEM ;
@@ -891,7 +907,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
891907
892908 rc = cs_parser (hpriv , job );
893909 if (rc ) {
894- atomic64_inc (& hpriv -> ctx -> cs_counters .parsing_drop_cnt );
910+ atomic64_inc (& ctx -> cs_counters .parsing_drop_cnt );
895911 atomic64_inc (& cntr -> parsing_drop_cnt );
896912 dev_err (hdev -> dev ,
897913 "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n" ,
@@ -901,8 +917,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
901917 }
902918
903919 if (int_queues_only ) {
904- atomic64_inc (& hpriv -> ctx -> cs_counters .parsing_drop_cnt );
905- atomic64_inc (& cntr -> parsing_drop_cnt );
920+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
921+ atomic64_inc (& cntr -> validation_drop_cnt );
906922 dev_err (hdev -> dev ,
907923 "Reject CS %d.%llu because only internal queues jobs are present\n" ,
908924 cs -> ctx -> asid , cs -> sequence );
@@ -1042,7 +1058,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
10421058}
10431059
10441060static int cs_ioctl_extract_signal_seq (struct hl_device * hdev ,
1045- struct hl_cs_chunk * chunk , u64 * signal_seq )
1061+ struct hl_cs_chunk * chunk , u64 * signal_seq , struct hl_ctx * ctx )
10461062{
10471063 u64 * signal_seq_arr = NULL ;
10481064 u32 size_to_copy , signal_seq_arr_len ;
@@ -1052,6 +1068,8 @@ static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
10521068
10531069 /* currently only one signal seq is supported */
10541070 if (signal_seq_arr_len != 1 ) {
1071+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
1072+ atomic64_inc (& hdev -> aggregated_cs_counters .validation_drop_cnt );
10551073 dev_err (hdev -> dev ,
10561074 "Wait for signal CS supports only one signal CS seq\n" );
10571075 return - EINVAL ;
@@ -1060,13 +1078,18 @@ static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
10601078 signal_seq_arr = kmalloc_array (signal_seq_arr_len ,
10611079 sizeof (* signal_seq_arr ),
10621080 GFP_ATOMIC );
1063- if (!signal_seq_arr )
1081+ if (!signal_seq_arr ) {
1082+ atomic64_inc (& ctx -> cs_counters .out_of_mem_drop_cnt );
1083+ atomic64_inc (& hdev -> aggregated_cs_counters .out_of_mem_drop_cnt );
10641084 return - ENOMEM ;
1085+ }
10651086
10661087 size_to_copy = chunk -> num_signal_seq_arr * sizeof (* signal_seq_arr );
10671088 if (copy_from_user (signal_seq_arr ,
10681089 u64_to_user_ptr (chunk -> signal_seq_arr ),
10691090 size_to_copy )) {
1091+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
1092+ atomic64_inc (& hdev -> aggregated_cs_counters .validation_drop_cnt );
10701093 dev_err (hdev -> dev ,
10711094 "Failed to copy signal seq array from user\n" );
10721095 rc = - EFAULT ;
@@ -1153,23 +1176,28 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
11531176 struct hl_device * hdev = hpriv -> hdev ;
11541177 struct hl_cs_compl * sig_waitcs_cmpl ;
11551178 u32 q_idx , collective_engine_id = 0 ;
1179+ struct hl_cs_counters_atomic * cntr ;
11561180 struct hl_fence * sig_fence = NULL ;
11571181 struct hl_ctx * ctx = hpriv -> ctx ;
11581182 enum hl_queue_type q_type ;
11591183 struct hl_cs * cs ;
11601184 u64 signal_seq ;
11611185 int rc ;
11621186
1187+ cntr = & hdev -> aggregated_cs_counters ;
11631188 * cs_seq = ULLONG_MAX ;
11641189
1165- rc = hl_cs_copy_chunk_array (hdev , & cs_chunk_array , chunks , num_chunks );
1190+ rc = hl_cs_copy_chunk_array (hdev , & cs_chunk_array , chunks , num_chunks ,
1191+ ctx );
11661192 if (rc )
11671193 goto out ;
11681194
11691195 /* currently it is guaranteed to have only one chunk */
11701196 chunk = & cs_chunk_array [0 ];
11711197
11721198 if (chunk -> queue_index >= hdev -> asic_prop .max_queues ) {
1199+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
1200+ atomic64_inc (& cntr -> validation_drop_cnt );
11731201 dev_err (hdev -> dev , "Queue index %d is invalid\n" ,
11741202 chunk -> queue_index );
11751203 rc = - EINVAL ;
@@ -1181,6 +1209,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
11811209 q_type = hw_queue_prop -> type ;
11821210
11831211 if (!hw_queue_prop -> supports_sync_stream ) {
1212+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
1213+ atomic64_inc (& cntr -> validation_drop_cnt );
11841214 dev_err (hdev -> dev ,
11851215 "Queue index %d does not support sync stream operations\n" ,
11861216 q_idx );
@@ -1190,6 +1220,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
11901220
11911221 if (cs_type == CS_TYPE_COLLECTIVE_WAIT ) {
11921222 if (!(hw_queue_prop -> collective_mode == HL_COLLECTIVE_MASTER )) {
1223+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
1224+ atomic64_inc (& cntr -> validation_drop_cnt );
11931225 dev_err (hdev -> dev ,
11941226 "Queue index %d is invalid\n" , q_idx );
11951227 rc = - EINVAL ;
@@ -1200,12 +1232,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
12001232 }
12011233
12021234 if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT ) {
1203- rc = cs_ioctl_extract_signal_seq (hdev , chunk , & signal_seq );
1235+ rc = cs_ioctl_extract_signal_seq (hdev , chunk , & signal_seq , ctx );
12041236 if (rc )
12051237 goto free_cs_chunk_array ;
12061238
12071239 sig_fence = hl_ctx_get_fence (ctx , signal_seq );
12081240 if (IS_ERR (sig_fence )) {
1241+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
1242+ atomic64_inc (& cntr -> validation_drop_cnt );
12091243 dev_err (hdev -> dev ,
12101244 "Failed to get signal CS with seq 0x%llx\n" ,
12111245 signal_seq );
@@ -1223,6 +1257,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
12231257 container_of (sig_fence , struct hl_cs_compl , base_fence );
12241258
12251259 if (sig_waitcs_cmpl -> type != CS_TYPE_SIGNAL ) {
1260+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
1261+ atomic64_inc (& cntr -> validation_drop_cnt );
12261262 dev_err (hdev -> dev ,
12271263 "CS seq 0x%llx is not of a signal CS\n" ,
12281264 signal_seq );
@@ -1270,8 +1306,11 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
12701306 else if (cs_type == CS_TYPE_COLLECTIVE_WAIT )
12711307 rc = hdev -> asic_funcs -> collective_wait_create_jobs (hdev , ctx ,
12721308 cs , q_idx , collective_engine_id );
1273- else
1309+ else {
1310+ atomic64_inc (& ctx -> cs_counters .validation_drop_cnt );
1311+ atomic64_inc (& cntr -> validation_drop_cnt );
12741312 rc = - EINVAL ;
1313+ }
12751314
12761315 if (rc )
12771316 goto free_cs_object ;
0 commit comments