@@ -7744,137 +7744,28 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
77447744 return !!ecc_data -> is_critical ;
77457745}
77467746
7747- /*
7748- * gaudi2_queue_idx_dec - decrement queue index (pi/ci) and handle wrap
7749- *
7750- * @idx: the current pi/ci value
7751- * @q_len: the queue length (power of 2)
7752- *
7753- * @return the cyclically decremented index
7754- */
7755- static inline u32 gaudi2_queue_idx_dec (u32 idx , u32 q_len )
7756- {
7757- u32 mask = q_len - 1 ;
7758-
7759- /*
7760- * modular decrement is equivalent to adding (queue_size -1)
7761- * later we take LSBs to make sure the value is in the
7762- * range [0, queue_len - 1]
7763- */
7764- return (idx + q_len - 1 ) & mask ;
7765- }
7766-
7767- /**
7768- * gaudi2_print_sw_config_stream_data - print SW config stream data
7769- *
7770- * @hdev: pointer to the habanalabs device structure
7771- * @stream: the QMAN's stream
7772- * @qman_base: base address of QMAN registers block
7773- */
7774- static void gaudi2_print_sw_config_stream_data (struct hl_device * hdev ,
7775- u32 stream , u64 qman_base )
7747+ static void print_lower_qman_data_on_err (struct hl_device * hdev , u64 qman_base )
77767748{
7777- u64 cq_ptr_lo , cq_ptr_hi , cq_tsize , cq_ptr ;
7778- u32 cq_ptr_lo_off , size ;
7749+ u32 lo , hi , cq_ptr_size , arc_cq_ptr_size ;
7750+ u64 cq_ptr , arc_cq_ptr , cp_current_inst ;
77797751
7780- cq_ptr_lo_off = mmDCORE0_TPC0_QM_CQ_PTR_LO_1 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0 ;
7781-
7782- cq_ptr_lo = qman_base + (mmDCORE0_TPC0_QM_CQ_PTR_LO_0 - mmDCORE0_TPC0_QM_BASE ) +
7783- stream * cq_ptr_lo_off ;
7784-
7785- cq_ptr_hi = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_PTR_HI_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0 );
7786-
7787- cq_tsize = cq_ptr_lo + (mmDCORE0_TPC0_QM_CQ_TSIZE_0 - mmDCORE0_TPC0_QM_CQ_PTR_LO_0 );
7788-
7789- cq_ptr = (((u64 ) RREG32 (cq_ptr_hi )) << 32 ) | RREG32 (cq_ptr_lo );
7790- size = RREG32 (cq_tsize );
7791- dev_info (hdev -> dev , "stop on err: stream: %u, addr: %#llx, size: %x\n" ,
7792- stream , cq_ptr , size );
7793- }
7794-
7795- /**
7796- * gaudi2_print_last_pqes_on_err - print last PQEs on error
7797- *
7798- * @hdev: pointer to the habanalabs device structure
7799- * @qid_base: first QID of the QMAN (out of 4 streams)
7800- * @stream: the QMAN's stream
7801- * @qman_base: base address of QMAN registers block
7802- * @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE)
7803- */
7804- static void gaudi2_print_last_pqes_on_err (struct hl_device * hdev , u32 qid_base , u32 stream ,
7805- u64 qman_base , bool pr_sw_conf )
7806- {
7807- u32 ci , qm_ci_stream_off ;
7808- struct hl_hw_queue * q ;
7809- u64 pq_ci ;
7810- int i ;
7752+ lo = RREG32 (qman_base + QM_CQ_PTR_LO_4_OFFSET );
7753+ hi = RREG32 (qman_base + QM_CQ_PTR_HI_4_OFFSET );
7754+ cq_ptr = ((u64 ) hi ) << 32 | lo ;
7755+ cq_ptr_size = RREG32 (qman_base + QM_CQ_TSIZE_4_OFFSET );
78117756
7812- q = & hdev -> kernel_queues [qid_base + stream ];
7813-
7814- qm_ci_stream_off = mmDCORE0_TPC0_QM_PQ_CI_1 - mmDCORE0_TPC0_QM_PQ_CI_0 ;
7815- pq_ci = qman_base + (mmDCORE0_TPC0_QM_PQ_CI_0 - mmDCORE0_TPC0_QM_BASE ) +
7816- stream * qm_ci_stream_off ;
7817-
7818- hdev -> asic_funcs -> hw_queues_lock (hdev );
7819-
7820- if (pr_sw_conf )
7821- gaudi2_print_sw_config_stream_data (hdev , stream , qman_base );
7822-
7823- ci = RREG32 (pq_ci );
7824-
7825- /* we should start printing form ci -1 */
7826- ci = gaudi2_queue_idx_dec (ci , HL_QUEUE_LENGTH );
7827-
7828- for (i = 0 ; i < PQ_FETCHER_CACHE_SIZE ; i ++ ) {
7829- struct hl_bd * bd ;
7830- u64 addr ;
7831- u32 len ;
7832-
7833- bd = q -> kernel_address ;
7834- bd += ci ;
7835-
7836- len = le32_to_cpu (bd -> len );
7837- /* len 0 means uninitialized entry- break */
7838- if (!len )
7839- break ;
7840-
7841- addr = le64_to_cpu (bd -> ptr );
7842-
7843- dev_info (hdev -> dev , "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %x\n" ,
7844- stream , ci , addr , len );
7845-
7846- /* get previous ci, wrap if needed */
7847- ci = gaudi2_queue_idx_dec (ci , HL_QUEUE_LENGTH );
7848- }
7849-
7850- hdev -> asic_funcs -> hw_queues_unlock (hdev );
7851- }
7852-
7853- /**
7854- * print_qman_data_on_err - extract QMAN data on error
7855- *
7856- * @hdev: pointer to the habanalabs device structure
7857- * @qid_base: first QID of the QMAN (out of 4 streams)
7858- * @stream: the QMAN's stream
7859- * @qman_base: base address of QMAN registers block
7860- *
7861- * This function attempt to extract as much data as possible on QMAN error.
7862- * On upper CP print the SW config stream data and last 8 PQEs.
7863- * On lower CP print SW config data and last PQEs of ALL 4 upper CPs
7864- */
7865- static void print_qman_data_on_err (struct hl_device * hdev , u32 qid_base , u32 stream , u64 qman_base )
7866- {
7867- u32 i ;
7868-
7869- if (stream != QMAN_STREAMS ) {
7870- gaudi2_print_last_pqes_on_err (hdev , qid_base , stream , qman_base , true);
7871- return ;
7872- }
7757+ lo = RREG32 (qman_base + QM_ARC_CQ_PTR_LO_OFFSET );
7758+ hi = RREG32 (qman_base + QM_ARC_CQ_PTR_HI_OFFSET );
7759+ arc_cq_ptr = ((u64 ) hi ) << 32 | lo ;
7760+ arc_cq_ptr_size = RREG32 (qman_base + QM_ARC_CQ_TSIZE_OFFSET );
78737761
7874- gaudi2_print_sw_config_stream_data (hdev , stream , qman_base );
7762+ lo = RREG32 (qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET );
7763+ hi = RREG32 (qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET );
7764+ cp_current_inst = ((u64 ) hi ) << 32 | lo ;
78757765
7876- for (i = 0 ; i < QMAN_STREAMS ; i ++ )
7877- gaudi2_print_last_pqes_on_err (hdev , qid_base , i , qman_base , false);
7766+ dev_info (hdev -> dev ,
7767+ "LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n" ,
7768+ cq_ptr , cq_ptr_size , arc_cq_ptr , arc_cq_ptr_size , cp_current_inst );
78787769}
78797770
78807771static int gaudi2_handle_qman_err_generic (struct hl_device * hdev , u16 event_type ,
@@ -7912,7 +7803,8 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type
79127803 error_count ++ ;
79137804 }
79147805
7915- print_qman_data_on_err (hdev , qid_base , i , qman_base );
7806+ if (i == QMAN_STREAMS )
7807+ print_lower_qman_data_on_err (hdev , qman_base );
79167808 }
79177809
79187810 arb_err_val = RREG32 (arb_err_addr );
0 commit comments