Skip to content

Commit ae303d8

Browse files
Tomer Tayarogabbay
authored andcommitted
accel/habanalabs/gaudi2: get the correct QM CQ info upon an error
Upon a QM error, the address/size from both the CQ and the ARC_CQ are printed, although the instruction that led to the error was received from only one of them. Moreover, in case of a QM undefined opcode, only one of these address/size sets will be captured based on the value of ARC_CQ_PTR. However, this value can be non-zero even if currently the CQ is used, in case the CQ/ARC_CQ are alternately used. Under the assumption of having a stop-on-error configuration, modify to use CP_STS.CUR_CQ field to get the relevant CQ for the QM error. Signed-off-by: Tomer Tayar <ttayar@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
1 parent 4b0b1fb commit ae303d8

2 files changed

Lines changed: 23 additions & 22 deletions

File tree

drivers/accel/habanalabs/gaudi2/gaudi2.c

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7860,36 +7860,36 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
78607860

78617861
static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u64 event_mask)
78627862
{
7863-
u32 lo, hi, cq_ptr_size, arc_cq_ptr_size;
7864-
u64 cq_ptr, arc_cq_ptr, cp_current_inst;
7865-
7866-
lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
7867-
hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
7868-
cq_ptr = ((u64) hi) << 32 | lo;
7869-
cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
7870-
7871-
lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
7872-
hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
7873-
arc_cq_ptr = ((u64) hi) << 32 | lo;
7874-
arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
7863+
u32 lo, hi, cq_ptr_size, cp_sts;
7864+
u64 cq_ptr, cp_current_inst;
7865+
bool is_arc_cq;
7866+
7867+
cp_sts = RREG32(qman_base + QM_CP_STS_4_OFFSET);
7868+
is_arc_cq = FIELD_GET(PDMA0_QM_CP_STS_CUR_CQ_MASK, cp_sts); /* 0 - legacy CQ, 1 - ARC_CQ */
7869+
7870+
if (is_arc_cq) {
7871+
lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
7872+
hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
7873+
cq_ptr = ((u64) hi) << 32 | lo;
7874+
cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
7875+
} else {
7876+
lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
7877+
hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
7878+
cq_ptr = ((u64) hi) << 32 | lo;
7879+
cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
7880+
}
78757881

78767882
lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET);
78777883
hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET);
78787884
cp_current_inst = ((u64) hi) << 32 | lo;
78797885

78807886
dev_info(hdev->dev,
7881-
"LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
7882-
cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst);
7887+
"LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
7888+
is_arc_cq ? "ARC_" : "", cq_ptr, cq_ptr_size, cp_current_inst);
78837889

78847890
if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
7885-
if (arc_cq_ptr) {
7886-
hdev->captured_err_info.undef_opcode.cq_addr = arc_cq_ptr;
7887-
hdev->captured_err_info.undef_opcode.cq_size = arc_cq_ptr_size;
7888-
} else {
7889-
hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
7890-
hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
7891-
}
7892-
7891+
hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
7892+
hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
78937893
hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS;
78947894
}
78957895
}

drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@
250250
#define QM_ARC_CQ_PTR_HI_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE)
251251
#define QM_ARC_CQ_TSIZE_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE)
252252

253+
#define QM_CP_STS_4_OFFSET (mmPDMA0_QM_CP_STS_4 - mmPDMA0_QM_BASE)
253254
#define QM_CP_CURRENT_INST_LO_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE)
254255
#define QM_CP_CURRENT_INST_HI_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_HI_4 - mmPDMA0_QM_BASE)
255256

0 commit comments

Comments
 (0)