Skip to content

Commit b6c7256

Browse files
kadesai16rleon
authored andcommitted
RDMA/bnxt_re: Add firmware stall check detection
Every completion will update last_seen value in the unit of jiffies. last_seen field will be used to know if firmware is alive and is useful to detect firmware stall. Non blocking interface __wait_for_resp will have logic to detect firmware stall. After every 10 second interval if __wait_for_resp has not received completion for a given command it will check for firmware stall condition. If current jiffies is greater than last_seen jiffies + RCFW_FW_STALL_TIMEOUT_SEC * HZ, it is a firmware stall. Signed-off-by: Kashyap Desai <kashyap.desai@broadcom.com> Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com> Link: https://lore.kernel.org/r/1686308514-11996-12-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky <leon@kernel.org>
1 parent 691eb7c commit b6c7256

2 files changed

Lines changed: 39 additions & 10 deletions

File tree

drivers/infiniband/hw/bnxt_re/qplib_rcfw.c

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,13 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode)
112112
do {
113113
if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags))
114114
return bnxt_qplib_map_rc(opcode);
115+
if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags))
116+
return -ETIMEDOUT;
115117

116-
/* Non zero means command completed */
117118
wait_event_timeout(cmdq->waitq,
118119
!test_bit(cbit, cmdq->cmdq_bitmap),
119-
msecs_to_jiffies(10000));
120+
msecs_to_jiffies(RCFW_FW_STALL_TIMEOUT_SEC
121+
* 1000));
120122

121123
if (!test_bit(cbit, cmdq->cmdq_bitmap))
122124
return 0;
@@ -126,6 +128,11 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode)
126128
if (!test_bit(cbit, cmdq->cmdq_bitmap))
127129
return 0;
128130

131+
/* Firmware stall is detected */
132+
if (time_after(jiffies, cmdq->last_seen +
133+
(RCFW_FW_STALL_TIMEOUT_SEC * HZ)))
134+
return -ENODEV;
135+
129136
} while (true);
130137
};
131138

@@ -154,6 +161,8 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode)
154161
do {
155162
if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags))
156163
return bnxt_qplib_map_rc(opcode);
164+
if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags))
165+
return -ETIMEDOUT;
157166

158167
udelay(1);
159168

@@ -184,9 +193,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw,
184193
hwq = &cmdq->hwq;
185194
pdev = rcfw->pdev;
186195

187-
if (test_bit(FIRMWARE_TIMED_OUT, &cmdq->flags))
188-
return -ETIMEDOUT;
189-
190196
/* Cmdq are in 16-byte units, each request can consume 1 or more
191197
* cmdqe
192198
*/
@@ -285,14 +291,21 @@ static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie,
285291
do {
286292
if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags))
287293
return bnxt_qplib_map_rc(opcode);
294+
if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags))
295+
return -ETIMEDOUT;
288296

289297
usleep_range(1000, 1001);
290298

291299
bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet);
292300
if (!test_bit(cbit, cmdq->cmdq_bitmap))
293301
return 0;
294-
if (jiffies_to_msecs(jiffies - issue_time) > 10000)
295-
return -ETIMEDOUT;
302+
if (jiffies_to_msecs(jiffies - issue_time) >
303+
(RCFW_FW_STALL_TIMEOUT_SEC * 1000)) {
304+
/* Firmware stall is detected */
305+
if (time_after(jiffies, cmdq->last_seen +
306+
(RCFW_FW_STALL_TIMEOUT_SEC * HZ)))
307+
return -ENODEV;
308+
}
296309
} while (true);
297310
};
298311

@@ -308,6 +321,8 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw,
308321
/* Prevent posting if f/w is not in a state to process */
309322
if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags))
310323
return -ENXIO;
324+
if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags))
325+
return -ETIMEDOUT;
311326

312327
if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) &&
313328
opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) {
@@ -375,14 +390,15 @@ static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
375390
/* timed out */
376391
dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n",
377392
cookie, opcode, RCFW_CMD_WAIT_TIME_MS);
378-
set_bit(FIRMWARE_TIMED_OUT, &rcfw->cmdq.flags);
379393
return rc;
380394
}
381395

382396
if (rc) {
383397
spin_lock_irqsave(&rcfw->cmdq.hwq.lock, flags);
384398
crsqe = &rcfw->crsqe_tbl[cbit];
385399
crsqe->is_waiter_alive = false;
400+
if (rc == -ENODEV)
401+
set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags);
386402
spin_unlock_irqrestore(&rcfw->cmdq.hwq.lock, flags);
387403
return -ETIMEDOUT;
388404
}
@@ -533,6 +549,17 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
533549
cookie &= RCFW_MAX_COOKIE_VALUE;
534550
cbit = cookie % rcfw->cmdq_depth;
535551
crsqe = &rcfw->crsqe_tbl[cbit];
552+
553+
if (WARN_ONCE(test_bit(FIRMWARE_STALL_DETECTED,
554+
&rcfw->cmdq.flags),
555+
"QPLIB: Unreponsive rcfw channel detected.!!")) {
556+
dev_info(&pdev->dev,
557+
"rcfw timedout: cookie = %#x, free_slots = %d",
558+
cookie, crsqe->free_slots);
559+
spin_unlock_irqrestore(&hwq->lock, flags);
560+
return rc;
561+
}
562+
536563
if (!test_and_clear_bit(cbit, rcfw->cmdq.cmdq_bitmap))
537564
dev_warn(&pdev->dev,
538565
"CMD bit %d was not requested\n", cbit);
@@ -582,6 +609,7 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t)
582609
* reading any further.
583610
*/
584611
dma_rmb();
612+
rcfw->cmdq.last_seen = jiffies;
585613

586614
type = creqe->type & CREQ_BASE_TYPE_MASK;
587615
switch (type) {

drivers/infiniband/hw/bnxt_re/qplib_rcfw.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151

5252
#define RCFW_DBR_PCI_BAR_REGION 2
5353
#define RCFW_DBR_BASE_PAGE_SHIFT 12
54+
#define RCFW_FW_STALL_TIMEOUT_SEC 40
5455

5556
/* Cmdq contains a fix number of a 16-Byte slots */
5657
struct bnxt_qplib_cmdqe {
@@ -128,7 +129,6 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req)
128129

129130
#define RCFW_MAX_COOKIE_VALUE (BNXT_QPLIB_CMDQE_MAX_CNT - 1)
130131
#define RCFW_CMD_IS_BLOCKING 0x8000
131-
#define RCFW_BLOCKED_CMD_WAIT_COUNT 20000000UL /* 20 sec */
132132

133133
/* Crsq buf is 1024-Byte */
134134
struct bnxt_qplib_crsbe {
@@ -170,7 +170,7 @@ struct bnxt_qplib_qp_node {
170170

171171
#define FIRMWARE_INITIALIZED_FLAG (0)
172172
#define FIRMWARE_FIRST_FLAG (31)
173-
#define FIRMWARE_TIMED_OUT (3)
173+
#define FIRMWARE_STALL_DETECTED (3)
174174
#define ERR_DEVICE_DETACHED (4)
175175

176176
struct bnxt_qplib_cmdq_mbox {
@@ -185,6 +185,7 @@ struct bnxt_qplib_cmdq_ctx {
185185
wait_queue_head_t waitq;
186186
unsigned long flags;
187187
unsigned long *cmdq_bitmap;
188+
unsigned long last_seen;
188189
u32 seq_num;
189190
};
190191

0 commit comments

Comments
 (0)