Skip to content

Commit 84911cf

Browse files
kadesai16rleon
authored andcommitted
RDMA/bnxt_re: post destroy_ah for delayed completion of AH creation
AH create may be called from interrpt context and driver has a special timeout (8 sec) for this command. This is to avoid soft lockups when the FW command takes more time. Driver returns -ETIMEOUT and fail create AH, without waiting for actual completion from firmware. When FW completion is received, use is_waiter_alive flag to avoid a regular completion path. If create_ah opcode is detected in completion path which does not have waiter alive, driver will fetch ah_id from successful firmware completion in the interrupt context and sends destroy_ah command for same ah_id. This special post is done in quick manner using helper function __send_message_no_waiter. timeout_send is only used for debugging purposes. If timeout_send value keeps incrementing, it indicates out of sync active ah counter between driver and firmware. This is a limitation but graceful handling is possible in future. Signed-off-by: Kashyap Desai <kashyap.desai@broadcom.com> Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com> Link: https://lore.kernel.org/r/1686308514-11996-13-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky <leon@kernel.org>
1 parent b6c7256 commit 84911cf

2 files changed

Lines changed: 110 additions & 0 deletions

File tree

drivers/infiniband/hw/bnxt_re/qplib_rcfw.c

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,73 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie, u8 opcode)
175175
return -ETIMEDOUT;
176176
};
177177

178+
/* __send_message_no_waiter - get cookie and post the message.
179+
* @rcfw - rcfw channel instance of rdev
180+
* @msg - qplib message internal
181+
*
182+
* This function will just post and don't bother about completion.
183+
* Current design of this function is -
184+
* user must hold the completion queue hwq->lock.
185+
* user must have used existing completion and free the resources.
186+
* this function will not check queue full condition.
187+
* this function will explicitly set is_waiter_alive=false.
188+
* current use case is - send destroy_ah if create_ah is return
189+
* after waiter of create_ah is lost. It can be extended for other
190+
* use case as well.
191+
*
192+
* Returns: Nothing
193+
*
194+
*/
195+
static void __send_message_no_waiter(struct bnxt_qplib_rcfw *rcfw,
196+
struct bnxt_qplib_cmdqmsg *msg)
197+
{
198+
struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq;
199+
struct bnxt_qplib_hwq *hwq = &cmdq->hwq;
200+
struct bnxt_qplib_crsqe *crsqe;
201+
struct bnxt_qplib_cmdqe *cmdqe;
202+
u32 sw_prod, cmdq_prod;
203+
u16 cookie, cbit;
204+
u32 bsize;
205+
u8 *preq;
206+
207+
cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE;
208+
cbit = cookie % rcfw->cmdq_depth;
209+
210+
set_bit(cbit, cmdq->cmdq_bitmap);
211+
__set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie));
212+
crsqe = &rcfw->crsqe_tbl[cbit];
213+
214+
/* Set cmd_size in terms of 16B slots in req. */
215+
bsize = bnxt_qplib_set_cmd_slots(msg->req);
216+
/* GET_CMD_SIZE would return number of slots in either case of tlv
217+
* and non-tlv commands after call to bnxt_qplib_set_cmd_slots()
218+
*/
219+
crsqe->is_internal_cmd = true;
220+
crsqe->is_waiter_alive = false;
221+
crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz);
222+
223+
preq = (u8 *)msg->req;
224+
do {
225+
/* Locate the next cmdq slot */
226+
sw_prod = HWQ_CMP(hwq->prod, hwq);
227+
cmdqe = bnxt_qplib_get_qe(hwq, sw_prod, NULL);
228+
/* Copy a segment of the req cmd to the cmdq */
229+
memset(cmdqe, 0, sizeof(*cmdqe));
230+
memcpy(cmdqe, preq, min_t(u32, bsize, sizeof(*cmdqe)));
231+
preq += min_t(u32, bsize, sizeof(*cmdqe));
232+
bsize -= min_t(u32, bsize, sizeof(*cmdqe));
233+
hwq->prod++;
234+
} while (bsize > 0);
235+
cmdq->seq_num++;
236+
237+
cmdq_prod = hwq->prod;
238+
atomic_inc(&rcfw->timeout_send);
239+
/* ring CMDQ DB */
240+
wmb();
241+
writel(cmdq_prod, cmdq->cmdq_mbox.prod);
242+
writel(RCFW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db);
243+
}
244+
178245
static int __send_message(struct bnxt_qplib_rcfw *rcfw,
179246
struct bnxt_qplib_cmdqmsg *msg)
180247
{
@@ -219,6 +286,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw,
219286
crsqe->free_slots = free_slots;
220287
crsqe->resp = (struct creq_qp_event *)msg->resp;
221288
crsqe->resp->cookie = cpu_to_le16(cookie);
289+
crsqe->is_internal_cmd = false;
222290
crsqe->is_waiter_alive = true;
223291
crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz);
224292
if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) {
@@ -343,6 +411,26 @@ static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw,
343411
return 0;
344412
}
345413

414+
/* This function will just post and do not bother about completion */
415+
static void __destroy_timedout_ah(struct bnxt_qplib_rcfw *rcfw,
416+
struct creq_create_ah_resp *create_ah_resp)
417+
{
418+
struct bnxt_qplib_cmdqmsg msg = {};
419+
struct cmdq_destroy_ah req = {};
420+
421+
bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req,
422+
CMDQ_BASE_OPCODE_DESTROY_AH,
423+
sizeof(req));
424+
req.ah_cid = create_ah_resp->xid;
425+
msg.req = (struct cmdq_base *)&req;
426+
msg.req_sz = sizeof(req);
427+
__send_message_no_waiter(rcfw, &msg);
428+
dev_info_ratelimited(&rcfw->pdev->dev,
429+
"From %s: ah_cid = %d timeout_send %d\n",
430+
__func__, req.ah_cid,
431+
atomic_read(&rcfw->timeout_send));
432+
}
433+
346434
/**
347435
* __bnxt_qplib_rcfw_send_message - qplib interface to send
348436
* and complete rcfw command.
@@ -563,6 +651,8 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
563651
if (!test_and_clear_bit(cbit, rcfw->cmdq.cmdq_bitmap))
564652
dev_warn(&pdev->dev,
565653
"CMD bit %d was not requested\n", cbit);
654+
if (crsqe->is_internal_cmd && !qp_event->status)
655+
atomic_dec(&rcfw->timeout_send);
566656

567657
if (crsqe->is_waiter_alive) {
568658
if (crsqe->resp)
@@ -579,6 +669,24 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
579669
crsqe->resp = NULL;
580670

581671
hwq->cons += req_size;
672+
673+
/* This is a case to handle below scenario -
674+
* Create AH is completed successfully by firmware,
675+
* but completion took more time and driver already lost
676+
* the context of create_ah from caller.
677+
* We have already return failure for create_ah verbs,
678+
* so let's destroy the same address vector since it is
679+
* no more used in stack. We don't care about completion
680+
* in __send_message_no_waiter.
681+
* If destroy_ah is failued by firmware, there will be AH
682+
* resource leak and relatively not critical + unlikely
683+
* scenario. Current design is not to handle such case.
684+
*/
685+
if (!is_waiter_alive && !qp_event->status &&
686+
qp_event->event == CREQ_QP_EVENT_EVENT_CREATE_AH)
687+
__destroy_timedout_ah(rcfw,
688+
(struct creq_create_ah_resp *)
689+
qp_event);
582690
spin_unlock_irqrestore(&hwq->lock, flags);
583691
}
584692
*num_wait += wait_cmds;

drivers/infiniband/hw/bnxt_re/qplib_rcfw.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ struct bnxt_qplib_crsqe {
153153
/* Free slots at the time of submission */
154154
u32 free_slots;
155155
bool is_waiter_alive;
156+
bool is_internal_cmd;
156157
};
157158

158159
struct bnxt_qplib_rcfw_sbuf {
@@ -225,6 +226,7 @@ struct bnxt_qplib_rcfw {
225226
u32 cmdq_depth;
226227
atomic_t rcfw_intr_enabled;
227228
struct semaphore rcfw_inflight;
229+
atomic_t timeout_send;
228230
};
229231

230232
struct bnxt_qplib_cmdqmsg {

0 commit comments

Comments
 (0)