Skip to content

Commit afcae7d

Browse files
chuckleverrleon
authored andcommitted
RDMA/core: add rdma_rw_max_sge() helper for SQ sizing
svc_rdma_accept() computes sc_sq_depth as the sum of rq_depth and the number of rdma_rw contexts (ctxts). This value is used to allocate the Send CQ and to initialize the sc_sq_avail credit pool. However, when the device uses memory registration for RDMA operations, rdma_rw_init_qp() inflates the QP's max_send_wr by a factor of three per context to account for REG and INV work requests. The Send CQ and credit pool remain sized for only one work request per context, causing Send Queue exhaustion under heavy NFS WRITE workloads. Introduce rdma_rw_max_sge() to compute the actual number of Send Queue entries required for a given number of rdma_rw contexts. Upper layer protocols call this helper before creating a Queue Pair so that their Send CQs and credit accounting match the QP's true capacity. Update svc_rdma_accept() to use rdma_rw_max_sge() when computing sc_sq_depth, ensuring the credit pool reflects the work requests that rdma_rw_init_qp() will reserve. Reviewed-by: Christoph Hellwig <hch@lst.de> Fixes: 00bd143 ("RDMA/rw: Support threshold for registration vs scattering to local pages") Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Link: https://patch.msgid.link/20260128005400.25147-5-cel@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org>
1 parent bea28ac commit afcae7d

3 files changed

Lines changed: 46 additions & 17 deletions

File tree

drivers/infiniband/core/rw.c

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1071,34 +1071,57 @@ unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num,
10711071
}
10721072
EXPORT_SYMBOL(rdma_rw_mr_factor);
10731073

1074+
/**
1075+
* rdma_rw_max_send_wr - compute max Send WRs needed for RDMA R/W contexts
1076+
* @dev: RDMA device
1077+
* @port_num: port number
1078+
* @max_rdma_ctxs: number of rdma_rw_ctx structures
1079+
* @create_flags: QP create flags (pass IB_QP_CREATE_INTEGRITY_EN if
1080+
* data integrity will be enabled on the QP)
1081+
*
1082+
* Returns the total number of Send Queue entries needed for
1083+
* @max_rdma_ctxs. The result accounts for memory registration and
1084+
* invalidation work requests when the device requires them.
1085+
*
1086+
* ULPs use this to size Send Queues and Send CQs before creating a
1087+
* Queue Pair.
1088+
*/
1089+
unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num,
1090+
unsigned int max_rdma_ctxs, u32 create_flags)
1091+
{
1092+
unsigned int factor = 1;
1093+
unsigned int result;
1094+
1095+
if (create_flags & IB_QP_CREATE_INTEGRITY_EN ||
1096+
rdma_rw_can_use_mr(dev, port_num))
1097+
factor += 2; /* reg + inv */
1098+
1099+
if (check_mul_overflow(factor, max_rdma_ctxs, &result))
1100+
return UINT_MAX;
1101+
return result;
1102+
}
1103+
EXPORT_SYMBOL(rdma_rw_max_send_wr);
1104+
10741105
void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
10751106
{
1076-
u32 factor;
1107+
unsigned int factor = 1;
10771108

10781109
WARN_ON_ONCE(attr->port_num == 0);
10791110

10801111
/*
1081-
* Each context needs at least one RDMA READ or WRITE WR.
1082-
*
1083-
* For some hardware we might need more, eventually we should ask the
1084-
* HCA driver for a multiplier here.
1085-
*/
1086-
factor = 1;
1087-
1088-
/*
1089-
* If the device needs MRs to perform RDMA READ or WRITE operations,
1090-
* we'll need two additional MRs for the registrations and the
1091-
* invalidation.
1112+
* If the device uses MRs to perform RDMA READ or WRITE operations,
1113+
* or if data integrity is enabled, account for registration and
1114+
* invalidation work requests.
10921115
*/
10931116
if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN ||
10941117
rdma_rw_can_use_mr(dev, attr->port_num))
1095-
factor += 2; /* inv + reg */
1118+
factor += 2; /* reg + inv */
10961119

10971120
attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
10981121

10991122
/*
1100-
* But maybe we were just too high in the sky and the device doesn't
1101-
* even support all we need, and we'll have to live with what we get..
1123+
* The device might not support all we need, and we'll have to
1124+
* live with what we get.
11021125
*/
11031126
attr->cap.max_send_wr =
11041127
min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);

include/rdma/rw.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
8686

8787
unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num,
8888
unsigned int maxpages);
89+
unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num,
90+
unsigned int max_rdma_ctxs, u32 create_flags);
8991
void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr);
9092
int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr);
9193
void rdma_rw_cleanup_mrs(struct ib_qp *qp);

net/sunrpc/xprtrdma/svc_rdma_transport.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -462,15 +462,19 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
462462
newxprt->sc_max_bc_requests = 2;
463463
}
464464

465-
/* Arbitrary estimate of the needed number of rdma_rw contexts.
465+
/* Estimate the needed number of rdma_rw contexts. The maximum
466+
* Read and Write chunks have one segment each. Each request
467+
* can involve one Read chunk and either a Write chunk or Reply
468+
* chunk; thus a factor of three.
466469
*/
467470
maxpayload = min(xprt->xpt_server->sv_max_payload,
468471
RPCSVC_MAXPAYLOAD_RDMA);
469472
ctxts = newxprt->sc_max_requests * 3 *
470473
rdma_rw_mr_factor(dev, newxprt->sc_port_num,
471474
maxpayload >> PAGE_SHIFT);
472475

473-
newxprt->sc_sq_depth = rq_depth + ctxts;
476+
newxprt->sc_sq_depth = rq_depth +
477+
rdma_rw_max_send_wr(dev, newxprt->sc_port_num, ctxts, 0);
474478
if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr)
475479
newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
476480
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);

0 commit comments

Comments
 (0)