Skip to content

Commit 5ee62b4

Browse files
chuckleverrleon
authored andcommitted
svcrdma: use bvec-based RDMA read/write API
Convert svcrdma to the bvec-based RDMA API introduced earlier in this series. The bvec-based RDMA API eliminates the intermediate scatterlist conversion step, allowing direct DMA mapping from bio_vec arrays. This simplifies the svc_rdma_rw_ctxt structure by removing the chained SG table management. The structure retains an inline array approach similar to the previous scatterlist implementation: an inline bvec array sized to max_send_sge handles most I/O operations without additional allocation. Larger requests fall back to dynamic allocation. This preserves the allocation-free fast path for typical NFS operations while supporting arbitrarily large transfers. The bvec API handles all device types internally, including iWARP devices which require memory registration. No explicit fallback path is needed. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Link: https://patch.msgid.link/20260128005400.25147-6-cel@kernel.org Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Leon Romanovsky <leon@kernel.org>
1 parent afcae7d commit 5ee62b4

1 file changed

Lines changed: 86 additions & 69 deletions

File tree

net/sunrpc/xprtrdma/svc_rdma_rw.c

Lines changed: 86 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
66
*/
77

8+
#include <linux/bvec.h>
9+
#include <linux/overflow.h>
810
#include <rdma/rw.h>
911

1012
#include <linux/sunrpc/xdr.h>
@@ -20,30 +22,33 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
2022
/* Each R/W context contains state for one chain of RDMA Read or
2123
* Write Work Requests.
2224
*
23-
* Each WR chain handles a single contiguous server-side buffer,
24-
* because scatterlist entries after the first have to start on
25-
* page alignment. xdr_buf iovecs cannot guarantee alignment.
25+
* Each WR chain handles a single contiguous server-side buffer.
26+
* - each xdr_buf iovec is a single contiguous buffer
27+
* - the xdr_buf pages array is a single contiguous buffer because the
28+
* second through the last element always start on a page boundary
2629
*
2730
* Each WR chain handles only one R_key. Each RPC-over-RDMA segment
2831
* from a client may contain a unique R_key, so each WR chain moves
2932
* up to one segment at a time.
3033
*
31-
* The scatterlist makes this data structure over 4KB in size. To
32-
* make it less likely to fail, and to handle the allocation for
33-
* smaller I/O requests without disabling bottom-halves, these
34-
* contexts are created on demand, but cached and reused until the
35-
* controlling svcxprt_rdma is destroyed.
34+
* The inline bvec array is sized to handle most I/O requests without
35+
* additional allocation. Larger requests fall back to dynamic allocation.
36+
* These contexts are created on demand, but cached and reused until
37+
* the controlling svcxprt_rdma is destroyed.
3638
*/
3739
struct svc_rdma_rw_ctxt {
3840
struct llist_node rw_node;
3941
struct list_head rw_list;
4042
struct rdma_rw_ctx rw_ctx;
4143
unsigned int rw_nents;
42-
unsigned int rw_first_sgl_nents;
43-
struct sg_table rw_sg_table;
44-
struct scatterlist rw_first_sgl[];
44+
unsigned int rw_first_bvec_nents;
45+
struct bio_vec *rw_bvec;
46+
struct bio_vec rw_first_bvec[];
4547
};
4648

49+
static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
50+
struct svc_rdma_rw_ctxt *ctxt);
51+
4752
static inline struct svc_rdma_rw_ctxt *
4853
svc_rdma_next_ctxt(struct list_head *list)
4954
{
@@ -52,10 +57,10 @@ svc_rdma_next_ctxt(struct list_head *list)
5257
}
5358

5459
static struct svc_rdma_rw_ctxt *
55-
svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
60+
svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int nr_bvec)
5661
{
5762
struct ib_device *dev = rdma->sc_cm_id->device;
58-
unsigned int first_sgl_nents = dev->attrs.max_send_sge;
63+
unsigned int first_bvec_nents = dev->attrs.max_send_sge;
5964
struct svc_rdma_rw_ctxt *ctxt;
6065
struct llist_node *node;
6166

@@ -65,33 +70,44 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
6570
if (node) {
6671
ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node);
6772
} else {
68-
ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents),
73+
ctxt = kmalloc_node(struct_size(ctxt, rw_first_bvec,
74+
first_bvec_nents),
6975
GFP_KERNEL, ibdev_to_node(dev));
7076
if (!ctxt)
7177
goto out_noctx;
7278

7379
INIT_LIST_HEAD(&ctxt->rw_list);
74-
ctxt->rw_first_sgl_nents = first_sgl_nents;
80+
ctxt->rw_first_bvec_nents = first_bvec_nents;
7581
}
7682

77-
ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
78-
if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
79-
ctxt->rw_sg_table.sgl,
80-
first_sgl_nents))
81-
goto out_free;
83+
if (nr_bvec <= ctxt->rw_first_bvec_nents) {
84+
ctxt->rw_bvec = ctxt->rw_first_bvec;
85+
} else {
86+
ctxt->rw_bvec = kmalloc_array_node(nr_bvec,
87+
sizeof(*ctxt->rw_bvec),
88+
GFP_KERNEL,
89+
ibdev_to_node(dev));
90+
if (!ctxt->rw_bvec)
91+
goto out_free;
92+
}
8293
return ctxt;
8394

8495
out_free:
85-
kfree(ctxt);
96+
/* Return cached contexts to cache; free freshly allocated ones */
97+
if (node)
98+
svc_rdma_put_rw_ctxt(rdma, ctxt);
99+
else
100+
kfree(ctxt);
86101
out_noctx:
87-
trace_svcrdma_rwctx_empty(rdma, sges);
102+
trace_svcrdma_rwctx_empty(rdma, nr_bvec);
88103
return NULL;
89104
}
90105

91106
static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt,
92107
struct llist_head *list)
93108
{
94-
sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents);
109+
if (ctxt->rw_bvec != ctxt->rw_first_bvec)
110+
kfree(ctxt->rw_bvec);
95111
llist_add(&ctxt->rw_node, list);
96112
}
97113

@@ -123,21 +139,26 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
123139
* @ctxt: R/W context to prepare
124140
* @offset: RDMA offset
125141
* @handle: RDMA tag/handle
142+
* @length: total number of bytes in the bvec array
126143
* @direction: I/O direction
127144
*
128145
* Returns on success, the number of WQEs that will be needed
129146
* on the workqueue, or a negative errno.
130147
*/
131148
static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
132149
struct svc_rdma_rw_ctxt *ctxt,
133-
u64 offset, u32 handle,
150+
u64 offset, u32 handle, unsigned int length,
134151
enum dma_data_direction direction)
135152
{
153+
struct bvec_iter iter = {
154+
.bi_size = length,
155+
};
136156
int ret;
137157

138-
ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num,
139-
ctxt->rw_sg_table.sgl, ctxt->rw_nents,
140-
0, offset, handle, direction);
158+
ret = rdma_rw_ctx_init_bvec(&ctxt->rw_ctx, rdma->sc_qp,
159+
rdma->sc_port_num,
160+
ctxt->rw_bvec, ctxt->rw_nents,
161+
iter, offset, handle, direction);
141162
if (unlikely(ret < 0)) {
142163
trace_svcrdma_dma_map_rw_err(rdma, offset, handle,
143164
ctxt->rw_nents, ret);
@@ -175,18 +196,18 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
175196
{
176197
struct llist_node *first, *last;
177198
struct svc_rdma_rw_ctxt *ctxt;
178-
LLIST_HEAD(free);
179199

180200
trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount);
181201

182202
first = last = NULL;
183203
while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
184204
list_del(&ctxt->rw_list);
185205

186-
rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
187-
rdma->sc_port_num, ctxt->rw_sg_table.sgl,
188-
ctxt->rw_nents, dir);
189-
__svc_rdma_put_rw_ctxt(ctxt, &free);
206+
rdma_rw_ctx_destroy_bvec(&ctxt->rw_ctx, rdma->sc_qp,
207+
rdma->sc_port_num,
208+
ctxt->rw_bvec, ctxt->rw_nents, dir);
209+
if (ctxt->rw_bvec != ctxt->rw_first_bvec)
210+
kfree(ctxt->rw_bvec);
190211

191212
ctxt->rw_node.next = first;
192213
first = &ctxt->rw_node;
@@ -414,51 +435,46 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma,
414435
return -ENOTCONN;
415436
}
416437

417-
/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
438+
/* Build a bvec that covers one kvec in an xdr_buf.
418439
*/
419-
static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
420-
unsigned int len,
421-
struct svc_rdma_rw_ctxt *ctxt)
440+
static void svc_rdma_vec_to_bvec(struct svc_rdma_write_info *info,
441+
unsigned int len,
442+
struct svc_rdma_rw_ctxt *ctxt)
422443
{
423-
struct scatterlist *sg = ctxt->rw_sg_table.sgl;
424-
425-
sg_set_buf(&sg[0], info->wi_base, len);
444+
bvec_set_virt(&ctxt->rw_bvec[0], info->wi_base, len);
426445
info->wi_base += len;
427446

428447
ctxt->rw_nents = 1;
429448
}
430449

431-
/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
450+
/* Build a bvec array that covers part of an xdr_buf's pagelist.
432451
*/
433-
static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
434-
unsigned int remaining,
435-
struct svc_rdma_rw_ctxt *ctxt)
452+
static void svc_rdma_pagelist_to_bvec(struct svc_rdma_write_info *info,
453+
unsigned int remaining,
454+
struct svc_rdma_rw_ctxt *ctxt)
436455
{
437-
unsigned int sge_no, sge_bytes, page_off, page_no;
456+
unsigned int bvec_idx, bvec_len, page_off, page_no;
438457
const struct xdr_buf *xdr = info->wi_xdr;
439-
struct scatterlist *sg;
440458
struct page **page;
441459

442460
page_off = info->wi_next_off + xdr->page_base;
443461
page_no = page_off >> PAGE_SHIFT;
444462
page_off = offset_in_page(page_off);
445463
page = xdr->pages + page_no;
446464
info->wi_next_off += remaining;
447-
sg = ctxt->rw_sg_table.sgl;
448-
sge_no = 0;
465+
bvec_idx = 0;
449466
do {
450-
sge_bytes = min_t(unsigned int, remaining,
451-
PAGE_SIZE - page_off);
452-
sg_set_page(sg, *page, sge_bytes, page_off);
453-
454-
remaining -= sge_bytes;
455-
sg = sg_next(sg);
467+
bvec_len = min_t(unsigned int, remaining,
468+
PAGE_SIZE - page_off);
469+
bvec_set_page(&ctxt->rw_bvec[bvec_idx], *page, bvec_len,
470+
page_off);
471+
remaining -= bvec_len;
456472
page_off = 0;
457-
sge_no++;
473+
bvec_idx++;
458474
page++;
459475
} while (remaining);
460476

461-
ctxt->rw_nents = sge_no;
477+
ctxt->rw_nents = bvec_idx;
462478
}
463479

464480
/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
@@ -496,7 +512,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
496512
constructor(info, write_len, ctxt);
497513
offset = seg->rs_offset + info->wi_seg_off;
498514
ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle,
499-
DMA_TO_DEVICE);
515+
write_len, DMA_TO_DEVICE);
500516
if (ret < 0)
501517
return -EIO;
502518
percpu_counter_inc(&svcrdma_stat_write);
@@ -535,7 +551,7 @@ static int svc_rdma_iov_write(struct svc_rdma_write_info *info,
535551
const struct kvec *iov)
536552
{
537553
info->wi_base = iov->iov_base;
538-
return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
554+
return svc_rdma_build_writes(info, svc_rdma_vec_to_bvec,
539555
iov->iov_len);
540556
}
541557

@@ -559,7 +575,7 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
559575
{
560576
info->wi_xdr = xdr;
561577
info->wi_next_off = offset - xdr->head[0].iov_len;
562-
return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
578+
return svc_rdma_build_writes(info, svc_rdma_pagelist_to_bvec,
563579
length);
564580
}
565581

@@ -734,29 +750,29 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
734750
{
735751
struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp);
736752
struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
737-
unsigned int sge_no, seg_len, len;
753+
unsigned int bvec_idx, nr_bvec, seg_len, len, total;
738754
struct svc_rdma_rw_ctxt *ctxt;
739-
struct scatterlist *sg;
740755
int ret;
741756

742757
len = segment->rs_length;
743-
sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT;
744-
ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no);
758+
if (check_add_overflow(head->rc_pageoff, len, &total))
759+
return -EINVAL;
760+
nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT;
761+
ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec);
745762
if (!ctxt)
746763
return -ENOMEM;
747-
ctxt->rw_nents = sge_no;
764+
ctxt->rw_nents = nr_bvec;
748765

749-
sg = ctxt->rw_sg_table.sgl;
750-
for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) {
766+
for (bvec_idx = 0; bvec_idx < ctxt->rw_nents; bvec_idx++) {
751767
seg_len = min_t(unsigned int, len,
752768
PAGE_SIZE - head->rc_pageoff);
753769

754770
if (!head->rc_pageoff)
755771
head->rc_page_count++;
756772

757-
sg_set_page(sg, rqstp->rq_pages[head->rc_curpage],
758-
seg_len, head->rc_pageoff);
759-
sg = sg_next(sg);
773+
bvec_set_page(&ctxt->rw_bvec[bvec_idx],
774+
rqstp->rq_pages[head->rc_curpage],
775+
seg_len, head->rc_pageoff);
760776

761777
head->rc_pageoff += seg_len;
762778
if (head->rc_pageoff == PAGE_SIZE) {
@@ -770,7 +786,8 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
770786
}
771787

772788
ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset,
773-
segment->rs_handle, DMA_FROM_DEVICE);
789+
segment->rs_handle, segment->rs_length,
790+
DMA_FROM_DEVICE);
774791
if (ret < 0)
775792
return -EIO;
776793
percpu_counter_inc(&svcrdma_stat_read);

0 commit comments

Comments
 (0)