55 * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
66 */
77
8+ #include <linux/bvec.h>
9+ #include <linux/overflow.h>
810#include <rdma/rw.h>
911
1012#include <linux/sunrpc/xdr.h>
@@ -20,30 +22,33 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
2022/* Each R/W context contains state for one chain of RDMA Read or
2123 * Write Work Requests.
2224 *
23- * Each WR chain handles a single contiguous server-side buffer,
24- * because scatterlist entries after the first have to start on
25- * page alignment. xdr_buf iovecs cannot guarantee alignment.
25+ * Each WR chain handles a single contiguous server-side buffer.
26+ * - each xdr_buf iovec is a single contiguous buffer
27+ * - the xdr_buf pages array is a single contiguous buffer because the
28+ * second through the last element always start on a page boundary
2629 *
2730 * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
2831 * from a client may contain a unique R_key, so each WR chain moves
2932 * up to one segment at a time.
3033 *
31- * The scatterlist makes this data structure over 4KB in size. To
32- * make it less likely to fail, and to handle the allocation for
33- * smaller I/O requests without disabling bottom-halves, these
34- * contexts are created on demand, but cached and reused until the
35- * controlling svcxprt_rdma is destroyed.
34+ * The inline bvec array is sized to handle most I/O requests without
35+ * additional allocation. Larger requests fall back to dynamic allocation.
36+ * These contexts are created on demand, but cached and reused until
37+ * the controlling svcxprt_rdma is destroyed.
3638 */
3739struct svc_rdma_rw_ctxt {
3840 struct llist_node rw_node ;
3941 struct list_head rw_list ;
4042 struct rdma_rw_ctx rw_ctx ;
4143 unsigned int rw_nents ;
42- unsigned int rw_first_sgl_nents ;
43- struct sg_table rw_sg_table ;
44- struct scatterlist rw_first_sgl [];
44+ unsigned int rw_first_bvec_nents ;
45+ struct bio_vec * rw_bvec ;
46+ struct bio_vec rw_first_bvec [];
4547};
4648
49+ static void svc_rdma_put_rw_ctxt (struct svcxprt_rdma * rdma ,
50+ struct svc_rdma_rw_ctxt * ctxt );
51+
4752static inline struct svc_rdma_rw_ctxt *
4853svc_rdma_next_ctxt (struct list_head * list )
4954{
@@ -52,10 +57,10 @@ svc_rdma_next_ctxt(struct list_head *list)
5257}
5358
5459static struct svc_rdma_rw_ctxt *
55- svc_rdma_get_rw_ctxt (struct svcxprt_rdma * rdma , unsigned int sges )
60+ svc_rdma_get_rw_ctxt (struct svcxprt_rdma * rdma , unsigned int nr_bvec )
5661{
5762 struct ib_device * dev = rdma -> sc_cm_id -> device ;
58- unsigned int first_sgl_nents = dev -> attrs .max_send_sge ;
63+ unsigned int first_bvec_nents = dev -> attrs .max_send_sge ;
5964 struct svc_rdma_rw_ctxt * ctxt ;
6065 struct llist_node * node ;
6166
@@ -65,33 +70,44 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
6570 if (node ) {
6671 ctxt = llist_entry (node , struct svc_rdma_rw_ctxt , rw_node );
6772 } else {
68- ctxt = kmalloc_node (struct_size (ctxt , rw_first_sgl , first_sgl_nents ),
73+ ctxt = kmalloc_node (struct_size (ctxt , rw_first_bvec ,
74+ first_bvec_nents ),
6975 GFP_KERNEL , ibdev_to_node (dev ));
7076 if (!ctxt )
7177 goto out_noctx ;
7278
7379 INIT_LIST_HEAD (& ctxt -> rw_list );
74- ctxt -> rw_first_sgl_nents = first_sgl_nents ;
80+ ctxt -> rw_first_bvec_nents = first_bvec_nents ;
7581 }
7682
77- ctxt -> rw_sg_table .sgl = ctxt -> rw_first_sgl ;
78- if (sg_alloc_table_chained (& ctxt -> rw_sg_table , sges ,
79- ctxt -> rw_sg_table .sgl ,
80- first_sgl_nents ))
81- goto out_free ;
83+ if (nr_bvec <= ctxt -> rw_first_bvec_nents ) {
84+ ctxt -> rw_bvec = ctxt -> rw_first_bvec ;
85+ } else {
86+ ctxt -> rw_bvec = kmalloc_array_node (nr_bvec ,
87+ sizeof (* ctxt -> rw_bvec ),
88+ GFP_KERNEL ,
89+ ibdev_to_node (dev ));
90+ if (!ctxt -> rw_bvec )
91+ goto out_free ;
92+ }
8293 return ctxt ;
8394
8495out_free :
85- kfree (ctxt );
96+ /* Return cached contexts to cache; free freshly allocated ones */
97+ if (node )
98+ svc_rdma_put_rw_ctxt (rdma , ctxt );
99+ else
100+ kfree (ctxt );
86101out_noctx :
87- trace_svcrdma_rwctx_empty (rdma , sges );
102+ trace_svcrdma_rwctx_empty (rdma , nr_bvec );
88103 return NULL ;
89104}
90105
91106static void __svc_rdma_put_rw_ctxt (struct svc_rdma_rw_ctxt * ctxt ,
92107 struct llist_head * list )
93108{
94- sg_free_table_chained (& ctxt -> rw_sg_table , ctxt -> rw_first_sgl_nents );
109+ if (ctxt -> rw_bvec != ctxt -> rw_first_bvec )
110+ kfree (ctxt -> rw_bvec );
95111 llist_add (& ctxt -> rw_node , list );
96112}
97113
@@ -123,21 +139,26 @@ void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
123139 * @ctxt: R/W context to prepare
124140 * @offset: RDMA offset
125141 * @handle: RDMA tag/handle
142+ * @length: total number of bytes in the bvec array
126143 * @direction: I/O direction
127144 *
128145 * Returns on success, the number of WQEs that will be needed
129146 * on the workqueue, or a negative errno.
130147 */
131148static int svc_rdma_rw_ctx_init (struct svcxprt_rdma * rdma ,
132149 struct svc_rdma_rw_ctxt * ctxt ,
133- u64 offset , u32 handle ,
150+ u64 offset , u32 handle , unsigned int length ,
134151 enum dma_data_direction direction )
135152{
153+ struct bvec_iter iter = {
154+ .bi_size = length ,
155+ };
136156 int ret ;
137157
138- ret = rdma_rw_ctx_init (& ctxt -> rw_ctx , rdma -> sc_qp , rdma -> sc_port_num ,
139- ctxt -> rw_sg_table .sgl , ctxt -> rw_nents ,
140- 0 , offset , handle , direction );
158+ ret = rdma_rw_ctx_init_bvec (& ctxt -> rw_ctx , rdma -> sc_qp ,
159+ rdma -> sc_port_num ,
160+ ctxt -> rw_bvec , ctxt -> rw_nents ,
161+ iter , offset , handle , direction );
141162 if (unlikely (ret < 0 )) {
142163 trace_svcrdma_dma_map_rw_err (rdma , offset , handle ,
143164 ctxt -> rw_nents , ret );
@@ -175,18 +196,18 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
175196{
176197 struct llist_node * first , * last ;
177198 struct svc_rdma_rw_ctxt * ctxt ;
178- LLIST_HEAD (free );
179199
180200 trace_svcrdma_cc_release (& cc -> cc_cid , cc -> cc_sqecount );
181201
182202 first = last = NULL ;
183203 while ((ctxt = svc_rdma_next_ctxt (& cc -> cc_rwctxts )) != NULL ) {
184204 list_del (& ctxt -> rw_list );
185205
186- rdma_rw_ctx_destroy (& ctxt -> rw_ctx , rdma -> sc_qp ,
187- rdma -> sc_port_num , ctxt -> rw_sg_table .sgl ,
188- ctxt -> rw_nents , dir );
189- __svc_rdma_put_rw_ctxt (ctxt , & free );
206+ rdma_rw_ctx_destroy_bvec (& ctxt -> rw_ctx , rdma -> sc_qp ,
207+ rdma -> sc_port_num ,
208+ ctxt -> rw_bvec , ctxt -> rw_nents , dir );
209+ if (ctxt -> rw_bvec != ctxt -> rw_first_bvec )
210+ kfree (ctxt -> rw_bvec );
190211
191212 ctxt -> rw_node .next = first ;
192213 first = & ctxt -> rw_node ;
@@ -414,51 +435,46 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma,
414435 return - ENOTCONN ;
415436}
416437
417- /* Build and DMA-map an SGL that covers one kvec in an xdr_buf
438+ /* Build a bvec that covers one kvec in an xdr_buf.
418439 */
419- static void svc_rdma_vec_to_sg (struct svc_rdma_write_info * info ,
420- unsigned int len ,
421- struct svc_rdma_rw_ctxt * ctxt )
440+ static void svc_rdma_vec_to_bvec (struct svc_rdma_write_info * info ,
441+ unsigned int len ,
442+ struct svc_rdma_rw_ctxt * ctxt )
422443{
423- struct scatterlist * sg = ctxt -> rw_sg_table .sgl ;
424-
425- sg_set_buf (& sg [0 ], info -> wi_base , len );
444+ bvec_set_virt (& ctxt -> rw_bvec [0 ], info -> wi_base , len );
426445 info -> wi_base += len ;
427446
428447 ctxt -> rw_nents = 1 ;
429448}
430449
431- /* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
450+ /* Build a bvec array that covers part of an xdr_buf's pagelist.
432451 */
433- static void svc_rdma_pagelist_to_sg (struct svc_rdma_write_info * info ,
434- unsigned int remaining ,
435- struct svc_rdma_rw_ctxt * ctxt )
452+ static void svc_rdma_pagelist_to_bvec (struct svc_rdma_write_info * info ,
453+ unsigned int remaining ,
454+ struct svc_rdma_rw_ctxt * ctxt )
436455{
437- unsigned int sge_no , sge_bytes , page_off , page_no ;
456+ unsigned int bvec_idx , bvec_len , page_off , page_no ;
438457 const struct xdr_buf * xdr = info -> wi_xdr ;
439- struct scatterlist * sg ;
440458 struct page * * page ;
441459
442460 page_off = info -> wi_next_off + xdr -> page_base ;
443461 page_no = page_off >> PAGE_SHIFT ;
444462 page_off = offset_in_page (page_off );
445463 page = xdr -> pages + page_no ;
446464 info -> wi_next_off += remaining ;
447- sg = ctxt -> rw_sg_table .sgl ;
448- sge_no = 0 ;
465+ bvec_idx = 0 ;
449466 do {
450- sge_bytes = min_t (unsigned int , remaining ,
451- PAGE_SIZE - page_off );
452- sg_set_page (sg , * page , sge_bytes , page_off );
453-
454- remaining -= sge_bytes ;
455- sg = sg_next (sg );
467+ bvec_len = min_t (unsigned int , remaining ,
468+ PAGE_SIZE - page_off );
469+ bvec_set_page (& ctxt -> rw_bvec [bvec_idx ], * page , bvec_len ,
470+ page_off );
471+ remaining -= bvec_len ;
456472 page_off = 0 ;
457- sge_no ++ ;
473+ bvec_idx ++ ;
458474 page ++ ;
459475 } while (remaining );
460476
461- ctxt -> rw_nents = sge_no ;
477+ ctxt -> rw_nents = bvec_idx ;
462478}
463479
464480/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
@@ -496,7 +512,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
496512 constructor (info , write_len , ctxt );
497513 offset = seg -> rs_offset + info -> wi_seg_off ;
498514 ret = svc_rdma_rw_ctx_init (rdma , ctxt , offset , seg -> rs_handle ,
499- DMA_TO_DEVICE );
515+ write_len , DMA_TO_DEVICE );
500516 if (ret < 0 )
501517 return - EIO ;
502518 percpu_counter_inc (& svcrdma_stat_write );
@@ -535,7 +551,7 @@ static int svc_rdma_iov_write(struct svc_rdma_write_info *info,
535551 const struct kvec * iov )
536552{
537553 info -> wi_base = iov -> iov_base ;
538- return svc_rdma_build_writes (info , svc_rdma_vec_to_sg ,
554+ return svc_rdma_build_writes (info , svc_rdma_vec_to_bvec ,
539555 iov -> iov_len );
540556}
541557
@@ -559,7 +575,7 @@ static int svc_rdma_pages_write(struct svc_rdma_write_info *info,
559575{
560576 info -> wi_xdr = xdr ;
561577 info -> wi_next_off = offset - xdr -> head [0 ].iov_len ;
562- return svc_rdma_build_writes (info , svc_rdma_pagelist_to_sg ,
578+ return svc_rdma_build_writes (info , svc_rdma_pagelist_to_bvec ,
563579 length );
564580}
565581
@@ -734,29 +750,29 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
734750{
735751 struct svcxprt_rdma * rdma = svc_rdma_rqst_rdma (rqstp );
736752 struct svc_rdma_chunk_ctxt * cc = & head -> rc_cc ;
737- unsigned int sge_no , seg_len , len ;
753+ unsigned int bvec_idx , nr_bvec , seg_len , len , total ;
738754 struct svc_rdma_rw_ctxt * ctxt ;
739- struct scatterlist * sg ;
740755 int ret ;
741756
742757 len = segment -> rs_length ;
743- sge_no = PAGE_ALIGN (head -> rc_pageoff + len ) >> PAGE_SHIFT ;
744- ctxt = svc_rdma_get_rw_ctxt (rdma , sge_no );
758+ if (check_add_overflow (head -> rc_pageoff , len , & total ))
759+ return - EINVAL ;
760+ nr_bvec = PAGE_ALIGN (total ) >> PAGE_SHIFT ;
761+ ctxt = svc_rdma_get_rw_ctxt (rdma , nr_bvec );
745762 if (!ctxt )
746763 return - ENOMEM ;
747- ctxt -> rw_nents = sge_no ;
764+ ctxt -> rw_nents = nr_bvec ;
748765
749- sg = ctxt -> rw_sg_table .sgl ;
750- for (sge_no = 0 ; sge_no < ctxt -> rw_nents ; sge_no ++ ) {
766+ for (bvec_idx = 0 ; bvec_idx < ctxt -> rw_nents ; bvec_idx ++ ) {
751767 seg_len = min_t (unsigned int , len ,
752768 PAGE_SIZE - head -> rc_pageoff );
753769
754770 if (!head -> rc_pageoff )
755771 head -> rc_page_count ++ ;
756772
757- sg_set_page ( sg , rqstp -> rq_pages [ head -> rc_curpage ],
758- seg_len , head -> rc_pageoff );
759- sg = sg_next ( sg );
773+ bvec_set_page ( & ctxt -> rw_bvec [ bvec_idx ],
774+ rqstp -> rq_pages [ head -> rc_curpage ],
775+ seg_len , head -> rc_pageoff );
760776
761777 head -> rc_pageoff += seg_len ;
762778 if (head -> rc_pageoff == PAGE_SIZE ) {
@@ -770,7 +786,8 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
770786 }
771787
772788 ret = svc_rdma_rw_ctx_init (rdma , ctxt , segment -> rs_offset ,
773- segment -> rs_handle , DMA_FROM_DEVICE );
789+ segment -> rs_handle , segment -> rs_length ,
790+ DMA_FROM_DEVICE );
774791 if (ret < 0 )
775792 return - EIO ;
776793 percpu_counter_inc (& svcrdma_stat_read );
0 commit comments