@@ -122,6 +122,36 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num,
122122 return count ;
123123}
124124
125+ static int rdma_rw_init_reg_wr (struct rdma_rw_reg_ctx * reg ,
126+ struct rdma_rw_reg_ctx * prev , struct ib_qp * qp , u32 port_num ,
127+ u64 remote_addr , u32 rkey , enum dma_data_direction dir )
128+ {
129+ if (prev ) {
130+ if (reg -> mr -> need_inval )
131+ prev -> wr .wr .next = & reg -> inv_wr ;
132+ else
133+ prev -> wr .wr .next = & reg -> reg_wr .wr ;
134+ }
135+
136+ reg -> reg_wr .wr .next = & reg -> wr .wr ;
137+
138+ reg -> wr .wr .sg_list = & reg -> sge ;
139+ reg -> wr .wr .num_sge = 1 ;
140+ reg -> wr .remote_addr = remote_addr ;
141+ reg -> wr .rkey = rkey ;
142+
143+ if (dir == DMA_TO_DEVICE ) {
144+ reg -> wr .wr .opcode = IB_WR_RDMA_WRITE ;
145+ } else if (!rdma_cap_read_inv (qp -> device , port_num )) {
146+ reg -> wr .wr .opcode = IB_WR_RDMA_READ ;
147+ } else {
148+ reg -> wr .wr .opcode = IB_WR_RDMA_READ_WITH_INV ;
149+ reg -> wr .wr .ex .invalidate_rkey = reg -> mr -> lkey ;
150+ }
151+
152+ return 1 ;
153+ }
154+
125155static int rdma_rw_init_mr_wrs (struct rdma_rw_ctx * ctx , struct ib_qp * qp ,
126156 u32 port_num , struct scatterlist * sg , u32 sg_cnt , u32 offset ,
127157 u64 remote_addr , u32 rkey , enum dma_data_direction dir )
@@ -147,30 +177,8 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
147177 if (ret < 0 )
148178 goto out_free ;
149179 count += ret ;
150-
151- if (prev ) {
152- if (reg -> mr -> need_inval )
153- prev -> wr .wr .next = & reg -> inv_wr ;
154- else
155- prev -> wr .wr .next = & reg -> reg_wr .wr ;
156- }
157-
158- reg -> reg_wr .wr .next = & reg -> wr .wr ;
159-
160- reg -> wr .wr .sg_list = & reg -> sge ;
161- reg -> wr .wr .num_sge = 1 ;
162- reg -> wr .remote_addr = remote_addr ;
163- reg -> wr .rkey = rkey ;
164- if (dir == DMA_TO_DEVICE ) {
165- reg -> wr .wr .opcode = IB_WR_RDMA_WRITE ;
166- } else if (!rdma_cap_read_inv (qp -> device , port_num )) {
167- reg -> wr .wr .opcode = IB_WR_RDMA_READ ;
168- } else {
169- reg -> wr .wr .opcode = IB_WR_RDMA_READ_WITH_INV ;
170- reg -> wr .wr .ex .invalidate_rkey = reg -> mr -> lkey ;
171- }
172- count ++ ;
173-
180+ count += rdma_rw_init_reg_wr (reg , prev , qp , port_num ,
181+ remote_addr , rkey , dir );
174182 remote_addr += reg -> sge .length ;
175183 sg_cnt -= nents ;
176184 for (j = 0 ; j < nents ; j ++ )
@@ -193,6 +201,92 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
193201 return ret ;
194202}
195203
204+ static int rdma_rw_init_mr_wrs_bvec (struct rdma_rw_ctx * ctx , struct ib_qp * qp ,
205+ u32 port_num , const struct bio_vec * bvecs , u32 nr_bvec ,
206+ struct bvec_iter * iter , u64 remote_addr , u32 rkey ,
207+ enum dma_data_direction dir )
208+ {
209+ struct ib_device * dev = qp -> pd -> device ;
210+ struct rdma_rw_reg_ctx * prev = NULL ;
211+ u32 pages_per_mr = rdma_rw_fr_page_list_len (dev , qp -> integrity_en );
212+ struct scatterlist * sg ;
213+ int i , ret , count = 0 ;
214+ u32 nents = 0 ;
215+
216+ ctx -> reg = kcalloc (DIV_ROUND_UP (nr_bvec , pages_per_mr ),
217+ sizeof (* ctx -> reg ), GFP_KERNEL );
218+ if (!ctx -> reg )
219+ return - ENOMEM ;
220+
221+ /*
222+ * Build scatterlist from bvecs using the iterator. This follows
223+ * the pattern from __blk_rq_map_sg.
224+ */
225+ ctx -> reg [0 ].sgt .sgl = kmalloc_array (nr_bvec ,
226+ sizeof (* ctx -> reg [0 ].sgt .sgl ),
227+ GFP_KERNEL );
228+ if (!ctx -> reg [0 ].sgt .sgl ) {
229+ ret = - ENOMEM ;
230+ goto out_free_reg ;
231+ }
232+ sg_init_table (ctx -> reg [0 ].sgt .sgl , nr_bvec );
233+
234+ for (sg = ctx -> reg [0 ].sgt .sgl ; iter -> bi_size ; sg = sg_next (sg )) {
235+ struct bio_vec bv = mp_bvec_iter_bvec (bvecs , * iter );
236+
237+ if (nents >= nr_bvec ) {
238+ ret = - EINVAL ;
239+ goto out_free_sgl ;
240+ }
241+ sg_set_page (sg , bv .bv_page , bv .bv_len , bv .bv_offset );
242+ bvec_iter_advance (bvecs , iter , bv .bv_len );
243+ nents ++ ;
244+ }
245+ sg_mark_end (sg_last (ctx -> reg [0 ].sgt .sgl , nents ));
246+ ctx -> reg [0 ].sgt .orig_nents = nents ;
247+
248+ /* DMA map the scatterlist */
249+ ret = ib_dma_map_sgtable_attrs (dev , & ctx -> reg [0 ].sgt , dir , 0 );
250+ if (ret )
251+ goto out_free_sgl ;
252+
253+ ctx -> nr_ops = DIV_ROUND_UP (ctx -> reg [0 ].sgt .nents , pages_per_mr );
254+
255+ sg = ctx -> reg [0 ].sgt .sgl ;
256+ nents = ctx -> reg [0 ].sgt .nents ;
257+ for (i = 0 ; i < ctx -> nr_ops ; i ++ ) {
258+ struct rdma_rw_reg_ctx * reg = & ctx -> reg [i ];
259+ u32 sge_cnt = min (nents , pages_per_mr );
260+
261+ ret = rdma_rw_init_one_mr (qp , port_num , reg , sg , sge_cnt , 0 );
262+ if (ret < 0 )
263+ goto out_free_mrs ;
264+ count += ret ;
265+ count += rdma_rw_init_reg_wr (reg , prev , qp , port_num ,
266+ remote_addr , rkey , dir );
267+ remote_addr += reg -> sge .length ;
268+ nents -= sge_cnt ;
269+ sg += sge_cnt ;
270+ prev = reg ;
271+ }
272+
273+ if (prev )
274+ prev -> wr .wr .next = NULL ;
275+
276+ ctx -> type = RDMA_RW_MR ;
277+ return count ;
278+
279+ out_free_mrs :
280+ while (-- i >= 0 )
281+ ib_mr_pool_put (qp , & qp -> rdma_mrs , ctx -> reg [i ].mr );
282+ ib_dma_unmap_sgtable_attrs (dev , & ctx -> reg [0 ].sgt , dir , 0 );
283+ out_free_sgl :
284+ kfree (ctx -> reg [0 ].sgt .sgl );
285+ out_free_reg :
286+ kfree (ctx -> reg );
287+ return ret ;
288+ }
289+
196290static int rdma_rw_init_map_wrs (struct rdma_rw_ctx * ctx , struct ib_qp * qp ,
197291 struct scatterlist * sg , u32 sg_cnt , u32 offset ,
198292 u64 remote_addr , u32 rkey , enum dma_data_direction dir )
@@ -547,48 +641,62 @@ EXPORT_SYMBOL(rdma_rw_ctx_init);
547641 * @rkey: remote key to operate on
548642 * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
549643 *
550- * Accepts bio_vec arrays directly, avoiding scatterlist conversion for
551- * callers that already have data in bio_vec form. Prefer this over
552- * rdma_rw_ctx_init() when the source data is a bio_vec array.
553- *
554- * This function does not support devices requiring memory registration.
555- * iWARP devices and configurations with force_mr=1 should use
556- * rdma_rw_ctx_init() with a scatterlist instead.
644+ * Maps the bio_vec array directly, avoiding intermediate scatterlist
645+ * conversion. Supports MR registration for iWARP devices and force_mr mode.
557646 *
558647 * Returns the number of WQEs that will be needed on the workqueue if
559648 * successful, or a negative error code:
560649 *
561650 * * -EINVAL - @nr_bvec is zero or @iter.bi_size is zero
562- * * -EOPNOTSUPP - device requires MR path (iWARP or force_mr=1)
563651 * * -ENOMEM - DMA mapping or memory allocation failed
564652 */
565653int rdma_rw_ctx_init_bvec (struct rdma_rw_ctx * ctx , struct ib_qp * qp ,
566654 u32 port_num , const struct bio_vec * bvecs , u32 nr_bvec ,
567655 struct bvec_iter iter , u64 remote_addr , u32 rkey ,
568656 enum dma_data_direction dir )
569657{
658+ struct ib_device * dev = qp -> pd -> device ;
570659 int ret ;
571660
572661 if (nr_bvec == 0 || iter .bi_size == 0 )
573662 return - EINVAL ;
574663
575- /* MR path not supported for bvec - reject iWARP and force_mr */
576- if (rdma_rw_io_needs_mr (qp -> device , port_num , dir , nr_bvec ))
577- return - EOPNOTSUPP ;
664+ /*
665+ * iWARP requires MR registration for all RDMA READs. The force_mr
666+ * debug option also mandates MR usage.
667+ */
668+ if (dir == DMA_FROM_DEVICE && rdma_protocol_iwarp (dev , port_num ))
669+ return rdma_rw_init_mr_wrs_bvec (ctx , qp , port_num , bvecs ,
670+ nr_bvec , & iter , remote_addr ,
671+ rkey , dir );
672+ if (unlikely (rdma_rw_force_mr ))
673+ return rdma_rw_init_mr_wrs_bvec (ctx , qp , port_num , bvecs ,
674+ nr_bvec , & iter , remote_addr ,
675+ rkey , dir );
578676
579677 if (nr_bvec == 1 )
580678 return rdma_rw_init_single_wr_bvec (ctx , qp , bvecs , & iter ,
581679 remote_addr , rkey , dir );
582680
583681 /*
584682 * Try IOVA-based mapping first for multi-bvec transfers.
585- * This reduces IOTLB sync overhead by batching all mappings.
683+ * IOVA coalesces bvecs into a single DMA-contiguous region,
684+ * reducing the number of WRs needed and avoiding MR overhead.
586685 */
587686 ret = rdma_rw_init_iova_wrs_bvec (ctx , qp , bvecs , & iter , remote_addr ,
588687 rkey , dir );
589688 if (ret != - EOPNOTSUPP )
590689 return ret ;
591690
691+ /*
692+ * IOVA mapping not available. Check if MR registration provides
693+ * better performance than multiple SGE entries.
694+ */
695+ if (rdma_rw_io_needs_mr (dev , port_num , dir , nr_bvec ))
696+ return rdma_rw_init_mr_wrs_bvec (ctx , qp , port_num , bvecs ,
697+ nr_bvec , & iter , remote_addr ,
698+ rkey , dir );
699+
592700 return rdma_rw_init_map_wrs_bvec (ctx , qp , bvecs , nr_bvec , & iter ,
593701 remote_addr , rkey , dir );
594702}
@@ -833,6 +941,8 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
833941
834942 switch (ctx -> type ) {
835943 case RDMA_RW_MR :
944+ /* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */
945+ WARN_ON_ONCE (ctx -> reg [0 ].sgt .sgl );
836946 for (i = 0 ; i < ctx -> nr_ops ; i ++ )
837947 ib_mr_pool_put (qp , & qp -> rdma_mrs , ctx -> reg [i ].mr );
838948 kfree (ctx -> reg );
@@ -880,6 +990,13 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
880990 u32 i ;
881991
882992 switch (ctx -> type ) {
993+ case RDMA_RW_MR :
994+ for (i = 0 ; i < ctx -> nr_ops ; i ++ )
995+ ib_mr_pool_put (qp , & qp -> rdma_mrs , ctx -> reg [i ].mr );
996+ ib_dma_unmap_sgtable_attrs (dev , & ctx -> reg [0 ].sgt , dir , 0 );
997+ kfree (ctx -> reg [0 ].sgt .sgl );
998+ kfree (ctx -> reg );
999+ break ;
8831000 case RDMA_RW_IOVA :
8841001 dma_iova_destroy (dev -> dma_device , & ctx -> iova .state ,
8851002 ctx -> iova .mapped_len , dir , 0 );
0 commit comments