Skip to content

Commit bea28ac

Browse files
chuckleverrleon
authored andcommitted
RDMA/core: add MR support for bvec-based RDMA operations
The bvec-based RDMA API currently returns -EOPNOTSUPP when Memory Region registration is required. This prevents iWARP devices from using the bvec path, since iWARP requires MR registration for RDMA READ operations. The force_mr debug parameter is also unusable with bvec input. Add rdma_rw_init_mr_wrs_bvec() to handle MR registration for bvec arrays. The approach creates a synthetic scatterlist populated with DMA addresses from the bvecs, then reuses the existing ib_map_mr_sg() infrastructure. This avoids driver changes while keeping the implementation small. The synthetic scatterlist is stored in the rdma_rw_ctx for cleanup. On destroy, the MRs are returned to the pool and the bvec DMA mappings are released using the stored addresses. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Link: https://patch.msgid.link/20260128005400.25147-4-cel@kernel.org Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Leon Romanovsky <leon@kernel.org>
1 parent 853e892 commit bea28ac

2 files changed

Lines changed: 154 additions & 36 deletions

File tree

  • drivers/infiniband/core
  • include/rdma

drivers/infiniband/core/rw.c

Lines changed: 153 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,36 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num,
122122
return count;
123123
}
124124

125+
static int rdma_rw_init_reg_wr(struct rdma_rw_reg_ctx *reg,
126+
struct rdma_rw_reg_ctx *prev, struct ib_qp *qp, u32 port_num,
127+
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
128+
{
129+
if (prev) {
130+
if (reg->mr->need_inval)
131+
prev->wr.wr.next = &reg->inv_wr;
132+
else
133+
prev->wr.wr.next = &reg->reg_wr.wr;
134+
}
135+
136+
reg->reg_wr.wr.next = &reg->wr.wr;
137+
138+
reg->wr.wr.sg_list = &reg->sge;
139+
reg->wr.wr.num_sge = 1;
140+
reg->wr.remote_addr = remote_addr;
141+
reg->wr.rkey = rkey;
142+
143+
if (dir == DMA_TO_DEVICE) {
144+
reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
145+
} else if (!rdma_cap_read_inv(qp->device, port_num)) {
146+
reg->wr.wr.opcode = IB_WR_RDMA_READ;
147+
} else {
148+
reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
149+
reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
150+
}
151+
152+
return 1;
153+
}
154+
125155
static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
126156
u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
127157
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
@@ -147,30 +177,8 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
147177
if (ret < 0)
148178
goto out_free;
149179
count += ret;
150-
151-
if (prev) {
152-
if (reg->mr->need_inval)
153-
prev->wr.wr.next = &reg->inv_wr;
154-
else
155-
prev->wr.wr.next = &reg->reg_wr.wr;
156-
}
157-
158-
reg->reg_wr.wr.next = &reg->wr.wr;
159-
160-
reg->wr.wr.sg_list = &reg->sge;
161-
reg->wr.wr.num_sge = 1;
162-
reg->wr.remote_addr = remote_addr;
163-
reg->wr.rkey = rkey;
164-
if (dir == DMA_TO_DEVICE) {
165-
reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
166-
} else if (!rdma_cap_read_inv(qp->device, port_num)) {
167-
reg->wr.wr.opcode = IB_WR_RDMA_READ;
168-
} else {
169-
reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
170-
reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
171-
}
172-
count++;
173-
180+
count += rdma_rw_init_reg_wr(reg, prev, qp, port_num,
181+
remote_addr, rkey, dir);
174182
remote_addr += reg->sge.length;
175183
sg_cnt -= nents;
176184
for (j = 0; j < nents; j++)
@@ -193,6 +201,92 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
193201
return ret;
194202
}
195203

204+
static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
205+
u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
206+
struct bvec_iter *iter, u64 remote_addr, u32 rkey,
207+
enum dma_data_direction dir)
208+
{
209+
struct ib_device *dev = qp->pd->device;
210+
struct rdma_rw_reg_ctx *prev = NULL;
211+
u32 pages_per_mr = rdma_rw_fr_page_list_len(dev, qp->integrity_en);
212+
struct scatterlist *sg;
213+
int i, ret, count = 0;
214+
u32 nents = 0;
215+
216+
ctx->reg = kcalloc(DIV_ROUND_UP(nr_bvec, pages_per_mr),
217+
sizeof(*ctx->reg), GFP_KERNEL);
218+
if (!ctx->reg)
219+
return -ENOMEM;
220+
221+
/*
222+
* Build scatterlist from bvecs using the iterator. This follows
223+
* the pattern from __blk_rq_map_sg.
224+
*/
225+
ctx->reg[0].sgt.sgl = kmalloc_array(nr_bvec,
226+
sizeof(*ctx->reg[0].sgt.sgl),
227+
GFP_KERNEL);
228+
if (!ctx->reg[0].sgt.sgl) {
229+
ret = -ENOMEM;
230+
goto out_free_reg;
231+
}
232+
sg_init_table(ctx->reg[0].sgt.sgl, nr_bvec);
233+
234+
for (sg = ctx->reg[0].sgt.sgl; iter->bi_size; sg = sg_next(sg)) {
235+
struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter);
236+
237+
if (nents >= nr_bvec) {
238+
ret = -EINVAL;
239+
goto out_free_sgl;
240+
}
241+
sg_set_page(sg, bv.bv_page, bv.bv_len, bv.bv_offset);
242+
bvec_iter_advance(bvecs, iter, bv.bv_len);
243+
nents++;
244+
}
245+
sg_mark_end(sg_last(ctx->reg[0].sgt.sgl, nents));
246+
ctx->reg[0].sgt.orig_nents = nents;
247+
248+
/* DMA map the scatterlist */
249+
ret = ib_dma_map_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
250+
if (ret)
251+
goto out_free_sgl;
252+
253+
ctx->nr_ops = DIV_ROUND_UP(ctx->reg[0].sgt.nents, pages_per_mr);
254+
255+
sg = ctx->reg[0].sgt.sgl;
256+
nents = ctx->reg[0].sgt.nents;
257+
for (i = 0; i < ctx->nr_ops; i++) {
258+
struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
259+
u32 sge_cnt = min(nents, pages_per_mr);
260+
261+
ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sge_cnt, 0);
262+
if (ret < 0)
263+
goto out_free_mrs;
264+
count += ret;
265+
count += rdma_rw_init_reg_wr(reg, prev, qp, port_num,
266+
remote_addr, rkey, dir);
267+
remote_addr += reg->sge.length;
268+
nents -= sge_cnt;
269+
sg += sge_cnt;
270+
prev = reg;
271+
}
272+
273+
if (prev)
274+
prev->wr.wr.next = NULL;
275+
276+
ctx->type = RDMA_RW_MR;
277+
return count;
278+
279+
out_free_mrs:
280+
while (--i >= 0)
281+
ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
282+
ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
283+
out_free_sgl:
284+
kfree(ctx->reg[0].sgt.sgl);
285+
out_free_reg:
286+
kfree(ctx->reg);
287+
return ret;
288+
}
289+
196290
static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
197291
struct scatterlist *sg, u32 sg_cnt, u32 offset,
198292
u64 remote_addr, u32 rkey, enum dma_data_direction dir)
@@ -547,48 +641,62 @@ EXPORT_SYMBOL(rdma_rw_ctx_init);
547641
* @rkey: remote key to operate on
548642
* @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
549643
*
550-
* Accepts bio_vec arrays directly, avoiding scatterlist conversion for
551-
* callers that already have data in bio_vec form. Prefer this over
552-
* rdma_rw_ctx_init() when the source data is a bio_vec array.
553-
*
554-
* This function does not support devices requiring memory registration.
555-
* iWARP devices and configurations with force_mr=1 should use
556-
* rdma_rw_ctx_init() with a scatterlist instead.
644+
* Maps the bio_vec array directly, avoiding intermediate scatterlist
645+
* conversion. Supports MR registration for iWARP devices and force_mr mode.
557646
*
558647
* Returns the number of WQEs that will be needed on the workqueue if
559648
* successful, or a negative error code:
560649
*
561650
* * -EINVAL - @nr_bvec is zero or @iter.bi_size is zero
562-
* * -EOPNOTSUPP - device requires MR path (iWARP or force_mr=1)
563651
* * -ENOMEM - DMA mapping or memory allocation failed
564652
*/
565653
int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
566654
u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec,
567655
struct bvec_iter iter, u64 remote_addr, u32 rkey,
568656
enum dma_data_direction dir)
569657
{
658+
struct ib_device *dev = qp->pd->device;
570659
int ret;
571660

572661
if (nr_bvec == 0 || iter.bi_size == 0)
573662
return -EINVAL;
574663

575-
/* MR path not supported for bvec - reject iWARP and force_mr */
576-
if (rdma_rw_io_needs_mr(qp->device, port_num, dir, nr_bvec))
577-
return -EOPNOTSUPP;
664+
/*
665+
* iWARP requires MR registration for all RDMA READs. The force_mr
666+
* debug option also mandates MR usage.
667+
*/
668+
if (dir == DMA_FROM_DEVICE && rdma_protocol_iwarp(dev, port_num))
669+
return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
670+
nr_bvec, &iter, remote_addr,
671+
rkey, dir);
672+
if (unlikely(rdma_rw_force_mr))
673+
return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
674+
nr_bvec, &iter, remote_addr,
675+
rkey, dir);
578676

579677
if (nr_bvec == 1)
580678
return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter,
581679
remote_addr, rkey, dir);
582680

583681
/*
584682
* Try IOVA-based mapping first for multi-bvec transfers.
585-
* This reduces IOTLB sync overhead by batching all mappings.
683+
* IOVA coalesces bvecs into a single DMA-contiguous region,
684+
* reducing the number of WRs needed and avoiding MR overhead.
586685
*/
587686
ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr,
588687
rkey, dir);
589688
if (ret != -EOPNOTSUPP)
590689
return ret;
591690

691+
/*
692+
* IOVA mapping not available. Check if MR registration provides
693+
* better performance than multiple SGE entries.
694+
*/
695+
if (rdma_rw_io_needs_mr(dev, port_num, dir, nr_bvec))
696+
return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs,
697+
nr_bvec, &iter, remote_addr,
698+
rkey, dir);
699+
592700
return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter,
593701
remote_addr, rkey, dir);
594702
}
@@ -833,6 +941,8 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
833941

834942
switch (ctx->type) {
835943
case RDMA_RW_MR:
944+
/* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */
945+
WARN_ON_ONCE(ctx->reg[0].sgt.sgl);
836946
for (i = 0; i < ctx->nr_ops; i++)
837947
ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
838948
kfree(ctx->reg);
@@ -880,6 +990,13 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
880990
u32 i;
881991

882992
switch (ctx->type) {
993+
case RDMA_RW_MR:
994+
for (i = 0; i < ctx->nr_ops; i++)
995+
ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
996+
ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0);
997+
kfree(ctx->reg[0].sgt.sgl);
998+
kfree(ctx->reg);
999+
break;
8831000
case RDMA_RW_IOVA:
8841001
dma_iova_destroy(dev->dma_device, &ctx->iova.state,
8851002
ctx->iova.mapped_len, dir, 0);

include/rdma/rw.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ struct rdma_rw_ctx {
4747
struct ib_reg_wr reg_wr;
4848
struct ib_send_wr inv_wr;
4949
struct ib_mr *mr;
50+
struct sg_table sgt;
5051
} *reg;
5152
};
5253
};

0 commit comments

Comments
 (0)