Skip to content

Commit 853e892

Browse files
chuckleverrleon
authored andcommitted
RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations
The bvec RDMA API maps each bvec individually via dma_map_phys(), requiring an IOTLB sync for each mapping. For large I/O operations with many bvecs, this overhead becomes significant. The two-step IOVA API (dma_iova_try_alloc / dma_iova_link / dma_iova_sync) allocates a contiguous IOVA range upfront, links all physical pages without IOTLB syncs, then performs a single sync at the end. This reduces IOTLB flushes from O(n) to O(1). It also requires only a single output dma_addr_t compared to extra per-input element storage in struct scatterlist. Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Link: https://patch.msgid.link/20260128005400.25147-3-cel@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org>
1 parent 5e54155 commit 853e892

2 files changed

Lines changed: 114 additions & 0 deletions

File tree

  • drivers/infiniband/core
  • include/rdma

drivers/infiniband/core/rw.c

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ enum {
1414
RDMA_RW_MULTI_WR,
1515
RDMA_RW_MR,
1616
RDMA_RW_SIG_MR,
17+
RDMA_RW_IOVA,
1718
};
1819

1920
static bool rdma_rw_force_mr;
@@ -383,6 +384,87 @@ static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
383384
return -ENOMEM;
384385
}
385386

387+
/*
388+
* Try to use the two-step IOVA API to map bvecs into a contiguous DMA range.
389+
* This reduces IOTLB sync overhead by doing one sync at the end instead of
390+
* one per bvec, and produces a contiguous DMA address range that can be
391+
* described by a single SGE.
392+
*
393+
* Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA
394+
* mapping is not available, or another negative error code on failure.
395+
*/
396+
static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx,
397+
struct ib_qp *qp, const struct bio_vec *bvec,
398+
struct bvec_iter *iter, u64 remote_addr, u32 rkey,
399+
enum dma_data_direction dir)
400+
{
401+
struct ib_device *dev = qp->pd->device;
402+
struct device *dma_dev = dev->dma_device;
403+
size_t total_len = iter->bi_size;
404+
struct bio_vec first_bv;
405+
size_t mapped_len = 0;
406+
int ret;
407+
408+
/* Virtual DMA devices cannot support IOVA allocators */
409+
if (ib_uses_virt_dma(dev))
410+
return -EOPNOTSUPP;
411+
412+
/* Try to allocate contiguous IOVA space */
413+
first_bv = mp_bvec_iter_bvec(bvec, *iter);
414+
if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state,
415+
bvec_phys(&first_bv), total_len))
416+
return -EOPNOTSUPP;
417+
418+
/* Link all bvecs into the IOVA space */
419+
while (iter->bi_size) {
420+
struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter);
421+
422+
ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv),
423+
mapped_len, bv.bv_len, dir, 0);
424+
if (ret)
425+
goto out_destroy;
426+
427+
mapped_len += bv.bv_len;
428+
bvec_iter_advance(bvec, iter, bv.bv_len);
429+
}
430+
431+
/* Sync the IOTLB once for all linked pages */
432+
ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len);
433+
if (ret)
434+
goto out_destroy;
435+
436+
ctx->iova.mapped_len = mapped_len;
437+
438+
/* Single SGE covers the entire contiguous IOVA range */
439+
ctx->iova.sge.addr = ctx->iova.state.addr;
440+
ctx->iova.sge.length = mapped_len;
441+
ctx->iova.sge.lkey = qp->pd->local_dma_lkey;
442+
443+
/* Single WR for the whole transfer */
444+
memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr));
445+
if (dir == DMA_TO_DEVICE)
446+
ctx->iova.wr.wr.opcode = IB_WR_RDMA_WRITE;
447+
else
448+
ctx->iova.wr.wr.opcode = IB_WR_RDMA_READ;
449+
ctx->iova.wr.wr.num_sge = 1;
450+
ctx->iova.wr.wr.sg_list = &ctx->iova.sge;
451+
ctx->iova.wr.remote_addr = remote_addr;
452+
ctx->iova.wr.rkey = rkey;
453+
454+
ctx->type = RDMA_RW_IOVA;
455+
ctx->nr_ops = 1;
456+
return 1;
457+
458+
out_destroy:
459+
/*
460+
* dma_iova_destroy() expects the actual mapped length, not the
461+
* total allocation size. It unlinks only the successfully linked
462+
* range and frees the entire IOVA allocation.
463+
*/
464+
dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0);
465+
return ret;
466+
}
467+
386468
/**
387469
* rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
388470
* @ctx: context to initialize
@@ -485,6 +567,8 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
485567
struct bvec_iter iter, u64 remote_addr, u32 rkey,
486568
enum dma_data_direction dir)
487569
{
570+
int ret;
571+
488572
if (nr_bvec == 0 || iter.bi_size == 0)
489573
return -EINVAL;
490574

@@ -495,6 +579,16 @@ int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
495579
if (nr_bvec == 1)
496580
return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter,
497581
remote_addr, rkey, dir);
582+
583+
/*
584+
* Try IOVA-based mapping first for multi-bvec transfers.
585+
* This reduces IOTLB sync overhead by batching all mappings.
586+
*/
587+
ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr,
588+
rkey, dir);
589+
if (ret != -EOPNOTSUPP)
590+
return ret;
591+
498592
return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter,
499593
remote_addr, rkey, dir);
500594
}
@@ -671,6 +765,10 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
671765
first_wr = &ctx->reg[0].reg_wr.wr;
672766
last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
673767
break;
768+
case RDMA_RW_IOVA:
769+
first_wr = &ctx->iova.wr.wr;
770+
last_wr = &ctx->iova.wr.wr;
771+
break;
674772
case RDMA_RW_MULTI_WR:
675773
first_wr = &ctx->map.wrs[0].wr;
676774
last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
@@ -745,6 +843,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
745843
break;
746844
case RDMA_RW_SINGLE_WR:
747845
break;
846+
case RDMA_RW_IOVA:
847+
/* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */
848+
WARN_ON_ONCE(1);
849+
return;
748850
default:
749851
BUG();
750852
break;
@@ -778,6 +880,10 @@ void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
778880
u32 i;
779881

780882
switch (ctx->type) {
883+
case RDMA_RW_IOVA:
884+
dma_iova_destroy(dev->dma_device, &ctx->iova.state,
885+
ctx->iova.mapped_len, dir, 0);
886+
break;
781887
case RDMA_RW_MULTI_WR:
782888
for (i = 0; i < nr_bvec; i++)
783889
ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr,

include/rdma/rw.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,14 @@ struct rdma_rw_ctx {
3232
struct ib_rdma_wr *wrs;
3333
} map;
3434

35+
/* for IOVA-based mapping of bvecs into contiguous DMA range: */
36+
struct {
37+
struct dma_iova_state state;
38+
struct ib_sge sge;
39+
struct ib_rdma_wr wr;
40+
size_t mapped_len;
41+
} iova;
42+
3543
/* for registering multiple WRs: */
3644
struct rdma_rw_reg_ctx {
3745
struct ib_sge sge;

0 commit comments

Comments
 (0)