Skip to content

Commit f43fdeb

Browse files
committed
Merge branch 'loop-aio-nowait' into for-6.19/block
Merge async IO IOCB_NOWAIT support from Ming: "This patchset improves loop aio perf by using IOCB_NOWAIT for avoiding to queue aio command to workqueue context, meantime refactor lo_rw_aio() a bit. In my test VM, loop disk perf becomes very close to perf of the backing block device(nvme/mq virtio-scsi). And Mikulas verified that this way can improve 12jobs sequential readwrite io by ~5X, and basically solve the reported problem together with loop MQ change. https://lore.kernel.org/linux-block/a8e5c76a-231f-07d1-a394-847de930f638@redhat.com/ Zhaoyang Huang also mentioned it may fix their performance issue on Android use case. The loop MQ change will be posted as standalone patch, because it needs UAPI change." Link: https://lore.kernel.org/linux-block/20251015110735.1361261-1-ming.lei@redhat.com/ Signed-off-by: Jens Axboe <axboe@kernel.dk> * loop-aio-nowait: loop: add hint for handling aio via IOCB_NOWAIT loop: try to handle loop aio command via NOWAIT IO first loop: move command blkcg/memcg initialization into loop_queue_work loop: add lo_submit_rw_aio() loop: add helper lo_rw_aio_prep() loop: add helper lo_cmd_nr_bvec()
2 parents 2c6d792 + 837ed30 commit f43fdeb

1 file changed

Lines changed: 194 additions & 39 deletions

File tree

drivers/block/loop.c

Lines changed: 194 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ struct loop_device {
6868
struct rb_root worker_tree;
6969
struct timer_list timer;
7070
bool sysfs_inited;
71+
unsigned lo_nr_blocking_writes;
7172

7273
struct request_queue *lo_queue;
7374
struct blk_mq_tag_set tag_set;
@@ -90,6 +91,8 @@ struct loop_cmd {
9091
#define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
9192
#define LOOP_DEFAULT_HW_Q_DEPTH 128
9293

94+
static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd);
95+
9396
static DEFINE_IDR(loop_index_idr);
9497
static DEFINE_MUTEX(loop_ctl_mutex);
9598
static DEFINE_MUTEX(loop_validate_mutex);
@@ -321,6 +324,15 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
321324

322325
if (!atomic_dec_and_test(&cmd->ref))
323326
return;
327+
328+
/* -EAGAIN could be returned from bdev's ->ki_complete */
329+
if (cmd->ret == -EAGAIN) {
330+
struct loop_device *lo = rq->q->queuedata;
331+
332+
loop_queue_work(lo, cmd);
333+
return;
334+
}
335+
324336
kfree(cmd->bvec);
325337
cmd->bvec = NULL;
326338
if (req_op(rq) == REQ_OP_WRITE)
@@ -337,24 +349,28 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret)
337349
lo_rw_aio_do_completion(cmd);
338350
}
339351

340-
static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
341-
loff_t pos, int rw)
352+
static inline unsigned lo_cmd_nr_bvec(struct loop_cmd *cmd)
342353
{
343-
struct iov_iter iter;
344-
struct req_iterator rq_iter;
345-
struct bio_vec *bvec;
346354
struct request *rq = blk_mq_rq_from_pdu(cmd);
347-
struct bio *bio = rq->bio;
348-
struct file *file = lo->lo_backing_file;
355+
struct req_iterator rq_iter;
349356
struct bio_vec tmp;
350-
unsigned int offset;
351357
int nr_bvec = 0;
352-
int ret;
353358

354359
rq_for_each_bvec(tmp, rq, rq_iter)
355360
nr_bvec++;
356361

362+
return nr_bvec;
363+
}
364+
365+
static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd,
366+
unsigned nr_bvec, loff_t pos)
367+
{
368+
struct request *rq = blk_mq_rq_from_pdu(cmd);
369+
357370
if (rq->bio != rq->biotail) {
371+
struct req_iterator rq_iter;
372+
struct bio_vec *bvec;
373+
struct bio_vec tmp;
358374

359375
bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
360376
GFP_NOIO);
@@ -372,24 +388,12 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
372388
*bvec = tmp;
373389
bvec++;
374390
}
375-
bvec = cmd->bvec;
376-
offset = 0;
377391
} else {
378-
/*
379-
* Same here, this bio may be started from the middle of the
380-
* 'bvec' because of bio splitting, so offset from the bvec
381-
* must be passed to iov iterator
382-
*/
383-
offset = bio->bi_iter.bi_bvec_done;
384-
bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
392+
cmd->bvec = NULL;
385393
}
386-
atomic_set(&cmd->ref, 2);
387-
388-
iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
389-
iter.iov_offset = offset;
390394

391395
cmd->iocb.ki_pos = pos;
392-
cmd->iocb.ki_filp = file;
396+
cmd->iocb.ki_filp = lo->lo_backing_file;
393397
cmd->iocb.ki_ioprio = req_get_ioprio(rq);
394398
if (cmd->use_aio) {
395399
cmd->iocb.ki_complete = lo_rw_aio_complete;
@@ -398,6 +402,35 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
398402
cmd->iocb.ki_complete = NULL;
399403
cmd->iocb.ki_flags = 0;
400404
}
405+
return 0;
406+
}
407+
408+
static int lo_submit_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
409+
int nr_bvec, int rw)
410+
{
411+
struct request *rq = blk_mq_rq_from_pdu(cmd);
412+
struct file *file = lo->lo_backing_file;
413+
struct iov_iter iter;
414+
int ret;
415+
416+
if (cmd->bvec) {
417+
iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
418+
iter.iov_offset = 0;
419+
} else {
420+
struct bio *bio = rq->bio;
421+
struct bio_vec *bvec = __bvec_iter_bvec(bio->bi_io_vec,
422+
bio->bi_iter);
423+
424+
/*
425+
* Same here, this bio may be started from the middle of the
426+
* 'bvec' because of bio splitting, so offset from the bvec
427+
* must be passed to iov iterator
428+
*/
429+
iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
430+
iter.iov_offset = bio->bi_iter.bi_bvec_done;
431+
}
432+
atomic_set(&cmd->ref, 2);
433+
401434

402435
if (rw == ITER_SOURCE) {
403436
kiocb_start_write(&cmd->iocb);
@@ -406,12 +439,84 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
406439
ret = file->f_op->read_iter(&cmd->iocb, &iter);
407440

408441
lo_rw_aio_do_completion(cmd);
442+
return ret;
443+
}
444+
445+
static bool lo_backfile_support_nowait(const struct loop_device *lo)
446+
{
447+
return lo->lo_backing_file->f_mode & FMODE_NOWAIT;
448+
}
409449

450+
static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
451+
loff_t pos, int rw)
452+
{
453+
int nr_bvec = lo_cmd_nr_bvec(cmd);
454+
int ret;
455+
456+
/* prepared already if we have tried nowait */
457+
if (!cmd->use_aio || !lo_backfile_support_nowait(lo)) {
458+
ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos);
459+
if (unlikely(ret))
460+
goto fail;
461+
}
462+
463+
cmd->iocb.ki_flags &= ~IOCB_NOWAIT;
464+
ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw);
465+
fail:
410466
if (ret != -EIOCBQUEUED)
411467
lo_rw_aio_complete(&cmd->iocb, ret);
412468
return -EIOCBQUEUED;
413469
}
414470

471+
static inline bool lo_aio_try_nowait(struct loop_device *lo,
472+
struct loop_cmd *cmd)
473+
{
474+
struct file *file = lo->lo_backing_file;
475+
struct inode *inode = file->f_mapping->host;
476+
struct request *rq = blk_mq_rq_from_pdu(cmd);
477+
478+
/* NOWAIT works fine for backing block device */
479+
if (S_ISBLK(inode->i_mode))
480+
return true;
481+
482+
/*
483+
* NOWAIT is supposed to be fine for READ without contending with
484+
* blocking WRITE
485+
*/
486+
if (req_op(rq) == REQ_OP_READ)
487+
return true;
488+
489+
/*
490+
* If there is any queued non-NOWAIT async WRITE , don't try new
491+
* NOWAIT WRITE for avoiding contention
492+
*
493+
* Here we focus on handling stable FS block mapping via NOWAIT
494+
*/
495+
return READ_ONCE(lo->lo_nr_blocking_writes) == 0;
496+
}
497+
498+
static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd,
499+
int rw)
500+
{
501+
struct request *rq = blk_mq_rq_from_pdu(cmd);
502+
loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
503+
int nr_bvec = lo_cmd_nr_bvec(cmd);
504+
int ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos);
505+
506+
if (unlikely(ret))
507+
goto fail;
508+
509+
if (!lo_aio_try_nowait(lo, cmd))
510+
return -EAGAIN;
511+
512+
cmd->iocb.ki_flags |= IOCB_NOWAIT;
513+
ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw);
514+
fail:
515+
if (ret != -EIOCBQUEUED && ret != -EAGAIN)
516+
lo_rw_aio_complete(&cmd->iocb, ret);
517+
return ret;
518+
}
519+
415520
static int do_req_filebacked(struct loop_device *lo, struct request *rq)
416521
{
417522
struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
@@ -706,12 +811,19 @@ static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
706811
return sysfs_emit(buf, "%s\n", dio ? "1" : "0");
707812
}
708813

814+
static ssize_t loop_attr_nr_blocking_writes_show(struct loop_device *lo,
815+
char *buf)
816+
{
817+
return sysfs_emit(buf, "%u\n", lo->lo_nr_blocking_writes);
818+
}
819+
709820
LOOP_ATTR_RO(backing_file);
710821
LOOP_ATTR_RO(offset);
711822
LOOP_ATTR_RO(sizelimit);
712823
LOOP_ATTR_RO(autoclear);
713824
LOOP_ATTR_RO(partscan);
714825
LOOP_ATTR_RO(dio);
826+
LOOP_ATTR_RO(nr_blocking_writes);
715827

716828
static struct attribute *loop_attrs[] = {
717829
&loop_attr_backing_file.attr,
@@ -720,6 +832,7 @@ static struct attribute *loop_attrs[] = {
720832
&loop_attr_autoclear.attr,
721833
&loop_attr_partscan.attr,
722834
&loop_attr_dio.attr,
835+
&loop_attr_nr_blocking_writes.attr,
723836
NULL,
724837
};
725838

@@ -795,13 +908,48 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
795908
}
796909
#endif
797910

911+
static inline void loop_inc_blocking_writes(struct loop_device *lo,
912+
struct loop_cmd *cmd)
913+
{
914+
lockdep_assert_held(&lo->lo_work_lock);
915+
916+
if (req_op(blk_mq_rq_from_pdu(cmd)) == REQ_OP_WRITE)
917+
lo->lo_nr_blocking_writes += 1;
918+
}
919+
920+
static inline void loop_dec_blocking_writes(struct loop_device *lo,
921+
struct loop_cmd *cmd)
922+
{
923+
lockdep_assert_held(&lo->lo_work_lock);
924+
925+
if (req_op(blk_mq_rq_from_pdu(cmd)) == REQ_OP_WRITE)
926+
lo->lo_nr_blocking_writes -= 1;
927+
}
928+
798929
static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
799930
{
931+
struct request __maybe_unused *rq = blk_mq_rq_from_pdu(cmd);
800932
struct rb_node **node, *parent = NULL;
801933
struct loop_worker *cur_worker, *worker = NULL;
802934
struct work_struct *work;
803935
struct list_head *cmd_list;
804936

937+
/* always use the first bio's css */
938+
cmd->blkcg_css = NULL;
939+
cmd->memcg_css = NULL;
940+
#ifdef CONFIG_BLK_CGROUP
941+
if (rq->bio) {
942+
cmd->blkcg_css = bio_blkcg_css(rq->bio);
943+
#ifdef CONFIG_MEMCG
944+
if (cmd->blkcg_css) {
945+
cmd->memcg_css =
946+
cgroup_get_e_css(cmd->blkcg_css->cgroup,
947+
&memory_cgrp_subsys);
948+
}
949+
#endif
950+
}
951+
#endif
952+
805953
spin_lock_irq(&lo->lo_work_lock);
806954

807955
if (queue_on_root_worker(cmd->blkcg_css))
@@ -860,6 +1008,8 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
8601008
work = &lo->rootcg_work;
8611009
cmd_list = &lo->rootcg_cmd_list;
8621010
}
1011+
if (cmd->use_aio)
1012+
loop_inc_blocking_writes(lo, cmd);
8631013
list_add_tail(&cmd->list_entry, cmd_list);
8641014
queue_work(lo->workqueue, work);
8651015
spin_unlock_irq(&lo->lo_work_lock);
@@ -1856,6 +2006,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
18562006
struct request *rq = bd->rq;
18572007
struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
18582008
struct loop_device *lo = rq->q->queuedata;
2009+
int rw = 0;
18592010

18602011
blk_mq_start_request(rq);
18612012

@@ -1868,26 +2019,27 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
18682019
case REQ_OP_WRITE_ZEROES:
18692020
cmd->use_aio = false;
18702021
break;
1871-
default:
2022+
case REQ_OP_READ:
2023+
rw = ITER_DEST;
2024+
cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
2025+
break;
2026+
case REQ_OP_WRITE:
2027+
rw = ITER_SOURCE;
18722028
cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
18732029
break;
2030+
default:
2031+
return BLK_STS_IOERR;
18742032
}
18752033

1876-
/* always use the first bio's css */
1877-
cmd->blkcg_css = NULL;
1878-
cmd->memcg_css = NULL;
1879-
#ifdef CONFIG_BLK_CGROUP
1880-
if (rq->bio) {
1881-
cmd->blkcg_css = bio_blkcg_css(rq->bio);
1882-
#ifdef CONFIG_MEMCG
1883-
if (cmd->blkcg_css) {
1884-
cmd->memcg_css =
1885-
cgroup_get_e_css(cmd->blkcg_css->cgroup,
1886-
&memory_cgrp_subsys);
1887-
}
1888-
#endif
2034+
/* try NOWAIT if the backing file supports the mode */
2035+
if (cmd->use_aio && lo_backfile_support_nowait(lo)) {
2036+
int res = lo_rw_aio_nowait(lo, cmd, rw);
2037+
2038+
if (res != -EAGAIN && res != -EOPNOTSUPP)
2039+
return BLK_STS_OK;
2040+
/* fallback to workqueue for handling aio */
18892041
}
1890-
#endif
2042+
18912043
loop_queue_work(lo, cmd);
18922044

18932045
return BLK_STS_OK;
@@ -1959,6 +2111,8 @@ static void loop_process_work(struct loop_worker *worker,
19592111
cond_resched();
19602112

19612113
spin_lock_irq(&lo->lo_work_lock);
2114+
if (cmd->use_aio)
2115+
loop_dec_blocking_writes(lo, cmd);
19622116
}
19632117

19642118
/*
@@ -2037,7 +2191,8 @@ static int loop_add(int i)
20372191
lo->tag_set.queue_depth = hw_queue_depth;
20382192
lo->tag_set.numa_node = NUMA_NO_NODE;
20392193
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
2040-
lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT;
2194+
lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT |
2195+
BLK_MQ_F_BLOCKING;
20412196
lo->tag_set.driver_data = lo;
20422197

20432198
err = blk_mq_alloc_tag_set(&lo->tag_set);

0 commit comments

Comments
 (0)