@@ -68,6 +68,7 @@ struct loop_device {
6868 struct rb_root worker_tree ;
6969 struct timer_list timer ;
7070 bool sysfs_inited ;
71+ unsigned lo_nr_blocking_writes ;
7172
7273 struct request_queue * lo_queue ;
7374 struct blk_mq_tag_set tag_set ;
@@ -90,6 +91,8 @@ struct loop_cmd {
9091#define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
9192#define LOOP_DEFAULT_HW_Q_DEPTH 128
9293
94+ static void loop_queue_work (struct loop_device * lo , struct loop_cmd * cmd );
95+
9396static DEFINE_IDR (loop_index_idr );
9497static DEFINE_MUTEX (loop_ctl_mutex );
9598static DEFINE_MUTEX (loop_validate_mutex );
@@ -321,6 +324,15 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
321324
322325 if (!atomic_dec_and_test (& cmd -> ref ))
323326 return ;
327+
328+ /* -EAGAIN could be returned from bdev's ->ki_complete */
329+ if (cmd -> ret == - EAGAIN ) {
330+ struct loop_device * lo = rq -> q -> queuedata ;
331+
332+ loop_queue_work (lo , cmd );
333+ return ;
334+ }
335+
324336 kfree (cmd -> bvec );
325337 cmd -> bvec = NULL ;
326338 if (req_op (rq ) == REQ_OP_WRITE )
@@ -337,24 +349,28 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret)
337349 lo_rw_aio_do_completion (cmd );
338350}
339351
340- static int lo_rw_aio (struct loop_device * lo , struct loop_cmd * cmd ,
341- loff_t pos , int rw )
352+ static inline unsigned lo_cmd_nr_bvec (struct loop_cmd * cmd )
342353{
343- struct iov_iter iter ;
344- struct req_iterator rq_iter ;
345- struct bio_vec * bvec ;
346354 struct request * rq = blk_mq_rq_from_pdu (cmd );
347- struct bio * bio = rq -> bio ;
348- struct file * file = lo -> lo_backing_file ;
355+ struct req_iterator rq_iter ;
349356 struct bio_vec tmp ;
350- unsigned int offset ;
351357 int nr_bvec = 0 ;
352- int ret ;
353358
354359 rq_for_each_bvec (tmp , rq , rq_iter )
355360 nr_bvec ++ ;
356361
362+ return nr_bvec ;
363+ }
364+
365+ static int lo_rw_aio_prep (struct loop_device * lo , struct loop_cmd * cmd ,
366+ unsigned nr_bvec , loff_t pos )
367+ {
368+ struct request * rq = blk_mq_rq_from_pdu (cmd );
369+
357370 if (rq -> bio != rq -> biotail ) {
371+ struct req_iterator rq_iter ;
372+ struct bio_vec * bvec ;
373+ struct bio_vec tmp ;
358374
359375 bvec = kmalloc_array (nr_bvec , sizeof (struct bio_vec ),
360376 GFP_NOIO );
@@ -372,24 +388,12 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
372388 * bvec = tmp ;
373389 bvec ++ ;
374390 }
375- bvec = cmd -> bvec ;
376- offset = 0 ;
377391 } else {
378- /*
379- * Same here, this bio may be started from the middle of the
380- * 'bvec' because of bio splitting, so offset from the bvec
381- * must be passed to iov iterator
382- */
383- offset = bio -> bi_iter .bi_bvec_done ;
384- bvec = __bvec_iter_bvec (bio -> bi_io_vec , bio -> bi_iter );
392+ cmd -> bvec = NULL ;
385393 }
386- atomic_set (& cmd -> ref , 2 );
387-
388- iov_iter_bvec (& iter , rw , bvec , nr_bvec , blk_rq_bytes (rq ));
389- iter .iov_offset = offset ;
390394
391395 cmd -> iocb .ki_pos = pos ;
392- cmd -> iocb .ki_filp = file ;
396+ cmd -> iocb .ki_filp = lo -> lo_backing_file ;
393397 cmd -> iocb .ki_ioprio = req_get_ioprio (rq );
394398 if (cmd -> use_aio ) {
395399 cmd -> iocb .ki_complete = lo_rw_aio_complete ;
@@ -398,6 +402,35 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
398402 cmd -> iocb .ki_complete = NULL ;
399403 cmd -> iocb .ki_flags = 0 ;
400404 }
405+ return 0 ;
406+ }
407+
408+ static int lo_submit_rw_aio (struct loop_device * lo , struct loop_cmd * cmd ,
409+ int nr_bvec , int rw )
410+ {
411+ struct request * rq = blk_mq_rq_from_pdu (cmd );
412+ struct file * file = lo -> lo_backing_file ;
413+ struct iov_iter iter ;
414+ int ret ;
415+
416+ if (cmd -> bvec ) {
417+ iov_iter_bvec (& iter , rw , cmd -> bvec , nr_bvec , blk_rq_bytes (rq ));
418+ iter .iov_offset = 0 ;
419+ } else {
420+ struct bio * bio = rq -> bio ;
421+ struct bio_vec * bvec = __bvec_iter_bvec (bio -> bi_io_vec ,
422+ bio -> bi_iter );
423+
424+ /*
425+ * Same here, this bio may be started from the middle of the
426+ * 'bvec' because of bio splitting, so offset from the bvec
427+ * must be passed to iov iterator
428+ */
429+ iov_iter_bvec (& iter , rw , bvec , nr_bvec , blk_rq_bytes (rq ));
430+ iter .iov_offset = bio -> bi_iter .bi_bvec_done ;
431+ }
432+ atomic_set (& cmd -> ref , 2 );
433+
401434
402435 if (rw == ITER_SOURCE ) {
403436 kiocb_start_write (& cmd -> iocb );
@@ -406,12 +439,84 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
406439 ret = file -> f_op -> read_iter (& cmd -> iocb , & iter );
407440
408441 lo_rw_aio_do_completion (cmd );
442+ return ret ;
443+ }
444+
445+ static bool lo_backfile_support_nowait (const struct loop_device * lo )
446+ {
447+ return lo -> lo_backing_file -> f_mode & FMODE_NOWAIT ;
448+ }
409449
450+ static int lo_rw_aio (struct loop_device * lo , struct loop_cmd * cmd ,
451+ loff_t pos , int rw )
452+ {
453+ int nr_bvec = lo_cmd_nr_bvec (cmd );
454+ int ret ;
455+
456+ /* prepared already if we have tried nowait */
457+ if (!cmd -> use_aio || !lo_backfile_support_nowait (lo )) {
458+ ret = lo_rw_aio_prep (lo , cmd , nr_bvec , pos );
459+ if (unlikely (ret ))
460+ goto fail ;
461+ }
462+
463+ cmd -> iocb .ki_flags &= ~IOCB_NOWAIT ;
464+ ret = lo_submit_rw_aio (lo , cmd , nr_bvec , rw );
465+ fail :
410466 if (ret != - EIOCBQUEUED )
411467 lo_rw_aio_complete (& cmd -> iocb , ret );
412468 return - EIOCBQUEUED ;
413469}
414470
471+ static inline bool lo_aio_try_nowait (struct loop_device * lo ,
472+ struct loop_cmd * cmd )
473+ {
474+ struct file * file = lo -> lo_backing_file ;
475+ struct inode * inode = file -> f_mapping -> host ;
476+ struct request * rq = blk_mq_rq_from_pdu (cmd );
477+
478+ /* NOWAIT works fine for backing block device */
479+ if (S_ISBLK (inode -> i_mode ))
480+ return true;
481+
482+ /*
483+ * NOWAIT is supposed to be fine for READ without contending with
484+ * blocking WRITE
485+ */
486+ if (req_op (rq ) == REQ_OP_READ )
487+ return true;
488+
489+ /*
490+ * If there is any queued non-NOWAIT async WRITE , don't try new
491+ * NOWAIT WRITE for avoiding contention
492+ *
493+ * Here we focus on handling stable FS block mapping via NOWAIT
494+ */
495+ return READ_ONCE (lo -> lo_nr_blocking_writes ) == 0 ;
496+ }
497+
498+ static int lo_rw_aio_nowait (struct loop_device * lo , struct loop_cmd * cmd ,
499+ int rw )
500+ {
501+ struct request * rq = blk_mq_rq_from_pdu (cmd );
502+ loff_t pos = ((loff_t ) blk_rq_pos (rq ) << 9 ) + lo -> lo_offset ;
503+ int nr_bvec = lo_cmd_nr_bvec (cmd );
504+ int ret = lo_rw_aio_prep (lo , cmd , nr_bvec , pos );
505+
506+ if (unlikely (ret ))
507+ goto fail ;
508+
509+ if (!lo_aio_try_nowait (lo , cmd ))
510+ return - EAGAIN ;
511+
512+ cmd -> iocb .ki_flags |= IOCB_NOWAIT ;
513+ ret = lo_submit_rw_aio (lo , cmd , nr_bvec , rw );
514+ fail :
515+ if (ret != - EIOCBQUEUED && ret != - EAGAIN )
516+ lo_rw_aio_complete (& cmd -> iocb , ret );
517+ return ret ;
518+ }
519+
415520static int do_req_filebacked (struct loop_device * lo , struct request * rq )
416521{
417522 struct loop_cmd * cmd = blk_mq_rq_to_pdu (rq );
@@ -706,12 +811,19 @@ static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
706811 return sysfs_emit (buf , "%s\n" , dio ? "1" : "0" );
707812}
708813
814+ static ssize_t loop_attr_nr_blocking_writes_show (struct loop_device * lo ,
815+ char * buf )
816+ {
817+ return sysfs_emit (buf , "%u\n" , lo -> lo_nr_blocking_writes );
818+ }
819+
709820LOOP_ATTR_RO (backing_file );
710821LOOP_ATTR_RO (offset );
711822LOOP_ATTR_RO (sizelimit );
712823LOOP_ATTR_RO (autoclear );
713824LOOP_ATTR_RO (partscan );
714825LOOP_ATTR_RO (dio );
826+ LOOP_ATTR_RO (nr_blocking_writes );
715827
716828static struct attribute * loop_attrs [] = {
717829 & loop_attr_backing_file .attr ,
@@ -720,6 +832,7 @@ static struct attribute *loop_attrs[] = {
720832 & loop_attr_autoclear .attr ,
721833 & loop_attr_partscan .attr ,
722834 & loop_attr_dio .attr ,
835+ & loop_attr_nr_blocking_writes .attr ,
723836 NULL ,
724837};
725838
@@ -795,13 +908,48 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
795908}
796909#endif
797910
911+ static inline void loop_inc_blocking_writes (struct loop_device * lo ,
912+ struct loop_cmd * cmd )
913+ {
914+ lockdep_assert_held (& lo -> lo_work_lock );
915+
916+ if (req_op (blk_mq_rq_from_pdu (cmd )) == REQ_OP_WRITE )
917+ lo -> lo_nr_blocking_writes += 1 ;
918+ }
919+
920+ static inline void loop_dec_blocking_writes (struct loop_device * lo ,
921+ struct loop_cmd * cmd )
922+ {
923+ lockdep_assert_held (& lo -> lo_work_lock );
924+
925+ if (req_op (blk_mq_rq_from_pdu (cmd )) == REQ_OP_WRITE )
926+ lo -> lo_nr_blocking_writes -= 1 ;
927+ }
928+
798929static void loop_queue_work (struct loop_device * lo , struct loop_cmd * cmd )
799930{
931+ struct request __maybe_unused * rq = blk_mq_rq_from_pdu (cmd );
800932 struct rb_node * * node , * parent = NULL ;
801933 struct loop_worker * cur_worker , * worker = NULL ;
802934 struct work_struct * work ;
803935 struct list_head * cmd_list ;
804936
937+ /* always use the first bio's css */
938+ cmd -> blkcg_css = NULL ;
939+ cmd -> memcg_css = NULL ;
940+ #ifdef CONFIG_BLK_CGROUP
941+ if (rq -> bio ) {
942+ cmd -> blkcg_css = bio_blkcg_css (rq -> bio );
943+ #ifdef CONFIG_MEMCG
944+ if (cmd -> blkcg_css ) {
945+ cmd -> memcg_css =
946+ cgroup_get_e_css (cmd -> blkcg_css -> cgroup ,
947+ & memory_cgrp_subsys );
948+ }
949+ #endif
950+ }
951+ #endif
952+
805953 spin_lock_irq (& lo -> lo_work_lock );
806954
807955 if (queue_on_root_worker (cmd -> blkcg_css ))
@@ -860,6 +1008,8 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
8601008 work = & lo -> rootcg_work ;
8611009 cmd_list = & lo -> rootcg_cmd_list ;
8621010 }
1011+ if (cmd -> use_aio )
1012+ loop_inc_blocking_writes (lo , cmd );
8631013 list_add_tail (& cmd -> list_entry , cmd_list );
8641014 queue_work (lo -> workqueue , work );
8651015 spin_unlock_irq (& lo -> lo_work_lock );
@@ -1856,6 +2006,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
18562006 struct request * rq = bd -> rq ;
18572007 struct loop_cmd * cmd = blk_mq_rq_to_pdu (rq );
18582008 struct loop_device * lo = rq -> q -> queuedata ;
2009+ int rw = 0 ;
18592010
18602011 blk_mq_start_request (rq );
18612012
@@ -1868,26 +2019,27 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
18682019 case REQ_OP_WRITE_ZEROES :
18692020 cmd -> use_aio = false;
18702021 break ;
1871- default :
2022+ case REQ_OP_READ :
2023+ rw = ITER_DEST ;
2024+ cmd -> use_aio = lo -> lo_flags & LO_FLAGS_DIRECT_IO ;
2025+ break ;
2026+ case REQ_OP_WRITE :
2027+ rw = ITER_SOURCE ;
18722028 cmd -> use_aio = lo -> lo_flags & LO_FLAGS_DIRECT_IO ;
18732029 break ;
2030+ default :
2031+ return BLK_STS_IOERR ;
18742032 }
18752033
1876- /* always use the first bio's css */
1877- cmd -> blkcg_css = NULL ;
1878- cmd -> memcg_css = NULL ;
1879- #ifdef CONFIG_BLK_CGROUP
1880- if (rq -> bio ) {
1881- cmd -> blkcg_css = bio_blkcg_css (rq -> bio );
1882- #ifdef CONFIG_MEMCG
1883- if (cmd -> blkcg_css ) {
1884- cmd -> memcg_css =
1885- cgroup_get_e_css (cmd -> blkcg_css -> cgroup ,
1886- & memory_cgrp_subsys );
1887- }
1888- #endif
2034+ /* try NOWAIT if the backing file supports the mode */
2035+ if (cmd -> use_aio && lo_backfile_support_nowait (lo )) {
2036+ int res = lo_rw_aio_nowait (lo , cmd , rw );
2037+
2038+ if (res != - EAGAIN && res != - EOPNOTSUPP )
2039+ return BLK_STS_OK ;
2040+ /* fallback to workqueue for handling aio */
18892041 }
1890- #endif
2042+
18912043 loop_queue_work (lo , cmd );
18922044
18932045 return BLK_STS_OK ;
@@ -1959,6 +2111,8 @@ static void loop_process_work(struct loop_worker *worker,
19592111 cond_resched ();
19602112
19612113 spin_lock_irq (& lo -> lo_work_lock );
2114+ if (cmd -> use_aio )
2115+ loop_dec_blocking_writes (lo , cmd );
19622116 }
19632117
19642118 /*
@@ -2037,7 +2191,8 @@ static int loop_add(int i)
20372191 lo -> tag_set .queue_depth = hw_queue_depth ;
20382192 lo -> tag_set .numa_node = NUMA_NO_NODE ;
20392193 lo -> tag_set .cmd_size = sizeof (struct loop_cmd );
2040- lo -> tag_set .flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT ;
2194+ lo -> tag_set .flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT |
2195+ BLK_MQ_F_BLOCKING ;
20412196 lo -> tag_set .driver_data = lo ;
20422197
20432198 err = blk_mq_alloc_tag_set (& lo -> tag_set );
0 commit comments