2020 * Private flags for iomap_dio, must not overlap with the public ones in
2121 * iomap.h:
2222 */
23- #define IOMAP_DIO_WRITE_FUA (1 << 28)
24- #define IOMAP_DIO_NEED_SYNC (1 << 29)
25- #define IOMAP_DIO_WRITE (1 << 30)
26- #define IOMAP_DIO_DIRTY (1 << 31)
23+ #define IOMAP_DIO_CALLER_COMP (1U << 26)
24+ #define IOMAP_DIO_INLINE_COMP (1U << 27)
25+ #define IOMAP_DIO_WRITE_THROUGH (1U << 28)
26+ #define IOMAP_DIO_NEED_SYNC (1U << 29)
27+ #define IOMAP_DIO_WRITE (1U << 30)
28+ #define IOMAP_DIO_DIRTY (1U << 31)
2729
2830struct iomap_dio {
2931 struct kiocb * iocb ;
@@ -41,7 +43,6 @@ struct iomap_dio {
4143 struct {
4244 struct iov_iter * iter ;
4345 struct task_struct * waiter ;
44- struct bio * poll_bio ;
4546 } submit ;
4647
4748 /* used for aio completion: */
@@ -63,12 +64,14 @@ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
6364static void iomap_dio_submit_bio (const struct iomap_iter * iter ,
6465 struct iomap_dio * dio , struct bio * bio , loff_t pos )
6566{
67+ struct kiocb * iocb = dio -> iocb ;
68+
6669 atomic_inc (& dio -> ref );
6770
6871 /* Sync dio can't be polled reliably */
69- if ((dio -> iocb -> ki_flags & IOCB_HIPRI ) && !is_sync_kiocb (dio -> iocb )) {
70- bio_set_polled (bio , dio -> iocb );
71- dio -> submit . poll_bio = bio ;
72+ if ((iocb -> ki_flags & IOCB_HIPRI ) && !is_sync_kiocb (iocb )) {
73+ bio_set_polled (bio , iocb );
74+ WRITE_ONCE ( iocb -> private , bio ) ;
7275 }
7376
7477 if (dio -> dops && dio -> dops -> submit_io )
@@ -130,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
130133}
131134EXPORT_SYMBOL_GPL (iomap_dio_complete );
132135
136+ static ssize_t iomap_dio_deferred_complete (void * data )
137+ {
138+ return iomap_dio_complete (data );
139+ }
140+
133141static void iomap_dio_complete_work (struct work_struct * work )
134142{
135143 struct iomap_dio * dio = container_of (work , struct iomap_dio , aio .work );
@@ -152,27 +160,69 @@ void iomap_dio_bio_end_io(struct bio *bio)
152160{
153161 struct iomap_dio * dio = bio -> bi_private ;
154162 bool should_dirty = (dio -> flags & IOMAP_DIO_DIRTY );
163+ struct kiocb * iocb = dio -> iocb ;
155164
156165 if (bio -> bi_status )
157166 iomap_dio_set_error (dio , blk_status_to_errno (bio -> bi_status ));
167+ if (!atomic_dec_and_test (& dio -> ref ))
168+ goto release_bio ;
158169
159- if (atomic_dec_and_test (& dio -> ref )) {
160- if (dio -> wait_for_completion ) {
161- struct task_struct * waiter = dio -> submit .waiter ;
162- WRITE_ONCE (dio -> submit .waiter , NULL );
163- blk_wake_io_task (waiter );
164- } else if (dio -> flags & IOMAP_DIO_WRITE ) {
165- struct inode * inode = file_inode (dio -> iocb -> ki_filp );
166-
167- WRITE_ONCE (dio -> iocb -> private , NULL );
168- INIT_WORK (& dio -> aio .work , iomap_dio_complete_work );
169- queue_work (inode -> i_sb -> s_dio_done_wq , & dio -> aio .work );
170- } else {
171- WRITE_ONCE (dio -> iocb -> private , NULL );
172- iomap_dio_complete_work (& dio -> aio .work );
173- }
170+ /*
171+ * Synchronous dio, task itself will handle any completion work
172+ * that needs after IO. All we need to do is wake the task.
173+ */
174+ if (dio -> wait_for_completion ) {
175+ struct task_struct * waiter = dio -> submit .waiter ;
176+
177+ WRITE_ONCE (dio -> submit .waiter , NULL );
178+ blk_wake_io_task (waiter );
179+ goto release_bio ;
180+ }
181+
182+ /*
183+ * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
184+ */
185+ if (dio -> flags & IOMAP_DIO_INLINE_COMP ) {
186+ WRITE_ONCE (iocb -> private , NULL );
187+ iomap_dio_complete_work (& dio -> aio .work );
188+ goto release_bio ;
189+ }
190+
191+ /*
192+ * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
193+ * our completion that way to avoid an async punt to a workqueue.
194+ */
195+ if (dio -> flags & IOMAP_DIO_CALLER_COMP ) {
196+ /* only polled IO cares about private cleared */
197+ iocb -> private = dio ;
198+ iocb -> dio_complete = iomap_dio_deferred_complete ;
199+
200+ /*
201+ * Invoke ->ki_complete() directly. We've assigned our
202+ * dio_complete callback handler, and since the issuer set
203+ * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
204+ * notice ->dio_complete being set and will defer calling that
205+ * handler until it can be done from a safe task context.
206+ *
207+ * Note that the 'res' being passed in here is not important
208+ * for this case. The actual completion value of the request
209+ * will be gotten from dio_complete when that is run by the
210+ * issuer.
211+ */
212+ iocb -> ki_complete (iocb , 0 );
213+ goto release_bio ;
174214 }
175215
216+ /*
217+ * Async DIO completion that requires filesystem level completion work
218+ * gets punted to a work queue to complete as the operation may require
219+ * more IO to be issued to finalise filesystem metadata changes or
220+ * guarantee data integrity.
221+ */
222+ INIT_WORK (& dio -> aio .work , iomap_dio_complete_work );
223+ queue_work (file_inode (iocb -> ki_filp )-> i_sb -> s_dio_done_wq ,
224+ & dio -> aio .work );
225+ release_bio :
176226 if (should_dirty ) {
177227 bio_check_pages_dirty (bio );
178228 } else {
@@ -203,7 +253,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
203253/*
204254 * Figure out the bio's operation flags from the dio request, the
205255 * mapping, and whether or not we want FUA. Note that we can end up
206- * clearing the WRITE_FUA flag in the dio request.
256+ * clearing the WRITE_THROUGH flag in the dio request.
207257 */
208258static inline blk_opf_t iomap_dio_bio_opflags (struct iomap_dio * dio ,
209259 const struct iomap * iomap , bool use_fua )
@@ -217,7 +267,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
217267 if (use_fua )
218268 opflags |= REQ_FUA ;
219269 else
220- dio -> flags &= ~IOMAP_DIO_WRITE_FUA ;
270+ dio -> flags &= ~IOMAP_DIO_WRITE_THROUGH ;
221271
222272 return opflags ;
223273}
@@ -257,12 +307,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
257307 * Use a FUA write if we need datasync semantics, this is a pure
258308 * data IO that doesn't require any metadata updates (including
259309 * after IO completion such as unwritten extent conversion) and
260- * the underlying device supports FUA. This allows us to avoid
261- * cache flushes on IO completion.
310+ * the underlying device either supports FUA or doesn't have
311+ * a volatile write cache. This allows us to avoid cache flushes
312+ * on IO completion. If we can't use writethrough and need to
313+ * sync, disable in-task completions as dio completion will
314+ * need to call generic_write_sync() which will do a blocking
315+ * fsync / cache flush call.
262316 */
263317 if (!(iomap -> flags & (IOMAP_F_SHARED |IOMAP_F_DIRTY )) &&
264- (dio -> flags & IOMAP_DIO_WRITE_FUA ) && bdev_fua (iomap -> bdev ))
318+ (dio -> flags & IOMAP_DIO_WRITE_THROUGH ) &&
319+ (bdev_fua (iomap -> bdev ) || !bdev_write_cache (iomap -> bdev )))
265320 use_fua = true;
321+ else if (dio -> flags & IOMAP_DIO_NEED_SYNC )
322+ dio -> flags &= ~IOMAP_DIO_CALLER_COMP ;
266323 }
267324
268325 /*
@@ -277,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
277334 goto out ;
278335
279336 /*
280- * We can only poll for single bio I/Os.
337+ * We can only do deferred completion for pure overwrites that
338+ * don't require additional IO at completion. This rules out
339+ * writes that need zeroing or extent conversion, extend
340+ * the file size, or issue journal IO or cache flushes
341+ * during completion processing.
281342 */
282343 if (need_zeroout ||
344+ ((dio -> flags & IOMAP_DIO_NEED_SYNC ) && !use_fua ) ||
283345 ((dio -> flags & IOMAP_DIO_WRITE ) && pos >= i_size_read (inode )))
346+ dio -> flags &= ~IOMAP_DIO_CALLER_COMP ;
347+
348+ /*
349+ * The rules for polled IO completions follow the guidelines as the
350+ * ones we set for inline and deferred completions. If none of those
351+ * are available for this IO, clear the polled flag.
352+ */
353+ if (!(dio -> flags & (IOMAP_DIO_INLINE_COMP |IOMAP_DIO_CALLER_COMP )))
284354 dio -> iocb -> ki_flags &= ~IOCB_HIPRI ;
285355
286356 if (need_zeroout ) {
@@ -505,12 +575,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
505575
506576 dio -> submit .iter = iter ;
507577 dio -> submit .waiter = current ;
508- dio -> submit .poll_bio = NULL ;
509578
510579 if (iocb -> ki_flags & IOCB_NOWAIT )
511580 iomi .flags |= IOMAP_NOWAIT ;
512581
513582 if (iov_iter_rw (iter ) == READ ) {
583+ /* reads can always complete inline */
584+ dio -> flags |= IOMAP_DIO_INLINE_COMP ;
585+
514586 if (iomi .pos >= dio -> i_size )
515587 goto out_free_dio ;
516588
@@ -524,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
524596 iomi .flags |= IOMAP_WRITE ;
525597 dio -> flags |= IOMAP_DIO_WRITE ;
526598
599+ /*
600+ * Flag as supporting deferred completions, if the issuer
601+ * groks it. This can avoid a workqueue punt for writes.
602+ * We may later clear this flag if we need to do other IO
603+ * as part of this IO completion.
604+ */
605+ if (iocb -> ki_flags & IOCB_DIO_CALLER_COMP )
606+ dio -> flags |= IOMAP_DIO_CALLER_COMP ;
607+
527608 if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY ) {
528609 ret = - EAGAIN ;
529610 if (iomi .pos >= dio -> i_size ||
@@ -537,13 +618,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
537618 dio -> flags |= IOMAP_DIO_NEED_SYNC ;
538619
539620 /*
540- * For datasync only writes, we optimistically try
541- * using FUA for this IO. Any non-FUA write that
542- * occurs will clear this flag, hence we know before
543- * completion whether a cache flush is necessary.
621+ * For datasync only writes, we optimistically try using
622+ * WRITE_THROUGH for this IO. This flag requires either
623+ * FUA writes through the device's write cache, or a
624+ * normal write to a device without a volatile write
625+ * cache. For the former, Any non-FUA write that occurs
626+ * will clear this flag, hence we know before completion
627+ * whether a cache flush is necessary.
544628 */
545629 if (!(iocb -> ki_flags & IOCB_SYNC ))
546- dio -> flags |= IOMAP_DIO_WRITE_FUA ;
630+ dio -> flags |= IOMAP_DIO_WRITE_THROUGH ;
547631 }
548632
549633 /*
@@ -605,14 +689,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
605689 iomap_dio_set_error (dio , ret );
606690
607691 /*
608- * If all the writes we issued were FUA, we don't need to flush the
609- * cache on IO completion. Clear the sync flag for this case.
692+ * If all the writes we issued were already written through to the
693+ * media, we don't need to flush the cache on IO completion. Clear the
694+ * sync flag for this case.
610695 */
611- if (dio -> flags & IOMAP_DIO_WRITE_FUA )
696+ if (dio -> flags & IOMAP_DIO_WRITE_THROUGH )
612697 dio -> flags &= ~IOMAP_DIO_NEED_SYNC ;
613698
614- WRITE_ONCE (iocb -> private , dio -> submit .poll_bio );
615-
616699 /*
617700 * We are about to drop our additional submission reference, which
618701 * might be the last reference to the dio. There are three different
0 commit comments