1616 * Private flags for iomap_dio, must not overlap with the public ones in
1717 * iomap.h:
1818 */
19- #define IOMAP_DIO_NO_INVALIDATE (1U << 25)
20- #define IOMAP_DIO_CALLER_COMP (1U << 26)
21- #define IOMAP_DIO_INLINE_COMP (1U << 27)
19+ #define IOMAP_DIO_NO_INVALIDATE (1U << 26)
20+ #define IOMAP_DIO_COMP_WORK (1U << 27)
2221#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
2322#define IOMAP_DIO_NEED_SYNC (1U << 29)
2423#define IOMAP_DIO_WRITE (1U << 30)
@@ -140,11 +139,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
140139}
141140EXPORT_SYMBOL_GPL (iomap_dio_complete );
142141
143- static ssize_t iomap_dio_deferred_complete (void * data )
144- {
145- return iomap_dio_complete (data );
146- }
147-
148142static void iomap_dio_complete_work (struct work_struct * work )
149143{
150144 struct iomap_dio * dio = container_of (work , struct iomap_dio , aio .work );
@@ -179,33 +173,33 @@ static void iomap_dio_done(struct iomap_dio *dio)
179173
180174 WRITE_ONCE (dio -> submit .waiter , NULL );
181175 blk_wake_io_task (waiter );
182- } else if (dio -> flags & IOMAP_DIO_INLINE_COMP ) {
183- WRITE_ONCE (iocb -> private , NULL );
184- iomap_dio_complete_work (& dio -> aio .work );
185- } else if (dio -> flags & IOMAP_DIO_CALLER_COMP ) {
186- /*
187- * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
188- * schedule our completion that way to avoid an async punt to a
189- * workqueue.
190- */
191- /* only polled IO cares about private cleared */
192- iocb -> private = dio ;
193- iocb -> dio_complete = iomap_dio_deferred_complete ;
176+ return ;
177+ }
194178
195- /*
196- * Invoke ->ki_complete() directly. We've assigned our
197- * dio_complete callback handler, and since the issuer set
198- * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
199- * notice ->dio_complete being set and will defer calling that
200- * handler until it can be done from a safe task context.
201- *
202- * Note that the 'res' being passed in here is not important
203- * for this case. The actual completion value of the request
204- * will be gotten from dio_complete when that is run by the
205- * issuer.
206- */
207- iocb -> ki_complete (iocb , 0 );
208- } else {
179+ /*
180+ * Always run error completions in user context. These are not
181+ * performance critical and some code relies on taking sleeping locks
182+ * for error handling.
183+ */
184+ if (dio -> error )
185+ dio -> flags |= IOMAP_DIO_COMP_WORK ;
186+
187+ /*
188+ * Never invalidate pages from this context to avoid deadlocks with
189+ * buffered I/O completions when called from the ioend workqueue,
190+ * or avoid sleeping when called directly from ->bi_end_io.
191+ * Tough luck if you hit the tiny race with someone dirtying the range
192+ * right between this check and the actual completion.
193+ */
194+ if ((dio -> flags & IOMAP_DIO_WRITE ) &&
195+ !(dio -> flags & IOMAP_DIO_COMP_WORK )) {
196+ if (dio -> iocb -> ki_filp -> f_mapping -> nrpages )
197+ dio -> flags |= IOMAP_DIO_COMP_WORK ;
198+ else
199+ dio -> flags |= IOMAP_DIO_NO_INVALIDATE ;
200+ }
201+
202+ if (dio -> flags & IOMAP_DIO_COMP_WORK ) {
209203 struct inode * inode = file_inode (iocb -> ki_filp );
210204
211205 /*
@@ -216,7 +210,11 @@ static void iomap_dio_done(struct iomap_dio *dio)
216210 */
217211 INIT_WORK (& dio -> aio .work , iomap_dio_complete_work );
218212 queue_work (inode -> i_sb -> s_dio_done_wq , & dio -> aio .work );
213+ return ;
219214 }
215+
216+ WRITE_ONCE (iocb -> private , NULL );
217+ iomap_dio_complete_work (& dio -> aio .work );
220218}
221219
222220void iomap_dio_bio_end_io (struct bio * bio )
@@ -252,16 +250,9 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
252250 /*
253251 * Try to avoid another context switch for the completion given
254252 * that we are already called from the ioend completion
255- * workqueue, but never invalidate pages from this thread to
256- * avoid deadlocks with buffered I/O completions. Tough luck if
257- * you hit the tiny race with someone dirtying the range now
258- * between this check and the actual completion.
253+ * workqueue.
259254 */
260- if (!dio -> iocb -> ki_filp -> f_mapping -> nrpages ) {
261- dio -> flags |= IOMAP_DIO_INLINE_COMP ;
262- dio -> flags |= IOMAP_DIO_NO_INVALIDATE ;
263- }
264- dio -> flags &= ~IOMAP_DIO_CALLER_COMP ;
255+ dio -> flags &= ~IOMAP_DIO_COMP_WORK ;
265256 iomap_dio_done (dio );
266257 }
267258
@@ -306,23 +297,6 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
306297 return 0 ;
307298}
308299
309- /*
310- * Use a FUA write if we need datasync semantics and this is a pure data I/O
311- * that doesn't require any metadata updates (including after I/O completion
312- * such as unwritten extent conversion) and the underlying device either
313- * doesn't have a volatile write cache or supports FUA.
314- * This allows us to avoid cache flushes on I/O completion.
315- */
316- static inline bool iomap_dio_can_use_fua (const struct iomap * iomap ,
317- struct iomap_dio * dio )
318- {
319- if (iomap -> flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY ))
320- return false;
321- if (!(dio -> flags & IOMAP_DIO_WRITE_THROUGH ))
322- return false;
323- return !bdev_write_cache (iomap -> bdev ) || bdev_fua (iomap -> bdev );
324- }
325-
326300static int iomap_dio_bio_iter (struct iomap_iter * iter , struct iomap_dio * dio )
327301{
328302 const struct iomap * iomap = & iter -> iomap ;
@@ -351,7 +325,24 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
351325 return - EINVAL ;
352326
353327 if (dio -> flags & IOMAP_DIO_WRITE ) {
354- bio_opf |= REQ_OP_WRITE ;
328+ bool need_completion_work = true;
329+
330+ switch (iomap -> type ) {
331+ case IOMAP_MAPPED :
332+ /*
333+ * Directly mapped I/O does not inherently need to do
334+ * work at I/O completion time. But there are various
335+ * cases below where this will get set again.
336+ */
337+ need_completion_work = false;
338+ break ;
339+ case IOMAP_UNWRITTEN :
340+ dio -> flags |= IOMAP_DIO_UNWRITTEN ;
341+ need_zeroout = true;
342+ break ;
343+ default :
344+ break ;
345+ }
355346
356347 if (iomap -> flags & IOMAP_F_ATOMIC_BIO ) {
357348 /*
@@ -364,35 +355,54 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
364355 bio_opf |= REQ_ATOMIC ;
365356 }
366357
367- if (iomap -> type == IOMAP_UNWRITTEN ) {
368- dio -> flags |= IOMAP_DIO_UNWRITTEN ;
369- need_zeroout = true;
370- }
371-
372- if ( iomap -> flags & IOMAP_F_SHARED )
358+ if (iomap -> flags & IOMAP_F_SHARED ) {
359+ /*
360+ * Unsharing of needs to update metadata at I/O
361+ * completion time.
362+ */
363+ need_completion_work = true;
373364 dio -> flags |= IOMAP_DIO_COW ;
365+ }
374366
375- if (iomap -> flags & IOMAP_F_NEW )
367+ if (iomap -> flags & IOMAP_F_NEW ) {
368+ /*
369+ * Newly allocated blocks might need recording in
370+ * metadata at I/O completion time.
371+ */
372+ need_completion_work = true;
376373 need_zeroout = true;
377- else if (iomap -> type == IOMAP_MAPPED &&
378- iomap_dio_can_use_fua (iomap , dio ))
379- bio_opf |= REQ_FUA ;
374+ }
380375
381- if (!(bio_opf & REQ_FUA ))
382- dio -> flags &= ~IOMAP_DIO_WRITE_THROUGH ;
376+ /*
377+ * Use a FUA write if we need datasync semantics and this is a
378+ * pure overwrite that doesn't require any metadata updates.
379+ *
380+ * This allows us to avoid cache flushes on I/O completion.
381+ */
382+ if (dio -> flags & IOMAP_DIO_WRITE_THROUGH ) {
383+ if (!need_completion_work &&
384+ !(iomap -> flags & IOMAP_F_DIRTY ) &&
385+ (!bdev_write_cache (iomap -> bdev ) ||
386+ bdev_fua (iomap -> bdev )))
387+ bio_opf |= REQ_FUA ;
388+ else
389+ dio -> flags &= ~IOMAP_DIO_WRITE_THROUGH ;
390+ }
383391
384392 /*
385- * We can only do deferred completion for pure overwrites that
393+ * We can only do inline completion for pure overwrites that
386394 * don't require additional I/O at completion time.
387395 *
388- * This rules out writes that need zeroing or extent conversion,
389- * extend the file size, or issue metadata I/O or cache flushes
390- * during completion processing.
396+ * This rules out writes that need zeroing or metdata updates to
397+ * convert unwritten or shared extents.
398+ *
399+ * Writes that extend i_size are also not supported, but this is
400+ * handled in __iomap_dio_rw().
391401 */
392- if (need_zeroout || ( pos >= i_size_read ( inode )) ||
393- (( dio -> flags & IOMAP_DIO_NEED_SYNC ) &&
394- !( bio_opf & REQ_FUA )))
395- dio -> flags &= ~ IOMAP_DIO_CALLER_COMP ;
402+ if (need_completion_work )
403+ dio -> flags |= IOMAP_DIO_COMP_WORK ;
404+
405+ bio_opf |= REQ_OP_WRITE ;
396406 } else {
397407 bio_opf |= REQ_OP_READ ;
398408 }
@@ -413,7 +423,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
413423 * ones we set for inline and deferred completions. If none of those
414424 * are available for this IO, clear the polled flag.
415425 */
416- if (!( dio -> flags & ( IOMAP_DIO_INLINE_COMP | IOMAP_DIO_CALLER_COMP )) )
426+ if (dio -> flags & IOMAP_DIO_COMP_WORK )
417427 dio -> iocb -> ki_flags &= ~IOCB_HIPRI ;
418428
419429 if (need_zeroout ) {
@@ -653,9 +663,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
653663 dio -> flags |= IOMAP_DIO_FSBLOCK_ALIGNED ;
654664
655665 if (iov_iter_rw (iter ) == READ ) {
656- /* reads can always complete inline */
657- dio -> flags |= IOMAP_DIO_INLINE_COMP ;
658-
659666 if (iomi .pos >= dio -> i_size )
660667 goto out_free_dio ;
661668
@@ -669,15 +676,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
669676 iomi .flags |= IOMAP_WRITE ;
670677 dio -> flags |= IOMAP_DIO_WRITE ;
671678
672- /*
673- * Flag as supporting deferred completions, if the issuer
674- * groks it. This can avoid a workqueue punt for writes.
675- * We may later clear this flag if we need to do other IO
676- * as part of this IO completion.
677- */
678- if (iocb -> ki_flags & IOCB_DIO_CALLER_COMP )
679- dio -> flags |= IOMAP_DIO_CALLER_COMP ;
680-
681679 if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY ) {
682680 ret = - EAGAIN ;
683681 if (iomi .pos >= dio -> i_size ||
@@ -706,6 +704,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
706704 dio -> flags |= IOMAP_DIO_WRITE_THROUGH ;
707705 }
708706
707+ /*
708+ * i_size updates must to happen from process context.
709+ */
710+ if (iomi .pos + iomi .len > dio -> i_size )
711+ dio -> flags |= IOMAP_DIO_COMP_WORK ;
712+
709713 /*
710714 * Try to invalidate cache pages for the range we are writing.
711715 * If this invalidation fails, let the caller fall back to
@@ -778,9 +782,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
778782 * If all the writes we issued were already written through to the
779783 * media, we don't need to flush the cache on IO completion. Clear the
780784 * sync flag for this case.
785+ *
786+ * Otherwise clear the inline completion flag if any sync work is
787+ * needed, as that needs to be performed from process context.
781788 */
782789 if (dio -> flags & IOMAP_DIO_WRITE_THROUGH )
783790 dio -> flags &= ~IOMAP_DIO_NEED_SYNC ;
791+ else if (dio -> flags & IOMAP_DIO_NEED_SYNC )
792+ dio -> flags |= IOMAP_DIO_COMP_WORK ;
784793
785794 /*
786795 * We are about to drop our additional submission reference, which
0 commit comments