@@ -312,80 +312,85 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
312312}
313313
314314/*
315- * Figure out the bio's operation flags from the dio request, the
316- * mapping, and whether or not we want FUA. Note that we can end up
317- * clearing the WRITE_THROUGH flag in the dio request.
315+ * Use a FUA write if we need datasync semantics and this is a pure data I/O
316+ * that doesn't require any metadata updates (including after I/O completion
317+ * such as unwritten extent conversion) and the underlying device either
318+ * doesn't have a volatile write cache or supports FUA.
319+ * This allows us to avoid cache flushes on I/O completion.
318320 */
319- static inline blk_opf_t iomap_dio_bio_opflags ( struct iomap_dio * dio ,
320- const struct iomap * iomap , bool use_fua , bool atomic_hw )
321+ static inline bool iomap_dio_can_use_fua ( const struct iomap * iomap ,
322+ struct iomap_dio * dio )
321323{
322- blk_opf_t opflags = REQ_SYNC | REQ_IDLE ;
323-
324- if (!(dio -> flags & IOMAP_DIO_WRITE ))
325- return REQ_OP_READ ;
326-
327- opflags |= REQ_OP_WRITE ;
328- if (use_fua )
329- opflags |= REQ_FUA ;
330- else
331- dio -> flags &= ~IOMAP_DIO_WRITE_THROUGH ;
332- if (atomic_hw )
333- opflags |= REQ_ATOMIC ;
334-
335- return opflags ;
324+ if (iomap -> flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY ))
325+ return false;
326+ if (!(dio -> flags & IOMAP_DIO_WRITE_THROUGH ))
327+ return false;
328+ return !bdev_write_cache (iomap -> bdev ) || bdev_fua (iomap -> bdev );
336329}
337330
338331static int iomap_dio_bio_iter (struct iomap_iter * iter , struct iomap_dio * dio )
339332{
340333 const struct iomap * iomap = & iter -> iomap ;
341334 struct inode * inode = iter -> inode ;
342335 unsigned int fs_block_size = i_blocksize (inode ), pad ;
343- bool atomic_hw = iter -> flags & IOMAP_ATOMIC_HW ;
344336 const loff_t length = iomap_length (iter );
345337 loff_t pos = iter -> pos ;
346- blk_opf_t bio_opf ;
338+ blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE ;
347339 struct bio * bio ;
348340 bool need_zeroout = false;
349- bool use_fua = false;
350341 int nr_pages , ret = 0 ;
351342 u64 copied = 0 ;
352343 size_t orig_count ;
353344
354- if (atomic_hw && length != iter -> len )
355- return - EINVAL ;
356-
357345 if ((pos | length ) & (bdev_logical_block_size (iomap -> bdev ) - 1 ) ||
358346 !bdev_iter_is_aligned (iomap -> bdev , dio -> submit .iter ))
359347 return - EINVAL ;
360348
361- if (iomap -> type == IOMAP_UNWRITTEN ) {
362- dio -> flags |= IOMAP_DIO_UNWRITTEN ;
363- need_zeroout = true;
364- }
349+ if (dio -> flags & IOMAP_DIO_WRITE ) {
350+ bio_opf |= REQ_OP_WRITE ;
351+
352+ if (iomap -> flags & IOMAP_F_ATOMIC_BIO ) {
353+ /*
354+ * Ensure that the mapping covers the full write
355+ * length, otherwise it won't be submitted as a single
356+ * bio, which is required to use hardware atomics.
357+ */
358+ if (length != iter -> len )
359+ return - EINVAL ;
360+ bio_opf |= REQ_ATOMIC ;
361+ }
365362
366- if (iomap -> flags & IOMAP_F_SHARED )
367- dio -> flags |= IOMAP_DIO_COW ;
363+ if (iomap -> type == IOMAP_UNWRITTEN ) {
364+ dio -> flags |= IOMAP_DIO_UNWRITTEN ;
365+ need_zeroout = true;
366+ }
367+
368+ if (iomap -> flags & IOMAP_F_SHARED )
369+ dio -> flags |= IOMAP_DIO_COW ;
370+
371+ if (iomap -> flags & IOMAP_F_NEW ) {
372+ need_zeroout = true;
373+ } else if (iomap -> type == IOMAP_MAPPED ) {
374+ if (iomap_dio_can_use_fua (iomap , dio ))
375+ bio_opf |= REQ_FUA ;
376+ else
377+ dio -> flags &= ~IOMAP_DIO_WRITE_THROUGH ;
378+ }
368379
369- if (iomap -> flags & IOMAP_F_NEW ) {
370- need_zeroout = true;
371- } else if (iomap -> type == IOMAP_MAPPED ) {
372380 /*
373- * Use a FUA write if we need datasync semantics, this is a pure
374- * data IO that doesn't require any metadata updates (including
375- * after IO completion such as unwritten extent conversion) and
376- * the underlying device either supports FUA or doesn't have
377- * a volatile write cache. This allows us to avoid cache flushes
378- * on IO completion. If we can't use writethrough and need to
379- * sync, disable in-task completions as dio completion will
380- * need to call generic_write_sync() which will do a blocking
381- * fsync / cache flush call.
381+ * We can only do deferred completion for pure overwrites that
382+ * don't require additional I/O at completion time.
383+ *
384+ * This rules out writes that need zeroing or extent conversion,
385+ * extend the file size, or issue metadata I/O or cache flushes
386+ * during completion processing.
382387 */
383- if (!(iomap -> flags & (IOMAP_F_SHARED |IOMAP_F_DIRTY )) &&
384- (dio -> flags & IOMAP_DIO_WRITE_THROUGH ) &&
385- (bdev_fua (iomap -> bdev ) || !bdev_write_cache (iomap -> bdev )))
386- use_fua = true;
387- else if (dio -> flags & IOMAP_DIO_NEED_SYNC )
388+ if (need_zeroout || (pos >= i_size_read (inode )) ||
389+ ((dio -> flags & IOMAP_DIO_NEED_SYNC ) &&
390+ !(bio_opf & REQ_FUA )))
388391 dio -> flags &= ~IOMAP_DIO_CALLER_COMP ;
392+ } else {
393+ bio_opf |= REQ_OP_READ ;
389394 }
390395
391396 /*
@@ -399,18 +404,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
399404 if (!iov_iter_count (dio -> submit .iter ))
400405 goto out ;
401406
402- /*
403- * We can only do deferred completion for pure overwrites that
404- * don't require additional IO at completion. This rules out
405- * writes that need zeroing or extent conversion, extend
406- * the file size, or issue journal IO or cache flushes
407- * during completion processing.
408- */
409- if (need_zeroout ||
410- ((dio -> flags & IOMAP_DIO_NEED_SYNC ) && !use_fua ) ||
411- ((dio -> flags & IOMAP_DIO_WRITE ) && pos >= i_size_read (inode )))
412- dio -> flags &= ~IOMAP_DIO_CALLER_COMP ;
413-
414407 /*
415408 * The rules for polled IO completions follow the guidelines as the
416409 * ones we set for inline and deferred completions. If none of those
@@ -428,8 +421,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
428421 goto out ;
429422 }
430423
431- bio_opf = iomap_dio_bio_opflags (dio , iomap , use_fua , atomic_hw );
432-
433424 nr_pages = bio_iov_vecs_to_alloc (dio -> submit .iter , BIO_MAX_VECS );
434425 do {
435426 size_t n ;
@@ -461,9 +452,9 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
461452 }
462453
463454 n = bio -> bi_iter .bi_size ;
464- if (WARN_ON_ONCE (atomic_hw && n != length )) {
455+ if (WARN_ON_ONCE (( bio_opf & REQ_ATOMIC ) && n != length )) {
465456 /*
466- * This bio should have covered the complete length,
457+ * An atomic write bio must cover the complete length,
467458 * which it doesn't, so error. We may need to zero out
468459 * the tail (complete FS block), similar to when
469460 * bio_iov_iter_get_pages() returns an error, above.
@@ -686,10 +677,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
686677 iomi .flags |= IOMAP_OVERWRITE_ONLY ;
687678 }
688679
689- if (dio_flags & IOMAP_DIO_ATOMIC_SW )
690- iomi .flags |= IOMAP_ATOMIC_SW ;
691- else if (iocb -> ki_flags & IOCB_ATOMIC )
692- iomi .flags |= IOMAP_ATOMIC_HW ;
680+ if (iocb -> ki_flags & IOCB_ATOMIC )
681+ iomi .flags |= IOMAP_ATOMIC ;
693682
694683 /* for data sync or sync, we need sync completion processing */
695684 if (iocb_is_dsync (iocb )) {
0 commit comments