Skip to content

Commit 310ee09

Browse files
Brian Fostertytso
authored andcommitted
ext4: allow concurrent unaligned dio overwrites
We've had reports of significant performance regression of sub-block (unaligned) direct writes due to the added exclusivity restrictions in ext4. The purpose of the exclusivity requirement for unaligned direct writes is to avoid data corruption caused by unserialized partial block zeroing in the iomap dio layer across overlapping writes. XFS has similar requirements for the same underlying reasons, yet doesn't suffer the extreme performance regression that ext4 does. The reason for this is that XFS utilizes IOMAP_DIO_OVERWRITE_ONLY mode, which allows for optimistic submission of concurrent unaligned I/O and kicks back writes that require partial block zeroing such that they can be submitted in a safe, exclusive context. Since ext4 already performs most of these checks pre-submission, it can support something similar without necessarily relying on the iomap flag and associated retry mechanism. Update the dio write submission path to allow concurrent submission of unaligned direct writes that are purely overwrite and so will not require block zeroing. To improve readability of the various related checks, move the unaligned I/O handling down into ext4_dio_write_checks(), where the dio draining and force wait logic can immediately follow the locking requirement checks. Finally, the IOMAP_DIO_OVERWRITE_ONLY flag is set to enable a warning check as a precaution should the ext4 overwrite logic ever become inconsistent with the zeroing expectations of iomap dio. The performance improvement of sub-block direct write I/O is shown in the following fio test on a 64xcpu guest vm: Test: fio --name=test --ioengine=libaio --direct=1 --group_reporting --overwrite=1 --thread --size=10G --filename=/mnt/fio --readwrite=write --ramp_time=10s --runtime=60s --numjobs=8 --blocksize=2k --iodepth=256 --allow_file_create=0 v6.2: write: IOPS=4328, BW=8724KiB/s v6.2 (patched): write: IOPS=801k, BW=1565MiB/s Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com> Reviewed-by: Jan Kara <jack@suse.cz> Link: https://lore.kernel.org/r/20230314130759.642710-1-bfoster@redhat.com Signed-off-by: Theodore Ts'o <tytso@mit.edu>
1 parent 4c0cfeb commit 310ee09

1 file changed

Lines changed: 46 additions & 40 deletions

File tree

fs/ext4/file.c

Lines changed: 46 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -444,13 +444,14 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
444444
*/
445445
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
446446
bool *ilock_shared, bool *extend,
447-
bool *unwritten)
447+
bool *unwritten, int *dio_flags)
448448
{
449449
struct file *file = iocb->ki_filp;
450450
struct inode *inode = file_inode(file);
451451
loff_t offset;
452452
size_t count;
453453
ssize_t ret;
454+
bool overwrite, unaligned_io;
454455

455456
restart:
456457
ret = ext4_generic_write_checks(iocb, from);
@@ -459,16 +460,20 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
459460

460461
offset = iocb->ki_pos;
461462
count = ret;
462-
if (ext4_extending_io(inode, offset, count))
463-
*extend = true;
463+
464+
unaligned_io = ext4_unaligned_io(inode, from, offset);
465+
*extend = ext4_extending_io(inode, offset, count);
466+
overwrite = ext4_overwrite_io(inode, offset, count, unwritten);
467+
464468
/*
465-
* Determine whether the IO operation will overwrite allocated
466-
* and initialized blocks.
467-
* We need exclusive i_rwsem for changing security info
468-
* in file_modified().
469+
* Determine whether we need to upgrade to an exclusive lock. This is
470+
* required to change security info in file_modified(), for extending
471+
* I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten
472+
* extents (as partial block zeroing may be required).
469473
*/
470-
if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
471-
!ext4_overwrite_io(inode, offset, count, unwritten))) {
474+
if (*ilock_shared &&
475+
((!IS_NOSEC(inode) || *extend || !overwrite ||
476+
(unaligned_io && *unwritten)))) {
472477
if (iocb->ki_flags & IOCB_NOWAIT) {
473478
ret = -EAGAIN;
474479
goto out;
@@ -479,6 +484,32 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
479484
goto restart;
480485
}
481486

487+
/*
488+
* Now that locking is settled, determine dio flags and exclusivity
489+
* requirements. Unaligned writes are allowed under shared lock so long
490+
* as they are pure overwrites. Set the iomap overwrite only flag as an
491+
* added precaution in this case. Even though this is unnecessary, we
492+
* can detect and warn on unexpected -EAGAIN if an unsafe unaligned
493+
* write is ever submitted.
494+
*
495+
* Otherwise, concurrent unaligned writes risk data corruption due to
496+
* partial block zeroing in the dio layer, and so the I/O must occur
497+
* exclusively. The inode lock is already held exclusive if the write is
498+
* non-overwrite or extending, so drain all outstanding dio and set the
499+
* force wait dio flag.
500+
*/
501+
if (*ilock_shared && unaligned_io) {
502+
*dio_flags = IOMAP_DIO_OVERWRITE_ONLY;
503+
} else if (!*ilock_shared && (unaligned_io || *extend)) {
504+
if (iocb->ki_flags & IOCB_NOWAIT) {
505+
ret = -EAGAIN;
506+
goto out;
507+
}
508+
if (unaligned_io && (!overwrite || *unwritten))
509+
inode_dio_wait(inode);
510+
*dio_flags = IOMAP_DIO_FORCE_WAIT;
511+
}
512+
482513
ret = file_modified(file);
483514
if (ret < 0)
484515
goto out;
@@ -500,17 +531,10 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
500531
loff_t offset = iocb->ki_pos;
501532
size_t count = iov_iter_count(from);
502533
const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
503-
bool extend = false, unaligned_io = false, unwritten = false;
534+
bool extend = false, unwritten = false;
504535
bool ilock_shared = true;
536+
int dio_flags = 0;
505537

506-
/*
507-
* We initially start with shared inode lock unless it is
508-
* unaligned IO which needs exclusive lock anyways.
509-
*/
510-
if (ext4_unaligned_io(inode, from, offset)) {
511-
unaligned_io = true;
512-
ilock_shared = false;
513-
}
514538
/*
515539
* Quick check here without any i_rwsem lock to see if it is extending
516540
* IO. A more reliable check is done in ext4_dio_write_checks() with
@@ -543,16 +567,11 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
543567
return ext4_buffered_write_iter(iocb, from);
544568
}
545569

546-
ret = ext4_dio_write_checks(iocb, from,
547-
&ilock_shared, &extend, &unwritten);
570+
ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
571+
&unwritten, &dio_flags);
548572
if (ret <= 0)
549573
return ret;
550574

551-
/* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
552-
if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
553-
ret = -EAGAIN;
554-
goto out;
555-
}
556575
/*
557576
* Make sure inline data cannot be created anymore since we are going
558577
* to allocate blocks for DIO. We know the inode does not have any
@@ -563,19 +582,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
563582
offset = iocb->ki_pos;
564583
count = ret;
565584

566-
/*
567-
* Unaligned direct IO must be serialized among each other as zeroing
568-
* of partial blocks of two competing unaligned IOs can result in data
569-
* corruption.
570-
*
571-
* So we make sure we don't allow any unaligned IO in flight.
572-
* For IOs where we need not wait (like unaligned non-AIO DIO),
573-
* below inode_dio_wait() may anyway become a no-op, since we start
574-
* with exclusive lock.
575-
*/
576-
if (unaligned_io)
577-
inode_dio_wait(inode);
578-
579585
if (extend) {
580586
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
581587
if (IS_ERR(handle)) {
@@ -595,8 +601,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
595601
if (ilock_shared && !unwritten)
596602
iomap_ops = &ext4_iomap_overwrite_ops;
597603
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
598-
(unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
599-
NULL, 0);
604+
dio_flags, NULL, 0);
605+
WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT));
600606
if (ret == -ENOTBLK)
601607
ret = 0;
602608

0 commit comments

Comments
 (0)