Skip to content

Commit eca9dc2

Browse files
Christoph Hellwigbrauner
authored andcommitted
iomap: support write completions from interrupt context
Completions for pure overwrites don't need to be deferred to a workqueue as there is no work to be done, or at least no work that needs a user context. Set the IOMAP_DIO_INLINE_COMP by default for writes like we already do for reads, and the clear it for all the cases that actually do need a user context for completions to update the inode size or record updates to the logical to physical mapping. I've audited all users of the ->end_io callback, and they only require user context for I/O that involves unwritten extents, COW, size extensions, or error handling and all those are still run from workqueue context. This restores the behavior of the old pre-iomap direct I/O code. Signed-off-by: Christoph Hellwig <hch@lst.de> Link: https://patch.msgid.link/20251113170633.1453259-5-hch@lst.de Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent 29086a3 commit eca9dc2

1 file changed

Lines changed: 48 additions & 11 deletions

File tree

fs/iomap/direct-io.c

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,21 @@ static void iomap_dio_done(struct iomap_dio *dio)
184184
if (dio->error)
185185
dio->flags &= ~IOMAP_DIO_INLINE_COMP;
186186

187+
/*
188+
* Never invalidate pages from this context to avoid deadlocks with
189+
* buffered I/O completions when called from the ioend workqueue,
190+
* or avoid sleeping when called directly from ->bi_end_io.
191+
* Tough luck if you hit the tiny race with someone dirtying the range
192+
* right between this check and the actual completion.
193+
*/
194+
if ((dio->flags & IOMAP_DIO_WRITE) &&
195+
(dio->flags & IOMAP_DIO_INLINE_COMP)) {
196+
if (dio->iocb->ki_filp->f_mapping->nrpages)
197+
dio->flags &= ~IOMAP_DIO_INLINE_COMP;
198+
else
199+
dio->flags |= IOMAP_DIO_NO_INVALIDATE;
200+
}
201+
187202
if (dio->flags & IOMAP_DIO_INLINE_COMP) {
188203
WRITE_ONCE(iocb->private, NULL);
189204
iomap_dio_complete_work(&dio->aio.work);
@@ -234,15 +249,9 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
234249
/*
235250
* Try to avoid another context switch for the completion given
236251
* that we are already called from the ioend completion
237-
* workqueue, but never invalidate pages from this thread to
238-
* avoid deadlocks with buffered I/O completions. Tough luck if
239-
* you hit the tiny race with someone dirtying the range now
240-
* between this check and the actual completion.
252+
* workqueue.
241253
*/
242-
if (!dio->iocb->ki_filp->f_mapping->nrpages) {
243-
dio->flags |= IOMAP_DIO_INLINE_COMP;
244-
dio->flags |= IOMAP_DIO_NO_INVALIDATE;
245-
}
254+
dio->flags |= IOMAP_DIO_INLINE_COMP;
246255
iomap_dio_done(dio);
247256
}
248257

@@ -378,6 +387,20 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
378387
else
379388
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
380389
}
390+
391+
/*
392+
* We can only do inline completion for pure overwrites that
393+
* don't require additional I/O at completion time.
394+
*
395+
* This rules out writes that need zeroing or metdata updates to
396+
* convert unwritten or shared extents.
397+
*
398+
* Writes that extend i_size are also not supported, but this is
399+
* handled in __iomap_dio_rw().
400+
*/
401+
if (need_completion_work)
402+
dio->flags &= ~IOMAP_DIO_INLINE_COMP;
403+
381404
bio_opf |= REQ_OP_WRITE;
382405
} else {
383406
bio_opf |= REQ_OP_READ;
@@ -638,10 +661,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
638661
if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED)
639662
dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
640663

641-
if (iov_iter_rw(iter) == READ) {
642-
/* reads can always complete inline */
643-
dio->flags |= IOMAP_DIO_INLINE_COMP;
664+
/*
665+
* Try to complete inline if we can. For reads this is always possible,
666+
* but for writes we'll end up clearing this more often than not.
667+
*/
668+
dio->flags |= IOMAP_DIO_INLINE_COMP;
644669

670+
if (iov_iter_rw(iter) == READ) {
645671
if (iomi.pos >= dio->i_size)
646672
goto out_free_dio;
647673

@@ -683,6 +709,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
683709
dio->flags |= IOMAP_DIO_WRITE_THROUGH;
684710
}
685711

712+
/*
713+
* i_size updates must to happen from process context.
714+
*/
715+
if (iomi.pos + iomi.len > dio->i_size)
716+
dio->flags &= ~IOMAP_DIO_INLINE_COMP;
717+
686718
/*
687719
* Try to invalidate cache pages for the range we are writing.
688720
* If this invalidation fails, let the caller fall back to
@@ -755,9 +787,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
755787
* If all the writes we issued were already written through to the
756788
* media, we don't need to flush the cache on IO completion. Clear the
757789
* sync flag for this case.
790+
*
791+
* Otherwise clear the inline completion flag if any sync work is
792+
* needed, as that needs to be performed from process context.
758793
*/
759794
if (dio->flags & IOMAP_DIO_WRITE_THROUGH)
760795
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
796+
else if (dio->flags & IOMAP_DIO_NEED_SYNC)
797+
dio->flags &= ~IOMAP_DIO_INLINE_COMP;
761798

762799
/*
763800
* We are about to drop our additional submission reference, which

0 commit comments

Comments
 (0)