Skip to content

Commit 560507c

Browse files
committed
Merge patch series "iomap: zero range folio batch support"
Brian Foster <bfoster@redhat.com> says: This adds folio batch support for iomap. This initially only targets zero range, the use case being zeroing of dirty folios over unwritten mappings. There is potential to support other operations in the future: iomap seek data/hole has similar raciness issues as zero range, the prospect of using this for buffered write has been raised for granular locking purposes, etc. The one major caveat with this zero range implementation is that it doesn't look at iomap_folio_state to determine whether to zero a sub-folio portion of the folio. Instead it just relies on whether the folio was dirty or not. This means that spurious zeroing of unwritten ranges is possible if a folio is dirty but the target range includes a subrange that is not. The reasoning is that this is essentially a complexity tradeoff. The current use cases for iomap_zero_range() are limited mostly to partial block zeroing scenarios. It's relatively harmless to zero an unwritten block (i.e. not a correctness issue), and this is something that filesystems have done in the past without much notice or issue. The advantage is less code and this makes it a little easier to use a filemap lookup function for the batch rather than open coding more logic in iomap. That said, this can probably be enhanced to look at ifs in the future if the use case expands and/or other operations justify it. WRT testing, I've tested with and without a local hack to redirect fallocate zero range calls to iomap_zero_range() in XFS. This helps test beyond the partial block/folio use case, i.e. to cover boundary conditions like full folio batch handling, etc. I recently added patch 7 in spirit of that, which turns this logic into an XFS errortag. Further comments on that are inline with patch 7. * patches from https://lore.kernel.org/20251003134642.604736-1-bfoster@redhat.com: xfs: error tag to force zeroing on debug kernels iomap: remove old partial eof zeroing optimization xfs: fill dirty folios on zero range of unwritten mappings xfs: always trim mapping to requested range for zero range iomap: optional zero range dirty folio processing iomap: remove pos+len BUG_ON() to after folio lookup filemap: add helper to look up dirty folios in a range Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 4966b46 + 66d78a1 commit 560507c

8 files changed

Lines changed: 211 additions & 50 deletions

File tree

fs/iomap/buffered-io.c

Lines changed: 85 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,28 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter,
772772
if (!mapping_large_folio_support(iter->inode->i_mapping))
773773
len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
774774

775+
if (iter->fbatch) {
776+
struct folio *folio = folio_batch_next(iter->fbatch);
777+
778+
if (!folio)
779+
return NULL;
780+
781+
/*
782+
* The folio mapping generally shouldn't have changed based on
783+
* fs locks, but be consistent with filemap lookup and retry
784+
* the iter if it does.
785+
*/
786+
folio_lock(folio);
787+
if (unlikely(folio->mapping != iter->inode->i_mapping)) {
788+
iter->iomap.flags |= IOMAP_F_STALE;
789+
folio_unlock(folio);
790+
return NULL;
791+
}
792+
793+
folio_get(folio);
794+
return folio;
795+
}
796+
775797
if (write_ops && write_ops->get_folio)
776798
return write_ops->get_folio(iter, pos, len);
777799
return iomap_get_folio(iter, pos, len);
@@ -826,15 +848,14 @@ static int iomap_write_begin(struct iomap_iter *iter,
826848
size_t *poffset, u64 *plen)
827849
{
828850
const struct iomap *srcmap = iomap_iter_srcmap(iter);
829-
loff_t pos = iter->pos;
851+
loff_t pos;
830852
u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
831853
struct folio *folio;
832854
int status = 0;
833855

834856
len = min_not_zero(len, *plen);
835-
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
836-
if (srcmap != &iter->iomap)
837-
BUG_ON(pos + len > srcmap->offset + srcmap->length);
857+
*foliop = NULL;
858+
*plen = 0;
838859

839860
if (fatal_signal_pending(current))
840861
return -EINTR;
@@ -843,6 +864,15 @@ static int iomap_write_begin(struct iomap_iter *iter,
843864
if (IS_ERR(folio))
844865
return PTR_ERR(folio);
845866

867+
/*
868+
* No folio means we're done with a batch. We still have range to
869+
* process so return and let the caller iterate and refill the batch.
870+
*/
871+
if (!folio) {
872+
WARN_ON_ONCE(!iter->fbatch);
873+
return 0;
874+
}
875+
846876
/*
847877
* Now we have a locked folio, before we do anything with it we need to
848878
* check that the iomap we have cached is not stale. The inode extent
@@ -863,6 +893,22 @@ static int iomap_write_begin(struct iomap_iter *iter,
863893
}
864894
}
865895

896+
/*
897+
* The folios in a batch may not be contiguous. If we've skipped
898+
* forward, advance the iter to the pos of the current folio. If the
899+
* folio starts beyond the end of the mapping, it may have been trimmed
900+
* since the lookup for whatever reason. Return a NULL folio to
901+
* terminate the op.
902+
*/
903+
if (folio_pos(folio) > iter->pos) {
904+
len = min_t(u64, folio_pos(folio) - iter->pos,
905+
iomap_length(iter));
906+
status = iomap_iter_advance(iter, len);
907+
len = iomap_length(iter);
908+
if (status || !len)
909+
goto out_unlock;
910+
}
911+
866912
pos = iomap_trim_folio_range(iter, folio, poffset, &len);
867913

868914
if (srcmap->type == IOMAP_INLINE)
@@ -1409,6 +1455,12 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
14091455
if (iter->iomap.flags & IOMAP_F_STALE)
14101456
break;
14111457

1458+
/* a NULL folio means we're done with a folio batch */
1459+
if (!folio) {
1460+
status = iomap_iter_advance_full(iter);
1461+
break;
1462+
}
1463+
14121464
/* warn about zeroing folios beyond eof that won't write back */
14131465
WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
14141466

@@ -1433,6 +1485,26 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
14331485
return status;
14341486
}
14351487

1488+
loff_t
1489+
iomap_fill_dirty_folios(
1490+
struct iomap_iter *iter,
1491+
loff_t offset,
1492+
loff_t length)
1493+
{
1494+
struct address_space *mapping = iter->inode->i_mapping;
1495+
pgoff_t start = offset >> PAGE_SHIFT;
1496+
pgoff_t end = (offset + length - 1) >> PAGE_SHIFT;
1497+
1498+
iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL);
1499+
if (!iter->fbatch)
1500+
return offset + length;
1501+
folio_batch_init(iter->fbatch);
1502+
1503+
filemap_get_folios_dirty(mapping, &start, end, iter->fbatch);
1504+
return (start << PAGE_SHIFT);
1505+
}
1506+
EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios);
1507+
14361508
int
14371509
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
14381510
const struct iomap_ops *ops,
@@ -1446,46 +1518,26 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
14461518
.private = private,
14471519
};
14481520
struct address_space *mapping = inode->i_mapping;
1449-
unsigned int blocksize = i_blocksize(inode);
1450-
unsigned int off = pos & (blocksize - 1);
1451-
loff_t plen = min_t(loff_t, len, blocksize - off);
14521521
int ret;
14531522
bool range_dirty;
14541523

1455-
/*
1456-
* Zero range can skip mappings that are zero on disk so long as
1457-
* pagecache is clean. If pagecache was dirty prior to zero range, the
1458-
* mapping converts on writeback completion and so must be zeroed.
1459-
*
1460-
* The simplest way to deal with this across a range is to flush
1461-
* pagecache and process the updated mappings. To avoid excessive
1462-
* flushing on partial eof zeroing, special case it to zero the
1463-
* unaligned start portion if already dirty in pagecache.
1464-
*/
1465-
if (off &&
1466-
filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
1467-
iter.len = plen;
1468-
while ((ret = iomap_iter(&iter, ops)) > 0)
1469-
iter.status = iomap_zero_iter(&iter, did_zero,
1470-
write_ops);
1471-
1472-
iter.len = len - (iter.pos - pos);
1473-
if (ret || !iter.len)
1474-
return ret;
1475-
}
1476-
14771524
/*
14781525
* To avoid an unconditional flush, check pagecache state and only flush
14791526
* if dirty and the fs returns a mapping that might convert on
14801527
* writeback.
14811528
*/
1482-
range_dirty = filemap_range_needs_writeback(inode->i_mapping,
1483-
iter.pos, iter.pos + iter.len - 1);
1529+
range_dirty = filemap_range_needs_writeback(mapping, iter.pos,
1530+
iter.pos + iter.len - 1);
14841531
while ((ret = iomap_iter(&iter, ops)) > 0) {
14851532
const struct iomap *srcmap = iomap_iter_srcmap(&iter);
14861533

1487-
if (srcmap->type == IOMAP_HOLE ||
1488-
srcmap->type == IOMAP_UNWRITTEN) {
1534+
if (WARN_ON_ONCE(iter.fbatch &&
1535+
srcmap->type != IOMAP_UNWRITTEN))
1536+
return -EIO;
1537+
1538+
if (!iter.fbatch &&
1539+
(srcmap->type == IOMAP_HOLE ||
1540+
srcmap->type == IOMAP_UNWRITTEN)) {
14891541
s64 status;
14901542

14911543
if (range_dirty) {

fs/iomap/iter.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@
88

99
static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
1010
{
11+
if (iter->fbatch) {
12+
folio_batch_release(iter->fbatch);
13+
kfree(iter->fbatch);
14+
iter->fbatch = NULL;
15+
}
16+
1117
iter->status = 0;
1218
memset(&iter->iomap, 0, sizeof(iter->iomap));
1319
memset(&iter->srcmap, 0, sizeof(iter->srcmap));

fs/xfs/libxfs/xfs_errortag.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@
7373
#define XFS_ERRTAG_WRITE_DELAY_MS 43
7474
#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44
7575
#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45
76-
#define XFS_ERRTAG_MAX 46
76+
#define XFS_ERRTAG_FORCE_ZERO_RANGE 46
77+
#define XFS_ERRTAG_MAX 47
7778

7879
/*
7980
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -133,7 +134,8 @@ XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \
133134
XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \
134135
XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \
135136
XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \
136-
XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4)
137+
XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \
138+
XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4)
137139
#endif /* XFS_ERRTAG */
138140

139141
#endif /* __XFS_ERRORTAG_H_ */

fs/xfs/xfs_file.c

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
#include "xfs_file.h"
2828
#include "xfs_aops.h"
2929
#include "xfs_zone_alloc.h"
30+
#include "xfs_error.h"
31+
#include "xfs_errortag.h"
3032

3133
#include <linux/dax.h>
3234
#include <linux/falloc.h>
@@ -1254,23 +1256,36 @@ xfs_falloc_zero_range(
12541256
struct xfs_zone_alloc_ctx *ac)
12551257
{
12561258
struct inode *inode = file_inode(file);
1259+
struct xfs_inode *ip = XFS_I(inode);
12571260
unsigned int blksize = i_blocksize(inode);
12581261
loff_t new_size = 0;
12591262
int error;
12601263

1261-
trace_xfs_zero_file_space(XFS_I(inode));
1264+
trace_xfs_zero_file_space(ip);
12621265

12631266
error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
12641267
if (error)
12651268
return error;
12661269

1267-
error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
1268-
if (error)
1269-
return error;
1270+
/*
1271+
* Zero range implements a full zeroing mechanism but is only used in
1272+
* limited situations. It is more efficient to allocate unwritten
1273+
* extents than to perform zeroing here, so use an errortag to randomly
1274+
* force zeroing on DEBUG kernels for added test coverage.
1275+
*/
1276+
if (XFS_TEST_ERROR(ip->i_mount,
1277+
XFS_ERRTAG_FORCE_ZERO_RANGE)) {
1278+
error = xfs_zero_range(ip, offset, len, ac, NULL);
1279+
} else {
1280+
error = xfs_free_file_space(ip, offset, len, ac);
1281+
if (error)
1282+
return error;
12701283

1271-
len = round_up(offset + len, blksize) - round_down(offset, blksize);
1272-
offset = round_down(offset, blksize);
1273-
error = xfs_alloc_file_space(XFS_I(inode), offset, len);
1284+
len = round_up(offset + len, blksize) -
1285+
round_down(offset, blksize);
1286+
offset = round_down(offset, blksize);
1287+
error = xfs_alloc_file_space(ip, offset, len);
1288+
}
12741289
if (error)
12751290
return error;
12761291
return xfs_falloc_setsize(file, new_size);

fs/xfs/xfs_iomap.c

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1702,6 +1702,8 @@ xfs_buffered_write_iomap_begin(
17021702
struct iomap *iomap,
17031703
struct iomap *srcmap)
17041704
{
1705+
struct iomap_iter *iter = container_of(iomap, struct iomap_iter,
1706+
iomap);
17051707
struct xfs_inode *ip = XFS_I(inode);
17061708
struct xfs_mount *mp = ip->i_mount;
17071709
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -1767,21 +1769,41 @@ xfs_buffered_write_iomap_begin(
17671769
}
17681770

17691771
/*
1770-
* For zeroing, trim a delalloc extent that extends beyond the EOF
1771-
* block. If it starts beyond the EOF block, convert it to an
1772+
* For zeroing, trim extents that extend beyond the EOF block. If a
1773+
* delalloc extent starts beyond the EOF block, convert it to an
17721774
* unwritten extent.
17731775
*/
1774-
if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb &&
1775-
isnullstartblock(imap.br_startblock)) {
1776+
if (flags & IOMAP_ZERO) {
17761777
xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
1778+
u64 end;
17771779

1778-
if (offset_fsb >= eof_fsb)
1780+
if (isnullstartblock(imap.br_startblock) &&
1781+
offset_fsb >= eof_fsb)
17791782
goto convert_delay;
1780-
if (end_fsb > eof_fsb) {
1783+
if (offset_fsb < eof_fsb && end_fsb > eof_fsb)
17811784
end_fsb = eof_fsb;
1782-
xfs_trim_extent(&imap, offset_fsb,
1783-
end_fsb - offset_fsb);
1785+
1786+
/*
1787+
* Look up dirty folios for unwritten mappings within EOF.
1788+
* Providing this bypasses the flush iomap uses to trigger
1789+
* extent conversion when unwritten mappings have dirty
1790+
* pagecache in need of zeroing.
1791+
*
1792+
* Trim the mapping to the end pos of the lookup, which in turn
1793+
* was trimmed to the end of the batch if it became full before
1794+
* the end of the mapping.
1795+
*/
1796+
if (imap.br_state == XFS_EXT_UNWRITTEN &&
1797+
offset_fsb < eof_fsb) {
1798+
loff_t len = min(count,
1799+
XFS_FSB_TO_B(mp, imap.br_blockcount));
1800+
1801+
end = iomap_fill_dirty_folios(iter, offset, len);
1802+
end_fsb = min_t(xfs_fileoff_t, end_fsb,
1803+
XFS_B_TO_FSB(mp, end));
17841804
}
1805+
1806+
xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
17851807
}
17861808

17871809
/*

include/linux/iomap.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <linux/types.h>
1010
#include <linux/mm_types.h>
1111
#include <linux/blkdev.h>
12+
#include <linux/pagevec.h>
1213

1314
struct address_space;
1415
struct fiemap_extent_info;
@@ -242,6 +243,7 @@ struct iomap_iter {
242243
unsigned flags;
243244
struct iomap iomap;
244245
struct iomap srcmap;
246+
struct folio_batch *fbatch;
245247
void *private;
246248
};
247249

@@ -350,6 +352,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
350352
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
351353
const struct iomap_ops *ops,
352354
const struct iomap_write_ops *write_ops);
355+
loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset,
356+
loff_t length);
353357
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
354358
bool *did_zero, const struct iomap_ops *ops,
355359
const struct iomap_write_ops *write_ops, void *private);

include/linux/pagemap.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,8 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
977977
pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
978978
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
979979
pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);
980+
unsigned filemap_get_folios_dirty(struct address_space *mapping,
981+
pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
980982

981983
struct folio *read_cache_folio(struct address_space *, pgoff_t index,
982984
filler_t *filler, struct file *file);

0 commit comments

Comments
 (0)