Skip to content

Commit 891bea7

Browse files
committed
Merge patch series "allow file systems to increase the minimum writeback chunk size v2"
Christoph Hellwig <hch@lst.de> says: The relatively low minimal writeback size of 4MiB leads means that written back inodes on rotational media are switched a lot. Besides introducing additional seeks, this also can lead to extreme file fragmentation on zoned devices when a lot of files are cached relative to the available writeback bandwidth. Add a superblock field that allows the file system to override the default size, and set it to the zone size for zoned XFS. * patches from https://patch.msgid.link/20251017034611.651385-1-hch@lst.de: xfs: set s_min_writeback_pages for zoned file systems writeback: allow the file system to override MIN_WRITEBACK_PAGES writeback: cleanup writeback_chunk_size Link: https://patch.msgid.link/20251017034611.651385-1-hch@lst.de Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 211c43d + 015a544 commit 891bea7

5 files changed

Lines changed: 42 additions & 19 deletions

File tree

fs/fs-writeback.c

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,6 @@
3232
#include <linux/memcontrol.h>
3333
#include "internal.h"
3434

35-
/*
36-
* 4MB minimal write chunk size
37-
*/
38-
#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
39-
4035
/*
4136
* Passed into wb_writeback(), essentially a subset of writeback_control
4237
*/
@@ -1889,8 +1884,8 @@ static int writeback_single_inode(struct inode *inode,
18891884
return ret;
18901885
}
18911886

1892-
static long writeback_chunk_size(struct bdi_writeback *wb,
1893-
struct wb_writeback_work *work)
1887+
static long writeback_chunk_size(struct super_block *sb,
1888+
struct bdi_writeback *wb, struct wb_writeback_work *work)
18941889
{
18951890
long pages;
18961891

@@ -1908,16 +1903,13 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
19081903
* (maybe slowly) sync all tagged pages
19091904
*/
19101905
if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
1911-
pages = LONG_MAX;
1912-
else {
1913-
pages = min(wb->avg_write_bandwidth / 2,
1914-
global_wb_domain.dirty_limit / DIRTY_SCOPE);
1915-
pages = min(pages, work->nr_pages);
1916-
pages = round_down(pages + MIN_WRITEBACK_PAGES,
1917-
MIN_WRITEBACK_PAGES);
1918-
}
1906+
return LONG_MAX;
19191907

1920-
return pages;
1908+
pages = min(wb->avg_write_bandwidth / 2,
1909+
global_wb_domain.dirty_limit / DIRTY_SCOPE);
1910+
pages = min(pages, work->nr_pages);
1911+
return round_down(pages + sb->s_min_writeback_pages,
1912+
sb->s_min_writeback_pages);
19211913
}
19221914

19231915
/*
@@ -2019,7 +2011,7 @@ static long writeback_sb_inodes(struct super_block *sb,
20192011
inode->i_state |= I_SYNC;
20202012
wbc_attach_and_unlock_inode(&wbc, inode);
20212013

2022-
write_chunk = writeback_chunk_size(wb, work);
2014+
write_chunk = writeback_chunk_size(inode->i_sb, wb, work);
20232015
wbc.nr_to_write = write_chunk;
20242016
wbc.pages_skipped = 0;
20252017

fs/super.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
389389
goto fail;
390390
if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
391391
goto fail;
392+
s->s_min_writeback_pages = MIN_WRITEBACK_PAGES;
392393
return s;
393394

394395
fail:

fs/xfs/xfs_zone_alloc.c

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1215,6 +1215,7 @@ xfs_mount_zones(
12151215
.mp = mp,
12161216
};
12171217
struct xfs_buftarg *bt = mp->m_rtdev_targp;
1218+
xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks;
12181219
int error;
12191220

12201221
if (!bt) {
@@ -1245,10 +1246,33 @@ xfs_mount_zones(
12451246
return -ENOMEM;
12461247

12471248
xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
1248-
mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
1249-
mp->m_max_open_zones);
1249+
mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones);
12501250
trace_xfs_zones_mount(mp);
12511251

1252+
/*
1253+
* The writeback code switches between inodes regularly to provide
1254+
* fairness. The default lower bound is 4MiB, but for zoned file
1255+
* systems we want to increase that both to reduce seeks, but also more
1256+
* importantly so that workloads that writes files in a multiple of the
1257+
* zone size do not get fragmented and require garbage collection when
1258+
* they shouldn't. Increase is to the zone size capped by the max
1259+
* extent len.
1260+
*
1261+
* Note that because s_min_writeback_pages is a superblock field, this
1262+
* value also get applied to non-zoned files on the data device if
1263+
* there are any. On typical zoned setup all data is on the RT device
1264+
* because using the more efficient sequential write required zones
1265+
* is the reason for using the zone allocator, and either the RT device
1266+
* and the (meta)data device are on the same block device, or the
1267+
* (meta)data device is on a fast SSD while the data on the RT device
1268+
* is on a SMR HDD. In any combination of the above cases enforcing
1269+
* the higher min_writeback_pages for non-RT inodes is either a noop
1270+
* or beneficial.
1271+
*/
1272+
mp->m_super->s_min_writeback_pages =
1273+
XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >>
1274+
PAGE_SHIFT;
1275+
12521276
if (bdev_is_zoned(bt->bt_bdev)) {
12531277
error = blkdev_report_zones(bt->bt_bdev,
12541278
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),

include/linux/fs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1583,6 +1583,7 @@ struct super_block {
15831583

15841584
spinlock_t s_inode_wblist_lock;
15851585
struct list_head s_inodes_wb; /* writeback inodes */
1586+
long s_min_writeback_pages;
15861587
} __randomize_layout;
15871588

15881589
static inline struct user_namespace *i_user_ns(const struct inode *inode)

include/linux/writeback.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *);
374374
void sb_mark_inode_writeback(struct inode *inode);
375375
void sb_clear_inode_writeback(struct inode *inode);
376376

377+
/*
378+
* 4MB minimal write chunk size
379+
*/
380+
#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
381+
377382
#endif /* WRITEBACK_H */

0 commit comments

Comments
 (0)