Skip to content

Commit 263e777

Browse files
committed
Merge tag 'vfs-6.18-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs writeback updates from Christian Brauner: "This contains work adressing lockups reported by users when a systemd unit reading lots of files from a filesystem mounted with the lazytime mount option exits. With the lazytime mount option enabled we can be switching many dirty inodes on cgroup exit to the parent cgroup. The numbers observed in practice when systemd slice of a large cron job exits can easily reach hundreds of thousands or millions. The logic in inode_do_switch_wbs() which sorts the inode into appropriate place in b_dirty list of the target wb however has linear complexity in the number of dirty inodes thus overall time complexity of switching all the inodes is quadratic leading to workers being pegged for hours consuming 100% of the CPU and switching inodes to the parent wb. Simple reproducer of the issue: FILES=10000 # Filesystem mounted with lazytime mount option MNT=/mnt/ echo "Creating files and switching timestamps" for (( j = 0; j < 50; j ++ )); do mkdir $MNT/dir$j for (( i = 0; i < $FILES; i++ )); do echo "foo" >$MNT/dir$j/file$i done touch -a -t 202501010000 $MNT/dir$j/file* done wait echo "Syncing and flushing" sync echo 3 >/proc/sys/vm/drop_caches echo "Reading all files from a cgroup" mkdir /sys/fs/cgroup/unified/mycg1 || exit echo $$ >/sys/fs/cgroup/unified/mycg1/cgroup.procs || exit for (( j = 0; j < 50; j ++ )); do cat /mnt/dir$j/file* >/dev/null & done wait echo "Switching wbs" # Now rmdir the cgroup after the script exits This can be solved by: - Avoiding contention on the wb->list_lock when switching inodes by running a single work item per wb and managing a queue of items switching to the wb - Allowing rescheduling when switching inodes over to a different cgroup to avoid softlockups - Maintaining the b_dirty list ordering instead of sorting it" * tag 'vfs-6.18-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: writeback: Add tracepoint to track pending inode switches writeback: Avoid excessively long inode switching times writeback: Avoid softlockup when switching many inodes writeback: Avoid contention on wb->list_lock when switching inodes
2 parents 18b19ab + 9426414 commit 263e777

5 files changed

Lines changed: 126 additions & 47 deletions

File tree

fs/fs-writeback.c

Lines changed: 86 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
368368
}
369369

370370
struct inode_switch_wbs_context {
371-
struct rcu_work work;
371+
/* List of queued switching contexts for the wb */
372+
struct llist_node list;
372373

373374
/*
374375
* Multiple inodes can be switched at once. The switching procedure
@@ -378,7 +379,6 @@ struct inode_switch_wbs_context {
378379
* array embedded into struct inode_switch_wbs_context. Otherwise
379380
* an inode could be left in a non-consistent state.
380381
*/
381-
struct bdi_writeback *new_wb;
382382
struct inode *inodes[];
383383
};
384384

@@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode,
445445
* Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
446446
* the specific list @inode was on is ignored and the @inode is put on
447447
* ->b_dirty which is always correct including from ->b_dirty_time.
448-
* The transfer preserves @inode->dirtied_when ordering. If the @inode
449-
* was clean, it means it was on the b_attached list, so move it onto
450-
* the b_attached list of @new_wb.
448+
* If the @inode was clean, it means it was on the b_attached list, so
449+
* move it onto the b_attached list of @new_wb.
451450
*/
452451
if (!list_empty(&inode->i_io_list)) {
453452
inode->i_wb = new_wb;
454453

455454
if (inode->i_state & I_DIRTY_ALL) {
456-
struct inode *pos;
457-
458-
list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
459-
if (time_after_eq(inode->dirtied_when,
460-
pos->dirtied_when))
461-
break;
455+
/*
456+
* We need to keep b_dirty list sorted by
457+
* dirtied_time_when. However properly sorting the
458+
* inode in the list gets too expensive when switching
459+
* many inodes. So just attach inode at the end of the
460+
* dirty list and clobber the dirtied_time_when.
461+
*/
462+
inode->dirtied_time_when = jiffies;
462463
inode_io_list_move_locked(inode, new_wb,
463-
pos->i_io_list.prev);
464+
&new_wb->b_dirty);
464465
} else {
465466
inode_cgwb_move_to_attached(inode, new_wb);
466467
}
@@ -486,13 +487,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
486487
return switched;
487488
}
488489

489-
static void inode_switch_wbs_work_fn(struct work_struct *work)
490+
static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
491+
struct inode_switch_wbs_context *isw)
490492
{
491-
struct inode_switch_wbs_context *isw =
492-
container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
493493
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
494494
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
495-
struct bdi_writeback *new_wb = isw->new_wb;
496495
unsigned long nr_switched = 0;
497496
struct inode **inodep;
498497

@@ -502,6 +501,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
502501
*/
503502
down_read(&bdi->wb_switch_rwsem);
504503

504+
inodep = isw->inodes;
505505
/*
506506
* By the time control reaches here, RCU grace period has passed
507507
* since I_WB_SWITCH assertion and all wb stat update transactions
@@ -512,6 +512,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
512512
* gives us exclusion against all wb related operations on @inode
513513
* including IO list manipulations and stat updates.
514514
*/
515+
relock:
515516
if (old_wb < new_wb) {
516517
spin_lock(&old_wb->list_lock);
517518
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
@@ -520,10 +521,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
520521
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
521522
}
522523

523-
for (inodep = isw->inodes; *inodep; inodep++) {
524+
while (*inodep) {
524525
WARN_ON_ONCE((*inodep)->i_wb != old_wb);
525526
if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
526527
nr_switched++;
528+
inodep++;
529+
if (*inodep && need_resched()) {
530+
spin_unlock(&new_wb->list_lock);
531+
spin_unlock(&old_wb->list_lock);
532+
cond_resched();
533+
goto relock;
534+
}
527535
}
528536

529537
spin_unlock(&new_wb->list_lock);
@@ -543,6 +551,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
543551
atomic_dec(&isw_nr_in_flight);
544552
}
545553

554+
void inode_switch_wbs_work_fn(struct work_struct *work)
555+
{
556+
struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
557+
switch_work);
558+
struct inode_switch_wbs_context *isw, *next_isw;
559+
struct llist_node *list;
560+
561+
/*
562+
* Grab out reference to wb so that it cannot get freed under us
563+
* after we process all the isw items.
564+
*/
565+
wb_get(new_wb);
566+
while (1) {
567+
list = llist_del_all(&new_wb->switch_wbs_ctxs);
568+
/* Nothing to do? */
569+
if (!list)
570+
break;
571+
/*
572+
* In addition to synchronizing among switchers, I_WB_SWITCH
573+
* tells the RCU protected stat update paths to grab the i_page
574+
* lock so that stat transfer can synchronize against them.
575+
* Let's continue after I_WB_SWITCH is guaranteed to be
576+
* visible.
577+
*/
578+
synchronize_rcu();
579+
580+
llist_for_each_entry_safe(isw, next_isw, list, list)
581+
process_inode_switch_wbs(new_wb, isw);
582+
}
583+
wb_put(new_wb);
584+
}
585+
546586
static bool inode_prepare_wbs_switch(struct inode *inode,
547587
struct bdi_writeback *new_wb)
548588
{
@@ -572,6 +612,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
572612
return true;
573613
}
574614

615+
static void wb_queue_isw(struct bdi_writeback *wb,
616+
struct inode_switch_wbs_context *isw)
617+
{
618+
if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
619+
queue_work(isw_wq, &wb->switch_work);
620+
}
621+
575622
/**
576623
* inode_switch_wbs - change the wb association of an inode
577624
* @inode: target inode
@@ -585,6 +632,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
585632
struct backing_dev_info *bdi = inode_to_bdi(inode);
586633
struct cgroup_subsys_state *memcg_css;
587634
struct inode_switch_wbs_context *isw;
635+
struct bdi_writeback *new_wb = NULL;
588636

589637
/* noop if seems to be already in progress */
590638
if (inode->i_state & I_WB_SWITCH)
@@ -609,40 +657,35 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
609657
if (!memcg_css)
610658
goto out_free;
611659

612-
isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
660+
new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
613661
css_put(memcg_css);
614-
if (!isw->new_wb)
662+
if (!new_wb)
615663
goto out_free;
616664

617-
if (!inode_prepare_wbs_switch(inode, isw->new_wb))
665+
if (!inode_prepare_wbs_switch(inode, new_wb))
618666
goto out_free;
619667

620668
isw->inodes[0] = inode;
621669

622-
/*
623-
* In addition to synchronizing among switchers, I_WB_SWITCH tells
624-
* the RCU protected stat update paths to grab the i_page
625-
* lock so that stat transfer can synchronize against them.
626-
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
627-
*/
628-
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
629-
queue_rcu_work(isw_wq, &isw->work);
670+
trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1);
671+
wb_queue_isw(new_wb, isw);
630672
return;
631673

632674
out_free:
633675
atomic_dec(&isw_nr_in_flight);
634-
if (isw->new_wb)
635-
wb_put(isw->new_wb);
676+
if (new_wb)
677+
wb_put(new_wb);
636678
kfree(isw);
637679
}
638680

639-
static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
681+
static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
682+
struct inode_switch_wbs_context *isw,
640683
struct list_head *list, int *nr)
641684
{
642685
struct inode *inode;
643686

644687
list_for_each_entry(inode, list, i_io_list) {
645-
if (!inode_prepare_wbs_switch(inode, isw->new_wb))
688+
if (!inode_prepare_wbs_switch(inode, new_wb))
646689
continue;
647690

648691
isw->inodes[*nr] = inode;
@@ -666,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
666709
{
667710
struct cgroup_subsys_state *memcg_css;
668711
struct inode_switch_wbs_context *isw;
712+
struct bdi_writeback *new_wb;
669713
int nr;
670714
bool restart = false;
671715

@@ -678,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
678722

679723
for (memcg_css = wb->memcg_css->parent; memcg_css;
680724
memcg_css = memcg_css->parent) {
681-
isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
682-
if (isw->new_wb)
725+
new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
726+
if (new_wb)
683727
break;
684728
}
685-
if (unlikely(!isw->new_wb))
686-
isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
729+
if (unlikely(!new_wb))
730+
new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
687731

688732
nr = 0;
689733
spin_lock(&wb->list_lock);
@@ -695,27 +739,22 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
695739
* bandwidth restrictions, as writeback of inode metadata is not
696740
* accounted for.
697741
*/
698-
restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
742+
restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
699743
if (!restart)
700-
restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
744+
restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
745+
&nr);
701746
spin_unlock(&wb->list_lock);
702747

703748
/* no attached inodes? bail out */
704749
if (nr == 0) {
705750
atomic_dec(&isw_nr_in_flight);
706-
wb_put(isw->new_wb);
751+
wb_put(new_wb);
707752
kfree(isw);
708753
return restart;
709754
}
710755

711-
/*
712-
* In addition to synchronizing among switchers, I_WB_SWITCH tells
713-
* the RCU protected stat update paths to grab the i_page
714-
* lock so that stat transfer can synchronize against them.
715-
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
716-
*/
717-
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
718-
queue_rcu_work(isw_wq, &isw->work);
756+
trace_inode_switch_wbs_queue(wb, new_wb, nr);
757+
wb_queue_isw(new_wb, isw);
719758

720759
return restart;
721760
}

include/linux/backing-dev-defs.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,10 @@ struct bdi_writeback {
152152
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
153153
struct list_head b_attached; /* attached inodes, protected by list_lock */
154154
struct list_head offline_node; /* anchored at offline_cgwbs */
155+
struct work_struct switch_work; /* work used to perform inode switching
156+
* to this wb */
157+
struct llist_head switch_wbs_ctxs; /* queued contexts for
158+
* writeback switching */
155159

156160
union {
157161
struct work_struct release_work;

include/linux/writeback.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
265265
bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
266266
}
267267

268+
void inode_switch_wbs_work_fn(struct work_struct *work);
269+
268270
#else /* CONFIG_CGROUP_WRITEBACK */
269271

270272
static inline void inode_attach_wb(struct inode *inode, struct folio *folio)

include/trace/events/writeback.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,35 @@ TRACE_EVENT(inode_foreign_history,
213213
)
214214
);
215215

216+
TRACE_EVENT(inode_switch_wbs_queue,
217+
218+
TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb,
219+
unsigned int count),
220+
221+
TP_ARGS(old_wb, new_wb, count),
222+
223+
TP_STRUCT__entry(
224+
__array(char, name, 32)
225+
__field(ino_t, old_cgroup_ino)
226+
__field(ino_t, new_cgroup_ino)
227+
__field(unsigned int, count)
228+
),
229+
230+
TP_fast_assign(
231+
strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
232+
__entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb);
233+
__entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb);
234+
__entry->count = count;
235+
),
236+
237+
TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u",
238+
__entry->name,
239+
(unsigned long)__entry->old_cgroup_ino,
240+
(unsigned long)__entry->new_cgroup_ino,
241+
__entry->count
242+
)
243+
);
244+
216245
TRACE_EVENT(inode_switch_wbs,
217246

218247
TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,

mm/backing-dev.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work)
633633
wb_exit(wb);
634634
bdi_put(bdi);
635635
WARN_ON_ONCE(!list_empty(&wb->b_attached));
636+
WARN_ON_ONCE(work_pending(&wb->switch_work));
636637
call_rcu(&wb->rcu, cgwb_free_rcu);
637638
}
638639

@@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
709710
wb->memcg_css = memcg_css;
710711
wb->blkcg_css = blkcg_css;
711712
INIT_LIST_HEAD(&wb->b_attached);
713+
INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
714+
init_llist_head(&wb->switch_wbs_ctxs);
712715
INIT_WORK(&wb->release_work, cgwb_release_workfn);
713716
set_bit(WB_registered, &wb->state);
714717
bdi_get(bdi);
@@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
839842
if (!ret) {
840843
bdi->wb.memcg_css = &root_mem_cgroup->css;
841844
bdi->wb.blkcg_css = blkcg_root_css;
845+
INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
846+
init_llist_head(&bdi->wb.switch_wbs_ctxs);
842847
}
843848
return ret;
844849
}

0 commit comments

Comments
 (0)