Skip to content

Commit 9426414

Browse files
committed
Merge patch series "writeback: Avoid lockups when switching inodes"
Jan Kara <jack@suse.cz> says: This patch series addresses lockups reported by users when systemd unit reading lots of files from a filesystem mounted with lazytime mount option exits. See patch 3 for more details about the reproducer. There are two main issues why switching many inodes between wbs: 1) Multiple workers will be spawned to do the switching but they all contend on the same wb->list_lock making all the parallelism pointless and just wasting time. 2) Sorting of wb->b_dirty list by dirtied_time_when is inherently slow. Patches 1-3 address these problems, patch 4 adds a tracepoint for better observability of inode writeback switching. * patches from https://lore.kernel.org/20250912103522.2935-1-jack@suse.cz: writeback: Add tracepoint to track pending inode switches writeback: Avoid excessively long inode switching times writeback: Avoid softlockup when switching many inodes writeback: Avoid contention on wb->list_lock when switching inodes Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 8f5ae30 + 0cee64c commit 9426414

5 files changed

Lines changed: 126 additions & 47 deletions

File tree

fs/fs-writeback.c

Lines changed: 86 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
368368
}
369369

370370
struct inode_switch_wbs_context {
371-
struct rcu_work work;
371+
/* List of queued switching contexts for the wb */
372+
struct llist_node list;
372373

373374
/*
374375
* Multiple inodes can be switched at once. The switching procedure
@@ -378,7 +379,6 @@ struct inode_switch_wbs_context {
378379
* array embedded into struct inode_switch_wbs_context. Otherwise
379380
* an inode could be left in a non-consistent state.
380381
*/
381-
struct bdi_writeback *new_wb;
382382
struct inode *inodes[];
383383
};
384384

@@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode,
445445
* Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
446446
* the specific list @inode was on is ignored and the @inode is put on
447447
* ->b_dirty which is always correct including from ->b_dirty_time.
448-
* The transfer preserves @inode->dirtied_when ordering. If the @inode
449-
* was clean, it means it was on the b_attached list, so move it onto
450-
* the b_attached list of @new_wb.
448+
* If the @inode was clean, it means it was on the b_attached list, so
449+
* move it onto the b_attached list of @new_wb.
451450
*/
452451
if (!list_empty(&inode->i_io_list)) {
453452
inode->i_wb = new_wb;
454453

455454
if (inode->i_state & I_DIRTY_ALL) {
456-
struct inode *pos;
457-
458-
list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
459-
if (time_after_eq(inode->dirtied_when,
460-
pos->dirtied_when))
461-
break;
455+
/*
456+
* We need to keep b_dirty list sorted by
457+
* dirtied_time_when. However properly sorting the
458+
* inode in the list gets too expensive when switching
459+
* many inodes. So just attach inode at the end of the
460+
* dirty list and clobber the dirtied_time_when.
461+
*/
462+
inode->dirtied_time_when = jiffies;
462463
inode_io_list_move_locked(inode, new_wb,
463-
pos->i_io_list.prev);
464+
&new_wb->b_dirty);
464465
} else {
465466
inode_cgwb_move_to_attached(inode, new_wb);
466467
}
@@ -486,13 +487,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
486487
return switched;
487488
}
488489

489-
static void inode_switch_wbs_work_fn(struct work_struct *work)
490+
static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
491+
struct inode_switch_wbs_context *isw)
490492
{
491-
struct inode_switch_wbs_context *isw =
492-
container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
493493
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
494494
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
495-
struct bdi_writeback *new_wb = isw->new_wb;
496495
unsigned long nr_switched = 0;
497496
struct inode **inodep;
498497

@@ -502,6 +501,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
502501
*/
503502
down_read(&bdi->wb_switch_rwsem);
504503

504+
inodep = isw->inodes;
505505
/*
506506
* By the time control reaches here, RCU grace period has passed
507507
* since I_WB_SWITCH assertion and all wb stat update transactions
@@ -512,6 +512,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
512512
* gives us exclusion against all wb related operations on @inode
513513
* including IO list manipulations and stat updates.
514514
*/
515+
relock:
515516
if (old_wb < new_wb) {
516517
spin_lock(&old_wb->list_lock);
517518
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
@@ -520,10 +521,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
520521
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
521522
}
522523

523-
for (inodep = isw->inodes; *inodep; inodep++) {
524+
while (*inodep) {
524525
WARN_ON_ONCE((*inodep)->i_wb != old_wb);
525526
if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
526527
nr_switched++;
528+
inodep++;
529+
if (*inodep && need_resched()) {
530+
spin_unlock(&new_wb->list_lock);
531+
spin_unlock(&old_wb->list_lock);
532+
cond_resched();
533+
goto relock;
534+
}
527535
}
528536

529537
spin_unlock(&new_wb->list_lock);
@@ -543,6 +551,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
543551
atomic_dec(&isw_nr_in_flight);
544552
}
545553

554+
void inode_switch_wbs_work_fn(struct work_struct *work)
555+
{
556+
struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
557+
switch_work);
558+
struct inode_switch_wbs_context *isw, *next_isw;
559+
struct llist_node *list;
560+
561+
/*
562+
* Grab out reference to wb so that it cannot get freed under us
563+
* after we process all the isw items.
564+
*/
565+
wb_get(new_wb);
566+
while (1) {
567+
list = llist_del_all(&new_wb->switch_wbs_ctxs);
568+
/* Nothing to do? */
569+
if (!list)
570+
break;
571+
/*
572+
* In addition to synchronizing among switchers, I_WB_SWITCH
573+
* tells the RCU protected stat update paths to grab the i_page
574+
* lock so that stat transfer can synchronize against them.
575+
* Let's continue after I_WB_SWITCH is guaranteed to be
576+
* visible.
577+
*/
578+
synchronize_rcu();
579+
580+
llist_for_each_entry_safe(isw, next_isw, list, list)
581+
process_inode_switch_wbs(new_wb, isw);
582+
}
583+
wb_put(new_wb);
584+
}
585+
546586
static bool inode_prepare_wbs_switch(struct inode *inode,
547587
struct bdi_writeback *new_wb)
548588
{
@@ -572,6 +612,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
572612
return true;
573613
}
574614

615+
static void wb_queue_isw(struct bdi_writeback *wb,
616+
struct inode_switch_wbs_context *isw)
617+
{
618+
if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
619+
queue_work(isw_wq, &wb->switch_work);
620+
}
621+
575622
/**
576623
* inode_switch_wbs - change the wb association of an inode
577624
* @inode: target inode
@@ -585,6 +632,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
585632
struct backing_dev_info *bdi = inode_to_bdi(inode);
586633
struct cgroup_subsys_state *memcg_css;
587634
struct inode_switch_wbs_context *isw;
635+
struct bdi_writeback *new_wb = NULL;
588636

589637
/* noop if seems to be already in progress */
590638
if (inode->i_state & I_WB_SWITCH)
@@ -609,40 +657,35 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
609657
if (!memcg_css)
610658
goto out_free;
611659

612-
isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
660+
new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
613661
css_put(memcg_css);
614-
if (!isw->new_wb)
662+
if (!new_wb)
615663
goto out_free;
616664

617-
if (!inode_prepare_wbs_switch(inode, isw->new_wb))
665+
if (!inode_prepare_wbs_switch(inode, new_wb))
618666
goto out_free;
619667

620668
isw->inodes[0] = inode;
621669

622-
/*
623-
* In addition to synchronizing among switchers, I_WB_SWITCH tells
624-
* the RCU protected stat update paths to grab the i_page
625-
* lock so that stat transfer can synchronize against them.
626-
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
627-
*/
628-
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
629-
queue_rcu_work(isw_wq, &isw->work);
670+
trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1);
671+
wb_queue_isw(new_wb, isw);
630672
return;
631673

632674
out_free:
633675
atomic_dec(&isw_nr_in_flight);
634-
if (isw->new_wb)
635-
wb_put(isw->new_wb);
676+
if (new_wb)
677+
wb_put(new_wb);
636678
kfree(isw);
637679
}
638680

639-
static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
681+
static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
682+
struct inode_switch_wbs_context *isw,
640683
struct list_head *list, int *nr)
641684
{
642685
struct inode *inode;
643686

644687
list_for_each_entry(inode, list, i_io_list) {
645-
if (!inode_prepare_wbs_switch(inode, isw->new_wb))
688+
if (!inode_prepare_wbs_switch(inode, new_wb))
646689
continue;
647690

648691
isw->inodes[*nr] = inode;
@@ -666,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
666709
{
667710
struct cgroup_subsys_state *memcg_css;
668711
struct inode_switch_wbs_context *isw;
712+
struct bdi_writeback *new_wb;
669713
int nr;
670714
bool restart = false;
671715

@@ -678,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
678722

679723
for (memcg_css = wb->memcg_css->parent; memcg_css;
680724
memcg_css = memcg_css->parent) {
681-
isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
682-
if (isw->new_wb)
725+
new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
726+
if (new_wb)
683727
break;
684728
}
685-
if (unlikely(!isw->new_wb))
686-
isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
729+
if (unlikely(!new_wb))
730+
new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
687731

688732
nr = 0;
689733
spin_lock(&wb->list_lock);
@@ -695,27 +739,22 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
695739
* bandwidth restrictions, as writeback of inode metadata is not
696740
* accounted for.
697741
*/
698-
restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
742+
restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
699743
if (!restart)
700-
restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
744+
restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
745+
&nr);
701746
spin_unlock(&wb->list_lock);
702747

703748
/* no attached inodes? bail out */
704749
if (nr == 0) {
705750
atomic_dec(&isw_nr_in_flight);
706-
wb_put(isw->new_wb);
751+
wb_put(new_wb);
707752
kfree(isw);
708753
return restart;
709754
}
710755

711-
/*
712-
* In addition to synchronizing among switchers, I_WB_SWITCH tells
713-
* the RCU protected stat update paths to grab the i_page
714-
* lock so that stat transfer can synchronize against them.
715-
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
716-
*/
717-
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
718-
queue_rcu_work(isw_wq, &isw->work);
756+
trace_inode_switch_wbs_queue(wb, new_wb, nr);
757+
wb_queue_isw(new_wb, isw);
719758

720759
return restart;
721760
}

include/linux/backing-dev-defs.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,10 @@ struct bdi_writeback {
152152
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
153153
struct list_head b_attached; /* attached inodes, protected by list_lock */
154154
struct list_head offline_node; /* anchored at offline_cgwbs */
155+
struct work_struct switch_work; /* work used to perform inode switching
156+
* to this wb */
157+
struct llist_head switch_wbs_ctxs; /* queued contexts for
158+
* writeback switching */
155159

156160
union {
157161
struct work_struct release_work;

include/linux/writeback.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
265265
bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
266266
}
267267

268+
void inode_switch_wbs_work_fn(struct work_struct *work);
269+
268270
#else /* CONFIG_CGROUP_WRITEBACK */
269271

270272
static inline void inode_attach_wb(struct inode *inode, struct folio *folio)

include/trace/events/writeback.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,35 @@ TRACE_EVENT(inode_foreign_history,
213213
)
214214
);
215215

216+
TRACE_EVENT(inode_switch_wbs_queue,
217+
218+
TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb,
219+
unsigned int count),
220+
221+
TP_ARGS(old_wb, new_wb, count),
222+
223+
TP_STRUCT__entry(
224+
__array(char, name, 32)
225+
__field(ino_t, old_cgroup_ino)
226+
__field(ino_t, new_cgroup_ino)
227+
__field(unsigned int, count)
228+
),
229+
230+
TP_fast_assign(
231+
strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
232+
__entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb);
233+
__entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb);
234+
__entry->count = count;
235+
),
236+
237+
TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u",
238+
__entry->name,
239+
(unsigned long)__entry->old_cgroup_ino,
240+
(unsigned long)__entry->new_cgroup_ino,
241+
__entry->count
242+
)
243+
);
244+
216245
TRACE_EVENT(inode_switch_wbs,
217246

218247
TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,

mm/backing-dev.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work)
633633
wb_exit(wb);
634634
bdi_put(bdi);
635635
WARN_ON_ONCE(!list_empty(&wb->b_attached));
636+
WARN_ON_ONCE(work_pending(&wb->switch_work));
636637
call_rcu(&wb->rcu, cgwb_free_rcu);
637638
}
638639

@@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
709710
wb->memcg_css = memcg_css;
710711
wb->blkcg_css = blkcg_css;
711712
INIT_LIST_HEAD(&wb->b_attached);
713+
INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
714+
init_llist_head(&wb->switch_wbs_ctxs);
712715
INIT_WORK(&wb->release_work, cgwb_release_workfn);
713716
set_bit(WB_registered, &wb->state);
714717
bdi_get(bdi);
@@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
839842
if (!ret) {
840843
bdi->wb.memcg_css = &root_mem_cgroup->css;
841844
bdi->wb.blkcg_css = blkcg_root_css;
845+
INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
846+
init_llist_head(&bdi->wb.switch_wbs_ctxs);
842847
}
843848
return ret;
844849
}

0 commit comments

Comments
 (0)