@@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
368368}
369369
370370struct inode_switch_wbs_context {
371- struct rcu_work work ;
371+ /* List of queued switching contexts for the wb */
372+ struct llist_node list ;
372373
373374 /*
374375 * Multiple inodes can be switched at once. The switching procedure
@@ -378,7 +379,6 @@ struct inode_switch_wbs_context {
378379 * array embedded into struct inode_switch_wbs_context. Otherwise
379380 * an inode could be left in a non-consistent state.
380381 */
381- struct bdi_writeback * new_wb ;
382382 struct inode * inodes [];
383383};
384384
@@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode,
445445 * Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
446446 * the specific list @inode was on is ignored and the @inode is put on
447447 * ->b_dirty which is always correct including from ->b_dirty_time.
448- * The transfer preserves @inode->dirtied_when ordering. If the @inode
449- * was clean, it means it was on the b_attached list, so move it onto
450- * the b_attached list of @new_wb.
448+ * If the @inode was clean, it means it was on the b_attached list, so
449+ * move it onto the b_attached list of @new_wb.
451450 */
452451 if (!list_empty (& inode -> i_io_list )) {
453452 inode -> i_wb = new_wb ;
454453
455454 if (inode -> i_state & I_DIRTY_ALL ) {
456- struct inode * pos ;
457-
458- list_for_each_entry (pos , & new_wb -> b_dirty , i_io_list )
459- if (time_after_eq (inode -> dirtied_when ,
460- pos -> dirtied_when ))
461- break ;
455+ /*
456+ * We need to keep b_dirty list sorted by
457+ * dirtied_time_when. However properly sorting the
458+ * inode in the list gets too expensive when switching
459+ * many inodes. So just attach inode at the end of the
460+ * dirty list and clobber the dirtied_time_when.
461+ */
462+ inode -> dirtied_time_when = jiffies ;
462463 inode_io_list_move_locked (inode , new_wb ,
463- pos -> i_io_list . prev );
464+ & new_wb -> b_dirty );
464465 } else {
465466 inode_cgwb_move_to_attached (inode , new_wb );
466467 }
@@ -486,13 +487,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
486487 return switched ;
487488}
488489
489- static void inode_switch_wbs_work_fn (struct work_struct * work )
490+ static void process_inode_switch_wbs (struct bdi_writeback * new_wb ,
491+ struct inode_switch_wbs_context * isw )
490492{
491- struct inode_switch_wbs_context * isw =
492- container_of (to_rcu_work (work ), struct inode_switch_wbs_context , work );
493493 struct backing_dev_info * bdi = inode_to_bdi (isw -> inodes [0 ]);
494494 struct bdi_writeback * old_wb = isw -> inodes [0 ]-> i_wb ;
495- struct bdi_writeback * new_wb = isw -> new_wb ;
496495 unsigned long nr_switched = 0 ;
497496 struct inode * * inodep ;
498497
@@ -502,6 +501,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
502501 */
503502 down_read (& bdi -> wb_switch_rwsem );
504503
504+ inodep = isw -> inodes ;
505505 /*
506506 * By the time control reaches here, RCU grace period has passed
507507 * since I_WB_SWITCH assertion and all wb stat update transactions
@@ -512,6 +512,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
512512 * gives us exclusion against all wb related operations on @inode
513513 * including IO list manipulations and stat updates.
514514 */
515+ relock :
515516 if (old_wb < new_wb ) {
516517 spin_lock (& old_wb -> list_lock );
517518 spin_lock_nested (& new_wb -> list_lock , SINGLE_DEPTH_NESTING );
@@ -520,10 +521,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
520521 spin_lock_nested (& old_wb -> list_lock , SINGLE_DEPTH_NESTING );
521522 }
522523
523- for ( inodep = isw -> inodes ; * inodep ; inodep ++ ) {
524+ while ( * inodep ) {
524525 WARN_ON_ONCE ((* inodep )-> i_wb != old_wb );
525526 if (inode_do_switch_wbs (* inodep , old_wb , new_wb ))
526527 nr_switched ++ ;
528+ inodep ++ ;
529+ if (* inodep && need_resched ()) {
530+ spin_unlock (& new_wb -> list_lock );
531+ spin_unlock (& old_wb -> list_lock );
532+ cond_resched ();
533+ goto relock ;
534+ }
527535 }
528536
529537 spin_unlock (& new_wb -> list_lock );
@@ -543,6 +551,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
543551 atomic_dec (& isw_nr_in_flight );
544552}
545553
554+ void inode_switch_wbs_work_fn (struct work_struct * work )
555+ {
556+ struct bdi_writeback * new_wb = container_of (work , struct bdi_writeback ,
557+ switch_work );
558+ struct inode_switch_wbs_context * isw , * next_isw ;
559+ struct llist_node * list ;
560+
561+ /*
562+ * Grab out reference to wb so that it cannot get freed under us
563+ * after we process all the isw items.
564+ */
565+ wb_get (new_wb );
566+ while (1 ) {
567+ list = llist_del_all (& new_wb -> switch_wbs_ctxs );
568+ /* Nothing to do? */
569+ if (!list )
570+ break ;
571+ /*
572+ * In addition to synchronizing among switchers, I_WB_SWITCH
573+ * tells the RCU protected stat update paths to grab the i_page
574+ * lock so that stat transfer can synchronize against them.
575+ * Let's continue after I_WB_SWITCH is guaranteed to be
576+ * visible.
577+ */
578+ synchronize_rcu ();
579+
580+ llist_for_each_entry_safe (isw , next_isw , list , list )
581+ process_inode_switch_wbs (new_wb , isw );
582+ }
583+ wb_put (new_wb );
584+ }
585+
546586static bool inode_prepare_wbs_switch (struct inode * inode ,
547587 struct bdi_writeback * new_wb )
548588{
@@ -572,6 +612,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
572612 return true;
573613}
574614
615+ static void wb_queue_isw (struct bdi_writeback * wb ,
616+ struct inode_switch_wbs_context * isw )
617+ {
618+ if (llist_add (& isw -> list , & wb -> switch_wbs_ctxs ))
619+ queue_work (isw_wq , & wb -> switch_work );
620+ }
621+
575622/**
576623 * inode_switch_wbs - change the wb association of an inode
577624 * @inode: target inode
@@ -585,6 +632,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
585632 struct backing_dev_info * bdi = inode_to_bdi (inode );
586633 struct cgroup_subsys_state * memcg_css ;
587634 struct inode_switch_wbs_context * isw ;
635+ struct bdi_writeback * new_wb = NULL ;
588636
589637 /* noop if seems to be already in progress */
590638 if (inode -> i_state & I_WB_SWITCH )
@@ -609,40 +657,35 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
609657 if (!memcg_css )
610658 goto out_free ;
611659
612- isw -> new_wb = wb_get_create (bdi , memcg_css , GFP_ATOMIC );
660+ new_wb = wb_get_create (bdi , memcg_css , GFP_ATOMIC );
613661 css_put (memcg_css );
614- if (!isw -> new_wb )
662+ if (!new_wb )
615663 goto out_free ;
616664
617- if (!inode_prepare_wbs_switch (inode , isw -> new_wb ))
665+ if (!inode_prepare_wbs_switch (inode , new_wb ))
618666 goto out_free ;
619667
620668 isw -> inodes [0 ] = inode ;
621669
622- /*
623- * In addition to synchronizing among switchers, I_WB_SWITCH tells
624- * the RCU protected stat update paths to grab the i_page
625- * lock so that stat transfer can synchronize against them.
626- * Let's continue after I_WB_SWITCH is guaranteed to be visible.
627- */
628- INIT_RCU_WORK (& isw -> work , inode_switch_wbs_work_fn );
629- queue_rcu_work (isw_wq , & isw -> work );
670+ trace_inode_switch_wbs_queue (inode -> i_wb , new_wb , 1 );
671+ wb_queue_isw (new_wb , isw );
630672 return ;
631673
632674out_free :
633675 atomic_dec (& isw_nr_in_flight );
634- if (isw -> new_wb )
635- wb_put (isw -> new_wb );
676+ if (new_wb )
677+ wb_put (new_wb );
636678 kfree (isw );
637679}
638680
639- static bool isw_prepare_wbs_switch (struct inode_switch_wbs_context * isw ,
681+ static bool isw_prepare_wbs_switch (struct bdi_writeback * new_wb ,
682+ struct inode_switch_wbs_context * isw ,
640683 struct list_head * list , int * nr )
641684{
642685 struct inode * inode ;
643686
644687 list_for_each_entry (inode , list , i_io_list ) {
645- if (!inode_prepare_wbs_switch (inode , isw -> new_wb ))
688+ if (!inode_prepare_wbs_switch (inode , new_wb ))
646689 continue ;
647690
648691 isw -> inodes [* nr ] = inode ;
@@ -666,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
666709{
667710 struct cgroup_subsys_state * memcg_css ;
668711 struct inode_switch_wbs_context * isw ;
712+ struct bdi_writeback * new_wb ;
669713 int nr ;
670714 bool restart = false;
671715
@@ -678,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
678722
679723 for (memcg_css = wb -> memcg_css -> parent ; memcg_css ;
680724 memcg_css = memcg_css -> parent ) {
681- isw -> new_wb = wb_get_create (wb -> bdi , memcg_css , GFP_KERNEL );
682- if (isw -> new_wb )
725+ new_wb = wb_get_create (wb -> bdi , memcg_css , GFP_KERNEL );
726+ if (new_wb )
683727 break ;
684728 }
685- if (unlikely (!isw -> new_wb ))
686- isw -> new_wb = & wb -> bdi -> wb ; /* wb_get() is noop for bdi's wb */
729+ if (unlikely (!new_wb ))
730+ new_wb = & wb -> bdi -> wb ; /* wb_get() is noop for bdi's wb */
687731
688732 nr = 0 ;
689733 spin_lock (& wb -> list_lock );
@@ -695,27 +739,22 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
695739 * bandwidth restrictions, as writeback of inode metadata is not
696740 * accounted for.
697741 */
698- restart = isw_prepare_wbs_switch (isw , & wb -> b_attached , & nr );
742+ restart = isw_prepare_wbs_switch (new_wb , isw , & wb -> b_attached , & nr );
699743 if (!restart )
700- restart = isw_prepare_wbs_switch (isw , & wb -> b_dirty_time , & nr );
744+ restart = isw_prepare_wbs_switch (new_wb , isw , & wb -> b_dirty_time ,
745+ & nr );
701746 spin_unlock (& wb -> list_lock );
702747
703748 /* no attached inodes? bail out */
704749 if (nr == 0 ) {
705750 atomic_dec (& isw_nr_in_flight );
706- wb_put (isw -> new_wb );
751+ wb_put (new_wb );
707752 kfree (isw );
708753 return restart ;
709754 }
710755
711- /*
712- * In addition to synchronizing among switchers, I_WB_SWITCH tells
713- * the RCU protected stat update paths to grab the i_page
714- * lock so that stat transfer can synchronize against them.
715- * Let's continue after I_WB_SWITCH is guaranteed to be visible.
716- */
717- INIT_RCU_WORK (& isw -> work , inode_switch_wbs_work_fn );
718- queue_rcu_work (isw_wq , & isw -> work );
756+ trace_inode_switch_wbs_queue (wb , new_wb , nr );
757+ wb_queue_isw (new_wb , isw );
719758
720759 return restart ;
721760}
0 commit comments