Skip to content

Commit b634aba

Browse files
dchinnerdgchinner
authored andcommitted
Merge tag 'scrub-drain-intents-6.4_2023-04-11' of git://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into guilt/xfs-for-next
xfs: drain deferred work items when scrubbing [v24.5] The design doc for XFS online fsck contains a long discussion of the eventual consistency models in use for XFS metadata. In that chapter, we note that it is possible for scrub to collide with a chain of deferred space metadata updates, and proposes a lightweight solution: The use of a pending-intents counter so that scrub can wait for the system to drain all chains. This patchset implements that scrub drain. The first patch implements the basic mechanism, and the subsequent patches reduce the runtime overhead by converting the implementation to use sloppy counters and introducing jump labels to avoid walking into scrub hooks when it isn't running. This last paradigm repeats elsewhere in this megaseries. v23.1: make intent items take an active ref to the perag structure and document why we bump and drop the intent counts when we do Signed-off-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Dave Chinner <david@fromorbit.com>
2 parents 793f5c2 + 88accf1 commit b634aba

31 files changed

Lines changed: 680 additions & 39 deletions

fs/xfs/Kconfig

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,15 @@ config XFS_RT
9393

9494
If unsure, say N.
9595

96+
config XFS_DRAIN_INTENTS
97+
bool
98+
select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
99+
96100
config XFS_ONLINE_SCRUB
97101
bool "XFS online metadata check support"
98102
default n
99103
depends on XFS_FS
104+
select XFS_DRAIN_INTENTS
100105
help
101106
If you say Y here you will be able to check metadata on a
102107
mounted XFS filesystem. This feature is intended to reduce

fs/xfs/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ ifeq ($(CONFIG_MEMORY_FAILURE),y)
136136
xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o
137137
endif
138138

139+
xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o
140+
139141
# online scrub/repair
140142
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
141143

fs/xfs/libxfs/xfs_ag.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ xfs_free_perag(
260260
spin_unlock(&mp->m_perag_lock);
261261
ASSERT(pag);
262262
XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
263+
xfs_defer_drain_free(&pag->pag_intents_drain);
263264

264265
cancel_delayed_work_sync(&pag->pag_blockgc_work);
265266
xfs_buf_hash_destroy(pag);
@@ -385,6 +386,7 @@ xfs_initialize_perag(
385386
spin_lock_init(&pag->pag_state_lock);
386387
INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
387388
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
389+
xfs_defer_drain_init(&pag->pag_intents_drain);
388390
init_waitqueue_head(&pag->pagb_wait);
389391
init_waitqueue_head(&pag->pag_active_wq);
390392
pag->pagb_count = 0;
@@ -421,6 +423,7 @@ xfs_initialize_perag(
421423
return 0;
422424

423425
out_remove_pag:
426+
xfs_defer_drain_free(&pag->pag_intents_drain);
424427
radix_tree_delete(&mp->m_perag_tree, index);
425428
out_free_pag:
426429
kmem_free(pag);
@@ -431,6 +434,7 @@ xfs_initialize_perag(
431434
if (!pag)
432435
break;
433436
xfs_buf_hash_destroy(pag);
437+
xfs_defer_drain_free(&pag->pag_intents_drain);
434438
kmem_free(pag);
435439
}
436440
return error;

fs/xfs/libxfs/xfs_ag.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,14 @@ struct xfs_perag {
101101
/* background prealloc block trimming */
102102
struct delayed_work pag_blockgc_work;
103103

104+
/*
105+
* We use xfs_drain to track the number of deferred log intent items
106+
* that have been queued (but not yet processed) so that waiters (e.g.
107+
* scrub) will not lock resources when other threads are in the middle
108+
* of processing a chain of intent items only to find momentary
109+
* inconsistencies.
110+
*/
111+
struct xfs_defer_drain pag_intents_drain;
104112
#endif /* __KERNEL__ */
105113
};
106114

fs/xfs/libxfs/xfs_defer.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,7 @@ xfs_defer_cancel_list(
397397
list_for_each_safe(pwi, n, &dfp->dfp_work) {
398398
list_del(pwi);
399399
dfp->dfp_count--;
400+
trace_xfs_defer_cancel_item(mp, dfp, pwi);
400401
ops->cancel_item(pwi);
401402
}
402403
ASSERT(dfp->dfp_count == 0);
@@ -476,6 +477,7 @@ xfs_defer_finish_one(
476477
list_for_each_safe(li, n, &dfp->dfp_work) {
477478
list_del(li);
478479
dfp->dfp_count--;
480+
trace_xfs_defer_finish_item(tp->t_mountp, dfp, li);
479481
error = ops->finish_item(tp, dfp->dfp_done, li, &state);
480482
if (error == -EAGAIN) {
481483
int ret;
@@ -623,7 +625,7 @@ xfs_defer_add(
623625
struct list_head *li)
624626
{
625627
struct xfs_defer_pending *dfp = NULL;
626-
const struct xfs_defer_op_type *ops;
628+
const struct xfs_defer_op_type *ops = defer_op_types[type];
627629

628630
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
629631
BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
@@ -636,7 +638,6 @@ xfs_defer_add(
636638
if (!list_empty(&tp->t_dfops)) {
637639
dfp = list_last_entry(&tp->t_dfops,
638640
struct xfs_defer_pending, dfp_list);
639-
ops = defer_op_types[dfp->dfp_type];
640641
if (dfp->dfp_type != type ||
641642
(ops->max_items && dfp->dfp_count >= ops->max_items))
642643
dfp = NULL;
@@ -653,6 +654,7 @@ xfs_defer_add(
653654
}
654655

655656
list_add_tail(li, &dfp->dfp_work);
657+
trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
656658
dfp->dfp_count++;
657659
}
658660

fs/xfs/scrub/agheader.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,15 @@
1818
#include "scrub/scrub.h"
1919
#include "scrub/common.h"
2020

21+
int
22+
xchk_setup_agheader(
23+
struct xfs_scrub *sc)
24+
{
25+
if (xchk_need_intent_drain(sc))
26+
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
27+
return xchk_setup_fs(sc);
28+
}
29+
2130
/* Superblock */
2231

2332
/* Cross-reference with the other btrees. */

fs/xfs/scrub/alloc.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ int
2424
xchk_setup_ag_allocbt(
2525
struct xfs_scrub *sc)
2626
{
27+
if (xchk_need_intent_drain(sc))
28+
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
29+
2730
return xchk_setup_ag_btree(sc, false);
2831
}
2932

fs/xfs/scrub/bmap.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ xchk_setup_inode_bmap(
3131
{
3232
int error;
3333

34+
if (xchk_need_intent_drain(sc))
35+
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
36+
3437
error = xchk_get_inode(sc);
3538
if (error)
3639
goto out;

fs/xfs/scrub/btree.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ __xchk_btree_process_error(
3636

3737
switch (*error) {
3838
case -EDEADLOCK:
39+
case -ECHRNG:
3940
/* Used to restart an op with deadlock avoidance. */
4041
trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
4142
break;

fs/xfs/scrub/common.c

Lines changed: 126 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ __xchk_process_error(
7575
case 0:
7676
return true;
7777
case -EDEADLOCK:
78+
case -ECHRNG:
7879
/* Used to restart an op with deadlock avoidance. */
7980
trace_xchk_deadlock_retry(
8081
sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
@@ -130,6 +131,7 @@ __xchk_fblock_process_error(
130131
case 0:
131132
return true;
132133
case -EDEADLOCK:
134+
case -ECHRNG:
133135
/* Used to restart an op with deadlock avoidance. */
134136
trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
135137
break;
@@ -396,26 +398,19 @@ want_ag_read_header_failure(
396398
}
397399

398400
/*
399-
* Grab the perag structure and all the headers for an AG.
401+
* Grab the AG header buffers for the attached perag structure.
400402
*
401403
* The headers should be released by xchk_ag_free, but as a fail safe we attach
402404
* all the buffers we grab to the scrub transaction so they'll all be freed
403-
* when we cancel it. Returns ENOENT if we can't grab the perag structure.
405+
* when we cancel it.
404406
*/
405-
int
406-
xchk_ag_read_headers(
407+
static inline int
408+
xchk_perag_read_headers(
407409
struct xfs_scrub *sc,
408-
xfs_agnumber_t agno,
409410
struct xchk_ag *sa)
410411
{
411-
struct xfs_mount *mp = sc->mp;
412412
int error;
413413

414-
ASSERT(!sa->pag);
415-
sa->pag = xfs_perag_get(mp, agno);
416-
if (!sa->pag)
417-
return -ENOENT;
418-
419414
error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
420415
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
421416
return error;
@@ -427,6 +422,104 @@ xchk_ag_read_headers(
427422
return 0;
428423
}
429424

425+
/*
426+
* Grab the AG headers for the attached perag structure and wait for pending
427+
* intents to drain.
428+
*/
429+
static int
430+
xchk_perag_drain_and_lock(
431+
struct xfs_scrub *sc)
432+
{
433+
struct xchk_ag *sa = &sc->sa;
434+
int error = 0;
435+
436+
ASSERT(sa->pag != NULL);
437+
ASSERT(sa->agi_bp == NULL);
438+
ASSERT(sa->agf_bp == NULL);
439+
440+
do {
441+
if (xchk_should_terminate(sc, &error))
442+
return error;
443+
444+
error = xchk_perag_read_headers(sc, sa);
445+
if (error)
446+
return error;
447+
448+
/*
449+
* If we've grabbed an inode for scrubbing then we assume that
450+
* holding its ILOCK will suffice to coordinate with any intent
451+
* chains involving this inode.
452+
*/
453+
if (sc->ip)
454+
return 0;
455+
456+
/*
457+
* Decide if this AG is quiet enough for all metadata to be
458+
* consistent with each other. XFS allows the AG header buffer
459+
* locks to cycle across transaction rolls while processing
460+
* chains of deferred ops, which means that there could be
461+
* other threads in the middle of processing a chain of
462+
* deferred ops. For regular operations we are careful about
463+
* ordering operations to prevent collisions between threads
464+
* (which is why we don't need a per-AG lock), but scrub and
465+
* repair have to serialize against chained operations.
466+
*
467+
* We just locked all the AG headers buffers; now take a look
468+
* to see if there are any intents in progress. If there are,
469+
* drop the AG headers and wait for the intents to drain.
470+
* Since we hold all the AG header locks for the duration of
471+
* the scrub, this is the only time we have to sample the
472+
* intents counter; any threads increasing it after this point
473+
* can't possibly be in the middle of a chain of AG metadata
474+
* updates.
475+
*
476+
* Obviously, this should be slanted against scrub and in favor
477+
* of runtime threads.
478+
*/
479+
if (!xfs_perag_intent_busy(sa->pag))
480+
return 0;
481+
482+
if (sa->agf_bp) {
483+
xfs_trans_brelse(sc->tp, sa->agf_bp);
484+
sa->agf_bp = NULL;
485+
}
486+
487+
if (sa->agi_bp) {
488+
xfs_trans_brelse(sc->tp, sa->agi_bp);
489+
sa->agi_bp = NULL;
490+
}
491+
492+
if (!(sc->flags & XCHK_FSGATES_DRAIN))
493+
return -ECHRNG;
494+
error = xfs_perag_intent_drain(sa->pag);
495+
if (error == -ERESTARTSYS)
496+
error = -EINTR;
497+
} while (!error);
498+
499+
return error;
500+
}
501+
502+
/*
503+
* Grab the per-AG structure, grab all AG header buffers, and wait until there
504+
* aren't any pending intents. Returns -ENOENT if we can't grab the perag
505+
* structure.
506+
*/
507+
int
508+
xchk_ag_read_headers(
509+
struct xfs_scrub *sc,
510+
xfs_agnumber_t agno,
511+
struct xchk_ag *sa)
512+
{
513+
struct xfs_mount *mp = sc->mp;
514+
515+
ASSERT(!sa->pag);
516+
sa->pag = xfs_perag_get(mp, agno);
517+
if (!sa->pag)
518+
return -ENOENT;
519+
520+
return xchk_perag_drain_and_lock(sc);
521+
}
522+
430523
/* Release all the AG btree cursors. */
431524
void
432525
xchk_ag_btcur_free(
@@ -916,3 +1009,25 @@ xchk_start_reaping(
9161009
}
9171010
sc->flags &= ~XCHK_REAPING_DISABLED;
9181011
}
1012+
1013+
/*
1014+
* Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
1015+
* operation. Callers must not hold any locks that intersect with the CPU
1016+
* hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
1017+
* to change kernel code.
1018+
*/
1019+
void
1020+
xchk_fsgates_enable(
1021+
struct xfs_scrub *sc,
1022+
unsigned int scrub_fsgates)
1023+
{
1024+
ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
1025+
ASSERT(!(sc->flags & scrub_fsgates));
1026+
1027+
trace_xchk_fsgates_enable(sc, scrub_fsgates);
1028+
1029+
if (scrub_fsgates & XCHK_FSGATES_DRAIN)
1030+
xfs_drain_wait_enable();
1031+
1032+
sc->flags |= scrub_fsgates;
1033+
}

0 commit comments

Comments
 (0)