Skip to content

Commit 2229276

Browse files
Darrick J. Wongdchinner
authored andcommitted
xfs: use a separate frextents counter for rt extent reservations
As mentioned in the previous commit, the kernel misuses sb_frextents in the incore mount to reflect both incore reservations made by running transactions as well as the actual count of free rt extents on disk. This results in the superblock being written to the log with an underestimate of the number of rt extents that are marked free in the rtbitmap. Teaching XFS to recompute frextents after log recovery avoids operational problems in the current mount, but it doesn't solve the problem of us writing undercounted frextents which are then recovered by an older kernel that doesn't have that fix. Create an incore percpu counter to mirror the ondisk frextents. This new counter will track transaction reservations and the only time we will touch the incore super counter (i.e the one that gets logged) is when those transactions commit updates to the rt bitmap. This is in contrast to the lazysbcount counters (e.g. fdblocks), where we know that log recovery will always fix any incorrect counter that we log. As a bonus, we only take m_sb_lock at transaction commit time. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
1 parent 5a605fd commit 2229276

8 files changed

Lines changed: 99 additions & 47 deletions

File tree

fs/xfs/libxfs/xfs_sb.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -911,6 +911,11 @@ xfs_log_sb(
911911
* reservations that have been taken out percpu counters. If we have an
912912
* unclean shutdown, this will be corrected by log recovery rebuilding
913913
* the counters from the AGF block counts.
914+
*
915+
* Do not update sb_frextents here because it is not part of the lazy
916+
* sb counters, despite having a percpu counter. It is always kept
917+
* consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
918+
* and hence we don't need have to update it here.
914919
*/
915920
if (xfs_has_lazysbcount(mp)) {
916921
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);

fs/xfs/xfs_fsops.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -349,10 +349,7 @@ xfs_fs_counts(
349349
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
350350
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
351351
xfs_fdblocks_unavailable(mp);
352-
353-
spin_lock(&mp->m_sb_lock);
354-
cnt->freertx = mp->m_sb.sb_frextents;
355-
spin_unlock(&mp->m_sb_lock);
352+
cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
356353
}
357354

358355
/*

fs/xfs/xfs_icache.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1916,13 +1916,16 @@ xfs_inodegc_want_queue_rt_file(
19161916
struct xfs_inode *ip)
19171917
{
19181918
struct xfs_mount *mp = ip->i_mount;
1919-
uint64_t freertx;
19201919

19211920
if (!XFS_IS_REALTIME_INODE(ip))
19221921
return false;
19231922

1924-
freertx = READ_ONCE(mp->m_sb.sb_frextents);
1925-
return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
1923+
if (__percpu_counter_compare(&mp->m_frextents,
1924+
mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
1925+
XFS_FDBLOCKS_BATCH) < 0)
1926+
return true;
1927+
1928+
return false;
19261929
}
19271930
#else
19281931
# define xfs_inodegc_want_queue_rt_file(ip) (false)

fs/xfs/xfs_mount.c

Lines changed: 21 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,24 +1110,33 @@ xfs_fs_writable(
11101110
return true;
11111111
}
11121112

1113+
/* Adjust m_fdblocks or m_frextents. */
11131114
int
1114-
xfs_mod_fdblocks(
1115+
xfs_mod_freecounter(
11151116
struct xfs_mount *mp,
1117+
struct percpu_counter *counter,
11161118
int64_t delta,
11171119
bool rsvd)
11181120
{
11191121
int64_t lcounter;
11201122
long long res_used;
1123+
uint64_t set_aside = 0;
11211124
s32 batch;
1122-
uint64_t set_aside;
1125+
bool has_resv_pool;
1126+
1127+
ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
1128+
has_resv_pool = (counter == &mp->m_fdblocks);
1129+
if (rsvd)
1130+
ASSERT(has_resv_pool);
11231131

11241132
if (delta > 0) {
11251133
/*
11261134
* If the reserve pool is depleted, put blocks back into it
11271135
* first. Most of the time the pool is full.
11281136
*/
1129-
if (likely(mp->m_resblks == mp->m_resblks_avail)) {
1130-
percpu_counter_add(&mp->m_fdblocks, delta);
1137+
if (likely(!has_resv_pool ||
1138+
mp->m_resblks == mp->m_resblks_avail)) {
1139+
percpu_counter_add(counter, delta);
11311140
return 0;
11321141
}
11331142

@@ -1139,7 +1148,7 @@ xfs_mod_fdblocks(
11391148
} else {
11401149
delta -= res_used;
11411150
mp->m_resblks_avail = mp->m_resblks;
1142-
percpu_counter_add(&mp->m_fdblocks, delta);
1151+
percpu_counter_add(counter, delta);
11431152
}
11441153
spin_unlock(&mp->m_sb_lock);
11451154
return 0;
@@ -1153,7 +1162,7 @@ xfs_mod_fdblocks(
11531162
* then make everything serialise as we are real close to
11541163
* ENOSPC.
11551164
*/
1156-
if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
1165+
if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
11571166
XFS_FDBLOCKS_BATCH) < 0)
11581167
batch = 1;
11591168
else
@@ -1170,9 +1179,10 @@ xfs_mod_fdblocks(
11701179
* problems (i.e. transaction abort, pagecache discards, etc.) than
11711180
* slightly premature -ENOSPC.
11721181
*/
1173-
set_aside = xfs_fdblocks_unavailable(mp);
1174-
percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
1175-
if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
1182+
if (has_resv_pool)
1183+
set_aside = xfs_fdblocks_unavailable(mp);
1184+
percpu_counter_add_batch(counter, delta, batch);
1185+
if (__percpu_counter_compare(counter, set_aside,
11761186
XFS_FDBLOCKS_BATCH) >= 0) {
11771187
/* we had space! */
11781188
return 0;
@@ -1183,8 +1193,8 @@ xfs_mod_fdblocks(
11831193
* that took us to ENOSPC.
11841194
*/
11851195
spin_lock(&mp->m_sb_lock);
1186-
percpu_counter_add(&mp->m_fdblocks, -delta);
1187-
if (!rsvd)
1196+
percpu_counter_add(counter, -delta);
1197+
if (!has_resv_pool || !rsvd)
11881198
goto fdblocks_enospc;
11891199

11901200
lcounter = (long long)mp->m_resblks_avail + delta;
@@ -1201,24 +1211,6 @@ xfs_mod_fdblocks(
12011211
return -ENOSPC;
12021212
}
12031213

1204-
int
1205-
xfs_mod_frextents(
1206-
struct xfs_mount *mp,
1207-
int64_t delta)
1208-
{
1209-
int64_t lcounter;
1210-
int ret = 0;
1211-
1212-
spin_lock(&mp->m_sb_lock);
1213-
lcounter = mp->m_sb.sb_frextents + delta;
1214-
if (lcounter < 0)
1215-
ret = -ENOSPC;
1216-
else
1217-
mp->m_sb.sb_frextents = lcounter;
1218-
spin_unlock(&mp->m_sb_lock);
1219-
return ret;
1220-
}
1221-
12221214
/*
12231215
* Used to free the superblock along various error paths.
12241216
*/

fs/xfs/xfs_mount.h

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ typedef struct xfs_mount {
183183
struct percpu_counter m_icount; /* allocated inodes counter */
184184
struct percpu_counter m_ifree; /* free inodes counter */
185185
struct percpu_counter m_fdblocks; /* free block counter */
186+
struct percpu_counter m_frextents; /* free rt extent counter */
187+
186188
/*
187189
* Count of data device blocks reserved for delayed allocations,
188190
* including indlen blocks. Does not include allocated CoW staging
@@ -494,9 +496,20 @@ xfs_fdblocks_unavailable(
494496
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
495497
}
496498

497-
extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
498-
bool reserved);
499-
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
499+
int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
500+
int64_t delta, bool rsvd);
501+
502+
static inline int
503+
xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
504+
{
505+
return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
506+
}
507+
508+
static inline int
509+
xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
510+
{
511+
return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
512+
}
500513

501514
extern int xfs_readsb(xfs_mount_t *, int);
502515
extern void xfs_freesb(xfs_mount_t *);

fs/xfs/xfs_rtalloc.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1318,6 +1318,7 @@ xfs_rtalloc_reinit_frextents(
13181318
spin_lock(&mp->m_sb_lock);
13191319
mp->m_sb.sb_frextents = val;
13201320
spin_unlock(&mp->m_sb_lock);
1321+
percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
13211322
return 0;
13221323
}
13231324

fs/xfs/xfs_super.c

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -843,9 +843,11 @@ xfs_fs_statfs(
843843

844844
if (XFS_IS_REALTIME_MOUNT(mp) &&
845845
(ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
846+
s64 freertx;
847+
846848
statp->f_blocks = sbp->sb_rblocks;
847-
statp->f_bavail = statp->f_bfree =
848-
sbp->sb_frextents * sbp->sb_rextsize;
849+
freertx = percpu_counter_sum_positive(&mp->m_frextents);
850+
statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
849851
}
850852

851853
return 0;
@@ -1015,8 +1017,14 @@ xfs_init_percpu_counters(
10151017
if (error)
10161018
goto free_fdblocks;
10171019

1020+
error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
1021+
if (error)
1022+
goto free_delalloc;
1023+
10181024
return 0;
10191025

1026+
free_delalloc:
1027+
percpu_counter_destroy(&mp->m_delalloc_blks);
10201028
free_fdblocks:
10211029
percpu_counter_destroy(&mp->m_fdblocks);
10221030
free_ifree:
@@ -1033,6 +1041,7 @@ xfs_reinit_percpu_counters(
10331041
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
10341042
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
10351043
percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
1044+
percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
10361045
}
10371046

10381047
static void
@@ -1045,6 +1054,7 @@ xfs_destroy_percpu_counters(
10451054
ASSERT(xfs_is_shutdown(mp) ||
10461055
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
10471056
percpu_counter_destroy(&mp->m_delalloc_blks);
1057+
percpu_counter_destroy(&mp->m_frextents);
10481058
}
10491059

10501060
static int

fs/xfs/xfs_trans.c

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -498,10 +498,31 @@ xfs_trans_apply_sb_deltas(
498498
be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
499499
}
500500

501-
if (tp->t_frextents_delta)
502-
be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
503-
if (tp->t_res_frextents_delta)
504-
be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
501+
/*
502+
* Updating frextents requires careful handling because it does not
503+
* behave like the lazysb counters because we cannot rely on log
504+
* recovery in older kenels to recompute the value from the rtbitmap.
505+
* This means that the ondisk frextents must be consistent with the
506+
* rtbitmap.
507+
*
508+
* Therefore, log the frextents change to the ondisk superblock and
509+
* update the incore superblock so that future calls to xfs_log_sb
510+
* write the correct value ondisk.
511+
*
512+
* Don't touch m_frextents because it includes incore reservations,
513+
* and those are handled by the unreserve function.
514+
*/
515+
if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
516+
struct xfs_mount *mp = tp->t_mountp;
517+
int64_t rtxdelta;
518+
519+
rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
520+
521+
spin_lock(&mp->m_sb_lock);
522+
be64_add_cpu(&sbp->sb_frextents, rtxdelta);
523+
mp->m_sb.sb_frextents += rtxdelta;
524+
spin_unlock(&mp->m_sb_lock);
525+
}
505526

506527
if (tp->t_dblocks_delta) {
507528
be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
@@ -614,15 +635,25 @@ xfs_trans_unreserve_and_mod_sb(
614635
if (ifreedelta)
615636
percpu_counter_add(&mp->m_ifree, ifreedelta);
616637

617-
if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
638+
if (rtxdelta) {
639+
error = xfs_mod_frextents(mp, rtxdelta);
640+
ASSERT(!error);
641+
}
642+
643+
if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
618644
return;
619645

620646
/* apply remaining deltas */
621647
spin_lock(&mp->m_sb_lock);
622648
mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
623649
mp->m_sb.sb_icount += idelta;
624650
mp->m_sb.sb_ifree += ifreedelta;
625-
mp->m_sb.sb_frextents += rtxdelta;
651+
/*
652+
* Do not touch sb_frextents here because we are dealing with incore
653+
* reservation. sb_frextents is not part of the lazy sb counters so it
654+
* must be consistent with the ondisk rtbitmap and must never include
655+
* incore reservations.
656+
*/
626657
mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
627658
mp->m_sb.sb_agcount += tp->t_agcount_delta;
628659
mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;

0 commit comments

Comments
 (0)