Skip to content

Commit 1f3a3e2

Browse files
committed
Merge tag 'for-6.8-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: "A few regular fixes and one fix for space reservation regression since 6.7 that users have been reporting: - fix over-reservation of metadata chunks due to not keeping proper balance between global block reserve and delayed refs reserve; in practice this leaves behind empty metadata block groups, the workaround is to reclaim them by using the '-musage=1' balance filter - other space reservation fixes: - do not delete unused block group if it may be used soon - do not reserve space for checksums for NOCOW files - fix extent map assertion failure when writing out free space inode - reject encoded write if inode has nodatasum flag set - fix chunk map leak when loading block group zone info" * tag 'for-6.8-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: don't refill whole delayed refs block reserve when starting transaction btrfs: zoned: fix chunk map leak when loading block group zone info btrfs: reject encoded write if inode has nodatasum flag set btrfs: don't reserve space for checksums when writing to nocow files btrfs: add new unused block groups to the list of unused block groups btrfs: do not delete unused block group if it may be used soon btrfs: add and use helper to check if block group is used btrfs: don't drop extent_map for free space inode on write error
2 parents 91f842f + 2f6397e commit 1f3a3e2

6 files changed

Lines changed: 131 additions & 50 deletions

File tree

fs/btrfs/block-group.c

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1455,6 +1455,7 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
14551455
*/
14561456
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
14571457
{
1458+
LIST_HEAD(retry_list);
14581459
struct btrfs_block_group *block_group;
14591460
struct btrfs_space_info *space_info;
14601461
struct btrfs_trans_handle *trans;
@@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
14761477

14771478
spin_lock(&fs_info->unused_bgs_lock);
14781479
while (!list_empty(&fs_info->unused_bgs)) {
1480+
u64 used;
14791481
int trimming;
14801482

14811483
block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1511,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
15111513
goto next;
15121514
}
15131515

1516+
spin_lock(&space_info->lock);
15141517
spin_lock(&block_group->lock);
1515-
if (block_group->reserved || block_group->pinned ||
1516-
block_group->used || block_group->ro ||
1518+
if (btrfs_is_block_group_used(block_group) || block_group->ro ||
15171519
list_is_singular(&block_group->list)) {
15181520
/*
15191521
* We want to bail if we made new allocations or have
@@ -1523,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
15231525
*/
15241526
trace_btrfs_skip_unused_block_group(block_group);
15251527
spin_unlock(&block_group->lock);
1528+
spin_unlock(&space_info->lock);
15261529
up_write(&space_info->groups_sem);
15271530
goto next;
15281531
}
1532+
1533+
/*
1534+
* The block group may be unused but there may be space reserved
1535+
* accounting with the existence of that block group, that is,
1536+
* space_info->bytes_may_use was incremented by a task but no
1537+
* space was yet allocated from the block group by the task.
1538+
* That space may or may not be allocated, as we are generally
1539+
* pessimistic about space reservation for metadata as well as
1540+
* for data when using compression (as we reserve space based on
1541+
* the worst case, when data can't be compressed, and before
1542+
* actually attempting compression, before starting writeback).
1543+
*
1544+
* So check if the total space of the space_info minus the size
1545+
* of this block group is less than the used space of the
1546+
* space_info - if that's the case, then it means we have tasks
1547+
* that might be relying on the block group in order to allocate
1548+
* extents, and add back the block group to the unused list when
1549+
* we finish, so that we retry later in case no tasks ended up
1550+
* needing to allocate extents from the block group.
1551+
*/
1552+
used = btrfs_space_info_used(space_info, true);
1553+
if (space_info->total_bytes - block_group->length < used) {
1554+
/*
1555+
* Add a reference for the list, compensate for the ref
1556+
* drop under the "next" label for the
1557+
* fs_info->unused_bgs list.
1558+
*/
1559+
btrfs_get_block_group(block_group);
1560+
list_add_tail(&block_group->bg_list, &retry_list);
1561+
1562+
trace_btrfs_skip_unused_block_group(block_group);
1563+
spin_unlock(&block_group->lock);
1564+
spin_unlock(&space_info->lock);
1565+
up_write(&space_info->groups_sem);
1566+
goto next;
1567+
}
1568+
15291569
spin_unlock(&block_group->lock);
1570+
spin_unlock(&space_info->lock);
15301571

15311572
/* We don't want to force the issue, only flip if it's ok. */
15321573
ret = inc_block_group_ro(block_group, 0);
@@ -1650,12 +1691,16 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
16501691
btrfs_put_block_group(block_group);
16511692
spin_lock(&fs_info->unused_bgs_lock);
16521693
}
1694+
list_splice_tail(&retry_list, &fs_info->unused_bgs);
16531695
spin_unlock(&fs_info->unused_bgs_lock);
16541696
mutex_unlock(&fs_info->reclaim_bgs_lock);
16551697
return;
16561698

16571699
flip_async:
16581700
btrfs_end_transaction(trans);
1701+
spin_lock(&fs_info->unused_bgs_lock);
1702+
list_splice_tail(&retry_list, &fs_info->unused_bgs);
1703+
spin_unlock(&fs_info->unused_bgs_lock);
16591704
mutex_unlock(&fs_info->reclaim_bgs_lock);
16601705
btrfs_put_block_group(block_group);
16611706
btrfs_discard_punt_unused_bgs_list(fs_info);
@@ -2684,6 +2729,37 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
26842729
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
26852730
list_del_init(&block_group->bg_list);
26862731
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
2732+
2733+
/*
2734+
* If the block group is still unused, add it to the list of
2735+
* unused block groups. The block group may have been created in
2736+
* order to satisfy a space reservation, in which case the
2737+
* extent allocation only happens later. But often we don't
2738+
* actually need to allocate space that we previously reserved,
2739+
* so the block group may become unused for a long time. For
2740+
* example for metadata we generally reserve space for a worst
2741+
* possible scenario, but then don't end up allocating all that
2742+
* space or none at all (due to no need to COW, extent buffers
2743+
* were already COWed in the current transaction and still
2744+
* unwritten, tree heights lower than the maximum possible
2745+
* height, etc). For data we generally reserve the axact amount
2746+
* of space we are going to allocate later, the exception is
2747+
* when using compression, as we must reserve space based on the
2748+
* uncompressed data size, because the compression is only done
2749+
* when writeback triggered and we don't know how much space we
2750+
* are actually going to need, so we reserve the uncompressed
2751+
* size because the data may be uncompressible in the worst case.
2752+
*/
2753+
if (ret == 0) {
2754+
bool used;
2755+
2756+
spin_lock(&block_group->lock);
2757+
used = btrfs_is_block_group_used(block_group);
2758+
spin_unlock(&block_group->lock);
2759+
2760+
if (!used)
2761+
btrfs_mark_bg_unused(block_group);
2762+
}
26872763
}
26882764
btrfs_trans_release_chunk_metadata(trans);
26892765
}

fs/btrfs/block-group.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
257257
return (block_group->start + block_group->length);
258258
}
259259

260+
static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
261+
{
262+
lockdep_assert_held(&bg->lock);
263+
264+
return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
265+
}
266+
260267
static inline bool btrfs_is_block_group_data_only(
261268
struct btrfs_block_group *block_group)
262269
{

fs/btrfs/delalloc-space.c

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
245245
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
246246
u64 reserve_size = 0;
247247
u64 qgroup_rsv_size = 0;
248-
u64 csum_leaves;
249248
unsigned outstanding_extents;
250249

251250
lockdep_assert_held(&inode->lock);
@@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
260259
outstanding_extents);
261260
reserve_size += btrfs_calc_metadata_size(fs_info, 1);
262261
}
263-
csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
264-
inode->csum_bytes);
265-
reserve_size += btrfs_calc_insert_metadata_size(fs_info,
266-
csum_leaves);
262+
if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
263+
u64 csum_leaves;
264+
265+
csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
266+
reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
267+
}
267268
/*
268269
* For qgroup rsv, the calculation is very simple:
269270
* account one nodesize for each outstanding extent
@@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
278279
spin_unlock(&block_rsv->lock);
279280
}
280281

281-
static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
282+
static void calc_inode_reservations(struct btrfs_inode *inode,
282283
u64 num_bytes, u64 disk_num_bytes,
283284
u64 *meta_reserve, u64 *qgroup_reserve)
284285
{
286+
struct btrfs_fs_info *fs_info = inode->root->fs_info;
285287
u64 nr_extents = count_max_extents(fs_info, num_bytes);
286-
u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
288+
u64 csum_leaves;
287289
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
288290

291+
if (inode->flags & BTRFS_INODE_NODATASUM)
292+
csum_leaves = 0;
293+
else
294+
csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
295+
289296
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
290297
nr_extents + csum_leaves);
291298

@@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
337344
* everything out and try again, which is bad. This way we just
338345
* over-reserve slightly, and clean up the mess when we are done.
339346
*/
340-
calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
347+
calc_inode_reservations(inode, num_bytes, disk_num_bytes,
341348
&meta_reserve, &qgroup_reserve);
342349
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
343350
noflush);
@@ -359,7 +366,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
359366
nr_extents = count_max_extents(fs_info, num_bytes);
360367
spin_lock(&inode->lock);
361368
btrfs_mod_outstanding_extents(inode, nr_extents);
362-
inode->csum_bytes += disk_num_bytes;
369+
if (!(inode->flags & BTRFS_INODE_NODATASUM))
370+
inode->csum_bytes += disk_num_bytes;
363371
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
364372
spin_unlock(&inode->lock);
365373

@@ -393,7 +401,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
393401

394402
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
395403
spin_lock(&inode->lock);
396-
inode->csum_bytes -= num_bytes;
404+
if (!(inode->flags & BTRFS_INODE_NODATASUM))
405+
inode->csum_bytes -= num_bytes;
397406
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
398407
spin_unlock(&inode->lock);
399408

fs/btrfs/inode.c

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3184,8 +3184,23 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
31843184
unwritten_start += logical_len;
31853185
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
31863186

3187-
/* Drop extent maps for the part of the extent we didn't write. */
3188-
btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
3187+
/*
3188+
* Drop extent maps for the part of the extent we didn't write.
3189+
*
3190+
* We have an exception here for the free_space_inode, this is
3191+
* because when we do btrfs_get_extent() on the free space inode
3192+
* we will search the commit root. If this is a new block group
3193+
* we won't find anything, and we will trip over the assert in
3194+
* writepage where we do ASSERT(em->block_start !=
3195+
* EXTENT_MAP_HOLE).
3196+
*
3197+
* Theoretically we could also skip this for any NOCOW extent as
3198+
* we don't mess with the extent map tree in the NOCOW case, but
3199+
* for now simply skip this if we are the free space inode.
3200+
*/
3201+
if (!btrfs_is_free_space_inode(inode))
3202+
btrfs_drop_extent_map_range(inode, unwritten_start,
3203+
end, false);
31893204

31903205
/*
31913206
* If the ordered extent had an IOERR or something else went
@@ -10273,6 +10288,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1027310288
if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
1027410289
return -EINVAL;
1027510290

10291+
/*
10292+
* Compressed extents should always have checksums, so error out if we
10293+
* have a NOCOW file or inode was created while mounted with NODATASUM.
10294+
*/
10295+
if (inode->flags & BTRFS_INODE_NODATASUM)
10296+
return -EINVAL;
10297+
1027610298
orig_count = iov_iter_count(from);
1027710299

1027810300
/* The extent size must be sane. */

fs/btrfs/transaction.c

Lines changed: 2 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
564564
u64 num_bytes,
565565
u64 *delayed_refs_bytes)
566566
{
567-
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
568567
struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
569-
u64 extra_delayed_refs_bytes = 0;
570-
u64 bytes;
568+
u64 bytes = num_bytes + *delayed_refs_bytes;
571569
int ret;
572570

573-
/*
574-
* If there's a gap between the size of the delayed refs reserve and
575-
* its reserved space, than some tasks have added delayed refs or bumped
576-
* its size otherwise (due to block group creation or removal, or block
577-
* group item update). Also try to allocate that gap in order to prevent
578-
* using (and possibly abusing) the global reserve when committing the
579-
* transaction.
580-
*/
581-
if (flush == BTRFS_RESERVE_FLUSH_ALL &&
582-
!btrfs_block_rsv_full(delayed_refs_rsv)) {
583-
spin_lock(&delayed_refs_rsv->lock);
584-
if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
585-
extra_delayed_refs_bytes = delayed_refs_rsv->size -
586-
delayed_refs_rsv->reserved;
587-
spin_unlock(&delayed_refs_rsv->lock);
588-
}
589-
590-
bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
591-
592571
/*
593572
* We want to reserve all the bytes we may need all at once, so we only
594573
* do 1 enospc flushing cycle per transaction start.
595574
*/
596575
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
597-
if (ret == 0) {
598-
if (extra_delayed_refs_bytes > 0)
599-
btrfs_migrate_to_delayed_refs_rsv(fs_info,
600-
extra_delayed_refs_bytes);
601-
return 0;
602-
}
603-
604-
if (extra_delayed_refs_bytes > 0) {
605-
bytes -= extra_delayed_refs_bytes;
606-
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
607-
if (ret == 0)
608-
return 0;
609-
}
610576

611577
/*
612578
* If we are an emergency flush, which can steal from the global block
613579
* reserve, then attempt to not reserve space for the delayed refs, as
614580
* we will consume space for them from the global block reserve.
615581
*/
616-
if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
582+
if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
617583
bytes -= *delayed_refs_bytes;
618584
*delayed_refs_bytes = 0;
619585
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);

fs/btrfs/zoned.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1670,6 +1670,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
16701670
}
16711671
bitmap_free(active);
16721672
kfree(zone_info);
1673+
btrfs_free_chunk_map(map);
16731674

16741675
return ret;
16751676
}

0 commit comments

Comments
 (0)