Skip to content

Commit b291ad4

Browse files
jinbaohongkdave
authored andcommitted
btrfs: fix transaction commit blocking during trim of unallocated space
When trimming unallocated space, btrfs_trim_fs() holds the device_list_mutex for the entire duration while iterating through all devices. On large filesystems with significant unallocated space, this operation can take minutes to hours on large storage systems. This causes a problem because btrfs_run_dev_stats(), which is called during transaction commit, also requires device_list_mutex: btrfs_trim_fs() mutex_lock(&fs_devices->device_list_mutex) list_for_each_entry(device, ...) btrfs_trim_free_extents(device) mutex_unlock(&fs_devices->device_list_mutex) commit_transaction() btrfs_run_dev_stats() mutex_lock(&fs_devices->device_list_mutex) // blocked! ... While trim is running, all transaction commits are blocked waiting for the mutex. Fix this by refactoring btrfs_trim_free_extents() to process devices in bounded chunks (up to 2GB per iteration) and release device_list_mutex between chunks. Signed-off-by: robbieko <robbieko@synology.com> Signed-off-by: jinbaohong <jinbaohong@synology.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent bfb670b commit b291ad4

2 files changed

Lines changed: 140 additions & 22 deletions

File tree

fs/btrfs/extent-tree.c

Lines changed: 134 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6513,10 +6513,12 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6
65136513
* it while performing the free space search since we have already
65146514
* held back allocations.
65156515
*/
6516-
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
6516+
static int btrfs_trim_free_extents_throttle(struct btrfs_device *device,
6517+
u64 *trimmed, u64 pos, u64 *ret_next_pos)
65176518
{
6518-
u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
65196519
int ret;
6520+
u64 start = pos;
6521+
u64 trim_len = 0;
65206522

65216523
*trimmed = 0;
65226524

@@ -6536,15 +6538,20 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
65366538

65376539
while (1) {
65386540
struct btrfs_fs_info *fs_info = device->fs_info;
6541+
u64 cur_start;
6542+
u64 end;
6543+
u64 len;
65396544
u64 bytes;
65406545

65416546
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
65426547
if (ret)
65436548
break;
65446549

6550+
cur_start = start;
65456551
btrfs_find_first_clear_extent_bit(&device->alloc_state, start,
65466552
&start, &end,
65476553
CHUNK_TRIMMED | CHUNK_ALLOCATED);
6554+
start = max(start, cur_start);
65486555

65496556
/* Check if there are any CHUNK_* bits left */
65506557
if (start > device->total_bytes) {
@@ -6570,6 +6577,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
65706577
end = min(end, device->total_bytes - 1);
65716578

65726579
len = end - start + 1;
6580+
len = min(len, BTRFS_MAX_TRIM_LENGTH);
65736581

65746582
/* We didn't find any extents */
65756583
if (!len) {
@@ -6590,6 +6598,12 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
65906598

65916599
start += len;
65926600
*trimmed += bytes;
6601+
trim_len += len;
6602+
if (trim_len >= BTRFS_MAX_TRIM_LENGTH) {
6603+
*ret_next_pos = start;
6604+
ret = -EAGAIN;
6605+
break;
6606+
}
65936607

65946608
if (btrfs_trim_interrupted()) {
65956609
ret = -ERESTARTSYS;
@@ -6602,6 +6616,122 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
66026616
return ret;
66036617
}
66046618

6619+
static int btrfs_trim_free_extents(struct btrfs_fs_info *fs_info, u64 *trimmed,
6620+
u64 *dev_failed, int *dev_ret)
6621+
{
6622+
struct btrfs_device *dev;
6623+
struct btrfs_device *working_dev = NULL;
6624+
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6625+
u8 uuid[BTRFS_UUID_SIZE];
6626+
u64 start = BTRFS_DEVICE_RANGE_RESERVED;
6627+
6628+
*trimmed = 0;
6629+
*dev_failed = 0;
6630+
*dev_ret = 0;
6631+
6632+
/* Find the device with the smallest UUID to start. */
6633+
mutex_lock(&fs_devices->device_list_mutex);
6634+
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
6635+
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
6636+
continue;
6637+
if (!working_dev ||
6638+
memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0)
6639+
working_dev = dev;
6640+
}
6641+
if (working_dev)
6642+
memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE);
6643+
mutex_unlock(&fs_devices->device_list_mutex);
6644+
6645+
if (!working_dev)
6646+
return 0;
6647+
6648+
while (1) {
6649+
u64 group_trimmed = 0;
6650+
u64 next_pos = 0;
6651+
int ret = 0;
6652+
6653+
mutex_lock(&fs_devices->device_list_mutex);
6654+
6655+
/* Find and trim the current device. */
6656+
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
6657+
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
6658+
continue;
6659+
if (dev == working_dev) {
6660+
ret = btrfs_trim_free_extents_throttle(working_dev,
6661+
&group_trimmed, start, &next_pos);
6662+
break;
6663+
}
6664+
}
6665+
6666+
/* Throttle: continue the same device from the new position. */
6667+
if (ret == -EAGAIN && next_pos > start) {
6668+
mutex_unlock(&fs_devices->device_list_mutex);
6669+
*trimmed += group_trimmed;
6670+
start = next_pos;
6671+
cond_resched();
6672+
continue;
6673+
}
6674+
6675+
/* User interrupted. */
6676+
if (ret == -ERESTARTSYS || ret == -EINTR) {
6677+
mutex_unlock(&fs_devices->device_list_mutex);
6678+
*trimmed += group_trimmed;
6679+
return ret;
6680+
}
6681+
6682+
/*
6683+
* Device completed (ret == 0), failed, or EAGAIN with no progress.
6684+
* Record error if any, then move to next device.
6685+
*/
6686+
if (ret == -EAGAIN) {
6687+
/* No progress - log and skip device. */
6688+
btrfs_warn(fs_info,
6689+
"trim throttle: no progress, offset=%llu device %s, skipping",
6690+
start, btrfs_dev_name(working_dev));
6691+
(*dev_failed)++;
6692+
if (!*dev_ret)
6693+
*dev_ret = ret;
6694+
} else if (ret) {
6695+
/* Device failed with error. */
6696+
(*dev_failed)++;
6697+
if (!*dev_ret)
6698+
*dev_ret = ret;
6699+
}
6700+
6701+
/*
6702+
* Find next device: smallest UUID larger than current.
6703+
* Devices added during trim with smaller UUID will be skipped.
6704+
*/
6705+
working_dev = NULL;
6706+
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
6707+
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
6708+
continue;
6709+
/* Must larger than current UUID. */
6710+
if (memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE) <= 0)
6711+
continue;
6712+
/* Find the smallest. */
6713+
if (!working_dev ||
6714+
memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0)
6715+
working_dev = dev;
6716+
}
6717+
if (working_dev)
6718+
memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE);
6719+
6720+
mutex_unlock(&fs_devices->device_list_mutex);
6721+
6722+
*trimmed += group_trimmed;
6723+
start = BTRFS_DEVICE_RANGE_RESERVED;
6724+
6725+
/* No more devices. */
6726+
if (!working_dev)
6727+
break;
6728+
6729+
cond_resched();
6730+
}
6731+
6732+
return 0;
6733+
}
6734+
66056735
/*
66066736
* Trim the whole filesystem by:
66076737
* 1) trimming the free space in each block group
@@ -6613,9 +6743,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
66136743
*/
66146744
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
66156745
{
6616-
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
66176746
struct btrfs_block_group *cache = NULL;
6618-
struct btrfs_device *device;
66196747
u64 group_trimmed;
66206748
u64 range_end = U64_MAX;
66216749
u64 start;
@@ -6686,24 +6814,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
66866814
if (ret == -ERESTARTSYS || ret == -EINTR)
66876815
return ret;
66886816

6689-
mutex_lock(&fs_devices->device_list_mutex);
6690-
list_for_each_entry(device, &fs_devices->devices, dev_list) {
6691-
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6692-
continue;
6693-
6694-
ret = btrfs_trim_free_extents(device, &group_trimmed);
6695-
6696-
trimmed += group_trimmed;
6697-
if (ret == -ERESTARTSYS || ret == -EINTR)
6698-
break;
6699-
if (ret) {
6700-
dev_failed++;
6701-
if (!dev_ret)
6702-
dev_ret = ret;
6703-
continue;
6704-
}
6705-
}
6706-
mutex_unlock(&fs_devices->device_list_mutex);
6817+
ret = btrfs_trim_free_extents(fs_info, &group_trimmed, &dev_failed, &dev_ret);
6818+
trimmed += group_trimmed;
67076819

67086820
if (dev_failed)
67096821
btrfs_warn(fs_info,

fs/btrfs/fs.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,12 @@ struct btrfs_space_info;
6565

6666
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
6767

68+
/*
69+
* Maximum length to trim in a single iteration to avoid holding device list
70+
* mutex for too long.
71+
*/
72+
#define BTRFS_MAX_TRIM_LENGTH SZ_2G
73+
6874
#define BTRFS_OLDEST_GENERATION 0ULL
6975

7076
#define BTRFS_EMPTY_DIR_SIZE 0

0 commit comments

Comments
 (0)