Skip to content

Commit d4e175f

Browse files
committed
Merge tag 'vfs-6.7.super' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs
Pull vfs superblock updates from Christian Brauner: "This contains the work to make block device opening functions return a struct bdev_handle instead of just a struct block_device. The same struct bdev_handle is then also passed to block device closing functions. This allows us to propagate context from opening to closing a block device without having to modify all users everytime. Sidenote, in the future we might even want to try and have block device opening functions return a struct file directly but that's a series on top of this. These are further preparatory changes to be able to count writable opens and blocking writes to mounted block devices. That's a separate piece of work for next cycle and for that we absolutely need the changes to btrfs that have been quietly dropped somehow. Originally the series contained a patch that removed the old blkdev_*() helpers. But since this would've caused needles churn in -next for bcachefs we ended up delaying it. The second piece of work addresses one of the major annoyances about the work last cycle, namely that we required dropping s_umount whenever we used the superblock and fs_holder_ops for a block device. The reason for that requirement had been that in some codepaths s_umount could've been taken under disk->open_mutex (that's always been the case, at least theoretically). For example, on surprise block device removal or media change. And opening and closing block devices required grabbing disk->open_mutex as well. So we did the work and went through the block layer and fixed all those places so that s_umount is never taken under disk->open_mutex. This means no more brittle games where we yield and reacquire s_umount during block device opening and closing and no more requirements where block devices need to be closed. Filesystems don't need to care about this. There's a bunch of other follow-up work such as moving block device freezing and thawing to holder operations which makes it work for all block devices and not just the main block device just as we did for surprise removal. But that is for next cycle. Tested with fstests for all major fses, blktests, LTP" * tag 'vfs-6.7.super' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs: (37 commits) porting: update locking requirements fs: assert that open_mutex isn't held over holder ops block: assert that we're not holding open_mutex over blk_report_disk_dead block: move bdev_mark_dead out of disk_check_media_change block: WARN_ON_ONCE() when we remove active partitions block: simplify bdev_del_partition() fs: Avoid grabbing sb->s_umount under bdev->bd_holder_lock jfs: fix log->bdev_handle null ptr deref in lbmStartIO bcache: Fixup error handling in register_cache() xfs: Convert to bdev_open_by_path() reiserfs: Convert to bdev_open_by_dev/path() ocfs2: Convert to use bdev_open_by_dev() nfs/blocklayout: Convert to use bdev_open_by_dev/path() jfs: Convert to bdev_open_by_dev() f2fs: Convert to bdev_open_by_dev/path() ext4: Convert to bdev_open_by_dev() erofs: Convert to use bdev_open_by_path() btrfs: Convert to bdev_open_by_path() fs: Convert to bdev_open_by_dev() mm/swap: Convert to use bdev_open_by_dev() ...
2 parents ffc2532 + 5aa9130 commit d4e175f

71 files changed

Lines changed: 854 additions & 684 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Documentation/filesystems/porting.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,3 +1045,10 @@ filesystem type is now moved to a later point when the devices are closed:
10451045
As this is a VFS level change it has no practical consequences for filesystems
10461046
other than that all of them must use one of the provided kill_litter_super(),
10471047
kill_anon_super(), or kill_block_super() helpers.
1048+
1049+
---
1050+
1051+
**mandatory**
1052+
1053+
Lock ordering has been changed so that s_umount ranks above open_mutex again.
1054+
All places where s_umount was taken under open_mutex have been fixed up.

block/bdev.c

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -829,6 +829,28 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
829829
}
830830
EXPORT_SYMBOL(blkdev_get_by_dev);
831831

832+
struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
833+
const struct blk_holder_ops *hops)
834+
{
835+
struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL);
836+
struct block_device *bdev;
837+
838+
if (!handle)
839+
return ERR_PTR(-ENOMEM);
840+
bdev = blkdev_get_by_dev(dev, mode, holder, hops);
841+
if (IS_ERR(bdev)) {
842+
kfree(handle);
843+
return ERR_CAST(bdev);
844+
}
845+
handle->bdev = bdev;
846+
handle->holder = holder;
847+
if (holder)
848+
mode |= BLK_OPEN_EXCL;
849+
handle->mode = mode;
850+
return handle;
851+
}
852+
EXPORT_SYMBOL(bdev_open_by_dev);
853+
832854
/**
833855
* blkdev_get_by_path - open a block device by name
834856
* @path: path to the block device to open
@@ -867,6 +889,28 @@ struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
867889
}
868890
EXPORT_SYMBOL(blkdev_get_by_path);
869891

892+
struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
893+
void *holder, const struct blk_holder_ops *hops)
894+
{
895+
struct bdev_handle *handle;
896+
dev_t dev;
897+
int error;
898+
899+
error = lookup_bdev(path, &dev);
900+
if (error)
901+
return ERR_PTR(error);
902+
903+
handle = bdev_open_by_dev(dev, mode, holder, hops);
904+
if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
905+
bdev_read_only(handle->bdev)) {
906+
bdev_release(handle);
907+
return ERR_PTR(-EACCES);
908+
}
909+
910+
return handle;
911+
}
912+
EXPORT_SYMBOL(bdev_open_by_path);
913+
870914
void blkdev_put(struct block_device *bdev, void *holder)
871915
{
872916
struct gendisk *disk = bdev->bd_disk;
@@ -903,6 +947,13 @@ void blkdev_put(struct block_device *bdev, void *holder)
903947
}
904948
EXPORT_SYMBOL(blkdev_put);
905949

950+
void bdev_release(struct bdev_handle *handle)
951+
{
952+
blkdev_put(handle->bdev, handle->holder);
953+
kfree(handle);
954+
}
955+
EXPORT_SYMBOL(bdev_release);
956+
906957
/**
907958
* lookup_bdev() - Look up a struct block_device by name.
908959
* @pathname: Name of the block device in the filesystem.
@@ -961,20 +1012,20 @@ void bdev_mark_dead(struct block_device *bdev, bool surprise)
9611012
mutex_lock(&bdev->bd_holder_lock);
9621013
if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
9631014
bdev->bd_holder_ops->mark_dead(bdev, surprise);
964-
else
1015+
else {
1016+
mutex_unlock(&bdev->bd_holder_lock);
9651017
sync_blockdev(bdev);
966-
mutex_unlock(&bdev->bd_holder_lock);
1018+
}
9671019

9681020
invalidate_bdev(bdev);
9691021
}
970-
#ifdef CONFIG_DASD_MODULE
9711022
/*
972-
* Drivers should not use this directly, but the DASD driver has historically
973-
* had a shutdown to offline mode that doesn't actually remove the gendisk
974-
* that otherwise looks a lot like a safe device removal.
1023+
* New drivers should not use this directly. There are some drivers however
1024+
* that needs this for historical reasons. For example, the DASD driver has
1025+
* historically had a shutdown to offline mode that doesn't actually remove the
1026+
* gendisk that otherwise looks a lot like a safe device removal.
9751027
*/
9761028
EXPORT_SYMBOL_GPL(bdev_mark_dead);
977-
#endif
9781029

9791030
void sync_bdevs(bool wait)
9801031
{

block/disk-events.c

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -266,24 +266,20 @@ static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
266266
* disk_check_media_change - check if a removable media has been changed
267267
* @disk: gendisk to check
268268
*
269-
* Check whether a removable media has been changed, and attempt to free all
270-
* dentries and inodes and invalidates all block device page cache entries in
271-
* that case.
272-
*
273-
* Returns %true if the media has changed, or %false if not.
269+
* Returns %true and marks the disk for a partition rescan whether a removable
270+
* media has been changed, and %false if the media did not change.
274271
*/
275272
bool disk_check_media_change(struct gendisk *disk)
276273
{
277274
unsigned int events;
278275

279276
events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
280277
DISK_EVENT_EJECT_REQUEST);
281-
if (!(events & DISK_EVENT_MEDIA_CHANGE))
282-
return false;
283-
284-
bdev_mark_dead(disk->part0, true);
285-
set_bit(GD_NEED_PART_SCAN, &disk->state);
286-
return true;
278+
if (events & DISK_EVENT_MEDIA_CHANGE) {
279+
set_bit(GD_NEED_PART_SCAN, &disk->state);
280+
return true;
281+
}
282+
return false;
287283
}
288284
EXPORT_SYMBOL(disk_check_media_change);
289285

block/fops.c

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -542,15 +542,31 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
542542
return error;
543543
}
544544

545+
/**
546+
* file_to_blk_mode - get block open flags from file flags
547+
* @file: file whose open flags should be converted
548+
*
549+
* Look at file open flags and generate corresponding block open flags from
550+
* them. The function works both for file just being open (e.g. during ->open
551+
* callback) and for file that is already open. This is actually non-trivial
552+
* (see comment in the function).
553+
*/
545554
blk_mode_t file_to_blk_mode(struct file *file)
546555
{
547556
blk_mode_t mode = 0;
557+
struct bdev_handle *handle = file->private_data;
548558

549559
if (file->f_mode & FMODE_READ)
550560
mode |= BLK_OPEN_READ;
551561
if (file->f_mode & FMODE_WRITE)
552562
mode |= BLK_OPEN_WRITE;
553-
if (file->private_data)
563+
/*
564+
* do_dentry_open() clears O_EXCL from f_flags, use handle->mode to
565+
* determine whether the open was exclusive for already open files.
566+
*/
567+
if (handle)
568+
mode |= handle->mode & BLK_OPEN_EXCL;
569+
else if (file->f_flags & O_EXCL)
554570
mode |= BLK_OPEN_EXCL;
555571
if (file->f_flags & O_NDELAY)
556572
mode |= BLK_OPEN_NDELAY;
@@ -568,7 +584,8 @@ blk_mode_t file_to_blk_mode(struct file *file)
568584

569585
static int blkdev_open(struct inode *inode, struct file *filp)
570586
{
571-
struct block_device *bdev;
587+
struct bdev_handle *handle;
588+
blk_mode_t mode;
572589

573590
/*
574591
* Preserve backwards compatibility and allow large file access
@@ -579,29 +596,24 @@ static int blkdev_open(struct inode *inode, struct file *filp)
579596
filp->f_flags |= O_LARGEFILE;
580597
filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
581598

582-
/*
583-
* Use the file private data to store the holder for exclusive openes.
584-
* file_to_blk_mode relies on it being present to set BLK_OPEN_EXCL.
585-
*/
586-
if (filp->f_flags & O_EXCL)
587-
filp->private_data = filp;
588-
589-
bdev = blkdev_get_by_dev(inode->i_rdev, file_to_blk_mode(filp),
590-
filp->private_data, NULL);
591-
if (IS_ERR(bdev))
592-
return PTR_ERR(bdev);
599+
mode = file_to_blk_mode(filp);
600+
handle = bdev_open_by_dev(inode->i_rdev, mode,
601+
mode & BLK_OPEN_EXCL ? filp : NULL, NULL);
602+
if (IS_ERR(handle))
603+
return PTR_ERR(handle);
593604

594-
if (bdev_nowait(bdev))
605+
if (bdev_nowait(handle->bdev))
595606
filp->f_mode |= FMODE_NOWAIT;
596607

597-
filp->f_mapping = bdev->bd_inode->i_mapping;
608+
filp->f_mapping = handle->bdev->bd_inode->i_mapping;
598609
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
610+
filp->private_data = handle;
599611
return 0;
600612
}
601613

602614
static int blkdev_release(struct inode *inode, struct file *filp)
603615
{
604-
blkdev_put(I_BDEV(filp->f_mapping->host), filp->private_data);
616+
bdev_release(filp->private_data);
605617
return 0;
606618
}
607619

block/genhd.c

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(disk_uevent);
342342

343343
int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
344344
{
345-
struct block_device *bdev;
345+
struct bdev_handle *handle;
346346
int ret = 0;
347347

348348
if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
@@ -366,12 +366,12 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
366366
}
367367

368368
set_bit(GD_NEED_PART_SCAN, &disk->state);
369-
bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL,
370-
NULL);
371-
if (IS_ERR(bdev))
372-
ret = PTR_ERR(bdev);
369+
handle = bdev_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL,
370+
NULL);
371+
if (IS_ERR(handle))
372+
ret = PTR_ERR(handle);
373373
else
374-
blkdev_put(bdev, NULL);
374+
bdev_release(handle);
375375

376376
/*
377377
* If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set,
@@ -559,6 +559,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
559559
struct block_device *bdev;
560560
unsigned long idx;
561561

562+
/*
563+
* On surprise disk removal, bdev_mark_dead() may call into file
564+
* systems below. Make it clear that we're expecting to not hold
565+
* disk->open_mutex.
566+
*/
567+
lockdep_assert_not_held(&disk->open_mutex);
568+
562569
rcu_read_lock();
563570
xa_for_each(&disk->part_tbl, idx, bdev) {
564571
if (!kobject_get_unless_zero(&bdev->bd_device.kobj))

block/ioctl.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -370,9 +370,10 @@ static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd,
370370
mutex_lock(&bdev->bd_holder_lock);
371371
if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync)
372372
bdev->bd_holder_ops->sync(bdev);
373-
else
373+
else {
374+
mutex_unlock(&bdev->bd_holder_lock);
374375
sync_blockdev(bdev);
375-
mutex_unlock(&bdev->bd_holder_lock);
376+
}
376377

377378
invalidate_bdev(bdev);
378379
return 0;
@@ -467,6 +468,7 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
467468
int __user *argp)
468469
{
469470
int ret, n;
471+
struct bdev_handle *handle;
470472

471473
if (!capable(CAP_SYS_ADMIN))
472474
return -EACCES;
@@ -478,10 +480,11 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
478480
if (mode & BLK_OPEN_EXCL)
479481
return set_blocksize(bdev, n);
480482

481-
if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode, &bdev, NULL)))
483+
handle = bdev_open_by_dev(bdev->bd_dev, mode, &bdev, NULL);
484+
if (IS_ERR(handle))
482485
return -EBUSY;
483486
ret = set_blocksize(bdev, n);
484-
blkdev_put(bdev, &bdev);
487+
bdev_release(handle);
485488

486489
return ret;
487490
}

block/partitions/core.c

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -274,17 +274,6 @@ void drop_partition(struct block_device *part)
274274
put_device(&part->bd_device);
275275
}
276276

277-
static void delete_partition(struct block_device *part)
278-
{
279-
/*
280-
* Remove the block device from the inode hash, so that it cannot be
281-
* looked up any more even when openers still hold references.
282-
*/
283-
remove_inode_hash(part->bd_inode);
284-
bdev_mark_dead(part, false);
285-
drop_partition(part);
286-
}
287-
288277
static ssize_t whole_disk_show(struct device *dev,
289278
struct device_attribute *attr, char *buf)
290279
{
@@ -485,7 +474,18 @@ int bdev_del_partition(struct gendisk *disk, int partno)
485474
if (atomic_read(&part->bd_openers))
486475
goto out_unlock;
487476

488-
delete_partition(part);
477+
/*
478+
* We verified that @part->bd_openers is zero above and so
479+
* @part->bd_holder{_ops} can't be set. And since we hold
480+
* @disk->open_mutex the device can't be claimed by anyone.
481+
*
482+
* So no need to call @part->bd_holder_ops->mark_dead() here.
483+
* Just delete the partition and invalidate it.
484+
*/
485+
486+
remove_inode_hash(part->bd_inode);
487+
invalidate_bdev(part);
488+
drop_partition(part);
489489
ret = 0;
490490
out_unlock:
491491
mutex_unlock(&disk->open_mutex);
@@ -663,8 +663,23 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate)
663663
sync_blockdev(disk->part0);
664664
invalidate_bdev(disk->part0);
665665

666-
xa_for_each_start(&disk->part_tbl, idx, part, 1)
667-
delete_partition(part);
666+
xa_for_each_start(&disk->part_tbl, idx, part, 1) {
667+
/*
668+
* Remove the block device from the inode hash, so that
669+
* it cannot be looked up any more even when openers
670+
* still hold references.
671+
*/
672+
remove_inode_hash(part->bd_inode);
673+
674+
/*
675+
* If @disk->open_partitions isn't elevated but there's
676+
* still an active holder of that block device things
677+
* are broken.
678+
*/
679+
WARN_ON_ONCE(atomic_read(&part->bd_openers));
680+
invalidate_bdev(part);
681+
drop_partition(part);
682+
}
668683
clear_bit(GD_NEED_PART_SCAN, &disk->state);
669684

670685
/*

drivers/block/ataflop.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1760,8 +1760,10 @@ static int fd_locked_ioctl(struct block_device *bdev, blk_mode_t mode,
17601760
/* invalidate the buffer track to force a reread */
17611761
BufferDrive = -1;
17621762
set_bit(drive, &fake_change);
1763-
if (disk_check_media_change(disk))
1763+
if (disk_check_media_change(disk)) {
1764+
bdev_mark_dead(disk->part0, true);
17641765
floppy_revalidate(disk);
1766+
}
17651767
return 0;
17661768
default:
17671769
return -EINVAL;

drivers/block/drbd/drbd_int.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,9 @@ struct drbd_md {
524524

525525
struct drbd_backing_dev {
526526
struct block_device *backing_bdev;
527+
struct bdev_handle *backing_bdev_handle;
527528
struct block_device *md_bdev;
529+
struct bdev_handle *md_bdev_handle;
528530
struct drbd_md md;
529531
struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */
530532
sector_t known_size; /* last known size of that backing device */

0 commit comments

Comments
 (0)