Hi Jens, this series fixes up a possible race with the block_device lookup changes, and the finishes off the conversion to stop using the inode refcount for block devices. Changes since v1: - clean up btrfs even more by storing a bdev instead of the disk - keep a persistent disk reference in the bdev - a bunch of cleanups to make the above change easier Diffstat: block/genhd.c | 12 +----- block/partitions/core.c | 34 ++++++++++--------- drivers/block/loop.c | 5 -- fs/block_dev.c | 83 ++++++++++++++---------------------------------- fs/btrfs/inode.c | 2 - fs/btrfs/ordered-data.c | 2 - fs/btrfs/ordered-data.h | 3 - fs/btrfs/zoned.c | 12 ++---- include/linux/blkdev.h | 2 - 9 files changed, 52 insertions(+), 103 deletions(-)
blkdev_get_no_open acquires a reference to the block_device through the block device inode and then tries to acquire a device model reference to the gendisk. But at this point the disk migh already be freed (although the race is free). Fix this by only freeing the gendisk from the whole device bdevs ->free_inode callback as well. Fixes: 22ae8ce8b892 ("block: simplify bdev/disk lookup in blkdev_get") Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> --- block/genhd.c | 3 +-- fs/block_dev.c | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index af4d2ab4a633..298ee78c1bda 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1079,10 +1079,9 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); - bdput(disk->part0); if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) blk_put_queue(disk->queue); - kfree(disk); + bdput(disk->part0); /* frees the disk */ } struct class block_class = { .name = "block", diff --git a/fs/block_dev.c b/fs/block_dev.c index 0c424a0cadaa..9ef4f1fc2cb0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -812,6 +812,8 @@ static void bdev_free_inode(struct inode *inode) free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); + if (!bdev_is_partition(bdev)) + kfree(bdev->bd_disk); kmem_cache_free(bdev_cachep, BDEV_I(inode)); } -- 2.30.2
Add a lockdep assert instead of the outdated locking comment. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> --- block/partitions/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 4230d4f71879..9902b1635b7d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -281,12 +281,10 @@ struct device_type part_type = { .uevent = part_uevent, }; -/* - * Must be called either with open_mutex held, before a disk can be opened or - * after all disk users are gone. - */ static void delete_partition(struct block_device *part) { + lockdep_assert_held(&part->bd_disk->open_mutex); + fsync_bdev(part); __invalidate_device(part, true); -- 2.30.2
Unhash the whole device inode early in del_gendisk. This allows to remove the first GENHD_FL_UP check in the open path as we simply won't find a just removed inode. The second non-racy check after taking open_mutex is still kept. Signed-off-by: Christoph Hellwig <hch@lst.de> --- block/genhd.c | 7 +------ fs/block_dev.c | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 298ee78c1bda..716f5ca479ad 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -585,6 +585,7 @@ void del_gendisk(struct gendisk *disk) disk_del_events(disk); mutex_lock(&disk->open_mutex); + remove_inode_hash(disk->part0->bd_inode); disk->flags &= ~GENHD_FL_UP; blk_drop_partitions(disk); mutex_unlock(&disk->open_mutex); @@ -592,12 +593,6 @@ void del_gendisk(struct gendisk *disk) fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); - /* - * Unhash the bdev inode for this device so that it can't be looked - * up any more even if openers still hold references to it. - */ - remove_inode_hash(disk->part0->bd_inode); - set_capacity(disk, 0); if (!(disk->flags & GENHD_FL_HIDDEN)) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 9ef4f1fc2cb0..932f4034ad66 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1340,7 +1340,7 @@ struct block_device *blkdev_get_no_open(dev_t dev) disk = bdev->bd_disk; if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) goto bdput; - if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + if (disk->flags & GENHD_FL_HIDDEN) goto put_disk; if (!try_module_get(bdev->bd_disk->fops->owner)) goto put_disk; -- 2.30.2
Move the allocation of bd_meta_info after initializing the struct device to avoid the special bdput error handling path. Signed-off-by: Christoph Hellwig <hch@lst.de> --- block/partitions/core.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 9902b1635b7d..09c58a110a89 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -356,13 +356,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); - if (info) { - err = -ENOMEM; - bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); - if (!bdev->bd_meta_info) - goto out_bdput; - } - pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) @@ -386,6 +379,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, } pdev->devt = devt; + if (info) { + err = -ENOMEM; + bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); + if (!bdev->bd_meta_info) + goto out_put; + } + /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); @@ -415,9 +415,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, kobject_uevent(&pdev->kobj, KOBJ_ADD); return bdev; -out_bdput: - bdput(bdev); - return ERR_PTR(err); out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); -- 2.30.2
Instead of acquiring an inode reference on open make sure partitions always hold device model references to the disk while alive, and switch open to grab only a device model reference to the opened block device. If that is a partition the disk reference is transitively held by the partition already. --- block/partitions/core.c | 9 ++++++- fs/block_dev.c | 60 ++++++++++++++++------------------------- 2 files changed, 31 insertions(+), 38 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 09c58a110a89..4f7a1a9cd544 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -261,6 +261,7 @@ static void part_release(struct device *dev) { if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(dev->devt)); + put_disk(dev_to_bdev(dev)->bd_disk); bdput(dev_to_bdev(dev)); } @@ -349,9 +350,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); + /* ensure we always have a reference to the whole disk */ + get_device(disk_to_dev(disk)); + + err = -ENOMEM; bdev = bdev_alloc(disk, partno); if (!bdev) - return ERR_PTR(-ENOMEM); + goto out_put_disk; bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); @@ -420,6 +425,8 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, device_del(pdev); out_put: put_device(pdev); +out_put_disk: + put_disk(disk); return ERR_PTR(err); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 932f4034ad66..4a6c8c0a3bc9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -921,16 +921,6 @@ void bdev_add(struct block_device *bdev, dev_t dev) insert_inode_hash(bdev->bd_inode); } -static struct block_device *bdget(dev_t dev) -{ - struct inode *inode; - - inode = ilookup(blockdev_superblock, dev); - if (!inode) - return NULL; - return &BDEV_I(inode)->bdev; -} - /** * bdgrab -- Grab a reference to an already referenced block device * @bdev: Block device to grab a reference to. @@ -1282,16 +1272,14 @@ static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) static int blkdev_get_part(struct block_device *part, fmode_t mode) { struct gendisk *disk = part->bd_disk; - struct block_device *whole; int ret; if (part->bd_openers) goto done; - whole = bdgrab(disk->part0); - ret = blkdev_get_whole(whole, mode); + ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) - goto out_put_whole; + return ret; ret = -ENXIO; if (!bdev_nr_sectors(part)) @@ -1306,9 +1294,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) return 0; out_blkdev_put: - blkdev_put_whole(whole, mode); -out_put_whole: - bdput(whole); + blkdev_put_whole(bdev_whole(part), mode); return ret; } @@ -1321,42 +1307,42 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; blkdev_put_whole(whole, mode); - bdput(whole); } struct block_device *blkdev_get_no_open(dev_t dev) { struct block_device *bdev; - struct gendisk *disk; + struct inode *inode; - bdev = bdget(dev); - if (!bdev) { + inode = ilookup(blockdev_superblock, dev); + if (!inode) { blk_request_module(dev); - bdev = bdget(dev); - if (!bdev) + inode = ilookup(blockdev_superblock, dev); + if (!inode) return NULL; } - disk = bdev->bd_disk; - if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) - goto bdput; - if (disk->flags & GENHD_FL_HIDDEN) - goto put_disk; - if (!try_module_get(bdev->bd_disk->fops->owner)) - goto put_disk; + /* switch from the inode reference to a device mode one: */ + bdev = &BDEV_I(inode)->bdev; + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + bdev = NULL; + iput(inode); + + if (!bdev) + return NULL; + if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || + !try_module_get(bdev->bd_disk->fops->owner)) { + put_device(&bdev->bd_device); + return NULL; + } + return bdev; -put_disk: - put_disk(disk); -bdput: - bdput(bdev); - return NULL; } void blkdev_put_no_open(struct block_device *bdev) { module_put(bdev->bd_disk->fops->owner); - put_disk(bdev->bd_disk); - bdput(bdev); + put_device(&bdev->bd_device); } /** -- 2.30.2
Store the block device instead of the gendisk in the btrfs_ordered_extent structure intead of acquiring a reference to it later. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/btrfs/inode.c | 2 +- fs/btrfs/ordered-data.c | 2 -- fs/btrfs/ordered-data.h | 3 +-- fs/btrfs/zoned.c | 12 ++++-------- 4 files changed, 6 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8f60314c36c5..0117d867ecf8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2992,7 +2992,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (ordered_extent->disk) + if (ordered_extent->bdev) btrfs_rewrite_logical_zoned(ordered_extent); btrfs_free_io_failure_record(inode, start, end); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6eb41b7c0c84..5c0f8481e25e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -190,8 +190,6 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->truncated_len = (u64)-1; entry->qgroup_rsv = ret; entry->physical = (u64)-1; - entry->disk = NULL; - entry->partno = (u8)-1; ASSERT(type == BTRFS_ORDERED_REGULAR || type == BTRFS_ORDERED_NOCOW || diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 566472004edd..b2d88aba8420 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -145,8 +145,7 @@ struct btrfs_ordered_extent { * command in a workqueue context */ u64 physical; - struct gendisk *disk; - u8 partno; + struct block_device *bdev; }; /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 297c0b1c0634..907c2cc45c9c 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1349,8 +1349,7 @@ void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, return; ordered->physical = physical; - ordered->disk = bio->bi_bdev->bd_disk; - ordered->partno = bio->bi_bdev->bd_partno; + ordered->bdev = bio->bi_bdev; btrfs_put_ordered_extent(ordered); } @@ -1362,18 +1361,16 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) struct extent_map_tree *em_tree; struct extent_map *em; struct btrfs_ordered_sum *sum; - struct block_device *bdev; u64 orig_logical = ordered->disk_bytenr; u64 *logical = NULL; int nr, stripe_len; /* Zoned devices should not have partitions. So, we can assume it is 0 */ - ASSERT(ordered->partno == 0); - bdev = bdgrab(ordered->disk->part0); - if (WARN_ON(!bdev)) + ASSERT(!bdev_is_partition(ordered->bdev)); + if (WARN_ON(!ordered->bdev)) return; - if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev, + if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, ordered->physical, &logical, &nr, &stripe_len))) goto out; @@ -1402,7 +1399,6 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) out: kfree(logical); - bdput(bdev); } bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, -- 2.30.2
The whole device block device won't be removed while the disk is still alive, so don't bother to grab a reference to it. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Josef Bacik <josef@toxicpanda.com> --- drivers/block/loop.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f37b9e3d833c..62c5120cf744 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1249,10 +1249,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (partscan) lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - /* Grab the block_device to prevent its destruction after we - * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev). - */ - bdgrab(bdev); mutex_unlock(&lo->lo_mutex); if (partscan) loop_reread_partitions(lo); @@ -1332,7 +1328,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) blk_queue_physical_block_size(lo->lo_queue, 512); blk_queue_io_min(lo->lo_queue, 512); if (bdev) { - bdput(bdev); invalidate_bdev(bdev); bdev->bd_inode->i_mapping->wb_err = 0; } -- 2.30.2
All callers are gone, and no one should grab a pure inode reference to a block device anymore. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Josef Bacik <josef@toxicpanda.com> --- fs/block_dev.c | 15 --------------- include/linux/blkdev.h | 1 - 2 files changed, 16 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 4a6c8c0a3bc9..4f2c4e9e84f5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -921,21 +921,6 @@ void bdev_add(struct block_device *bdev, dev_t dev) insert_inode_hash(bdev->bd_inode); } -/** - * bdgrab -- Grab a reference to an already referenced block device - * @bdev: Block device to grab a reference to. - * - * Returns the block_device with an additional reference when successful, - * or NULL if the inode is already beeing freed. - */ -struct block_device *bdgrab(struct block_device *bdev) -{ - if (!igrab(bdev->bd_inode)) - return NULL; - return bdev; -} -EXPORT_SYMBOL(bdgrab); - long nr_blockdev_pages(void) { struct inode *inode; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3177181c4326..98772da38bb1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1984,7 +1984,6 @@ void blkdev_put_no_open(struct block_device *bdev); struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend); -- 2.30.2
Now that we've stopped using inode references for anything meaninful in the block layer get rid of the helper to put it and just open code the call to iput on the block_device inode. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Josef Bacik <josef@toxicpanda.com> --- block/genhd.c | 4 ++-- block/partitions/core.c | 2 +- fs/block_dev.c | 6 ------ include/linux/blkdev.h | 1 - 4 files changed, 3 insertions(+), 10 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 716f5ca479ad..5dbb99b57b33 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1076,7 +1076,7 @@ static void disk_release(struct device *dev) xa_destroy(&disk->part_tbl); if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) blk_put_queue(disk->queue); - bdput(disk->part0); /* frees the disk */ + iput(disk->part0->bd_inode); /* frees the disk */ } struct class block_class = { .name = "block", @@ -1261,7 +1261,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) out_destroy_part_tbl: xa_destroy(&disk->part_tbl); - bdput(disk->part0); + iput(disk->part0->bd_inode); out_free_disk: kfree(disk); return NULL; diff --git a/block/partitions/core.c b/block/partitions/core.c index 4f7a1a9cd544..2415bffc2771 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -262,7 +262,7 @@ static void part_release(struct device *dev) if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(dev->devt)); put_disk(dev_to_bdev(dev)->bd_disk); - bdput(dev_to_bdev(dev)); + iput(dev_to_bdev(dev)->bd_inode); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) diff --git a/fs/block_dev.c b/fs/block_dev.c index 4f2c4e9e84f5..6658f40ae492 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -934,12 +934,6 @@ long nr_blockdev_pages(void) return ret; } -void bdput(struct block_device *bdev) -{ - iput(bdev->bd_inode); -} -EXPORT_SYMBOL(bdput); - /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 98772da38bb1..b94de1d194b8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1984,7 +1984,6 @@ void blkdev_put_no_open(struct block_device *bdev); struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -void bdput(struct block_device *); int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend); -- 2.30.2
On Thu, Jul 22, 2021 at 09:53:56AM +0200, Christoph Hellwig wrote:
> Unhash the whole device inode early in del_gendisk. This allows to
> remove the first GENHD_FL_UP check in the open path as we simply
> won't find a just removed inode. The second non-racy check after
> taking open_mutex is still kept.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> block/genhd.c | 7 +------
> fs/block_dev.c | 2 +-
> 2 files changed, 2 insertions(+), 7 deletions(-)
>
> diff --git a/block/genhd.c b/block/genhd.c
> index 298ee78c1bda..716f5ca479ad 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -585,6 +585,7 @@ void del_gendisk(struct gendisk *disk)
> disk_del_events(disk);
>
> mutex_lock(&disk->open_mutex);
> + remove_inode_hash(disk->part0->bd_inode);
> disk->flags &= ~GENHD_FL_UP;
> blk_drop_partitions(disk);
> mutex_unlock(&disk->open_mutex);
> @@ -592,12 +593,6 @@ void del_gendisk(struct gendisk *disk)
> fsync_bdev(disk->part0);
> __invalidate_device(disk->part0, true);
>
> - /*
> - * Unhash the bdev inode for this device so that it can't be looked
> - * up any more even if openers still hold references to it.
> - */
> - remove_inode_hash(disk->part0->bd_inode);
> -
> set_capacity(disk, 0);
>
> if (!(disk->flags & GENHD_FL_HIDDEN)) {
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 9ef4f1fc2cb0..932f4034ad66 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1340,7 +1340,7 @@ struct block_device *blkdev_get_no_open(dev_t dev)
> disk = bdev->bd_disk;
> if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj))
> goto bdput;
> - if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
> + if (disk->flags & GENHD_FL_HIDDEN)
But del_gendisk() can be called just between bdget() and checking GENHD_FL_UP.
And not see difference by moving remove_inode_hash() with disk open_mutex held.
Thanks,
Ming
On Thu, Jul 22, 2021 at 09:53:57AM +0200, Christoph Hellwig wrote:
> Move the allocation of bd_meta_info after initializing the struct device
> to avoid the special bdput error handling path.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
--
Ming
On Thu, Jul 22, 2021 at 09:53:58AM +0200, Christoph Hellwig wrote:
> Instead of acquiring an inode reference on open make sure partitions
> always hold device model references to the disk while alive, and switch
> open to grab only a device model reference to the opened block device.
> If that is a partition the disk reference is transitively held by the
> partition already.
> ---
> block/partitions/core.c | 9 ++++++-
> fs/block_dev.c | 60 ++++++++++++++++-------------------------
> 2 files changed, 31 insertions(+), 38 deletions(-)
>
> diff --git a/block/partitions/core.c b/block/partitions/core.c
> index 09c58a110a89..4f7a1a9cd544 100644
> --- a/block/partitions/core.c
> +++ b/block/partitions/core.c
> @@ -261,6 +261,7 @@ static void part_release(struct device *dev)
> {
> if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR)
> blk_free_ext_minor(MINOR(dev->devt));
> + put_disk(dev_to_bdev(dev)->bd_disk);
> bdput(dev_to_bdev(dev));
> }
>
> @@ -349,9 +350,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
> if (xa_load(&disk->part_tbl, partno))
> return ERR_PTR(-EBUSY);
>
> + /* ensure we always have a reference to the whole disk */
> + get_device(disk_to_dev(disk));
> +
> + err = -ENOMEM;
> bdev = bdev_alloc(disk, partno);
> if (!bdev)
> - return ERR_PTR(-ENOMEM);
> + goto out_put_disk;
>
> bdev->bd_start_sect = start;
> bdev_set_nr_sectors(bdev, len);
> @@ -420,6 +425,8 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
> device_del(pdev);
> out_put:
> put_device(pdev);
> +out_put_disk:
> + put_disk(disk);
put_disk() is only needed for failure of bdev_alloc(). Once bdev->bd_device
is initialized, the disk reference will be dropped via part_release().
Thanks,
Ming
On Thu, Jul 22, 2021 at 09:54:00AM +0200, Christoph Hellwig wrote:
> The whole device block device won't be removed while the disk is still
> alive, so don't bother to grab a reference to it.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Ming Lei <ming.lei@rehat.com>
--
Ming
On Thu, Jul 22, 2021 at 09:53:59AM +0200, Christoph Hellwig wrote: > Store the block device instead of the gendisk in the btrfs_ordered_extent > structure intead of acquiring a reference to it later. instead > > Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: David Sterba <dsterba@suse.com> I can add the patch to the next pull request so you can rebase your series on top of it and don't need to carry it until the next merge window.
On Thu, Jul 22, 2021 at 02:58:59PM +0200, David Sterba wrote:
> On Thu, Jul 22, 2021 at 09:53:59AM +0200, Christoph Hellwig wrote:
> > Store the block device instead of the gendisk in the btrfs_ordered_extent
> > structure intead of acquiring a reference to it later.
> instead
>
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> Reviewed-by: David Sterba <dsterba@suse.com>
>
> I can add the patch to the next pull request so you can rebase your
> series on top of it and don't need to carry it until the next merge
> window.
Assuming Jens is fine with starting the block tree on -rc3 that would
be great.
On Thu, Jul 22, 2021 at 04:19:42PM +0800, Ming Lei wrote:
> > goto bdput;
> > - if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
> > + if (disk->flags & GENHD_FL_HIDDEN)
>
> But del_gendisk() can be called just between bdget() and checking GENHD_FL_UP.
>
> And not see difference by moving remove_inode_hash() with disk open_mutex held.
The difference is not about having the open_mutex held, but about doing
it earlier.
The only check that matters is the GENHD_FL_UP check in blkdev_get_by_dev.
The earlier check just reduces the amount of work we're doing for a disk
already being delete. With the early unhash there is no need for that
check as we won't even find the inode for a disk in del_gendisk. We still
need the non-racy check under the lock, but the patch doesn't touch that
one.
Maybe I need to split this into two patches and improve the commit log.
On Thu, Jul 22, 2021 at 04:41:28PM +0800, Ming Lei wrote:
> > device_del(pdev);
> > out_put:
> > put_device(pdev);
> > +out_put_disk:
> > + put_disk(disk);
>
> put_disk() is only needed for failure of bdev_alloc(). Once bdev->bd_device
> is initialized, the disk reference will be dropped via part_release().
Indeed.
On 7/22/2021 12:53 AM, Christoph Hellwig wrote:
> Add a lockdep assert instead of the outdated locking comment.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Josef Bacik <josef@toxicpanda.com>
> Reviewed-by: Ming Lei <ming.lei@redhat.com>
> ---
> block/partitions/core.c | 6 ++----
> 1 file changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/block/partitions/core.c b/block/partitions/core.c
> index 4230d4f71879..9902b1635b7d 100644
> --- a/block/partitions/core.c
> +++ b/block/partitions/core.c
> @@ -281,12 +281,10 @@ struct device_type part_type = {
> .uevent = part_uevent,
> };
>
> -/*
> - * Must be called either with open_mutex held, before a disk can be opened or
> - * after all disk users are gone.
> - */
> static void delete_partition(struct block_device *part)
> {
> + lockdep_assert_held(&part->bd_disk->open_mutex);
> +
> fsync_bdev(part);
> __invalidate_device(part, true);
>
>
Reviewed-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
--
-ck
On 7/22/2021 12:54 AM, Christoph Hellwig wrote:
> The whole device block device won't be removed while the disk is still
> alive, so don't bother to grab a reference to it.
>
> Signed-off-by: Christoph Hellwig<hch@lst.de>
> Reviewed-by: Josef Bacik<josef@toxicpanda.com>
Reviewed-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
--
-ck
On 7/22/2021 12:54 AM, Christoph Hellwig wrote:
> Now that we've stopped using inode references for anything meaninful
> in the block layer get rid of the helper to put it and just open code
> the call to iput on the block_device inode.
>
> Signed-off-by: Christoph Hellwig<hch@lst.de>
> Reviewed-by: Josef Bacik<josef@toxicpanda.com>
Reviewed-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
--
-ck
On 7/22/21 1:53 AM, Christoph Hellwig wrote:
> Hi Jens,
>
> this series fixes up a possible race with the block_device lookup
> changes, and the finishes off the conversion to stop using the inode
> refcount for block devices.
Dropped the btrfs patch since it's in -git, and applied #1 to block-5.14,
rest for 5.15.
--
Jens Axboe