All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ming Lei <ming.lei@redhat.com>
To: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>,
	Jeffle Xu <jefflexu@linux.alibaba.com>,
	Damien Le Moal <Damien.LeMoal@wdc.com>,
	Keith Busch <kbusch@kernel.org>, Sagi Grimberg <sagi@grimberg.me>,
	"Wunderlich, Mark" <mark.wunderlich@intel.com>,
	"Vasudevan, Anil" <anil.vasudevan@intel.com>,
	linux-block@vger.kernel.org, linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH 12/15] block: switch polling to be bio based
Date: Wed, 28 Apr 2021 10:28:10 +0800	[thread overview]
Message-ID: <YIjIOgYS29GvcoIm@T590> (raw)
In-Reply-To: <20210427161619.1294399-13-hch@lst.de>

On Tue, Apr 27, 2021 at 06:16:16PM +0200, Christoph Hellwig wrote:
> Replace the blk_poll interface that requires the caller to keep a queue
> and cookie from the submissions with polling based on the bio.
> 
> Polling for the bio itself leads to a few advantages:
> 
>  - the cookie construction can made entirely private in blk-mq.c
>  - the caller does not need to remember the request_queue and cookie
>    separately and thus sidesteps their lifetime issues
>  - keeping the device and the cookie inside the bio allows to trivially
>    support polling BIOs remapping by stacking drivers
>  - a lot of code to propagate the cookie back up the submission path can
>    be removed entirely.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  arch/m68k/emu/nfblock.c             |   3 +-
>  arch/xtensa/platforms/iss/simdisk.c |   3 +-
>  block/bio.c                         |   1 +
>  block/blk-core.c                    | 116 ++++++++++++++++++++--------
>  block/blk-mq.c                      |  73 ++++++-----------
>  block/blk-mq.h                      |   2 +
>  drivers/block/brd.c                 |  12 ++-
>  drivers/block/drbd/drbd_int.h       |   2 +-
>  drivers/block/drbd/drbd_req.c       |   3 +-
>  drivers/block/n64cart.c             |  12 ++-
>  drivers/block/null_blk/main.c       |   3 +-
>  drivers/block/pktcdvd.c             |   7 +-
>  drivers/block/ps3vram.c             |   6 +-
>  drivers/block/rsxx/dev.c            |   7 +-
>  drivers/block/zram/zram_drv.c       |  10 +--
>  drivers/lightnvm/pblk-init.c        |   6 +-
>  drivers/md/bcache/request.c         |  13 ++--
>  drivers/md/bcache/request.h         |   4 +-
>  drivers/md/dm.c                     |  28 +++----
>  drivers/md/md.c                     |  10 +--
>  drivers/nvdimm/blk.c                |   5 +-
>  drivers/nvdimm/btt.c                |   5 +-
>  drivers/nvdimm/pmem.c               |   3 +-
>  drivers/nvme/host/core.c            |   2 +-
>  drivers/nvme/host/multipath.c       |   6 +-
>  drivers/s390/block/dcssblk.c        |   7 +-
>  drivers/s390/block/xpram.c          |   5 +-
>  fs/block_dev.c                      |  25 ++----
>  fs/btrfs/inode.c                    |   8 +-
>  fs/ext4/file.c                      |   2 +-
>  fs/gfs2/file.c                      |   4 +-
>  fs/iomap/direct-io.c                |  39 ++++------
>  fs/xfs/xfs_file.c                   |   2 +-
>  fs/zonefs/super.c                   |   2 +-
>  include/linux/bio.h                 |   2 +-
>  include/linux/blk-mq.h              |  15 +---
>  include/linux/blk_types.h           |  12 ++-
>  include/linux/blkdev.h              |   8 +-
>  include/linux/fs.h                  |   6 +-
>  include/linux/iomap.h               |   3 +-
>  mm/page_io.c                        |   8 +-
>  41 files changed, 219 insertions(+), 271 deletions(-)
> 
> diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
> index ba808543161a..dd36808f0d5e 100644
> --- a/arch/m68k/emu/nfblock.c
> +++ b/arch/m68k/emu/nfblock.c
> @@ -59,7 +59,7 @@ struct nfhd_device {
>  	struct gendisk *disk;
>  };
>  
> -static blk_qc_t nfhd_submit_bio(struct bio *bio)
> +static void nfhd_submit_bio(struct bio *bio)
>  {
>  	struct nfhd_device *dev = bio->bi_bdev->bd_disk->private_data;
>  	struct bio_vec bvec;
> @@ -77,7 +77,6 @@ static blk_qc_t nfhd_submit_bio(struct bio *bio)
>  		sec += len;
>  	}
>  	bio_endio(bio);
> -	return BLK_QC_T_NONE;
>  }
>  
>  static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
> diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
> index fc09be7b1347..182825d639e2 100644
> --- a/arch/xtensa/platforms/iss/simdisk.c
> +++ b/arch/xtensa/platforms/iss/simdisk.c
> @@ -101,7 +101,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector,
>  	spin_unlock(&dev->lock);
>  }
>  
> -static blk_qc_t simdisk_submit_bio(struct bio *bio)
> +static void simdisk_submit_bio(struct bio *bio)
>  {
>  	struct simdisk *dev = bio->bi_bdev->bd_disk->private_data;
>  	struct bio_vec bvec;
> @@ -119,7 +119,6 @@ static blk_qc_t simdisk_submit_bio(struct bio *bio)
>  	}
>  
>  	bio_endio(bio);
> -	return BLK_QC_T_NONE;
>  }
>  
>  static int simdisk_open(struct block_device *bdev, fmode_t mode)
> diff --git a/block/bio.c b/block/bio.c
> index de5505d7018e..986908cc99d4 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -250,6 +250,7 @@ void bio_init(struct bio *bio, struct bio_vec *table,
>  	memset(bio, 0, sizeof(*bio));
>  	atomic_set(&bio->__bi_remaining, 1);
>  	atomic_set(&bio->__bi_cnt, 1);
> +	bio->bi_cookie = BLK_QC_T_NONE;
>  
>  	bio->bi_io_vec = table;
>  	bio->bi_max_vecs = max_vecs;
> diff --git a/block/blk-core.c b/block/blk-core.c
> index adfab5976be0..305fb8722871 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -910,18 +910,18 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
>  	return false;
>  }
>  
> -static blk_qc_t __submit_bio(struct bio *bio)
> +static void __submit_bio(struct bio *bio)
>  {
>  	struct gendisk *disk = bio->bi_bdev->bd_disk;
> -	blk_qc_t ret = BLK_QC_T_NONE;
>  
>  	if (blk_crypto_bio_prep(&bio)) {
> -		if (!disk->fops->submit_bio)
> -			return blk_mq_submit_bio(bio);
> -		ret = disk->fops->submit_bio(bio);
> +		if (!disk->fops->submit_bio) {
> +			blk_mq_submit_bio(bio);
> +			return;
> +		}
> +		disk->fops->submit_bio(bio);
>  	}
>  	blk_queue_exit(disk->queue);
> -	return ret;
>  }
>  
>  /*
> @@ -943,10 +943,9 @@ static blk_qc_t __submit_bio(struct bio *bio)
>   * bio_list_on_stack[1] contains bios that were submitted before the current
>   *	->submit_bio_bio, but that haven't been processed yet.
>   */
> -static blk_qc_t __submit_bio_noacct(struct bio *bio)
> +static void __submit_bio_noacct(struct bio *bio)
>  {
>  	struct bio_list bio_list_on_stack[2];
> -	blk_qc_t ret = BLK_QC_T_NONE;
>  
>  	BUG_ON(bio->bi_next);
>  
> @@ -966,7 +965,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  		bio_list_on_stack[1] = bio_list_on_stack[0];
>  		bio_list_init(&bio_list_on_stack[0]);
>  
> -		ret = __submit_bio(bio);
> +		__submit_bio(bio);
>  
>  		/*
>  		 * Sort new bios into those for a lower level and those for the
> @@ -989,13 +988,11 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  	} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
>  
>  	current->bio_list = NULL;
> -	return ret;
>  }
>  
> -static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
> +static void __submit_bio_noacct_mq(struct bio *bio)
>  {
>  	struct bio_list bio_list[2] = { };
> -	blk_qc_t ret = BLK_QC_T_NONE;
>  
>  	current->bio_list = bio_list;
>  
> @@ -1007,15 +1004,13 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
>  
>  		if (!blk_crypto_bio_prep(&bio)) {
>  			blk_queue_exit(disk->queue);
> -			ret = BLK_QC_T_NONE;
>  			continue;
>  		}
>  
> -		ret = blk_mq_submit_bio(bio);
> +		blk_mq_submit_bio(bio);
>  	} while ((bio = bio_list_pop(&bio_list[0])));
>  
>  	current->bio_list = NULL;
> -	return ret;
>  }
>  
>  /**
> @@ -1027,10 +1022,10 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
>   * systems and other upper level users of the block layer should use
>   * submit_bio() instead.
>   */
> -blk_qc_t submit_bio_noacct(struct bio *bio)
> +void submit_bio_noacct(struct bio *bio)
>  {
>  	if (!submit_bio_checks(bio))
> -		return BLK_QC_T_NONE;
> +		return;
>  
>  	/*
>  	 * We only want one ->submit_bio to be active at a time, else stack
> @@ -1038,14 +1033,12 @@ blk_qc_t submit_bio_noacct(struct bio *bio)
>  	 * to collect a list of requests submited by a ->submit_bio method while
>  	 * it is active, and then process them after it returned.
>  	 */
> -	if (current->bio_list) {
> +	if (current->bio_list)
>  		bio_list_add(&current->bio_list[0], bio);
> -		return BLK_QC_T_NONE;
> -	}
> -
> -	if (!bio->bi_bdev->bd_disk->fops->submit_bio)
> -		return __submit_bio_noacct_mq(bio);
> -	return __submit_bio_noacct(bio);
> +	else if (!bio->bi_bdev->bd_disk->fops->submit_bio)
> +		__submit_bio_noacct_mq(bio);
> +	else
> +		__submit_bio_noacct(bio);
>  }
>  EXPORT_SYMBOL(submit_bio_noacct);
>  
> @@ -1062,10 +1055,10 @@ EXPORT_SYMBOL(submit_bio_noacct);
>   * in @bio.  The bio must NOT be touched by thecaller until ->bi_end_io() has
>   * been called.
>   */
> -blk_qc_t submit_bio(struct bio *bio)
> +void submit_bio(struct bio *bio)
>  {
>  	if (blkcg_punt_bio_submit(bio))
> -		return BLK_QC_T_NONE;
> +		return;
>  
>  	/*
>  	 * If it's a regular read/write or a barrier with data attached,
> @@ -1106,19 +1099,80 @@ blk_qc_t submit_bio(struct bio *bio)
>  	if (unlikely(bio_op(bio) == REQ_OP_READ &&
>  	    bio_flagged(bio, BIO_WORKINGSET))) {
>  		unsigned long pflags;
> -		blk_qc_t ret;
>  
>  		psi_memstall_enter(&pflags);
> -		ret = submit_bio_noacct(bio);
> +		submit_bio_noacct(bio);
>  		psi_memstall_leave(&pflags);
> -
> -		return ret;
> +		return;
>  	}
>  
> -	return submit_bio_noacct(bio);
> +	submit_bio_noacct(bio);
>  }
>  EXPORT_SYMBOL(submit_bio);
>  
> +/**
> + * bio_poll - poll for BIO completions
> + * @bio: bio to poll for
> + * @flags: BLK_POLL_* flags that control the behavior
> + *
> + * Poll for completions on queue associated with the bio. Returns number of
> + * completed entries found.
> + *
> + * Note: the caller must either be the context that submitted @bio, or
> + * be in a RCU critical section to prevent freeing of @bio.
> + */
> +int bio_poll(struct bio *bio, unsigned int flags)
> +{
> +	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
> +	blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
> +
> +	if (cookie == BLK_QC_T_NONE ||
> +	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
> +		return 0;
> +
> +	if (current->plug)
> +		blk_flush_plug_list(current->plug, false);
> +
> +	/* not yet implemented, so this should not happen */
> +	if (WARN_ON_ONCE(!queue_is_mq(q)))
> +		return 0;
> +	return blk_mq_poll(q, cookie, flags);
> +}
> +EXPORT_SYMBOL_GPL(bio_poll);
> +
> +/*
> + * Helper to implement file_operations.iopoll.  Requires the bio to be stored
> + * in iocb->private, and cleared before freeing the bio.
> + */
> +int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags)
> +{
> +	struct bio *bio;
> +	int ret = 0;
> +
> +	/*
> +	 * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
> +	 * point to a freshly allocated bio at this point.  If that happens
> +	 * we have a few cases to consider:
> +	 *
> +	 *  1) the bio is beeing initialized and bi_bdev is NULL.  We can just
> +	 *     simply nothing in this case
> +	 *  2) the bio points to a not poll enabled device.  bio_poll will catch
> +	 *     this and return 0
> +	 *  3) the bio points to a poll capable device, including but not
> +	 *     limited to the one that the original bio pointed to.  In this
> +	 *     case we will call into the actual poll method and poll for I/O,
> +	 *     even if we don't need to, but it won't cause harm either.
> +	 */
> +	rcu_read_lock();
> +	bio = READ_ONCE(kiocb->private);
> +	if (bio && bio->bi_bdev)

->bi_bdev and associated disk/request_queue/hctx/... refrerred in bio_poll()
may have being freed now, so there is UAF risk.


Thanks,
Ming


  reply	other threads:[~2021-04-28  2:28 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-27 16:16 switch block layer polling to a bio based model v2 Christoph Hellwig
2021-04-27 16:16 ` [PATCH 01/15] direct-io: remove blk_poll support Christoph Hellwig
2021-04-27 16:16 ` [PATCH 02/15] block: don't try to poll multi-bio I/Os in __blkdev_direct_IO Christoph Hellwig
2021-04-27 16:16 ` [PATCH 03/15] iomap: don't try to poll multi-bio I/Os in __iomap_dio_rw Christoph Hellwig
2021-04-27 16:16 ` [PATCH 04/15] blk-mq: factor out a blk_qc_to_hctx helper Christoph Hellwig
2021-04-27 16:16 ` [PATCH 05/15] blk-mq: factor out a "classic" poll helper Christoph Hellwig
2021-04-27 16:16 ` [PATCH 06/15] blk-mq: remove blk_qc_t_to_tag and blk_qc_t_is_internal Christoph Hellwig
2021-04-27 16:16 ` [PATCH 07/15] blk-mq: remove blk_qc_t_valid Christoph Hellwig
2021-04-27 16:16 ` [PATCH 08/15] io_uring: don't sleep when polling for I/O Christoph Hellwig
2021-04-27 16:16 ` [PATCH 09/15] block: rename REQ_HIPRI to REQ_POLLED Christoph Hellwig
2021-04-27 16:16 ` [PATCH 10/15] block: use SLAB_TYPESAFE_BY_RCU for the bio slab Christoph Hellwig
2021-04-27 16:16 ` [PATCH 11/15] block: define 'struct bvec_iter' as packed Christoph Hellwig
2021-04-27 16:16 ` [PATCH 12/15] block: switch polling to be bio based Christoph Hellwig
2021-04-28  2:28   ` Ming Lei [this message]
2021-04-29  7:20     ` Christoph Hellwig
2021-04-29  7:36       ` Ming Lei
2021-04-29  9:50         ` Christoph Hellwig
2021-04-29 14:42           ` Ming Lei
2021-04-27 16:16 ` [PATCH 13/15] block: add a QUEUE_FLAG_POLL_CAPABLE flag Christoph Hellwig
2021-04-27 16:16 ` [PATCH 14/15] nvme-multipath: set QUEUE_FLAG_NOWAIT Christoph Hellwig
2021-04-28  2:32   ` Ming Lei
2021-04-29  7:27     ` Christoph Hellwig
2021-04-29  7:37       ` Ming Lei
2021-04-29  9:46         ` Christoph Hellwig
2021-05-06 21:23           ` Sagi Grimberg
2021-04-27 16:16 ` [PATCH 15/15] nvme-multipath: enable polled I/O Christoph Hellwig
2021-04-28  2:36   ` Ming Lei
2021-05-12 13:15 switch block layer polling to a bio based model v3 Christoph Hellwig
2021-05-12 13:15 ` [PATCH 12/15] block: switch polling to be bio based Christoph Hellwig
2021-05-12 13:15   ` Christoph Hellwig
2021-05-12 22:03   ` Sagi Grimberg
2021-05-12 22:03     ` Sagi Grimberg
2021-05-12 22:12     ` Keith Busch
2021-05-12 22:12       ` Keith Busch
2021-05-14  2:50       ` Keith Busch
2021-05-14  2:50         ` Keith Busch
2021-05-14 16:26         ` Keith Busch
2021-05-14 16:26           ` Keith Busch
2021-05-14 16:28           ` Christoph Hellwig
2021-05-14 16:28             ` Christoph Hellwig
2021-05-13  1:26     ` Ming Lei
2021-05-13  1:26       ` Ming Lei
2021-05-13  1:23   ` Ming Lei
2021-05-13  1:23     ` Ming Lei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YIjIOgYS29GvcoIm@T590 \
    --to=ming.lei@redhat.com \
    --cc=Damien.LeMoal@wdc.com \
    --cc=anil.vasudevan@intel.com \
    --cc=axboe@kernel.dk \
    --cc=hch@lst.de \
    --cc=jefflexu@linux.alibaba.com \
    --cc=kbusch@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=mark.wunderlich@intel.com \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.