linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] blk-mq: export request polling helpers
@ 2023-03-24 21:28 Keith Busch
  2023-03-24 21:28 ` [PATCH 2/2] nvme: use blk-mq polling for uring commands Keith Busch
  0 siblings, 1 reply; 15+ messages in thread
From: Keith Busch @ 2023-03-24 21:28 UTC (permalink / raw)
  To: linux-block, linux-nvme, axboe, hch; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

These will be used by drivers later.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-mq.c         | 6 +-----
 block/blk-mq.h         | 2 --
 include/linux/blk-mq.h | 7 +++++++
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 37d8a2f4d5da8..34ac95fc43a66 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -52,11 +52,6 @@ static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
 	return xa_load(&q->hctx_table, qc);
 }
 
-static inline blk_qc_t blk_rq_to_qc(struct request *rq)
-{
-	return rq->mq_hctx->queue_num;
-}
-
 /*
  * Check if any of the ctx, dispatch list or elevator
  * have pending work in this hardware queue.
@@ -4744,6 +4739,7 @@ int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *
 	__set_current_state(TASK_RUNNING);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(blk_mq_poll);
 
 unsigned int blk_mq_rq_cpu(struct request *rq)
 {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ef59fee62780d..92bc058fba3e5 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -31,8 +31,6 @@ struct blk_mq_ctx {
 } ____cacheline_aligned_in_smp;
 
 void blk_mq_submit_bio(struct bio *bio);
-int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
-		unsigned int flags);
 void blk_mq_exit_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1dacb2c81fdda..ed10b5e380103 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -438,6 +438,11 @@ struct blk_mq_hw_ctx {
 	struct list_head	hctx_list;
 };
 
+static inline blk_qc_t blk_rq_to_qc(struct request *rq)
+{
+	return rq->mq_hctx->queue_num;
+}
+
 /**
  * struct blk_mq_queue_map - Map software queues to hardware queues
  * @mq_map:       CPU ID to hardware queue index map. This is an array
@@ -716,6 +721,8 @@ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
 void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 
 void blk_mq_free_request(struct request *rq);
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+		unsigned int flags);
 
 bool blk_mq_queue_inflight(struct request_queue *q);
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-24 21:28 [PATCH 1/2] blk-mq: export request polling helpers Keith Busch
@ 2023-03-24 21:28 ` Keith Busch
  2023-03-25  2:50   ` kernel test robot
                     ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Keith Busch @ 2023-03-24 21:28 UTC (permalink / raw)
  To: linux-block, linux-nvme, axboe, hch; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

The first advantage is that unshared and multipath namespaces can use
the same polling callback.

The other advantage is that we don't need a bio payload in order to
poll, allowing commands like 'flush' and 'write zeroes' to be submitted
on the same high priority queue as read and write commands.

This can also allow for a future optimization for the driver since we no
longer need to create special hidden block devices to back nvme-generic
char dev's with unsupported command sets.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/ioctl.c     | 79 ++++++++++++-----------------------
 drivers/nvme/host/multipath.c |  2 +-
 drivers/nvme/host/nvme.h      |  2 -
 3 files changed, 28 insertions(+), 55 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 723e7d5b778f2..369e8519b87a2 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -503,7 +503,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 {
 	struct io_uring_cmd *ioucmd = req->end_io_data;
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-	void *cookie = READ_ONCE(ioucmd->cookie);
 
 	req->bio = pdu->bio;
 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -516,9 +515,10 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 	 * For iopoll, complete it directly.
 	 * Otherwise, move the completion to task work.
 	 */
-	if (cookie != NULL && blk_rq_is_poll(req))
+	if (blk_rq_is_poll(req)) {
+		WRITE_ONCE(ioucmd->cookie, NULL);
 		nvme_uring_task_cb(ioucmd);
-	else
+	} else
 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
 
 	return RQ_END_IO_FREE;
@@ -529,7 +529,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
 {
 	struct io_uring_cmd *ioucmd = req->end_io_data;
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-	void *cookie = READ_ONCE(ioucmd->cookie);
 
 	req->bio = pdu->bio;
 	pdu->req = req;
@@ -538,9 +537,10 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
 	 * For iopoll, complete it directly.
 	 * Otherwise, move the completion to task work.
 	 */
-	if (cookie != NULL && blk_rq_is_poll(req))
+	if (blk_rq_is_poll(req)) {
+		WRITE_ONCE(ioucmd->cookie, NULL);
 		nvme_uring_task_meta_cb(ioucmd);
-	else
+	} else
 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb);
 
 	return RQ_END_IO_NONE;
@@ -597,7 +597,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (issue_flags & IO_URING_F_IOPOLL)
 		rq_flags |= REQ_POLLED;
 
-retry:
 	req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
@@ -611,17 +610,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 			return ret;
 	}
 
-	if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) {
-		if (unlikely(!req->bio)) {
-			/* we can't poll this, so alloc regular req instead */
-			blk_mq_free_request(req);
-			rq_flags &= ~REQ_POLLED;
-			goto retry;
-		} else {
-			WRITE_ONCE(ioucmd->cookie, req->bio);
-			req->bio->bi_opf |= REQ_POLLED;
-		}
-	}
+	if (blk_rq_is_poll(req))
+		WRITE_ONCE(ioucmd->cookie, req);
+
 	/* to free bio on completion, as req->bio will be null at that time */
 	pdu->bio = req->bio;
 	pdu->meta_len = d.metadata_len;
@@ -780,18 +771,27 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 				 struct io_comp_batch *iob,
 				 unsigned int poll_flags)
 {
-	struct bio *bio;
+	struct request *req;
 	int ret = 0;
-	struct nvme_ns *ns;
-	struct request_queue *q;
 
+	/*
+	 * The rcu lock ensures the 'req' in the command cookie will not be
+	 * freed until after the unlock. The queue must be frozen to free the
+	 * request, and the freeze requires an rcu grace period. The cookie is
+	 * cleared before the request is completed, so we're fine even if a
+	 * competing polling thread completes this thread's request.
+	 */
 	rcu_read_lock();
-	bio = READ_ONCE(ioucmd->cookie);
-	ns = container_of(file_inode(ioucmd->file)->i_cdev,
-			struct nvme_ns, cdev);
-	q = ns->queue;
-	if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
-		ret = bio_poll(bio, iob, poll_flags);
+	req = READ_ONCE(ioucmd->cookie);
+	if (req) {
+		struct request_queue *q = req->q;
+
+		if (percpu_ref_tryget(&q->q_usage_counter)) {
+			ret = blk_mq_poll(q, blk_rq_to_qc(req), iob,
+					  poll_flags);
+			blk_queue_exit(q);
+		}
+	}
 	rcu_read_unlock();
 	return ret;
 }
@@ -883,31 +883,6 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return ret;
 }
-
-int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
-				      struct io_comp_batch *iob,
-				      unsigned int poll_flags)
-{
-	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
-	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
-	int srcu_idx = srcu_read_lock(&head->srcu);
-	struct nvme_ns *ns = nvme_find_path(head);
-	struct bio *bio;
-	int ret = 0;
-	struct request_queue *q;
-
-	if (ns) {
-		rcu_read_lock();
-		bio = READ_ONCE(ioucmd->cookie);
-		q = ns->queue;
-		if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio
-				&& bio->bi_bdev)
-			ret = bio_poll(bio, iob, poll_flags);
-		rcu_read_unlock();
-	}
-	srcu_read_unlock(&head->srcu, srcu_idx);
-	return ret;
-}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index fc39d01e7b63b..fcecb731c8bd9 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -470,7 +470,7 @@ static const struct file_operations nvme_ns_head_chr_fops = {
 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
 	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
-	.uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll,
+	.uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
 };
 
 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bf46f122e9e1e..ca4ea89333660 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -847,8 +847,6 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
 int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 		struct io_comp_batch *iob, unsigned int poll_flags);
-int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
-		struct io_comp_batch *iob, unsigned int poll_flags);
 int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-24 21:28 ` [PATCH 2/2] nvme: use blk-mq polling for uring commands Keith Busch
@ 2023-03-25  2:50   ` kernel test robot
  2023-03-26 13:01   ` Sagi Grimberg
  2023-03-27 13:58   ` Kanchan Joshi
  2 siblings, 0 replies; 15+ messages in thread
From: kernel test robot @ 2023-03-25  2:50 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-nvme, axboe, hch
  Cc: oe-kbuild-all, Keith Busch

Hi Keith,

I love your patch! Yet something to improve:

[auto build test ERROR on axboe-block/for-next]
[cannot apply to linus/master v6.3-rc3]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/nvme-use-blk-mq-polling-for-uring-commands/20230325-052914
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20230324212803.1837554-2-kbusch%40meta.com
patch subject: [PATCH 2/2] nvme: use blk-mq polling for uring commands
config: m68k-allmodconfig (https://download.01.org/0day-ci/archive/20230325/202303251022.c5FPQHwt-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/62483898b54af99f656f5a16a65ff26940b817a8
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Keith-Busch/nvme-use-blk-mq-polling-for-uring-commands/20230325-052914
        git checkout 62483898b54af99f656f5a16a65ff26940b817a8
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303251022.c5FPQHwt-lkp@intel.com/

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> ERROR: modpost: "blk_queue_exit" [drivers/nvme/host/nvme-core.ko] undefined!

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-24 21:28 ` [PATCH 2/2] nvme: use blk-mq polling for uring commands Keith Busch
  2023-03-25  2:50   ` kernel test robot
@ 2023-03-26 13:01   ` Sagi Grimberg
  2023-03-27 15:29     ` Keith Busch
  2023-03-27 13:58   ` Kanchan Joshi
  2 siblings, 1 reply; 15+ messages in thread
From: Sagi Grimberg @ 2023-03-26 13:01 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-nvme, axboe, hch; +Cc: Keith Busch



On 3/25/23 00:28, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The first advantage is that unshared and multipath namespaces can use
> the same polling callback.
> 
> The other advantage is that we don't need a bio payload in order to
> poll, allowing commands like 'flush' and 'write zeroes' to be submitted
> on the same high priority queue as read and write commands.
> 
> This can also allow for a future optimization for the driver since we no
> longer need to create special hidden block devices to back nvme-generic
> char dev's with unsupported command sets.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   drivers/nvme/host/ioctl.c     | 79 ++++++++++++-----------------------
>   drivers/nvme/host/multipath.c |  2 +-
>   drivers/nvme/host/nvme.h      |  2 -
>   3 files changed, 28 insertions(+), 55 deletions(-)
> 
> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> index 723e7d5b778f2..369e8519b87a2 100644
> --- a/drivers/nvme/host/ioctl.c
> +++ b/drivers/nvme/host/ioctl.c
> @@ -503,7 +503,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
>   {
>   	struct io_uring_cmd *ioucmd = req->end_io_data;
>   	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> -	void *cookie = READ_ONCE(ioucmd->cookie);
>   
>   	req->bio = pdu->bio;
>   	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
> @@ -516,9 +515,10 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
>   	 * For iopoll, complete it directly.
>   	 * Otherwise, move the completion to task work.
>   	 */
> -	if (cookie != NULL && blk_rq_is_poll(req))
> +	if (blk_rq_is_poll(req)) {
> +		WRITE_ONCE(ioucmd->cookie, NULL);
>   		nvme_uring_task_cb(ioucmd);
> -	else
> +	} else
>   		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
>   
>   	return RQ_END_IO_FREE;
> @@ -529,7 +529,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
>   {
>   	struct io_uring_cmd *ioucmd = req->end_io_data;
>   	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> -	void *cookie = READ_ONCE(ioucmd->cookie);
>   
>   	req->bio = pdu->bio;
>   	pdu->req = req;
> @@ -538,9 +537,10 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
>   	 * For iopoll, complete it directly.
>   	 * Otherwise, move the completion to task work.
>   	 */
> -	if (cookie != NULL && blk_rq_is_poll(req))
> +	if (blk_rq_is_poll(req)) {
> +		WRITE_ONCE(ioucmd->cookie, NULL);
>   		nvme_uring_task_meta_cb(ioucmd);
> -	else
> +	} else
>   		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb);
>   
>   	return RQ_END_IO_NONE;
> @@ -597,7 +597,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>   	if (issue_flags & IO_URING_F_IOPOLL)
>   		rq_flags |= REQ_POLLED;
>   
> -retry:
>   	req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags);
>   	if (IS_ERR(req))
>   		return PTR_ERR(req);
> @@ -611,17 +610,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>   			return ret;
>   	}
>   
> -	if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) {
> -		if (unlikely(!req->bio)) {
> -			/* we can't poll this, so alloc regular req instead */
> -			blk_mq_free_request(req);
> -			rq_flags &= ~REQ_POLLED;
> -			goto retry;
> -		} else {
> -			WRITE_ONCE(ioucmd->cookie, req->bio);
> -			req->bio->bi_opf |= REQ_POLLED;
> -		}
> -	}
> +	if (blk_rq_is_poll(req))
> +		WRITE_ONCE(ioucmd->cookie, req);

Why aren't we always setting the cookie to point at the req?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-24 21:28 ` [PATCH 2/2] nvme: use blk-mq polling for uring commands Keith Busch
  2023-03-25  2:50   ` kernel test robot
  2023-03-26 13:01   ` Sagi Grimberg
@ 2023-03-27 13:58   ` Kanchan Joshi
  2023-03-27 15:20     ` Keith Busch
  2 siblings, 1 reply; 15+ messages in thread
From: Kanchan Joshi @ 2023-03-27 13:58 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, axboe, hch, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 6047 bytes --]

On Fri, Mar 24, 2023 at 02:28:03PM -0700, Keith Busch wrote:
>From: Keith Busch <kbusch@kernel.org>
>
>The first advantage is that unshared and multipath namespaces can use
>the same polling callback.
>
>The other advantage is that we don't need a bio payload in order to
>poll, allowing commands like 'flush' and 'write zeroes' to be submitted
>on the same high priority queue as read and write commands.
>
>This can also allow for a future optimization for the driver since we no
>longer need to create special hidden block devices to back nvme-generic
>char dev's with unsupported command sets.
>
>Signed-off-by: Keith Busch <kbusch@kernel.org>
>---
> drivers/nvme/host/ioctl.c     | 79 ++++++++++++-----------------------
> drivers/nvme/host/multipath.c |  2 +-
> drivers/nvme/host/nvme.h      |  2 -
> 3 files changed, 28 insertions(+), 55 deletions(-)
>
>diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
>index 723e7d5b778f2..369e8519b87a2 100644
>--- a/drivers/nvme/host/ioctl.c
>+++ b/drivers/nvme/host/ioctl.c
>@@ -503,7 +503,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
> {
> 	struct io_uring_cmd *ioucmd = req->end_io_data;
> 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>-	void *cookie = READ_ONCE(ioucmd->cookie);
>
> 	req->bio = pdu->bio;
> 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
>@@ -516,9 +515,10 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
> 	 * For iopoll, complete it directly.
> 	 * Otherwise, move the completion to task work.
> 	 */
>-	if (cookie != NULL && blk_rq_is_poll(req))
>+	if (blk_rq_is_poll(req)) {
>+		WRITE_ONCE(ioucmd->cookie, NULL);
> 		nvme_uring_task_cb(ioucmd);
>-	else
>+	} else
> 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
>
> 	return RQ_END_IO_FREE;
>@@ -529,7 +529,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
> {
> 	struct io_uring_cmd *ioucmd = req->end_io_data;
> 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>-	void *cookie = READ_ONCE(ioucmd->cookie);
>
> 	req->bio = pdu->bio;
> 	pdu->req = req;
>@@ -538,9 +537,10 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
> 	 * For iopoll, complete it directly.
> 	 * Otherwise, move the completion to task work.
> 	 */
>-	if (cookie != NULL && blk_rq_is_poll(req))
>+	if (blk_rq_is_poll(req)) {
>+		WRITE_ONCE(ioucmd->cookie, NULL);
> 		nvme_uring_task_meta_cb(ioucmd);
>-	else
>+	} else
> 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb);
>
> 	return RQ_END_IO_NONE;
>@@ -597,7 +597,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> 	if (issue_flags & IO_URING_F_IOPOLL)
> 		rq_flags |= REQ_POLLED;
>
>-retry:
> 	req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags);
> 	if (IS_ERR(req))
> 		return PTR_ERR(req);
>@@ -611,17 +610,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> 			return ret;
> 	}
>
>-	if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) {
>-		if (unlikely(!req->bio)) {
>-			/* we can't poll this, so alloc regular req instead */
>-			blk_mq_free_request(req);
>-			rq_flags &= ~REQ_POLLED;
>-			goto retry;
>-		} else {
>-			WRITE_ONCE(ioucmd->cookie, req->bio);
>-			req->bio->bi_opf |= REQ_POLLED;
>-		}
>-	}
>+	if (blk_rq_is_poll(req))
>+		WRITE_ONCE(ioucmd->cookie, req);

blk_rq_is_poll(req) warns for null "req->bio" and returns false if that
is the case. That defeats one of the purpose of the series i.e. poll on
no-payload commands such as flush/write-zeroes.


> 	/* to free bio on completion, as req->bio will be null at that time */
> 	pdu->bio = req->bio;
> 	pdu->meta_len = d.metadata_len;
>@@ -780,18 +771,27 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
> 				 struct io_comp_batch *iob,
> 				 unsigned int poll_flags)
> {
>-	struct bio *bio;
>+	struct request *req;
> 	int ret = 0;
>-	struct nvme_ns *ns;
>-	struct request_queue *q;
>
>+	/*
>+	 * The rcu lock ensures the 'req' in the command cookie will not be
>+	 * freed until after the unlock. The queue must be frozen to free the
>+	 * request, and the freeze requires an rcu grace period. The cookie is
>+	 * cleared before the request is completed, so we're fine even if a
>+	 * competing polling thread completes this thread's request.
>+	 */
> 	rcu_read_lock();
>-	bio = READ_ONCE(ioucmd->cookie);
>-	ns = container_of(file_inode(ioucmd->file)->i_cdev,
>-			struct nvme_ns, cdev);
>-	q = ns->queue;
>-	if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
>-		ret = bio_poll(bio, iob, poll_flags);
>+	req = READ_ONCE(ioucmd->cookie);
>+	if (req) {

This is risky. We are not sure if the cookie is actually "req" at this
moment. If driver is loaded without the poll-queues, we will not be able
to set req into ioucmd->cookie during the submission (in
nvme_uring_cmd_io). Therefore, the original code checked for QUEUE_FLAG_POLL
before treating ioucmd->cookie as bio here.

This should handle it (on top of your patch):

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 369e8519b87a..89eef5da4b5c 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -773,6 +773,8 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 {
        struct request *req;
        int ret = 0;
+       struct request_queue *q;
+       struct nvme_ns *ns;

        /*
         * The rcu lock ensures the 'req' in the command cookie will not be
@@ -783,8 +785,10 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
         */
        rcu_read_lock();
        req = READ_ONCE(ioucmd->cookie);
-       if (req) {
-               struct request_queue *q = req->q;
+       ns = container_of(file_inode(ioucmd->file)->i_cdev, struct nvme_ns,
+                       cdev);
+       q = ns->queue;
+       if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && req) {

                if (percpu_ref_tryget(&q->q_usage_counter)) {
                        ret = blk_mq_poll(q, blk_rq_to_qc(req), iob,


[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-27 13:58   ` Kanchan Joshi
@ 2023-03-27 15:20     ` Keith Busch
  2023-03-27 17:20       ` Kanchan Joshi
  0 siblings, 1 reply; 15+ messages in thread
From: Keith Busch @ 2023-03-27 15:20 UTC (permalink / raw)
  To: Kanchan Joshi; +Cc: Keith Busch, linux-block, linux-nvme, axboe, hch

On Mon, Mar 27, 2023 at 07:28:10PM +0530, Kanchan Joshi wrote:
> > -	}
> > +	if (blk_rq_is_poll(req))
> > +		WRITE_ONCE(ioucmd->cookie, req);
> 
> blk_rq_is_poll(req) warns for null "req->bio" and returns false if that
> is the case. That defeats one of the purpose of the series i.e. poll on
> no-payload commands such as flush/write-zeroes.

Sorry, I'm sending out various patches piecemeal. This patch here depends on
this one sent out earlier:

  https://lore.kernel.org/linux-block/3f670ca7-908d-db55-3da1-4090f116005d@nvidia.com/T/#mbc6174ce3f9dbae38ae2ca646518be4bf105f6e4

> > 	rcu_read_lock();
> > -	bio = READ_ONCE(ioucmd->cookie);
> > -	ns = container_of(file_inode(ioucmd->file)->i_cdev,
> > -			struct nvme_ns, cdev);
> > -	q = ns->queue;
> > -	if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
> > -		ret = bio_poll(bio, iob, poll_flags);
> > +	req = READ_ONCE(ioucmd->cookie);
> > +	if (req) {
> 
> This is risky. We are not sure if the cookie is actually "req" at this
> moment.

What else could it be? It's either a real request from a polled hctx tag, or
NULL at this point.

It's safe to check the cookie like this and rely on its contents. The queue's
hctx's can't change within an rcu section, and the cookie is cleared in the
completion path prior to the request being free'd. In the worst case, we're
racing another polling thread completing our request while simultaneously
trying to renumber the hctx's, but the request and the current hctx it points
are reliable if we see non-NULL.

> If driver is loaded without the poll-queues, we will not be able
> to set req into ioucmd->cookie during the submission (in
> nvme_uring_cmd_io). Therefore, the original code checked for QUEUE_FLAG_POLL
> before treating ioucmd->cookie as bio here.

You don't need to check the queue's FLAG_POLL after the request is allocated.
The user can't change this directly, and this flag can't be changed with
requests in flight, so checking blk_rq_is_poll() is the only thing we need to
rely on.

> This should handle it (on top of your patch):

This doesn't work with multipath.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-26 13:01   ` Sagi Grimberg
@ 2023-03-27 15:29     ` Keith Busch
  2023-03-28  8:35       ` Sagi Grimberg
  0 siblings, 1 reply; 15+ messages in thread
From: Keith Busch @ 2023-03-27 15:29 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Keith Busch, linux-block, linux-nvme, axboe, hch

On Sun, Mar 26, 2023 at 04:01:46PM +0300, Sagi Grimberg wrote:
> On 3/25/23 00:28, Keith Busch wrote:
> > +	if (blk_rq_is_poll(req))
> > +		WRITE_ONCE(ioucmd->cookie, req);
> 
> Why aren't we always setting the cookie to point at the req?

As in unconditionally set the cookie even for non-polled requests? We need to
see that the cookie is NULL to prevent polling interrupt queues when an
io_uring polled command is dispatched to a device without polled hctx's.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-27 15:20     ` Keith Busch
@ 2023-03-27 17:20       ` Kanchan Joshi
  2023-03-28  0:48         ` Keith Busch
  0 siblings, 1 reply; 15+ messages in thread
From: Kanchan Joshi @ 2023-03-27 17:20 UTC (permalink / raw)
  To: Keith Busch
  Cc: Kanchan Joshi, Keith Busch, linux-block, linux-nvme, axboe, hch

On Mon, Mar 27, 2023 at 8:59 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Mon, Mar 27, 2023 at 07:28:10PM +0530, Kanchan Joshi wrote:
> > > -   }
> > > +   if (blk_rq_is_poll(req))
> > > +           WRITE_ONCE(ioucmd->cookie, req);
> >
> > blk_rq_is_poll(req) warns for null "req->bio" and returns false if that
> > is the case. That defeats one of the purpose of the series i.e. poll on
> > no-payload commands such as flush/write-zeroes.
>
> Sorry, I'm sending out various patches piecemeal. This patch here depends on
> this one sent out earlier:
>
>   https://lore.kernel.org/linux-block/3f670ca7-908d-db55-3da1-4090f116005d@nvidia.com/T/#mbc6174ce3f9dbae38ae2ca646518be4bf105f6e4

That clears it up, thanks.

> > >     rcu_read_lock();
> > > -   bio = READ_ONCE(ioucmd->cookie);
> > > -   ns = container_of(file_inode(ioucmd->file)->i_cdev,
> > > -                   struct nvme_ns, cdev);
> > > -   q = ns->queue;
> > > -   if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
> > > -           ret = bio_poll(bio, iob, poll_flags);
> > > +   req = READ_ONCE(ioucmd->cookie);
> > > +   if (req) {
> >
> > This is risky. We are not sure if the cookie is actually "req" at this
> > moment.
>
> What else could it be? It's either a real request from a polled hctx tag, or
> NULL at this point.

It can also be a function pointer that gets assigned on irq-driven completion.
See the "struct io_uring_cmd" - we are tight on cacheline, so cookie
and task_work_cb share the storage.

> It's safe to check the cookie like this and rely on its contents.
Hence not safe. Please try running this without poll-queues (at nvme
level), you'll see failures.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-27 17:20       ` Kanchan Joshi
@ 2023-03-28  0:48         ` Keith Busch
  2023-03-28  7:49           ` Kanchan Joshi
  0 siblings, 1 reply; 15+ messages in thread
From: Keith Busch @ 2023-03-28  0:48 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Kanchan Joshi, Keith Busch, linux-block, linux-nvme, axboe, hch

On Mon, Mar 27, 2023 at 10:50:47PM +0530, Kanchan Joshi wrote:
> On Mon, Mar 27, 2023 at 8:59 PM Keith Busch <kbusch@kernel.org> wrote:
> > > >     rcu_read_lock();
> > > > -   bio = READ_ONCE(ioucmd->cookie);
> > > > -   ns = container_of(file_inode(ioucmd->file)->i_cdev,
> > > > -                   struct nvme_ns, cdev);
> > > > -   q = ns->queue;
> > > > -   if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
> > > > -           ret = bio_poll(bio, iob, poll_flags);
> > > > +   req = READ_ONCE(ioucmd->cookie);
> > > > +   if (req) {
> > >
> > > This is risky. We are not sure if the cookie is actually "req" at this
> > > moment.
> >
> > What else could it be? It's either a real request from a polled hctx tag, or
> > NULL at this point.
> 
> It can also be a function pointer that gets assigned on irq-driven completion.
> See the "struct io_uring_cmd" - we are tight on cacheline, so cookie
> and task_work_cb share the storage.
> 
> > It's safe to check the cookie like this and rely on its contents.
> Hence not safe. Please try running this without poll-queues (at nvme
> level), you'll see failures.

Okay, you have a iouring polling instance used with a file that has poll
capabilities, but doesn't have any polling hctx's. It would be nice to exclude
these from io_uring's polling since they're wasting CPU time, but that doesn't
look easily done. This simple patch atop should work though.

---
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 369e8519b87a2..e3ff019404816 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -612,6 +612,8 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 
 	if (blk_rq_is_poll(req))
 		WRITE_ONCE(ioucmd->cookie, req);
+	else if (issue_flags & IO_URING_F_IOPOLL)
+		ioucmd->flags |= IORING_URING_CMD_NOPOLL;
 
 	/* to free bio on completion, as req->bio will be null at that time */
 	pdu->bio = req->bio;
@@ -774,6 +776,9 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 	struct request *req;
 	int ret = 0;
 
+	if (ioucmd->flags & IORING_URING_CMD_NOPOLL)
+		return 0;
+
 	/*
 	 * The rcu lock ensures the 'req' in the command cookie will not be
 	 * freed until after the unlock. The queue must be frozen to free the
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 934e5dd4ccc08..2abf45491b3df 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -22,6 +22,8 @@ enum io_uring_cmd_flags {
 	IO_URING_F_IOPOLL		= (1 << 10),
 };
 
+#define IORING_URING_CMD_NOPOLL	(1U << 31)
+
 struct io_uring_cmd {
 	struct file	*file;
 	const void	*cmd;
--

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-28  0:48         ` Keith Busch
@ 2023-03-28  7:49           ` Kanchan Joshi
  2023-03-28 14:52             ` Keith Busch
  0 siblings, 1 reply; 15+ messages in thread
From: Kanchan Joshi @ 2023-03-28  7:49 UTC (permalink / raw)
  To: Keith Busch
  Cc: Kanchan Joshi, Keith Busch, linux-block, linux-nvme, axboe, hch

[-- Attachment #1: Type: text/plain, Size: 2296 bytes --]

On Mon, Mar 27, 2023 at 06:48:30PM -0600, Keith Busch wrote:
>On Mon, Mar 27, 2023 at 10:50:47PM +0530, Kanchan Joshi wrote:
>> On Mon, Mar 27, 2023 at 8:59 PM Keith Busch <kbusch@kernel.org> wrote:
>> > > >     rcu_read_lock();
>> > > > -   bio = READ_ONCE(ioucmd->cookie);
>> > > > -   ns = container_of(file_inode(ioucmd->file)->i_cdev,
>> > > > -                   struct nvme_ns, cdev);
>> > > > -   q = ns->queue;
>> > > > -   if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
>> > > > -           ret = bio_poll(bio, iob, poll_flags);
>> > > > +   req = READ_ONCE(ioucmd->cookie);
>> > > > +   if (req) {
>> > >
>> > > This is risky. We are not sure if the cookie is actually "req" at this
>> > > moment.
>> >
>> > What else could it be? It's either a real request from a polled hctx tag, or
>> > NULL at this point.
>>
>> It can also be a function pointer that gets assigned on irq-driven completion.
>> See the "struct io_uring_cmd" - we are tight on cacheline, so cookie
>> and task_work_cb share the storage.
>>
>> > It's safe to check the cookie like this and rely on its contents.
>> Hence not safe. Please try running this without poll-queues (at nvme
>> level), you'll see failures.
>
>Okay, you have a iouring polling instance used with a file that has poll
>capabilities, but doesn't have any polling hctx's. It would be nice to exclude
>these from io_uring's polling since they're wasting CPU time, but that doesn't
>look easily done.

Do you mean having the ring with IOPOLL set, and yet skip the attempt of
actively reaping the completion for certain IOs?

>This simple patch atop should work though.
>
>---
>diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
>index 369e8519b87a2..e3ff019404816 100644
>--- a/drivers/nvme/host/ioctl.c
>+++ b/drivers/nvme/host/ioctl.c
>@@ -612,6 +612,8 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>
> 	if (blk_rq_is_poll(req))
> 		WRITE_ONCE(ioucmd->cookie, req);
>+	else if (issue_flags & IO_URING_F_IOPOLL)
>+		ioucmd->flags |= IORING_URING_CMD_NOPOLL;

If IO_URING_F_IOPOLL would have come here as part of "ioucmd->flags", we
could have just cleared that here. That would avoid the need of NOPOLL flag.
That said, I don't feel strongly about new flag too. You decide.

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-27 15:29     ` Keith Busch
@ 2023-03-28  8:35       ` Sagi Grimberg
  0 siblings, 0 replies; 15+ messages in thread
From: Sagi Grimberg @ 2023-03-28  8:35 UTC (permalink / raw)
  To: Keith Busch; +Cc: Keith Busch, linux-block, linux-nvme, axboe, hch


>>> +	if (blk_rq_is_poll(req))
>>> +		WRITE_ONCE(ioucmd->cookie, req);
>>
>> Why aren't we always setting the cookie to point at the req?
> 
> As in unconditionally set the cookie even for non-polled requests? We need to
> see that the cookie is NULL to prevent polling interrupt queues when an
> io_uring polled command is dispatched to a device without polled hctx's.

Why don't we export that to an explicit flag? It would make it cleaner
I think.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-28  7:49           ` Kanchan Joshi
@ 2023-03-28 14:52             ` Keith Busch
  2023-03-29  8:46               ` Kanchan Joshi
  0 siblings, 1 reply; 15+ messages in thread
From: Keith Busch @ 2023-03-28 14:52 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Kanchan Joshi, Keith Busch, linux-block, linux-nvme, axboe, hch

On Tue, Mar 28, 2023 at 01:19:39PM +0530, Kanchan Joshi wrote:
> On Mon, Mar 27, 2023 at 06:48:30PM -0600, Keith Busch wrote:
> > On Mon, Mar 27, 2023 at 10:50:47PM +0530, Kanchan Joshi wrote:
> > > On Mon, Mar 27, 2023 at 8:59 PM Keith Busch <kbusch@kernel.org> wrote:
> > > > > >     rcu_read_lock();
> > > > > > -   bio = READ_ONCE(ioucmd->cookie);
> > > > > > -   ns = container_of(file_inode(ioucmd->file)->i_cdev,
> > > > > > -                   struct nvme_ns, cdev);
> > > > > > -   q = ns->queue;
> > > > > > -   if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
> > > > > > -           ret = bio_poll(bio, iob, poll_flags);
> > > > > > +   req = READ_ONCE(ioucmd->cookie);
> > > > > > +   if (req) {
> > > > >
> > > > > This is risky. We are not sure if the cookie is actually "req" at this
> > > > > moment.
> > > >
> > > > What else could it be? It's either a real request from a polled hctx tag, or
> > > > NULL at this point.
> > > 
> > > It can also be a function pointer that gets assigned on irq-driven completion.
> > > See the "struct io_uring_cmd" - we are tight on cacheline, so cookie
> > > and task_work_cb share the storage.
> > > 
> > > > It's safe to check the cookie like this and rely on its contents.
> > > Hence not safe. Please try running this without poll-queues (at nvme
> > > level), you'll see failures.
> > 
> > Okay, you have a iouring polling instance used with a file that has poll
> > capabilities, but doesn't have any polling hctx's. It would be nice to exclude
> > these from io_uring's polling since they're wasting CPU time, but that doesn't
> > look easily done.
> 
> Do you mean having the ring with IOPOLL set, and yet skip the attempt of
> actively reaping the completion for certain IOs?

Yes, exactly. It'd be great if non-polled requests don't get added to the
ctx->iopoll_list in the first place.
 
> > This simple patch atop should work though.
> > 
> > ---
> > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> > index 369e8519b87a2..e3ff019404816 100644
> > --- a/drivers/nvme/host/ioctl.c
> > +++ b/drivers/nvme/host/ioctl.c
> > @@ -612,6 +612,8 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> > 
> > 	if (blk_rq_is_poll(req))
> > 		WRITE_ONCE(ioucmd->cookie, req);
> > +	else if (issue_flags & IO_URING_F_IOPOLL)
> > +		ioucmd->flags |= IORING_URING_CMD_NOPOLL;
> 
> If IO_URING_F_IOPOLL would have come here as part of "ioucmd->flags", we
> could have just cleared that here. That would avoid the need of NOPOLL flag.
> That said, I don't feel strongly about new flag too. You decide.

IO_URING_F_IOPOLL, while named in an enum that sounds suspiciouly like it is
part of ioucmd->flags, is actually ctx flags, so a little confusing. And we
need to be a litle careful here: the existing ioucmd->flags is used with uapi
flags.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-28 14:52             ` Keith Busch
@ 2023-03-29  8:46               ` Kanchan Joshi
  2023-03-29 16:11                 ` Keith Busch
  0 siblings, 1 reply; 15+ messages in thread
From: Kanchan Joshi @ 2023-03-29  8:46 UTC (permalink / raw)
  To: Keith Busch
  Cc: Kanchan Joshi, Keith Busch, linux-block, linux-nvme, axboe, hch

[-- Attachment #1: Type: text/plain, Size: 3103 bytes --]

On Tue, Mar 28, 2023 at 08:52:53AM -0600, Keith Busch wrote:
>On Tue, Mar 28, 2023 at 01:19:39PM +0530, Kanchan Joshi wrote:
>> On Mon, Mar 27, 2023 at 06:48:30PM -0600, Keith Busch wrote:
>> > On Mon, Mar 27, 2023 at 10:50:47PM +0530, Kanchan Joshi wrote:
>> > > On Mon, Mar 27, 2023 at 8:59 PM Keith Busch <kbusch@kernel.org> wrote:
>> > > > > >     rcu_read_lock();
>> > > > > > -   bio = READ_ONCE(ioucmd->cookie);
>> > > > > > -   ns = container_of(file_inode(ioucmd->file)->i_cdev,
>> > > > > > -                   struct nvme_ns, cdev);
>> > > > > > -   q = ns->queue;
>> > > > > > -   if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
>> > > > > > -           ret = bio_poll(bio, iob, poll_flags);
>> > > > > > +   req = READ_ONCE(ioucmd->cookie);
>> > > > > > +   if (req) {
>> > > > >
>> > > > > This is risky. We are not sure if the cookie is actually "req" at this
>> > > > > moment.
>> > > >
>> > > > What else could it be? It's either a real request from a polled hctx tag, or
>> > > > NULL at this point.
>> > >
>> > > It can also be a function pointer that gets assigned on irq-driven completion.
>> > > See the "struct io_uring_cmd" - we are tight on cacheline, so cookie
>> > > and task_work_cb share the storage.
>> > >
>> > > > It's safe to check the cookie like this and rely on its contents.
>> > > Hence not safe. Please try running this without poll-queues (at nvme
>> > > level), you'll see failures.
>> >
>> > Okay, you have a iouring polling instance used with a file that has poll
>> > capabilities, but doesn't have any polling hctx's. It would be nice to exclude
>> > these from io_uring's polling since they're wasting CPU time, but that doesn't
>> > look easily done.
>>
>> Do you mean having the ring with IOPOLL set, and yet skip the attempt of
>> actively reaping the completion for certain IOs?
>
>Yes, exactly. It'd be great if non-polled requests don't get added to the
>ctx->iopoll_list in the first place.
>
>> > This simple patch atop should work though.
>> >
>> > ---
>> > diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
>> > index 369e8519b87a2..e3ff019404816 100644
>> > --- a/drivers/nvme/host/ioctl.c
>> > +++ b/drivers/nvme/host/ioctl.c
>> > @@ -612,6 +612,8 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>> >
>> > 	if (blk_rq_is_poll(req))
>> > 		WRITE_ONCE(ioucmd->cookie, req);
>> > +	else if (issue_flags & IO_URING_F_IOPOLL)
>> > +		ioucmd->flags |= IORING_URING_CMD_NOPOLL;
>>
>> If IO_URING_F_IOPOLL would have come here as part of "ioucmd->flags", we
>> could have just cleared that here. That would avoid the need of NOPOLL flag.
>> That said, I don't feel strongly about new flag too. You decide.
>
>IO_URING_F_IOPOLL, while named in an enum that sounds suspiciouly like it is
>part of ioucmd->flags, is actually ctx flags, so a little confusing. And we
>need to be a litle careful here: the existing ioucmd->flags is used with uapi
>flags.

Indeed. If this is getting crufty, series can just enable polling on
no-payload requests. Reducing nvme handlers - for another day.

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-29  8:46               ` Kanchan Joshi
@ 2023-03-29 16:11                 ` Keith Busch
  2023-04-03 12:42                   ` Kanchan Joshi
  0 siblings, 1 reply; 15+ messages in thread
From: Keith Busch @ 2023-03-29 16:11 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Kanchan Joshi, Keith Busch, linux-block, linux-nvme, axboe, hch

On Wed, Mar 29, 2023 at 02:16:18PM +0530, Kanchan Joshi wrote:
> On Tue, Mar 28, 2023 at 08:52:53AM -0600, Keith Busch wrote:
> > > > +	else if (issue_flags & IO_URING_F_IOPOLL)
> > > > +		ioucmd->flags |= IORING_URING_CMD_NOPOLL;
> > > 
> > > If IO_URING_F_IOPOLL would have come here as part of "ioucmd->flags", we
> > > could have just cleared that here. That would avoid the need of NOPOLL flag.
> > > That said, I don't feel strongly about new flag too. You decide.
> > 
> > IO_URING_F_IOPOLL, while named in an enum that sounds suspiciouly like it is
> > part of ioucmd->flags, is actually ctx flags, so a little confusing. And we
> > need to be a litle careful here: the existing ioucmd->flags is used with uapi
> > flags.
> 
> Indeed. If this is getting crufty, series can just enable polling on
> no-payload requests. Reducing nvme handlers - for another day.

Well something needs to be done about multipath since it's broken today: if the
path changes between submission and poll, we'll consult the wrong queue for
polling enabled. This could cause a missed polling opprotunity, polling a
pointer that isn't a bio, or poll an irq enabled cq. All are bad.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: use blk-mq polling for uring commands
  2023-03-29 16:11                 ` Keith Busch
@ 2023-04-03 12:42                   ` Kanchan Joshi
  0 siblings, 0 replies; 15+ messages in thread
From: Kanchan Joshi @ 2023-04-03 12:42 UTC (permalink / raw)
  To: Keith Busch
  Cc: Kanchan Joshi, Keith Busch, linux-block, linux-nvme, axboe, hch

On Wed, Mar 29, 2023 at 9:41 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Wed, Mar 29, 2023 at 02:16:18PM +0530, Kanchan Joshi wrote:
> > On Tue, Mar 28, 2023 at 08:52:53AM -0600, Keith Busch wrote:
> > > > > +       else if (issue_flags & IO_URING_F_IOPOLL)
> > > > > +               ioucmd->flags |= IORING_URING_CMD_NOPOLL;
> > > >
> > > > If IO_URING_F_IOPOLL would have come here as part of "ioucmd->flags", we
> > > > could have just cleared that here. That would avoid the need of NOPOLL flag.
> > > > That said, I don't feel strongly about new flag too. You decide.
> > >
> > > IO_URING_F_IOPOLL, while named in an enum that sounds suspiciouly like it is
> > > part of ioucmd->flags, is actually ctx flags, so a little confusing. And we
> > > need to be a litle careful here: the existing ioucmd->flags is used with uapi
> > > flags.
> >
> > Indeed. If this is getting crufty, series can just enable polling on
> > no-payload requests. Reducing nvme handlers - for another day.
>
> Well something needs to be done about multipath since it's broken today: if the
> path changes between submission and poll, we'll consult the wrong queue for
> polling enabled. This could cause a missed polling opprotunity, polling a
> pointer that isn't a bio, or poll an irq enabled cq. All are bad.

I see, that is because "nvme_find_path" was used.
How about using top 4 bits of "ioucmd->flags" for an internal flag.
That means we can support max 28 flags for "sqe->uring_cmd_flags".
That perhaps is not too bad, given that we use one bit at the moment?

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2023-04-03 12:43 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-24 21:28 [PATCH 1/2] blk-mq: export request polling helpers Keith Busch
2023-03-24 21:28 ` [PATCH 2/2] nvme: use blk-mq polling for uring commands Keith Busch
2023-03-25  2:50   ` kernel test robot
2023-03-26 13:01   ` Sagi Grimberg
2023-03-27 15:29     ` Keith Busch
2023-03-28  8:35       ` Sagi Grimberg
2023-03-27 13:58   ` Kanchan Joshi
2023-03-27 15:20     ` Keith Busch
2023-03-27 17:20       ` Kanchan Joshi
2023-03-28  0:48         ` Keith Busch
2023-03-28  7:49           ` Kanchan Joshi
2023-03-28 14:52             ` Keith Busch
2023-03-29  8:46               ` Kanchan Joshi
2023-03-29 16:11                 ` Keith Busch
2023-04-03 12:42                   ` Kanchan Joshi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).