All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET 0/4] Add support for list issue
@ 2021-11-17  3:38 Jens Axboe
  2021-11-17  3:38 ` [PATCH 1/4] block: add mq_ops->queue_rqs hook Jens Axboe
                   ` (3 more replies)
  0 siblings, 4 replies; 31+ messages in thread
From: Jens Axboe @ 2021-11-17  3:38 UTC (permalink / raw)
  To: linux-block; +Cc: hch

Hi,

With the support in 5.16-rc1 for allocating and completing batches of
IO, the one missing piece is passing down a list of requests for issue.
Drivers can take advantage of this by defining an mq_ops->queue_rqs()
hook.

This implements it for NVMe, allowing copy of multiple commands in one
swoop.

This is good for around a 500K IOPS/core improvement in my testing,
which is around a 5-6% improvement in efficiency.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 1/4] block: add mq_ops->queue_rqs hook
  2021-11-17  3:38 [PATCHSET 0/4] Add support for list issue Jens Axboe
@ 2021-11-17  3:38 ` Jens Axboe
  2021-11-17  6:25   ` Christoph Hellwig
                     ` (2 more replies)
  2021-11-17  3:38 ` [PATCH 2/4] nvme: split command copy into a helper Jens Axboe
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 31+ messages in thread
From: Jens Axboe @ 2021-11-17  3:38 UTC (permalink / raw)
  To: linux-block; +Cc: hch, Jens Axboe

If we have a list of requests in our plug list, send it to the driver in
one go, if possible. The driver must set mq_ops->queue_rqs() to support
this, if not the usual one-by-one path is used.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 17 +++++++++++++++++
 include/linux/blk-mq.h |  8 ++++++++
 2 files changed, 25 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9b4e79e2ac1e..005715206b16 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2208,6 +2208,19 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
 	int queued = 0;
 	int errors = 0;
 
+	/*
+	 * Peek first request and see if we have a ->queue_rqs() hook. If we
+	 * do, we can dispatch the whole plug list in one go. We already know
+	 * at this point that all requests belong to the same queue, caller
+	 * must ensure that's the case.
+	 */
+	rq = rq_list_peek(&plug->mq_list);
+	if (rq->q->mq_ops->queue_rqs) {
+		rq->q->mq_ops->queue_rqs(&plug->mq_list);
+		if (rq_list_empty(plug->mq_list))
+			return;
+	}
+
 	while ((rq = rq_list_pop(&plug->mq_list))) {
 		bool last = rq_list_empty(plug->mq_list);
 		blk_status_t ret;
@@ -2256,6 +2269,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
 		blk_mq_plug_issue_direct(plug, false);
+		/*
+		 * Expected case, all requests got dispatched. If not, fall
+		 * through to individual dispatch of the remainder.
+		 */
 		if (rq_list_empty(plug->mq_list))
 			return;
 	}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 3ba1e750067b..897cf475e7eb 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -503,6 +503,14 @@ struct blk_mq_ops {
 	 */
 	void (*commit_rqs)(struct blk_mq_hw_ctx *);
 
+	/**
+	 * @queue_rqs: Queue a list of new requests. Driver is guaranteed
+	 * that each request belongs to the same queue. If the driver doesn't
+	 * empty the @rqlist completely, then the rest will be queued
+	 * individually by the block layer upon return.
+	 */
+	void (*queue_rqs)(struct request **rqlist);
+
 	/**
 	 * @get_budget: Reserve budget before queue request, once .queue_rq is
 	 * run, it is driver's responsibility to release the
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/4] nvme: split command copy into a helper
  2021-11-17  3:38 [PATCHSET 0/4] Add support for list issue Jens Axboe
  2021-11-17  3:38 ` [PATCH 1/4] block: add mq_ops->queue_rqs hook Jens Axboe
@ 2021-11-17  3:38 ` Jens Axboe
  2021-11-17  6:15   ` Christoph Hellwig
  2021-11-18  7:54   ` Chaitanya Kulkarni
  2021-11-17  3:38 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe
  2021-11-17  3:38 ` [PATCH 4/4] nvme: add support for mq_ops->queue_rqs() Jens Axboe
  3 siblings, 2 replies; 31+ messages in thread
From: Jens Axboe @ 2021-11-17  3:38 UTC (permalink / raw)
  To: linux-block; +Cc: hch, Jens Axboe

We'll need it for batched submit as well.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c89f74ea00d4..c33cd1177b37 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -501,6 +501,15 @@ static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
 	nvmeq->last_sq_tail = nvmeq->sq_tail;
 }
 
+static inline void nvme_copy_cmd(struct nvme_queue *nvmeq,
+				 struct nvme_command *cmd)
+{
+	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), cmd,
+		sizeof(*cmd));
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+}
+
 /**
  * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
@@ -511,10 +520,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
 			    bool write_sq)
 {
 	spin_lock(&nvmeq->sq_lock);
-	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
-	       cmd, sizeof(*cmd));
-	if (++nvmeq->sq_tail == nvmeq->q_depth)
-		nvmeq->sq_tail = 0;
+	nvme_copy_cmd(nvmeq, cmd);
 	nvme_write_sq_db(nvmeq, write_sq);
 	spin_unlock(&nvmeq->sq_lock);
 }
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 3/4] nvme: separate command prep and issue
  2021-11-17  3:38 [PATCHSET 0/4] Add support for list issue Jens Axboe
  2021-11-17  3:38 ` [PATCH 1/4] block: add mq_ops->queue_rqs hook Jens Axboe
  2021-11-17  3:38 ` [PATCH 2/4] nvme: split command copy into a helper Jens Axboe
@ 2021-11-17  3:38 ` Jens Axboe
  2021-11-17  6:17   ` Christoph Hellwig
  2021-11-17  3:38 ` [PATCH 4/4] nvme: add support for mq_ops->queue_rqs() Jens Axboe
  3 siblings, 1 reply; 31+ messages in thread
From: Jens Axboe @ 2021-11-17  3:38 UTC (permalink / raw)
  To: linux-block; +Cc: hch, Jens Axboe

Add a nvme_prep_rq() helper to setup a command, and nvme_queue_rq() is
adapted to use this helper.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 54 +++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c33cd1177b37..d2b654fc3603 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -937,18 +937,10 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
 	return BLK_STS_OK;
 }
 
-/*
- * NOTE: ns is NULL when called on the admin queue.
- */
-static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
-			 const struct blk_mq_queue_data *bd)
+static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns,
+				 struct request *req, struct nvme_command *cmnd)
 {
-	struct nvme_ns *ns = hctx->queue->queuedata;
-	struct nvme_queue *nvmeq = hctx->driver_data;
-	struct nvme_dev *dev = nvmeq->dev;
-	struct request *req = bd->rq;
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	struct nvme_command *cmnd = &iod->cmd;
 	blk_status_t ret;
 
 	iod->aborted = false;
@@ -956,16 +948,6 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	iod->npages = -1;
 	iod->nents = 0;
 
-	/*
-	 * We should not need to do this, but we're still using this to
-	 * ensure we can drain requests on a dying queue.
-	 */
-	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
-		return BLK_STS_IOERR;
-
-	if (!nvme_check_ready(&dev->ctrl, req, true))
-		return nvme_fail_nonready_command(&dev->ctrl, req);
-
 	ret = nvme_setup_cmd(ns, req);
 	if (ret)
 		return ret;
@@ -983,7 +965,6 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	blk_mq_start_request(req);
-	nvme_submit_cmd(nvmeq, cmnd, bd->last);
 	return BLK_STS_OK;
 out_unmap_data:
 	nvme_unmap_data(dev, req);
@@ -992,6 +973,37 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
+static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
+{
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct nvme_dev *dev = nvmeq->dev;
+	struct request *req = bd->rq;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	blk_status_t ret;
+
+	/*
+	 * We should not need to do this, but we're still using this to
+	 * ensure we can drain requests on a dying queue.
+	 */
+	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
+		return BLK_STS_IOERR;
+
+	if (!nvme_check_ready(&dev->ctrl, req, true))
+		return nvme_fail_nonready_command(&dev->ctrl, req);
+
+	ret = nvme_prep_rq(dev, ns, req, &iod->cmd);
+	if (ret == BLK_STS_OK) {
+		nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
+		return BLK_STS_OK;
+	}
+	return ret;
+}
+
 static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 4/4] nvme: add support for mq_ops->queue_rqs()
  2021-11-17  3:38 [PATCHSET 0/4] Add support for list issue Jens Axboe
                   ` (2 preceding siblings ...)
  2021-11-17  3:38 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe
@ 2021-11-17  3:38 ` Jens Axboe
  2021-11-17  8:39   ` Christoph Hellwig
  2021-11-17 19:41   ` Keith Busch
  3 siblings, 2 replies; 31+ messages in thread
From: Jens Axboe @ 2021-11-17  3:38 UTC (permalink / raw)
  To: linux-block; +Cc: hch, Jens Axboe

This enables the block layer to send us a full plug list of requests
that need submitting. The block layer guarantees that they all belong
to the same queue, but we do have to check the hardware queue mapping
for each request.

If errors are encountered, leave them in the passed in list. Then the
block layer will handle them individually.

This is good for about a 4% improvement in peak performance, taking us
from 9.6M to 10M IOPS/core.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 67 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d2b654fc3603..2eedd04b1f90 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1004,6 +1004,72 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
+static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
+{
+	spin_lock(&nvmeq->sq_lock);
+	while (!rq_list_empty(*rqlist)) {
+		struct request *req = rq_list_pop(rqlist);
+		struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+		nvme_copy_cmd(nvmeq, absolute_pointer(&iod->cmd));
+	}
+	nvme_write_sq_db(nvmeq, true);
+	spin_unlock(&nvmeq->sq_lock);
+}
+
+static void nvme_queue_rqs(struct request **rqlist)
+{
+	struct request *requeue_list = NULL, *req, *prev = NULL;
+	struct blk_mq_hw_ctx *hctx;
+	struct nvme_queue *nvmeq;
+	struct nvme_ns *ns;
+
+restart:
+	req = rq_list_peek(rqlist);
+	hctx = req->mq_hctx;
+	nvmeq = hctx->driver_data;
+	ns = hctx->queue->queuedata;
+
+	/*
+	 * We should not need to do this, but we're still using this to
+	 * ensure we can drain requests on a dying queue.
+	 */
+	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
+		return;
+
+	rq_list_for_each(rqlist, req) {
+		struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+		blk_status_t ret;
+
+		if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
+			goto requeue;
+
+		if (req->mq_hctx != hctx) {
+			/* detach rest of list, and submit */
+			prev->rq_next = NULL;
+			nvme_submit_cmds(nvmeq, rqlist);
+			/* req now start of new list for this hw queue */
+			*rqlist = req;
+			goto restart;
+		}
+
+		hctx->tags->rqs[req->tag] = req;
+		ret = nvme_prep_rq(nvmeq->dev, ns, req, &iod->cmd);
+		if (ret == BLK_STS_OK) {
+			prev = req;
+			continue;
+		}
+requeue:
+		/* detach 'req' and add to remainder list */
+		if (prev)
+			prev->rq_next = req->rq_next;
+		rq_list_add(&requeue_list, req);
+	}
+
+	nvme_submit_cmds(nvmeq, rqlist);
+	*rqlist = requeue_list;
+}
+
 static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1741,6 +1807,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 
 static const struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
+	.queue_rqs	= nvme_queue_rqs,
 	.complete	= nvme_pci_complete_rq,
 	.commit_rqs	= nvme_commit_rqs,
 	.init_hctx	= nvme_init_hctx,
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/4] nvme: split command copy into a helper
  2021-11-17  3:38 ` [PATCH 2/4] nvme: split command copy into a helper Jens Axboe
@ 2021-11-17  6:15   ` Christoph Hellwig
  2021-11-17 15:44     ` Jens Axboe
  2021-11-18  7:54   ` Chaitanya Kulkarni
  1 sibling, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2021-11-17  6:15 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, hch

On Tue, Nov 16, 2021 at 08:38:05PM -0700, Jens Axboe wrote:
>  /**
>   * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
>   * @nvmeq: The queue to use
> @@ -511,10 +520,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
>  			    bool write_sq)
>  {
>  	spin_lock(&nvmeq->sq_lock);
> -	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
> -	       cmd, sizeof(*cmd));
> -	if (++nvmeq->sq_tail == nvmeq->q_depth)
> -		nvmeq->sq_tail = 0;
> +	nvme_copy_cmd(nvmeq, cmd);
>  	nvme_write_sq_db(nvmeq, write_sq);
>  	spin_unlock(&nvmeq->sq_lock);

Given that nvme_submit_cmd only has two callers, I'd be tempted to just
open code in the callers rather than creating a deep callchain here.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] nvme: separate command prep and issue
  2021-11-17  3:38 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe
@ 2021-11-17  6:17   ` Christoph Hellwig
  2021-11-17 15:45     ` Jens Axboe
  2021-11-18  7:59     ` Chaitanya Kulkarni
  0 siblings, 2 replies; 31+ messages in thread
From: Christoph Hellwig @ 2021-11-17  6:17 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, hch

On Tue, Nov 16, 2021 at 08:38:06PM -0700, Jens Axboe wrote:
> +	ret = nvme_prep_rq(dev, ns, req, &iod->cmd);
> +	if (ret == BLK_STS_OK) {
> +		nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
> +		return BLK_STS_OK;
> +	}
> +	return ret;

I'd prefer the traditional handle errors outside the straight path
order here:

	ret = nvme_prep_rq(dev, ns, req, &iod->cmd);
	if (ret)
		return ret;
	nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
	return BLK_STS_OK;


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] block: add mq_ops->queue_rqs hook
  2021-11-17  3:38 ` [PATCH 1/4] block: add mq_ops->queue_rqs hook Jens Axboe
@ 2021-11-17  6:25   ` Christoph Hellwig
  2021-11-17 15:41     ` Jens Axboe
  2021-11-17  8:20   ` Ming Lei
  2021-11-17 20:41   ` Keith Busch
  2 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2021-11-17  6:25 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, hch

> @@ -2208,6 +2208,19 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
>  	int queued = 0;
>  	int errors = 0;
>  
> +	/*
> +	 * Peek first request and see if we have a ->queue_rqs() hook. If we
> +	 * do, we can dispatch the whole plug list in one go. We already know
> +	 * at this point that all requests belong to the same queue, caller
> +	 * must ensure that's the case.
> +	 */
> +	rq = rq_list_peek(&plug->mq_list);
> +	if (rq->q->mq_ops->queue_rqs) {
> +		rq->q->mq_ops->queue_rqs(&plug->mq_list);
> +		if (rq_list_empty(plug->mq_list))
> +			return;
> +	}

I'd move this straight into the caller to simplify the follow, something
like the patch below.  Also I wonder if the slow path might want to
be moved from blk_mq_flush_plug_list into a separate helper to keep
the fast path as straight as possible?

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3ab34c4f20daf..370f4b2ab4511 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2255,6 +2255,14 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	plug->rq_count = 0;
 
 	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
+		struct request_queue *q = rq_list_peek(&plug->mq_list)->q;
+
+		if (q->mq_ops->queue_rqs) {
+			q->mq_ops->queue_rqs(&plug->mq_list);
+			if (rq_list_empty(plug->mq_list))
+				return;
+		}
+
 		blk_mq_plug_issue_direct(plug, false);
 		if (rq_list_empty(plug->mq_list))
 			return;

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] block: add mq_ops->queue_rqs hook
  2021-11-17  3:38 ` [PATCH 1/4] block: add mq_ops->queue_rqs hook Jens Axboe
  2021-11-17  6:25   ` Christoph Hellwig
@ 2021-11-17  8:20   ` Ming Lei
  2021-11-17 15:43     ` Jens Axboe
  2021-11-17 20:41   ` Keith Busch
  2 siblings, 1 reply; 31+ messages in thread
From: Ming Lei @ 2021-11-17  8:20 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, hch

On Tue, Nov 16, 2021 at 08:38:04PM -0700, Jens Axboe wrote:
> If we have a list of requests in our plug list, send it to the driver in
> one go, if possible. The driver must set mq_ops->queue_rqs() to support
> this, if not the usual one-by-one path is used.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  block/blk-mq.c         | 17 +++++++++++++++++
>  include/linux/blk-mq.h |  8 ++++++++
>  2 files changed, 25 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 9b4e79e2ac1e..005715206b16 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2208,6 +2208,19 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
>  	int queued = 0;
>  	int errors = 0;
>  
> +	/*
> +	 * Peek first request and see if we have a ->queue_rqs() hook. If we
> +	 * do, we can dispatch the whole plug list in one go. We already know
> +	 * at this point that all requests belong to the same queue, caller
> +	 * must ensure that's the case.
> +	 */
> +	rq = rq_list_peek(&plug->mq_list);
> +	if (rq->q->mq_ops->queue_rqs) {
> +		rq->q->mq_ops->queue_rqs(&plug->mq_list);
> +		if (rq_list_empty(plug->mq_list))
> +			return;
> +	}
> +

Then BLK_MQ_F_TAG_QUEUE_SHARED isn't handled as before for multiple NVMe
NS.


thanks,
Ming


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 4/4] nvme: add support for mq_ops->queue_rqs()
  2021-11-17  3:38 ` [PATCH 4/4] nvme: add support for mq_ops->queue_rqs() Jens Axboe
@ 2021-11-17  8:39   ` Christoph Hellwig
  2021-11-17 15:55     ` Jens Axboe
  2021-11-17 19:41   ` Keith Busch
  1 sibling, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2021-11-17  8:39 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, hch

On Tue, Nov 16, 2021 at 08:38:07PM -0700, Jens Axboe wrote:
> This enables the block layer to send us a full plug list of requests
> that need submitting. The block layer guarantees that they all belong
> to the same queue, but we do have to check the hardware queue mapping
> for each request.
> 
> If errors are encountered, leave them in the passed in list. Then the
> block layer will handle them individually.
> 
> This is good for about a 4% improvement in peak performance, taking us
> from 9.6M to 10M IOPS/core.

The concept looks sensible, but the loop in nvme_queue_rqs is a complete
mess to follow. What about something like this (untested) on top?

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 13722cc400c2c..555a7609580c7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -509,21 +509,6 @@ static inline void nvme_copy_cmd(struct nvme_queue *nvmeq,
 		nvmeq->sq_tail = 0;
 }
 
-/**
- * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
- * @nvmeq: The queue to use
- * @cmd: The command to send
- * @write_sq: whether to write to the SQ doorbell
- */
-static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
-			    bool write_sq)
-{
-	spin_lock(&nvmeq->sq_lock);
-	nvme_copy_cmd(nvmeq, cmd);
-	nvme_write_sq_db(nvmeq, write_sq);
-	spin_unlock(&nvmeq->sq_lock);
-}
-
 static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
@@ -918,8 +903,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
 	return BLK_STS_OK;
 }
 
-static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns,
-				 struct request *req, struct nvme_command *cmnd)
+static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	blk_status_t ret;
@@ -928,18 +912,18 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns,
 	iod->npages = -1;
 	iod->nents = 0;
 
-	ret = nvme_setup_cmd(ns, req);
+	ret = nvme_setup_cmd(req->q->queuedata, req);
 	if (ret)
 		return ret;
 
 	if (blk_rq_nr_phys_segments(req)) {
-		ret = nvme_map_data(dev, req, cmnd);
+		ret = nvme_map_data(dev, req, &iod->cmd);
 		if (ret)
 			goto out_free_cmd;
 	}
 
 	if (blk_integrity_rq(req)) {
-		ret = nvme_map_metadata(dev, req, cmnd);
+		ret = nvme_map_metadata(dev, req, &iod->cmd);
 		if (ret)
 			goto out_unmap_data;
 	}
@@ -959,7 +943,6 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct nvme_ns *ns,
 static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			 const struct blk_mq_queue_data *bd)
 {
-	struct nvme_ns *ns = hctx->queue->queuedata;
 	struct nvme_queue *nvmeq = hctx->driver_data;
 	struct nvme_dev *dev = nvmeq->dev;
 	struct request *req = bd->rq;
@@ -976,12 +959,15 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (!nvme_check_ready(&dev->ctrl, req, true))
 		return nvme_fail_nonready_command(&dev->ctrl, req);
 
-	ret = nvme_prep_rq(dev, ns, req, &iod->cmd);
-	if (ret == BLK_STS_OK) {
-		nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
-		return BLK_STS_OK;
-	}
-	return ret;
+	ret = nvme_prep_rq(dev, req);
+	if (ret)
+		return ret;
+
+	spin_lock(&nvmeq->sq_lock);
+	nvme_copy_cmd(nvmeq, &iod->cmd);
+	nvme_write_sq_db(nvmeq, bd->last);
+	spin_unlock(&nvmeq->sq_lock);
+	return BLK_STS_OK;
 }
 
 static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
@@ -997,56 +983,47 @@ static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
 	spin_unlock(&nvmeq->sq_lock);
 }
 
-static void nvme_queue_rqs(struct request **rqlist)
+static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
 {
-	struct request *requeue_list = NULL, *req, *prev = NULL;
-	struct blk_mq_hw_ctx *hctx;
-	struct nvme_queue *nvmeq;
-	struct nvme_ns *ns;
-
-restart:
-	req = rq_list_peek(rqlist);
-	hctx = req->mq_hctx;
-	nvmeq = hctx->driver_data;
-	ns = hctx->queue->queuedata;
-
 	/*
 	 * We should not need to do this, but we're still using this to
 	 * ensure we can drain requests on a dying queue.
 	 */
 	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
-		return;
+		return false;
+	if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
+		return false;
 
-	rq_list_for_each(rqlist, req) {
-		struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-		blk_status_t ret;
+	req->mq_hctx->tags->rqs[req->tag] = req;
+	return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
+}
 
-		if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
-			goto requeue;
+static void nvme_queue_rqs(struct request **rqlist)
+{
+	struct request *req = rq_list_peek(rqlist), *prev = NULL;
+	struct request *requeue_list = NULL;
+
+	do {
+		struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+
+		if (!nvme_prep_rq_batch(nvmeq, req)) {
+			/* detach 'req' and add to remainder list */
+			if (prev)
+				prev->rq_next = req->rq_next;
+			rq_list_add(&requeue_list, req);
+		} else {
+			prev = req;
+		}
 
-		if (req->mq_hctx != hctx) {
+		req = rq_list_next(req);
+		if (!req || (prev && req->mq_hctx != prev->mq_hctx)) {
 			/* detach rest of list, and submit */
 			prev->rq_next = NULL;
 			nvme_submit_cmds(nvmeq, rqlist);
-			/* req now start of new list for this hw queue */
 			*rqlist = req;
-			goto restart;
-		}
-
-		hctx->tags->rqs[req->tag] = req;
-		ret = nvme_prep_rq(nvmeq->dev, ns, req, &iod->cmd);
-		if (ret == BLK_STS_OK) {
-			prev = req;
-			continue;
 		}
-requeue:
-		/* detach 'req' and add to remainder list */
-		if (prev)
-			prev->rq_next = req->rq_next;
-		rq_list_add(&requeue_list, req);
-	}
+	} while (req);
 
-	nvme_submit_cmds(nvmeq, rqlist);
 	*rqlist = requeue_list;
 }
 
@@ -1224,7 +1201,11 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 
 	c.common.opcode = nvme_admin_async_event;
 	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
-	nvme_submit_cmd(nvmeq, &c, true);
+
+	spin_lock(&nvmeq->sq_lock);
+	nvme_copy_cmd(nvmeq, &c);
+	nvme_write_sq_db(nvmeq, true);
+	spin_unlock(&nvmeq->sq_lock);
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] block: add mq_ops->queue_rqs hook
  2021-11-17  6:25   ` Christoph Hellwig
@ 2021-11-17 15:41     ` Jens Axboe
  0 siblings, 0 replies; 31+ messages in thread
From: Jens Axboe @ 2021-11-17 15:41 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 11/16/21 11:25 PM, Christoph Hellwig wrote:
>> @@ -2208,6 +2208,19 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
>>  	int queued = 0;
>>  	int errors = 0;
>>  
>> +	/*
>> +	 * Peek first request and see if we have a ->queue_rqs() hook. If we
>> +	 * do, we can dispatch the whole plug list in one go. We already know
>> +	 * at this point that all requests belong to the same queue, caller
>> +	 * must ensure that's the case.
>> +	 */
>> +	rq = rq_list_peek(&plug->mq_list);
>> +	if (rq->q->mq_ops->queue_rqs) {
>> +		rq->q->mq_ops->queue_rqs(&plug->mq_list);
>> +		if (rq_list_empty(plug->mq_list))
>> +			return;
>> +	}
> 
> I'd move this straight into the caller to simplify the follow, something
> like the patch below.  Also I wonder if the slow path might want to
> be moved from blk_mq_flush_plug_list into a separate helper to keep
> the fast path as straight as possible?

Yes, I think that's better. I'll fold that in.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] block: add mq_ops->queue_rqs hook
  2021-11-17  8:20   ` Ming Lei
@ 2021-11-17 15:43     ` Jens Axboe
  2021-11-17 20:48       ` Keith Busch
  0 siblings, 1 reply; 31+ messages in thread
From: Jens Axboe @ 2021-11-17 15:43 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, hch

On 11/17/21 1:20 AM, Ming Lei wrote:
> On Tue, Nov 16, 2021 at 08:38:04PM -0700, Jens Axboe wrote:
>> If we have a list of requests in our plug list, send it to the driver in
>> one go, if possible. The driver must set mq_ops->queue_rqs() to support
>> this, if not the usual one-by-one path is used.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  block/blk-mq.c         | 17 +++++++++++++++++
>>  include/linux/blk-mq.h |  8 ++++++++
>>  2 files changed, 25 insertions(+)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 9b4e79e2ac1e..005715206b16 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -2208,6 +2208,19 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
>>  	int queued = 0;
>>  	int errors = 0;
>>  
>> +	/*
>> +	 * Peek first request and see if we have a ->queue_rqs() hook. If we
>> +	 * do, we can dispatch the whole plug list in one go. We already know
>> +	 * at this point that all requests belong to the same queue, caller
>> +	 * must ensure that's the case.
>> +	 */
>> +	rq = rq_list_peek(&plug->mq_list);
>> +	if (rq->q->mq_ops->queue_rqs) {
>> +		rq->q->mq_ops->queue_rqs(&plug->mq_list);
>> +		if (rq_list_empty(plug->mq_list))
>> +			return;
>> +	}
>> +
> 
> Then BLK_MQ_F_TAG_QUEUE_SHARED isn't handled as before for multiple NVMe
> NS.

Can you expand? If we have multiple namespaces in the plug list, we have
multiple queues. There's no direct issue of the list if that's the case.
Or maybe I'm missing what you mean here?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/4] nvme: split command copy into a helper
  2021-11-17  6:15   ` Christoph Hellwig
@ 2021-11-17 15:44     ` Jens Axboe
  0 siblings, 0 replies; 31+ messages in thread
From: Jens Axboe @ 2021-11-17 15:44 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 11/16/21 11:15 PM, Christoph Hellwig wrote:
> On Tue, Nov 16, 2021 at 08:38:05PM -0700, Jens Axboe wrote:
>>  /**
>>   * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
>>   * @nvmeq: The queue to use
>> @@ -511,10 +520,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
>>  			    bool write_sq)
>>  {
>>  	spin_lock(&nvmeq->sq_lock);
>> -	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
>> -	       cmd, sizeof(*cmd));
>> -	if (++nvmeq->sq_tail == nvmeq->q_depth)
>> -		nvmeq->sq_tail = 0;
>> +	nvme_copy_cmd(nvmeq, cmd);
>>  	nvme_write_sq_db(nvmeq, write_sq);
>>  	spin_unlock(&nvmeq->sq_lock);
> 
> Given that nvme_submit_cmd only has two callers, I'd be tempted to just
> open code in the callers rather than creating a deep callchain here.

It's inlined, but if you prefer duplicating the code, then I can just
drop this patch.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] nvme: separate command prep and issue
  2021-11-17  6:17   ` Christoph Hellwig
@ 2021-11-17 15:45     ` Jens Axboe
  2021-11-18  7:59     ` Chaitanya Kulkarni
  1 sibling, 0 replies; 31+ messages in thread
From: Jens Axboe @ 2021-11-17 15:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 11/16/21 11:17 PM, Christoph Hellwig wrote:
> On Tue, Nov 16, 2021 at 08:38:06PM -0700, Jens Axboe wrote:
>> +	ret = nvme_prep_rq(dev, ns, req, &iod->cmd);
>> +	if (ret == BLK_STS_OK) {
>> +		nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
>> +		return BLK_STS_OK;
>> +	}
>> +	return ret;
> 
> I'd prefer the traditional handle errors outside the straight path
> order here:
> 
> 	ret = nvme_prep_rq(dev, ns, req, &iod->cmd);
> 	if (ret)
> 		return ret;
> 	nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
> 	return BLK_STS_OK;

Sure, changed.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 4/4] nvme: add support for mq_ops->queue_rqs()
  2021-11-17  8:39   ` Christoph Hellwig
@ 2021-11-17 15:55     ` Jens Axboe
  2021-11-17 15:58       ` Jens Axboe
  0 siblings, 1 reply; 31+ messages in thread
From: Jens Axboe @ 2021-11-17 15:55 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 11/17/21 1:39 AM, Christoph Hellwig wrote:
> On Tue, Nov 16, 2021 at 08:38:07PM -0700, Jens Axboe wrote:
>> This enables the block layer to send us a full plug list of requests
>> that need submitting. The block layer guarantees that they all belong
>> to the same queue, but we do have to check the hardware queue mapping
>> for each request.
>>
>> If errors are encountered, leave them in the passed in list. Then the
>> block layer will handle them individually.
>>
>> This is good for about a 4% improvement in peak performance, taking us
>> from 9.6M to 10M IOPS/core.
> 
> The concept looks sensible, but the loop in nvme_queue_rqs is a complete
> mess to follow. What about something like this (untested) on top?

Let me take a closer look.

> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 13722cc400c2c..555a7609580c7 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -509,21 +509,6 @@ static inline void nvme_copy_cmd(struct nvme_queue *nvmeq,
>  		nvmeq->sq_tail = 0;
>  }
>  
> -/**
> - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
> - * @nvmeq: The queue to use
> - * @cmd: The command to send
> - * @write_sq: whether to write to the SQ doorbell
> - */
> -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
> -			    bool write_sq)
> -{
> -	spin_lock(&nvmeq->sq_lock);
> -	nvme_copy_cmd(nvmeq, cmd);
> -	nvme_write_sq_db(nvmeq, write_sq);
> -	spin_unlock(&nvmeq->sq_lock);
> -}

You really don't like helpers? Code generation wise it doesn't matter,
but without this and the copy helper we do end up having some trivial
duplicated code...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 4/4] nvme: add support for mq_ops->queue_rqs()
  2021-11-17 15:55     ` Jens Axboe
@ 2021-11-17 15:58       ` Jens Axboe
  0 siblings, 0 replies; 31+ messages in thread
From: Jens Axboe @ 2021-11-17 15:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 11/17/21 8:55 AM, Jens Axboe wrote:
> On 11/17/21 1:39 AM, Christoph Hellwig wrote:
>> On Tue, Nov 16, 2021 at 08:38:07PM -0700, Jens Axboe wrote:
>>> This enables the block layer to send us a full plug list of requests
>>> that need submitting. The block layer guarantees that they all belong
>>> to the same queue, but we do have to check the hardware queue mapping
>>> for each request.
>>>
>>> If errors are encountered, leave them in the passed in list. Then the
>>> block layer will handle them individually.
>>>
>>> This is good for about a 4% improvement in peak performance, taking us
>>> from 9.6M to 10M IOPS/core.
>>
>> The concept looks sensible, but the loop in nvme_queue_rqs is a complete
>> mess to follow. What about something like this (untested) on top?
> 
> Let me take a closer look.

Something changed, efficiency is way down:

     2.26%     +4.34%  [nvme]            [k] nvme_queue_rqs


-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 4/4] nvme: add support for mq_ops->queue_rqs()
  2021-11-17  3:38 ` [PATCH 4/4] nvme: add support for mq_ops->queue_rqs() Jens Axboe
  2021-11-17  8:39   ` Christoph Hellwig
@ 2021-11-17 19:41   ` Keith Busch
  1 sibling, 0 replies; 31+ messages in thread
From: Keith Busch @ 2021-11-17 19:41 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, hch

[adding linux-nvme]

On Tue, Nov 16, 2021 at 08:38:07PM -0700, Jens Axboe wrote:
> This enables the block layer to send us a full plug list of requests
> that need submitting. The block layer guarantees that they all belong
> to the same queue, but we do have to check the hardware queue mapping
> for each request.

So this means that the nvme namespace will always be the same for all
the requests in the list, but the rqlist may contain requests allocated
from different CPUs?
 
> If errors are encountered, leave them in the passed in list. Then the
> block layer will handle them individually.
> 
> This is good for about a 4% improvement in peak performance, taking us
> from 9.6M to 10M IOPS/core.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  drivers/nvme/host/pci.c | 67 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 67 insertions(+)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index d2b654fc3603..2eedd04b1f90 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1004,6 +1004,72 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	return ret;
>  }
>  
> +static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
> +{
> +	spin_lock(&nvmeq->sq_lock);
> +	while (!rq_list_empty(*rqlist)) {
> +		struct request *req = rq_list_pop(rqlist);
> +		struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +
> +		nvme_copy_cmd(nvmeq, absolute_pointer(&iod->cmd));
> +	}
> +	nvme_write_sq_db(nvmeq, true);
> +	spin_unlock(&nvmeq->sq_lock);
> +}
> +
> +static void nvme_queue_rqs(struct request **rqlist)
> +{
> +	struct request *requeue_list = NULL, *req, *prev = NULL;
> +	struct blk_mq_hw_ctx *hctx;
> +	struct nvme_queue *nvmeq;
> +	struct nvme_ns *ns;
> +
> +restart:
> +	req = rq_list_peek(rqlist);
> +	hctx = req->mq_hctx;
> +	nvmeq = hctx->driver_data;
> +	ns = hctx->queue->queuedata;
> +
> +	/*
> +	 * We should not need to do this, but we're still using this to
> +	 * ensure we can drain requests on a dying queue.
> +	 */
> +	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
> +		return;
> +
> +	rq_list_for_each(rqlist, req) {
> +		struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +		blk_status_t ret;
> +
> +		if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
> +			goto requeue;
> +
> +		if (req->mq_hctx != hctx) {
> +			/* detach rest of list, and submit */
> +			prev->rq_next = NULL;
> +			nvme_submit_cmds(nvmeq, rqlist);
> +			/* req now start of new list for this hw queue */
> +			*rqlist = req;
> +			goto restart;
> +		}
> +
> +		hctx->tags->rqs[req->tag] = req;

After checking how this is handled previously, it appears this new
.queue_rqs() skips incrementing active requests, and bypasses the
hctx_lock(). Won't that break quiesce?

> +		ret = nvme_prep_rq(nvmeq->dev, ns, req, &iod->cmd);
> +		if (ret == BLK_STS_OK) {
> +			prev = req;
> +			continue;
> +		}
> +requeue:
> +		/* detach 'req' and add to remainder list */
> +		if (prev)
> +			prev->rq_next = req->rq_next;
> +		rq_list_add(&requeue_list, req);
> +	}
> +
> +	nvme_submit_cmds(nvmeq, rqlist);
> +	*rqlist = requeue_list;
> +}
> +
>  static __always_inline void nvme_pci_unmap_rq(struct request *req)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> @@ -1741,6 +1807,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
>  
>  static const struct blk_mq_ops nvme_mq_ops = {
>  	.queue_rq	= nvme_queue_rq,
> +	.queue_rqs	= nvme_queue_rqs,
>  	.complete	= nvme_pci_complete_rq,
>  	.commit_rqs	= nvme_commit_rqs,
>  	.init_hctx	= nvme_init_hctx,
> -- 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] block: add mq_ops->queue_rqs hook
  2021-11-17  3:38 ` [PATCH 1/4] block: add mq_ops->queue_rqs hook Jens Axboe
  2021-11-17  6:25   ` Christoph Hellwig
  2021-11-17  8:20   ` Ming Lei
@ 2021-11-17 20:41   ` Keith Busch
  2021-11-18  0:18     ` Ming Lei
  2 siblings, 1 reply; 31+ messages in thread
From: Keith Busch @ 2021-11-17 20:41 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, hch

On Tue, Nov 16, 2021 at 08:38:04PM -0700, Jens Axboe wrote:
> If we have a list of requests in our plug list, send it to the driver in
> one go, if possible. The driver must set mq_ops->queue_rqs() to support
> this, if not the usual one-by-one path is used.

It looks like we still need to sync with the request_queue quiesce flag.
Something like the following (untested) on top of this patch should do
it:

---
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 688ebf6a7a7b..447d0b77375d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -263,6 +263,9 @@ void blk_mq_wait_quiesce_done(struct request_queue *q)
 	unsigned int i;
 	bool rcu = false;
 
+	if (q->tag_set->flags & BLK_MQ_F_BLOCKING)
+		synchronize_srcu(q->srcu);
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->flags & BLK_MQ_F_BLOCKING)
 			synchronize_srcu(hctx->srcu);
@@ -2201,6 +2204,25 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued,
 	*queued = 0;
 }
 
+static void queue_lock(struct request_queue *q, int *srcu_idx)
+	__acquires(q->srcu)
+{
+	if (!(q->tag_set->flags & BLK_MQ_F_BLOCKING)) {
+		*srcu_idx = 0;
+		rcu_read_lock();
+	} else
+		*srcu_idx = srcu_read_lock(q->srcu);
+}
+
+static void queue_unlock(struct request_queue *q, int srcu_idx)
+	__releases(q->srcu)
+{
+	if (!(q->tag_set->flags & BLK_MQ_F_BLOCKING))
+		rcu_read_unlock();
+	else
+		srcu_read_unlock(q->srcu, srcu_idx);
+}
+
 static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
 {
 	struct blk_mq_hw_ctx *hctx = NULL;
@@ -2216,7 +2238,14 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
 	 */
 	rq = rq_list_peek(&plug->mq_list);
 	if (rq->q->mq_ops->queue_rqs) {
-		rq->q->mq_ops->queue_rqs(&plug->mq_list);
+		struct request_queue *q = rq->q;
+		int srcu_idx;
+
+		queue_lock(q, &srcu_idx);
+		if (!blk_queue_quiesced(q))
+			q->mq_ops->queue_rqs(&plug->mq_list);
+		queue_unlock(q, srcu_idx);
+
 		if (rq_list_empty(plug->mq_list))
 			return;
 	}
@@ -3727,6 +3756,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
 
 	q->tag_set = set;
+	if (set->flags & BLK_MQ_F_BLOCKING)
+		init_srcu_struct(q->srcu);
 
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 	if (set->nr_maps > HCTX_TYPE_POLL &&
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bd4370baccca..ae7591dc9cbb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -373,6 +373,8 @@ struct request_queue {
 	 * devices that do not have multiple independent access ranges.
 	 */
 	struct blk_independent_access_ranges *ia_ranges;
+
+	struct srcu_struct	srcu[];
 };
 
 /* Keep blk_queue_flag_name[] in sync with the definitions below */
--

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] block: add mq_ops->queue_rqs hook
  2021-11-17 15:43     ` Jens Axboe
@ 2021-11-17 20:48       ` Keith Busch
  2021-11-17 23:59         ` Ming Lei
  0 siblings, 1 reply; 31+ messages in thread
From: Keith Busch @ 2021-11-17 20:48 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Ming Lei, linux-block, hch

On Wed, Nov 17, 2021 at 08:43:02AM -0700, Jens Axboe wrote:
> On 11/17/21 1:20 AM, Ming Lei wrote:
> > On Tue, Nov 16, 2021 at 08:38:04PM -0700, Jens Axboe wrote:
> >> If we have a list of requests in our plug list, send it to the driver in
> >> one go, if possible. The driver must set mq_ops->queue_rqs() to support
> >> this, if not the usual one-by-one path is used.
> >>
> >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> >> ---
> >>  block/blk-mq.c         | 17 +++++++++++++++++
> >>  include/linux/blk-mq.h |  8 ++++++++
> >>  2 files changed, 25 insertions(+)
> >>
> >> diff --git a/block/blk-mq.c b/block/blk-mq.c
> >> index 9b4e79e2ac1e..005715206b16 100644
> >> --- a/block/blk-mq.c
> >> +++ b/block/blk-mq.c
> >> @@ -2208,6 +2208,19 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
> >>  	int queued = 0;
> >>  	int errors = 0;
> >>  
> >> +	/*
> >> +	 * Peek first request and see if we have a ->queue_rqs() hook. If we
> >> +	 * do, we can dispatch the whole plug list in one go. We already know
> >> +	 * at this point that all requests belong to the same queue, caller
> >> +	 * must ensure that's the case.
> >> +	 */
> >> +	rq = rq_list_peek(&plug->mq_list);
> >> +	if (rq->q->mq_ops->queue_rqs) {
> >> +		rq->q->mq_ops->queue_rqs(&plug->mq_list);
> >> +		if (rq_list_empty(plug->mq_list))
> >> +			return;
> >> +	}
> >> +
> > 
> > Then BLK_MQ_F_TAG_QUEUE_SHARED isn't handled as before for multiple NVMe
> > NS.
> 
> Can you expand? If we have multiple namespaces in the plug list, we have
> multiple queues. There's no direct issue of the list if that's the case.
> Or maybe I'm missing what you mean here?

If the plug list only has requests for one namespace, I think Ming is
referring to the special accounting for BLK_MQ_F_TAG_QUEUE_SHARED in
__blk_mq_get_driver_tag() that normally gets called before dispatching
to the driver, but isn't getting called when using .queue_rqs().

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] block: add mq_ops->queue_rqs hook
  2021-11-17 20:48       ` Keith Busch
@ 2021-11-17 23:59         ` Ming Lei
  0 siblings, 0 replies; 31+ messages in thread
From: Ming Lei @ 2021-11-17 23:59 UTC (permalink / raw)
  To: Keith Busch; +Cc: Jens Axboe, linux-block, hch

On Wed, Nov 17, 2021 at 12:48:14PM -0800, Keith Busch wrote:
> On Wed, Nov 17, 2021 at 08:43:02AM -0700, Jens Axboe wrote:
> > On 11/17/21 1:20 AM, Ming Lei wrote:
> > > On Tue, Nov 16, 2021 at 08:38:04PM -0700, Jens Axboe wrote:
> > >> If we have a list of requests in our plug list, send it to the driver in
> > >> one go, if possible. The driver must set mq_ops->queue_rqs() to support
> > >> this, if not the usual one-by-one path is used.
> > >>
> > >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> > >> ---
> > >>  block/blk-mq.c         | 17 +++++++++++++++++
> > >>  include/linux/blk-mq.h |  8 ++++++++
> > >>  2 files changed, 25 insertions(+)
> > >>
> > >> diff --git a/block/blk-mq.c b/block/blk-mq.c
> > >> index 9b4e79e2ac1e..005715206b16 100644
> > >> --- a/block/blk-mq.c
> > >> +++ b/block/blk-mq.c
> > >> @@ -2208,6 +2208,19 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
> > >>  	int queued = 0;
> > >>  	int errors = 0;
> > >>  
> > >> +	/*
> > >> +	 * Peek first request and see if we have a ->queue_rqs() hook. If we
> > >> +	 * do, we can dispatch the whole plug list in one go. We already know
> > >> +	 * at this point that all requests belong to the same queue, caller
> > >> +	 * must ensure that's the case.
> > >> +	 */
> > >> +	rq = rq_list_peek(&plug->mq_list);
> > >> +	if (rq->q->mq_ops->queue_rqs) {
> > >> +		rq->q->mq_ops->queue_rqs(&plug->mq_list);
> > >> +		if (rq_list_empty(plug->mq_list))
> > >> +			return;
> > >> +	}
> > >> +
> > > 
> > > Then BLK_MQ_F_TAG_QUEUE_SHARED isn't handled as before for multiple NVMe
> > > NS.
> > 
> > Can you expand? If we have multiple namespaces in the plug list, we have
> > multiple queues. There's no direct issue of the list if that's the case.
> > Or maybe I'm missing what you mean here?
> 
> If the plug list only has requests for one namespace, I think Ming is
> referring to the special accounting for BLK_MQ_F_TAG_QUEUE_SHARED in
> __blk_mq_get_driver_tag() that normally gets called before dispatching
> to the driver, but isn't getting called when using .queue_rqs().

Yeah, that is it. This is one normal case, each task runs I/O on
different namespace, but all NSs share/contend the single host tags,
BLK_MQ_F_TAG_QUEUE_SHARED is supposed to provide fair allocation among
all these NSs.


thanks, 
Ming


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] block: add mq_ops->queue_rqs hook
  2021-11-17 20:41   ` Keith Busch
@ 2021-11-18  0:18     ` Ming Lei
  2021-11-18  2:02       ` Keith Busch
  0 siblings, 1 reply; 31+ messages in thread
From: Ming Lei @ 2021-11-18  0:18 UTC (permalink / raw)
  To: Keith Busch; +Cc: Jens Axboe, linux-block, hch

On Wed, Nov 17, 2021 at 12:41:01PM -0800, Keith Busch wrote:
> On Tue, Nov 16, 2021 at 08:38:04PM -0700, Jens Axboe wrote:
> > If we have a list of requests in our plug list, send it to the driver in
> > one go, if possible. The driver must set mq_ops->queue_rqs() to support
> > this, if not the usual one-by-one path is used.
> 
> It looks like we still need to sync with the request_queue quiesce flag.

I think this approach is good.

> Something like the following (untested) on top of this patch should do
> it:
> 
> ---
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 688ebf6a7a7b..447d0b77375d 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -263,6 +263,9 @@ void blk_mq_wait_quiesce_done(struct request_queue *q)
>  	unsigned int i;
>  	bool rcu = false;
>  
> +	if (q->tag_set->flags & BLK_MQ_F_BLOCKING)
> +		synchronize_srcu(q->srcu);
> +
>  	queue_for_each_hw_ctx(q, hctx, i) {
>  		if (hctx->flags & BLK_MQ_F_BLOCKING)
>  			synchronize_srcu(hctx->srcu);
> @@ -2201,6 +2204,25 @@ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued,
>  	*queued = 0;
>  }
>  
> +static void queue_lock(struct request_queue *q, int *srcu_idx)
> +	__acquires(q->srcu)
> +{
> +	if (!(q->tag_set->flags & BLK_MQ_F_BLOCKING)) {
> +		*srcu_idx = 0;
> +		rcu_read_lock();
> +	} else
> +		*srcu_idx = srcu_read_lock(q->srcu);
> +}
> +
> +static void queue_unlock(struct request_queue *q, int srcu_idx)
> +	__releases(q->srcu)
> +{
> +	if (!(q->tag_set->flags & BLK_MQ_F_BLOCKING))
> +		rcu_read_unlock();
> +	else
> +		srcu_read_unlock(q->srcu, srcu_idx);
> +}
> +
>  static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
>  {
>  	struct blk_mq_hw_ctx *hctx = NULL;
> @@ -2216,7 +2238,14 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
>  	 */
>  	rq = rq_list_peek(&plug->mq_list);
>  	if (rq->q->mq_ops->queue_rqs) {
> -		rq->q->mq_ops->queue_rqs(&plug->mq_list);
> +		struct request_queue *q = rq->q;
> +		int srcu_idx;
> +
> +		queue_lock(q, &srcu_idx);
> +		if (!blk_queue_quiesced(q))
> +			q->mq_ops->queue_rqs(&plug->mq_list);
> +		queue_unlock(q, srcu_idx);
> +
>  		if (rq_list_empty(plug->mq_list))
>  			return;
>  	}
> @@ -3727,6 +3756,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>  	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
>  
>  	q->tag_set = set;
> +	if (set->flags & BLK_MQ_F_BLOCKING)
> +		init_srcu_struct(q->srcu);
>  
>  	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
>  	if (set->nr_maps > HCTX_TYPE_POLL &&
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index bd4370baccca..ae7591dc9cbb 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -373,6 +373,8 @@ struct request_queue {
>  	 * devices that do not have multiple independent access ranges.
>  	 */
>  	struct blk_independent_access_ranges *ia_ranges;
> +
> +	struct srcu_struct	srcu[];

Basically it is same with my previous post[1], but the above patch doesn't
handle request queue allocation/freeing correctly in case of BLK_MQ_F_BLOCKING.

[1] https://lore.kernel.org/linux-block/20211103160018.3764976-1-ming.lei@redhat.com/


Thanks,
Ming


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] block: add mq_ops->queue_rqs hook
  2021-11-18  0:18     ` Ming Lei
@ 2021-11-18  2:02       ` Keith Busch
  2021-11-18  2:14         ` Ming Lei
  0 siblings, 1 reply; 31+ messages in thread
From: Keith Busch @ 2021-11-18  2:02 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, hch

On Thu, Nov 18, 2021 at 08:18:29AM +0800, Ming Lei wrote:
> On Wed, Nov 17, 2021 at 12:41:01PM -0800, Keith Busch wrote:
> > On Tue, Nov 16, 2021 at 08:38:04PM -0700, Jens Axboe wrote:
> > > If we have a list of requests in our plug list, send it to the driver in
> > > one go, if possible. The driver must set mq_ops->queue_rqs() to support
> > > this, if not the usual one-by-one path is used.
> > 
> > It looks like we still need to sync with the request_queue quiesce flag.
> 
> Basically it is same with my previous post[1], but the above patch doesn't
> handle request queue allocation/freeing correctly in case of BLK_MQ_F_BLOCKING.

Thanks for the pointer. I also thought it may be just as well that blocking
dispatchers don't get to use the optimized rqlist queueing, which would
simplify quiesce to the normal rcu usage.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] block: add mq_ops->queue_rqs hook
  2021-11-18  2:02       ` Keith Busch
@ 2021-11-18  2:14         ` Ming Lei
  0 siblings, 0 replies; 31+ messages in thread
From: Ming Lei @ 2021-11-18  2:14 UTC (permalink / raw)
  To: Keith Busch; +Cc: Jens Axboe, linux-block, hch, Sagi Grimberg

On Wed, Nov 17, 2021 at 07:02:27PM -0700, Keith Busch wrote:
> On Thu, Nov 18, 2021 at 08:18:29AM +0800, Ming Lei wrote:
> > On Wed, Nov 17, 2021 at 12:41:01PM -0800, Keith Busch wrote:
> > > On Tue, Nov 16, 2021 at 08:38:04PM -0700, Jens Axboe wrote:
> > > > If we have a list of requests in our plug list, send it to the driver in
> > > > one go, if possible. The driver must set mq_ops->queue_rqs() to support
> > > > this, if not the usual one-by-one path is used.
> > > 
> > > It looks like we still need to sync with the request_queue quiesce flag.
> > 
> > Basically it is same with my previous post[1], but the above patch doesn't
> > handle request queue allocation/freeing correctly in case of BLK_MQ_F_BLOCKING.
> 
> Thanks for the pointer. I also thought it may be just as well that blocking
> dispatchers don't get to use the optimized rqlist queueing,

Yeah, that is fine, but nvme-tcp may not benefit from the ->queue_rq() optimization.

> which would simplify quiesce to the normal rcu usage.

Normal dispatch code still need to check quiesced for blocking queue, so
srcu is still needed, not see easier way than srcu yet. Years ago, I
tried to replace the srcu with percpu-refcnt, which is still more
complicated than srcu, but srcu_index can be dropped.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/4] nvme: split command copy into a helper
  2021-11-17  3:38 ` [PATCH 2/4] nvme: split command copy into a helper Jens Axboe
  2021-11-17  6:15   ` Christoph Hellwig
@ 2021-11-18  7:54   ` Chaitanya Kulkarni
  1 sibling, 0 replies; 31+ messages in thread
From: Chaitanya Kulkarni @ 2021-11-18  7:54 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: hch

On 11/16/21 19:38, Jens Axboe wrote:
> External email: Use caution opening links or attachments
> 
> 
> We'll need it for batched submit as well.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---

maybe use nvme_sq_add_cmd() or nvme_sq_copy_cmd()
instead of nvme_copy_cmd() makes is easier to grep
XXX_sq_XXX(), either way looks good :-

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] nvme: separate command prep and issue
  2021-11-17  6:17   ` Christoph Hellwig
  2021-11-17 15:45     ` Jens Axboe
@ 2021-11-18  7:59     ` Chaitanya Kulkarni
  1 sibling, 0 replies; 31+ messages in thread
From: Chaitanya Kulkarni @ 2021-11-18  7:59 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe; +Cc: linux-block

On 11/16/2021 10:17 PM, Christoph Hellwig wrote:
> On Tue, Nov 16, 2021 at 08:38:06PM -0700, Jens Axboe wrote:
>> +	ret = nvme_prep_rq(dev, ns, req, &iod->cmd);
>> +	if (ret == BLK_STS_OK) {
>> +		nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
>> +		return BLK_STS_OK;
>> +	}
>> +	return ret;
> 
> I'd prefer the traditional handle errors outside the straight path
> order here:
> 
> 	ret = nvme_prep_rq(dev, ns, req, &iod->cmd);
> 	if (ret)
> 		return ret;
> 	nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
> 	return BLK_STS_OK;
> 

perhaps consider adding unlikely() for the error condition above ?

	ret = nvme_prep_rq(dev, ns, req, &iod->cmd);
  	if (unlikely(ret))
  		return ret;
  	nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
  	return BLK_STS_OK;


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 3/4] nvme: separate command prep and issue
  2021-12-16 16:38 [PATCHSET v5 0/4] Add support for list issue Jens Axboe
@ 2021-12-16 16:39 ` Jens Axboe
  0 siblings, 0 replies; 31+ messages in thread
From: Jens Axboe @ 2021-12-16 16:39 UTC (permalink / raw)
  To: io-uring, linux-block, linux-nvme
  Cc: Jens Axboe, Hannes Reinecke, Christoph Hellwig

Add a nvme_prep_rq() helper to setup a command, and nvme_queue_rq() is
adapted to use this helper.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 63 +++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9d2a36de228a..7062128c8204 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -903,55 +903,32 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
 	return BLK_STS_OK;
 }
 
-/*
- * NOTE: ns is NULL when called on the admin queue.
- */
-static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
-			 const struct blk_mq_queue_data *bd)
+static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 {
-	struct nvme_ns *ns = hctx->queue->queuedata;
-	struct nvme_queue *nvmeq = hctx->driver_data;
-	struct nvme_dev *dev = nvmeq->dev;
-	struct request *req = bd->rq;
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	struct nvme_command *cmnd = &iod->cmd;
 	blk_status_t ret;
 
 	iod->aborted = 0;
 	iod->npages = -1;
 	iod->nents = 0;
 
-	/*
-	 * We should not need to do this, but we're still using this to
-	 * ensure we can drain requests on a dying queue.
-	 */
-	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
-		return BLK_STS_IOERR;
-
-	if (!nvme_check_ready(&dev->ctrl, req, true))
-		return nvme_fail_nonready_command(&dev->ctrl, req);
-
-	ret = nvme_setup_cmd(ns, req);
+	ret = nvme_setup_cmd(req->q->queuedata, req);
 	if (ret)
 		return ret;
 
 	if (blk_rq_nr_phys_segments(req)) {
-		ret = nvme_map_data(dev, req, cmnd);
+		ret = nvme_map_data(dev, req, &iod->cmd);
 		if (ret)
 			goto out_free_cmd;
 	}
 
 	if (blk_integrity_rq(req)) {
-		ret = nvme_map_metadata(dev, req, cmnd);
+		ret = nvme_map_metadata(dev, req, &iod->cmd);
 		if (ret)
 			goto out_unmap_data;
 	}
 
 	blk_mq_start_request(req);
-	spin_lock(&nvmeq->sq_lock);
-	nvme_sq_copy_cmd(nvmeq, &iod->cmd);
-	nvme_write_sq_db(nvmeq, bd->last);
-	spin_unlock(&nvmeq->sq_lock);
 	return BLK_STS_OK;
 out_unmap_data:
 	nvme_unmap_data(dev, req);
@@ -960,6 +937,38 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
+static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct nvme_dev *dev = nvmeq->dev;
+	struct request *req = bd->rq;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	blk_status_t ret;
+
+	/*
+	 * We should not need to do this, but we're still using this to
+	 * ensure we can drain requests on a dying queue.
+	 */
+	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
+		return BLK_STS_IOERR;
+
+	if (unlikely(!nvme_check_ready(&dev->ctrl, req, true)))
+		return nvme_fail_nonready_command(&dev->ctrl, req);
+
+	ret = nvme_prep_rq(dev, req);
+	if (unlikely(ret))
+		return ret;
+	spin_lock(&nvmeq->sq_lock);
+	nvme_sq_copy_cmd(nvmeq, &iod->cmd);
+	nvme_write_sq_db(nvmeq, bd->last);
+	spin_unlock(&nvmeq->sq_lock);
+	return BLK_STS_OK;
+}
+
 static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 3/4] nvme: separate command prep and issue
  2021-12-16 16:05 [PATCHSET v4 0/4] Add support for list issue Jens Axboe
@ 2021-12-16 16:05 ` Jens Axboe
  0 siblings, 0 replies; 31+ messages in thread
From: Jens Axboe @ 2021-12-16 16:05 UTC (permalink / raw)
  To: io-uring, linux-block, linux-nvme
  Cc: Jens Axboe, Hannes Reinecke, Christoph Hellwig

Add a nvme_prep_rq() helper to setup a command, and nvme_queue_rq() is
adapted to use this helper.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 57 ++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 09ea21f75439..6be6b1ab4285 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -918,52 +918,32 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
 	return BLK_STS_OK;
 }
 
-/*
- * NOTE: ns is NULL when called on the admin queue.
- */
-static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
-			 const struct blk_mq_queue_data *bd)
+static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 {
-	struct nvme_ns *ns = hctx->queue->queuedata;
-	struct nvme_queue *nvmeq = hctx->driver_data;
-	struct nvme_dev *dev = nvmeq->dev;
-	struct request *req = bd->rq;
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	struct nvme_command *cmnd = &iod->cmd;
 	blk_status_t ret;
 
 	iod->aborted = 0;
 	iod->npages = -1;
 	iod->nents = 0;
 
-	/*
-	 * We should not need to do this, but we're still using this to
-	 * ensure we can drain requests on a dying queue.
-	 */
-	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
-		return BLK_STS_IOERR;
-
-	if (!nvme_check_ready(&dev->ctrl, req, true))
-		return nvme_fail_nonready_command(&dev->ctrl, req);
-
-	ret = nvme_setup_cmd(ns, req);
+	ret = nvme_setup_cmd(req->q->queuedata, req);
 	if (ret)
 		return ret;
 
 	if (blk_rq_nr_phys_segments(req)) {
-		ret = nvme_map_data(dev, req, cmnd);
+		ret = nvme_map_data(dev, req, &iod->cmd);
 		if (ret)
 			goto out_free_cmd;
 	}
 
 	if (blk_integrity_rq(req)) {
-		ret = nvme_map_metadata(dev, req, cmnd);
+		ret = nvme_map_metadata(dev, req, &iod->cmd);
 		if (ret)
 			goto out_unmap_data;
 	}
 
 	blk_mq_start_request(req);
-	nvme_submit_cmd(nvmeq, cmnd, bd->last);
 	return BLK_STS_OK;
 out_unmap_data:
 	nvme_unmap_data(dev, req);
@@ -972,6 +952,35 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
+static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct nvme_dev *dev = nvmeq->dev;
+	struct request *req = bd->rq;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	blk_status_t ret;
+
+	/*
+	 * We should not need to do this, but we're still using this to
+	 * ensure we can drain requests on a dying queue.
+	 */
+	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
+		return BLK_STS_IOERR;
+
+	if (unlikely(!nvme_check_ready(&dev->ctrl, req, true)))
+		return nvme_fail_nonready_command(&dev->ctrl, req);
+
+	ret = nvme_prep_rq(dev, req);
+	if (unlikely(ret))
+		return ret;
+	nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
+	return BLK_STS_OK;
+}
+
 static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] nvme: separate command prep and issue
  2021-12-15 16:24 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe
@ 2021-12-16  9:02   ` Christoph Hellwig
  0 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2021-12-16  9:02 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, linux-nvme, Hannes Reinecke

On Wed, Dec 15, 2021 at 09:24:20AM -0700, Jens Axboe wrote:
> Add a nvme_prep_rq() helper to setup a command, and nvme_queue_rq() is
> adapted to use this helper.
> 
> Reviewed-by: Hannes Reinecke <hare@suse.de>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 3/4] nvme: separate command prep and issue
  2021-12-15 16:24 [PATCHSET v3 0/4] Add support for list issue Jens Axboe
@ 2021-12-15 16:24 ` Jens Axboe
  2021-12-16  9:02   ` Christoph Hellwig
  0 siblings, 1 reply; 31+ messages in thread
From: Jens Axboe @ 2021-12-15 16:24 UTC (permalink / raw)
  To: io-uring, linux-nvme; +Cc: Jens Axboe, Hannes Reinecke

Add a nvme_prep_rq() helper to setup a command, and nvme_queue_rq() is
adapted to use this helper.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 57 ++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 09ea21f75439..6be6b1ab4285 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -918,52 +918,32 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
 	return BLK_STS_OK;
 }
 
-/*
- * NOTE: ns is NULL when called on the admin queue.
- */
-static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
-			 const struct blk_mq_queue_data *bd)
+static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 {
-	struct nvme_ns *ns = hctx->queue->queuedata;
-	struct nvme_queue *nvmeq = hctx->driver_data;
-	struct nvme_dev *dev = nvmeq->dev;
-	struct request *req = bd->rq;
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	struct nvme_command *cmnd = &iod->cmd;
 	blk_status_t ret;
 
 	iod->aborted = 0;
 	iod->npages = -1;
 	iod->nents = 0;
 
-	/*
-	 * We should not need to do this, but we're still using this to
-	 * ensure we can drain requests on a dying queue.
-	 */
-	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
-		return BLK_STS_IOERR;
-
-	if (!nvme_check_ready(&dev->ctrl, req, true))
-		return nvme_fail_nonready_command(&dev->ctrl, req);
-
-	ret = nvme_setup_cmd(ns, req);
+	ret = nvme_setup_cmd(req->q->queuedata, req);
 	if (ret)
 		return ret;
 
 	if (blk_rq_nr_phys_segments(req)) {
-		ret = nvme_map_data(dev, req, cmnd);
+		ret = nvme_map_data(dev, req, &iod->cmd);
 		if (ret)
 			goto out_free_cmd;
 	}
 
 	if (blk_integrity_rq(req)) {
-		ret = nvme_map_metadata(dev, req, cmnd);
+		ret = nvme_map_metadata(dev, req, &iod->cmd);
 		if (ret)
 			goto out_unmap_data;
 	}
 
 	blk_mq_start_request(req);
-	nvme_submit_cmd(nvmeq, cmnd, bd->last);
 	return BLK_STS_OK;
 out_unmap_data:
 	nvme_unmap_data(dev, req);
@@ -972,6 +952,35 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
+static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct nvme_dev *dev = nvmeq->dev;
+	struct request *req = bd->rq;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	blk_status_t ret;
+
+	/*
+	 * We should not need to do this, but we're still using this to
+	 * ensure we can drain requests on a dying queue.
+	 */
+	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
+		return BLK_STS_IOERR;
+
+	if (unlikely(!nvme_check_ready(&dev->ctrl, req, true)))
+		return nvme_fail_nonready_command(&dev->ctrl, req);
+
+	ret = nvme_prep_rq(dev, req);
+	if (unlikely(ret))
+		return ret;
+	nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
+	return BLK_STS_OK;
+}
+
 static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] nvme: separate command prep and issue
  2021-12-03 21:45 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe
@ 2021-12-04 10:45   ` Hannes Reinecke
  0 siblings, 0 replies; 31+ messages in thread
From: Hannes Reinecke @ 2021-12-04 10:45 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-nvme

On 12/3/21 10:45 PM, Jens Axboe wrote:
> Add a nvme_prep_rq() helper to setup a command, and nvme_queue_rq() is
> adapted to use this helper.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>   drivers/nvme/host/pci.c | 57 ++++++++++++++++++++++++-----------------
>   1 file changed, 33 insertions(+), 24 deletions(-)
> 
Reviewed-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 3/4] nvme: separate command prep and issue
  2021-12-03 21:45 [PATCHSET v2 0/4] Add support for list issue Jens Axboe
@ 2021-12-03 21:45 ` Jens Axboe
  2021-12-04 10:45   ` Hannes Reinecke
  0 siblings, 1 reply; 31+ messages in thread
From: Jens Axboe @ 2021-12-03 21:45 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: Jens Axboe

Add a nvme_prep_rq() helper to setup a command, and nvme_queue_rq() is
adapted to use this helper.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 57 ++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 09ea21f75439..6be6b1ab4285 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -918,52 +918,32 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
 	return BLK_STS_OK;
 }
 
-/*
- * NOTE: ns is NULL when called on the admin queue.
- */
-static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
-			 const struct blk_mq_queue_data *bd)
+static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 {
-	struct nvme_ns *ns = hctx->queue->queuedata;
-	struct nvme_queue *nvmeq = hctx->driver_data;
-	struct nvme_dev *dev = nvmeq->dev;
-	struct request *req = bd->rq;
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	struct nvme_command *cmnd = &iod->cmd;
 	blk_status_t ret;
 
 	iod->aborted = 0;
 	iod->npages = -1;
 	iod->nents = 0;
 
-	/*
-	 * We should not need to do this, but we're still using this to
-	 * ensure we can drain requests on a dying queue.
-	 */
-	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
-		return BLK_STS_IOERR;
-
-	if (!nvme_check_ready(&dev->ctrl, req, true))
-		return nvme_fail_nonready_command(&dev->ctrl, req);
-
-	ret = nvme_setup_cmd(ns, req);
+	ret = nvme_setup_cmd(req->q->queuedata, req);
 	if (ret)
 		return ret;
 
 	if (blk_rq_nr_phys_segments(req)) {
-		ret = nvme_map_data(dev, req, cmnd);
+		ret = nvme_map_data(dev, req, &iod->cmd);
 		if (ret)
 			goto out_free_cmd;
 	}
 
 	if (blk_integrity_rq(req)) {
-		ret = nvme_map_metadata(dev, req, cmnd);
+		ret = nvme_map_metadata(dev, req, &iod->cmd);
 		if (ret)
 			goto out_unmap_data;
 	}
 
 	blk_mq_start_request(req);
-	nvme_submit_cmd(nvmeq, cmnd, bd->last);
 	return BLK_STS_OK;
 out_unmap_data:
 	nvme_unmap_data(dev, req);
@@ -972,6 +952,35 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
+static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct nvme_dev *dev = nvmeq->dev;
+	struct request *req = bd->rq;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	blk_status_t ret;
+
+	/*
+	 * We should not need to do this, but we're still using this to
+	 * ensure we can drain requests on a dying queue.
+	 */
+	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
+		return BLK_STS_IOERR;
+
+	if (unlikely(!nvme_check_ready(&dev->ctrl, req, true)))
+		return nvme_fail_nonready_command(&dev->ctrl, req);
+
+	ret = nvme_prep_rq(dev, req);
+	if (unlikely(ret))
+		return ret;
+	nvme_submit_cmd(nvmeq, &iod->cmd, bd->last);
+	return BLK_STS_OK;
+}
+
 static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2021-12-16 16:39 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-17  3:38 [PATCHSET 0/4] Add support for list issue Jens Axboe
2021-11-17  3:38 ` [PATCH 1/4] block: add mq_ops->queue_rqs hook Jens Axboe
2021-11-17  6:25   ` Christoph Hellwig
2021-11-17 15:41     ` Jens Axboe
2021-11-17  8:20   ` Ming Lei
2021-11-17 15:43     ` Jens Axboe
2021-11-17 20:48       ` Keith Busch
2021-11-17 23:59         ` Ming Lei
2021-11-17 20:41   ` Keith Busch
2021-11-18  0:18     ` Ming Lei
2021-11-18  2:02       ` Keith Busch
2021-11-18  2:14         ` Ming Lei
2021-11-17  3:38 ` [PATCH 2/4] nvme: split command copy into a helper Jens Axboe
2021-11-17  6:15   ` Christoph Hellwig
2021-11-17 15:44     ` Jens Axboe
2021-11-18  7:54   ` Chaitanya Kulkarni
2021-11-17  3:38 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe
2021-11-17  6:17   ` Christoph Hellwig
2021-11-17 15:45     ` Jens Axboe
2021-11-18  7:59     ` Chaitanya Kulkarni
2021-11-17  3:38 ` [PATCH 4/4] nvme: add support for mq_ops->queue_rqs() Jens Axboe
2021-11-17  8:39   ` Christoph Hellwig
2021-11-17 15:55     ` Jens Axboe
2021-11-17 15:58       ` Jens Axboe
2021-11-17 19:41   ` Keith Busch
2021-12-03 21:45 [PATCHSET v2 0/4] Add support for list issue Jens Axboe
2021-12-03 21:45 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe
2021-12-04 10:45   ` Hannes Reinecke
2021-12-15 16:24 [PATCHSET v3 0/4] Add support for list issue Jens Axboe
2021-12-15 16:24 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe
2021-12-16  9:02   ` Christoph Hellwig
2021-12-16 16:05 [PATCHSET v4 0/4] Add support for list issue Jens Axboe
2021-12-16 16:05 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe
2021-12-16 16:38 [PATCHSET v5 0/4] Add support for list issue Jens Axboe
2021-12-16 16:39 ` [PATCH 3/4] nvme: separate command prep and issue Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.