All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] nvme fabrics polling fixes
@ 2023-03-22  0:23 Keith Busch
  2023-03-22  0:23 ` [PATCH 1/3] nvme-fabrics: add queue setup helpers Keith Busch
                   ` (4 more replies)
  0 siblings, 5 replies; 28+ messages in thread
From: Keith Busch @ 2023-03-22  0:23 UTC (permalink / raw)
  To: linux-block, axboe, linux-nvme, hch, sagi; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

I couldn't test the existing tcp or rdma options, so I had to make a
loop poll option. The last patch fixes the polling queues when used with
fabrics.

Note, this depends on patch I sent earlier today that I should have just
included in this series:

  https://lore.kernel.org/linux-block/20230321215001.2655451-1-kbusch@meta.com/T/#u

Keith Busch (3):
  nvme-fabrics: add queue setup helpers
  nvme: add polling options for loop target
  blk-mq: directly poll requests

 block/blk-mq.c              |  4 +-
 drivers/nvme/host/fabrics.c | 95 +++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/fabrics.h |  5 ++
 drivers/nvme/host/rdma.c    | 81 ++-----------------------------
 drivers/nvme/host/tcp.c     | 92 ++---------------------------------
 drivers/nvme/target/loop.c  | 63 ++++++++++++++++++++++--
 6 files changed, 168 insertions(+), 172 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/3] nvme-fabrics: add queue setup helpers
  2023-03-22  0:23 [PATCH 0/3] nvme fabrics polling fixes Keith Busch
@ 2023-03-22  0:23 ` Keith Busch
  2023-03-22  1:46   ` Chaitanya Kulkarni
                     ` (3 more replies)
  2023-03-22  0:23 ` [PATCH 2/3] nvme: add polling options for loop target Keith Busch
                   ` (3 subsequent siblings)
  4 siblings, 4 replies; 28+ messages in thread
From: Keith Busch @ 2023-03-22  0:23 UTC (permalink / raw)
  To: linux-block, axboe, linux-nvme, hch, sagi; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

tcp and rdma transports have lots of duplicate code setting up the
different queue mappings. Add common helpers.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/fabrics.c | 95 +++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/fabrics.h |  5 ++
 drivers/nvme/host/rdma.c    | 81 ++-----------------------------
 drivers/nvme/host/tcp.c     | 92 ++---------------------------------
 4 files changed, 109 insertions(+), 164 deletions(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index bbaa04a0c502b..9dcb56ea1cc00 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -4,6 +4,7 @@
  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/blk-mq-rdma.h>
 #include <linux/init.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
@@ -957,6 +958,100 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 	return ret;
 }
 
+unsigned int nvme_nr_io_queues(struct nvmf_ctrl_options *opts)
+{
+	unsigned int nr_io_queues;
+
+	nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
+	nr_io_queues += min(opts->nr_write_queues, num_online_cpus());
+	nr_io_queues += min(opts->nr_poll_queues, num_online_cpus());
+
+	return nr_io_queues;
+}
+EXPORT_SYMBOL_GPL(nvme_nr_io_queues);
+
+void nvme_set_io_queues(struct nvmf_ctrl_options *opts, u32 nr_io_queues,
+			u32 io_queues[HCTX_MAX_TYPES])
+{
+	if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
+		/*
+		 * separate read/write queues
+		 * hand out dedicated default queues only after we have
+		 * sufficient read queues.
+		 */
+		io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
+		nr_io_queues -= io_queues[HCTX_TYPE_READ];
+		io_queues[HCTX_TYPE_DEFAULT] =
+			min(opts->nr_write_queues, nr_io_queues);
+		nr_io_queues -= io_queues[HCTX_TYPE_DEFAULT];
+	} else {
+		/*
+		 * shared read/write queues
+		 * either no write queues were requested, or we don't have
+		 * sufficient queue count to have dedicated default queues.
+		 */
+		io_queues[HCTX_TYPE_DEFAULT] =
+			min(opts->nr_io_queues, nr_io_queues);
+		nr_io_queues -= io_queues[HCTX_TYPE_DEFAULT];
+	}
+
+	if (opts->nr_poll_queues && nr_io_queues) {
+		/* map dedicated poll queues only if we have queues left */
+		io_queues[HCTX_TYPE_POLL] =
+			min(opts->nr_poll_queues, nr_io_queues);
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_set_io_queues);
+
+void nvme_map_queues(struct blk_mq_tag_set *set, struct nvme_ctrl *ctrl,
+		     struct ib_device *dev, u32 io_queues[HCTX_MAX_TYPES])
+{
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+
+	if (opts->nr_write_queues && io_queues[HCTX_TYPE_READ]) {
+		/* separate read/write queues */
+		set->map[HCTX_TYPE_DEFAULT].nr_queues =
+			io_queues[HCTX_TYPE_DEFAULT];
+		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+		set->map[HCTX_TYPE_READ].nr_queues =
+			io_queues[HCTX_TYPE_READ];
+		set->map[HCTX_TYPE_READ].queue_offset =
+			io_queues[HCTX_TYPE_DEFAULT];
+	} else {
+		/* shared read/write queues */
+		set->map[HCTX_TYPE_DEFAULT].nr_queues =
+			io_queues[HCTX_TYPE_DEFAULT];
+		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+		set->map[HCTX_TYPE_READ].nr_queues =
+			io_queues[HCTX_TYPE_DEFAULT];
+		set->map[HCTX_TYPE_READ].queue_offset = 0;
+	}
+
+	if (dev) {
+		blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT], dev, 0);
+		blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ], dev, 0);
+	} else {
+		blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
+		blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
+	}
+
+	if (opts->nr_poll_queues && io_queues[HCTX_TYPE_POLL]) {
+		/* map dedicated poll queues only if we have queues left */
+		set->map[HCTX_TYPE_POLL].nr_queues = io_queues[HCTX_TYPE_POLL];
+		set->map[HCTX_TYPE_POLL].queue_offset =
+			io_queues[HCTX_TYPE_DEFAULT] +
+			io_queues[HCTX_TYPE_READ];
+		blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
+	}
+
+	dev_info(ctrl->device,
+		"mapped %d/%d/%d default/read/poll queues.\n",
+		io_queues[HCTX_TYPE_DEFAULT],
+		io_queues[HCTX_TYPE_READ],
+		io_queues[HCTX_TYPE_POLL]);
+}
+EXPORT_SYMBOL_GPL(nvme_map_queues);
+
 static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts,
 		unsigned int required_opts)
 {
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index dcac3df8a5f76..bad3e21ffb8ba 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -215,5 +215,10 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
 bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
 bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
 		struct nvmf_ctrl_options *opts);
+void nvme_set_io_queues(struct nvmf_ctrl_options *opts, u32 nr_io_queues,
+			u32 io_queues[HCTX_MAX_TYPES]);
+unsigned int nvme_nr_io_queues(struct nvmf_ctrl_options *opts);
+void nvme_map_queues(struct blk_mq_tag_set *set, struct nvme_ctrl *ctrl,
+		     struct ib_device *dev, u32 io_queues[HCTX_MAX_TYPES]);
 
 #endif /* _NVME_FABRICS_H */
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index bbad26b82b56d..cbec566db2761 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -718,18 +718,10 @@ static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl,
 static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
 {
 	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
-	struct ib_device *ibdev = ctrl->device->dev;
-	unsigned int nr_io_queues, nr_default_queues;
-	unsigned int nr_read_queues, nr_poll_queues;
+	unsigned int nr_io_queues;
 	int i, ret;
 
-	nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
-				min(opts->nr_io_queues, num_online_cpus()));
-	nr_default_queues =  min_t(unsigned int, ibdev->num_comp_vectors,
-				min(opts->nr_write_queues, num_online_cpus()));
-	nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
-	nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
-
+	nr_io_queues = nvme_nr_io_queues(opts);
 	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
 	if (ret)
 		return ret;
@@ -744,34 +736,7 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
 	dev_info(ctrl->ctrl.device,
 		"creating %d I/O queues.\n", nr_io_queues);
 
-	if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
-		/*
-		 * separate read/write queues
-		 * hand out dedicated default queues only after we have
-		 * sufficient read queues.
-		 */
-		ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
-		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
-		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
-			min(nr_default_queues, nr_io_queues);
-		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
-	} else {
-		/*
-		 * shared read/write queues
-		 * either no write queues were requested, or we don't have
-		 * sufficient queue count to have dedicated default queues.
-		 */
-		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
-			min(nr_read_queues, nr_io_queues);
-		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
-	}
-
-	if (opts->nr_poll_queues && nr_io_queues) {
-		/* map dedicated poll queues only if we have queues left */
-		ctrl->io_queues[HCTX_TYPE_POLL] =
-			min(nr_poll_queues, nr_io_queues);
-	}
-
+	nvme_set_io_queues(opts, nr_io_queues, ctrl->io_queues);
 	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
 		ret = nvme_rdma_alloc_queue(ctrl, i,
 				ctrl->ctrl.sqsize + 1);
@@ -2143,46 +2108,8 @@ static void nvme_rdma_complete_rq(struct request *rq)
 static void nvme_rdma_map_queues(struct blk_mq_tag_set *set)
 {
 	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data);
-	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
 
-	if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
-		/* separate read/write queues */
-		set->map[HCTX_TYPE_DEFAULT].nr_queues =
-			ctrl->io_queues[HCTX_TYPE_DEFAULT];
-		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
-		set->map[HCTX_TYPE_READ].nr_queues =
-			ctrl->io_queues[HCTX_TYPE_READ];
-		set->map[HCTX_TYPE_READ].queue_offset =
-			ctrl->io_queues[HCTX_TYPE_DEFAULT];
-	} else {
-		/* shared read/write queues */
-		set->map[HCTX_TYPE_DEFAULT].nr_queues =
-			ctrl->io_queues[HCTX_TYPE_DEFAULT];
-		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
-		set->map[HCTX_TYPE_READ].nr_queues =
-			ctrl->io_queues[HCTX_TYPE_DEFAULT];
-		set->map[HCTX_TYPE_READ].queue_offset = 0;
-	}
-	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
-			ctrl->device->dev, 0);
-	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
-			ctrl->device->dev, 0);
-
-	if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
-		/* map dedicated poll queues only if we have queues left */
-		set->map[HCTX_TYPE_POLL].nr_queues =
-				ctrl->io_queues[HCTX_TYPE_POLL];
-		set->map[HCTX_TYPE_POLL].queue_offset =
-			ctrl->io_queues[HCTX_TYPE_DEFAULT] +
-			ctrl->io_queues[HCTX_TYPE_READ];
-		blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
-	}
-
-	dev_info(ctrl->ctrl.device,
-		"mapped %d/%d/%d default/read/poll queues.\n",
-		ctrl->io_queues[HCTX_TYPE_DEFAULT],
-		ctrl->io_queues[HCTX_TYPE_READ],
-		ctrl->io_queues[HCTX_TYPE_POLL]);
+	nvme_map_queues(set, &ctrl->ctrl, ctrl->device->dev, ctrl->io_queues);
 }
 
 static const struct blk_mq_ops nvme_rdma_mq_ops = {
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 7723a49895244..96298f8794e1a 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1781,58 +1781,12 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
 	return ret;
 }
 
-static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
-{
-	unsigned int nr_io_queues;
-
-	nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
-	nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
-	nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
-
-	return nr_io_queues;
-}
-
-static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
-		unsigned int nr_io_queues)
-{
-	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
-	struct nvmf_ctrl_options *opts = nctrl->opts;
-
-	if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
-		/*
-		 * separate read/write queues
-		 * hand out dedicated default queues only after we have
-		 * sufficient read queues.
-		 */
-		ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
-		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
-		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
-			min(opts->nr_write_queues, nr_io_queues);
-		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
-	} else {
-		/*
-		 * shared read/write queues
-		 * either no write queues were requested, or we don't have
-		 * sufficient queue count to have dedicated default queues.
-		 */
-		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
-			min(opts->nr_io_queues, nr_io_queues);
-		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
-	}
-
-	if (opts->nr_poll_queues && nr_io_queues) {
-		/* map dedicated poll queues only if we have queues left */
-		ctrl->io_queues[HCTX_TYPE_POLL] =
-			min(opts->nr_poll_queues, nr_io_queues);
-	}
-}
-
 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
 {
 	unsigned int nr_io_queues;
 	int ret;
 
-	nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
+	nr_io_queues = nvme_nr_io_queues(ctrl->opts);
 	ret = nvme_set_queue_count(ctrl, &nr_io_queues);
 	if (ret)
 		return ret;
@@ -1847,8 +1801,8 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
 	dev_info(ctrl->device,
 		"creating %d I/O queues.\n", nr_io_queues);
 
-	nvme_tcp_set_io_queues(ctrl, nr_io_queues);
-
+	nvme_set_io_queues(ctrl->opts, nr_io_queues,
+			   to_tcp_ctrl(ctrl)->io_queues);
 	return __nvme_tcp_alloc_io_queues(ctrl);
 }
 
@@ -2428,44 +2382,8 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
 static void nvme_tcp_map_queues(struct blk_mq_tag_set *set)
 {
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
-	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
-
-	if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
-		/* separate read/write queues */
-		set->map[HCTX_TYPE_DEFAULT].nr_queues =
-			ctrl->io_queues[HCTX_TYPE_DEFAULT];
-		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
-		set->map[HCTX_TYPE_READ].nr_queues =
-			ctrl->io_queues[HCTX_TYPE_READ];
-		set->map[HCTX_TYPE_READ].queue_offset =
-			ctrl->io_queues[HCTX_TYPE_DEFAULT];
-	} else {
-		/* shared read/write queues */
-		set->map[HCTX_TYPE_DEFAULT].nr_queues =
-			ctrl->io_queues[HCTX_TYPE_DEFAULT];
-		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
-		set->map[HCTX_TYPE_READ].nr_queues =
-			ctrl->io_queues[HCTX_TYPE_DEFAULT];
-		set->map[HCTX_TYPE_READ].queue_offset = 0;
-	}
-	blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
-	blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
-
-	if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
-		/* map dedicated poll queues only if we have queues left */
-		set->map[HCTX_TYPE_POLL].nr_queues =
-				ctrl->io_queues[HCTX_TYPE_POLL];
-		set->map[HCTX_TYPE_POLL].queue_offset =
-			ctrl->io_queues[HCTX_TYPE_DEFAULT] +
-			ctrl->io_queues[HCTX_TYPE_READ];
-		blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
-	}
-
-	dev_info(ctrl->ctrl.device,
-		"mapped %d/%d/%d default/read/poll queues.\n",
-		ctrl->io_queues[HCTX_TYPE_DEFAULT],
-		ctrl->io_queues[HCTX_TYPE_READ],
-		ctrl->io_queues[HCTX_TYPE_POLL]);
+
+	nvme_map_queues(set, &ctrl->ctrl, NULL, ctrl->io_queues);
 }
 
 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/3] nvme: add polling options for loop target
  2023-03-22  0:23 [PATCH 0/3] nvme fabrics polling fixes Keith Busch
  2023-03-22  0:23 ` [PATCH 1/3] nvme-fabrics: add queue setup helpers Keith Busch
@ 2023-03-22  0:23 ` Keith Busch
  2023-03-22  1:47   ` Chaitanya Kulkarni
                     ` (2 more replies)
  2023-03-22  0:23 ` [PATCH 3/3] blk-mq: directly poll requests Keith Busch
                   ` (2 subsequent siblings)
  4 siblings, 3 replies; 28+ messages in thread
From: Keith Busch @ 2023-03-22  0:23 UTC (permalink / raw)
  To: linux-block, axboe, linux-nvme, hch, sagi; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

This is for mostly for testing purposes.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/loop.c | 63 +++++++++++++++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index f2d24b2d992f8..0587ead60b09e 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -22,6 +22,7 @@ struct nvme_loop_iod {
 	struct nvmet_req	req;
 	struct nvme_loop_queue	*queue;
 	struct work_struct	work;
+	struct work_struct	poll;
 	struct sg_table		sg_table;
 	struct scatterlist	first_sgl[];
 };
@@ -37,6 +38,7 @@ struct nvme_loop_ctrl {
 	struct nvme_ctrl	ctrl;
 
 	struct nvmet_port	*port;
+	u32			io_queues[HCTX_MAX_TYPES];
 };
 
 static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
@@ -76,7 +78,11 @@ static void nvme_loop_complete_rq(struct request *req)
 	struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
 
 	sg_free_table_chained(&iod->sg_table, NVME_INLINE_SG_CNT);
-	nvme_complete_rq(req);
+
+	if (req->mq_hctx->type != HCTX_TYPE_POLL || !in_interrupt())
+		nvme_complete_rq(req);
+	else
+		queue_work(nvmet_wq, &iod->poll);
 }
 
 static struct blk_mq_tags *nvme_loop_tagset(struct nvme_loop_queue *queue)
@@ -120,6 +126,15 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
 	}
 }
 
+static void nvme_loop_poll_work(struct work_struct *work)
+{
+	struct nvme_loop_iod *iod =
+		container_of(work, struct nvme_loop_iod, poll);
+	struct request *req = blk_mq_rq_from_pdu(iod);
+
+	nvme_complete_rq(req);
+}
+
 static void nvme_loop_execute_work(struct work_struct *work)
 {
 	struct nvme_loop_iod *iod =
@@ -170,6 +185,30 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_STS_OK;
 }
 
+static bool nvme_loop_poll_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+	struct nvme_loop_iod *iod;
+	struct request *rq;
+
+	rq = blk_mq_tag_to_rq(hctx->tags, bitnr);
+	if (!rq)
+		return true;
+
+	iod = blk_mq_rq_to_pdu(rq);
+	flush_work(&iod->poll);
+	return true;
+}
+
+static int nvme_loop_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
+{
+	struct blk_mq_tags *tags = hctx->tags;
+	struct sbitmap_queue *btags = &tags->bitmap_tags;
+
+	sbitmap_for_each_set(&btags->sb, nvme_loop_poll_iter, hctx);
+	return 1;
+}
+
 static void nvme_loop_submit_async_event(struct nvme_ctrl *arg)
 {
 	struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg);
@@ -197,6 +236,7 @@ static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl,
 	iod->req.cqe = &iod->cqe;
 	iod->queue = &ctrl->queues[queue_idx];
 	INIT_WORK(&iod->work, nvme_loop_execute_work);
+	INIT_WORK(&iod->poll, nvme_loop_poll_work);
 	return 0;
 }
 
@@ -247,11 +287,20 @@ static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 	return 0;
 }
 
+static void nvme_loop_map_queues(struct blk_mq_tag_set *set)
+{
+	struct nvme_loop_ctrl *ctrl = to_loop_ctrl(set->driver_data);
+
+	nvme_map_queues(set, &ctrl->ctrl, NULL, ctrl->io_queues);
+}
+
 static const struct blk_mq_ops nvme_loop_mq_ops = {
 	.queue_rq	= nvme_loop_queue_rq,
 	.complete	= nvme_loop_complete_rq,
 	.init_request	= nvme_loop_init_request,
 	.init_hctx	= nvme_loop_init_hctx,
+	.map_queues	= nvme_loop_map_queues,
+	.poll		= nvme_loop_poll,
 };
 
 static const struct blk_mq_ops nvme_loop_admin_mq_ops = {
@@ -305,7 +354,7 @@ static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl)
 	unsigned int nr_io_queues;
 	int ret, i;
 
-	nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
+	nr_io_queues = nvme_nr_io_queues(opts);
 	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
 	if (ret || !nr_io_queues)
 		return ret;
@@ -321,6 +370,7 @@ static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl)
 		ctrl->ctrl.queue_count++;
 	}
 
+	nvme_set_io_queues(opts, nr_io_queues, ctrl->io_queues);
 	return 0;
 
 out_destroy_queues:
@@ -494,7 +544,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
 		return ret;
 
 	ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set,
-			&nvme_loop_mq_ops, 1,
+			&nvme_loop_mq_ops, ctrl->ctrl.opts->nr_poll_queues ? 3 : 2,
 			sizeof(struct nvme_loop_iod) +
 			NVME_INLINE_SG_CNT * sizeof(struct scatterlist));
 	if (ret)
@@ -534,6 +584,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
 		struct nvmf_ctrl_options *opts)
 {
 	struct nvme_loop_ctrl *ctrl;
+	unsigned int nr_io_queues;
 	int ret;
 
 	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
@@ -559,7 +610,8 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
 	ctrl->ctrl.kato = opts->kato;
 	ctrl->port = nvme_loop_find_port(&ctrl->ctrl);
 
-	ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
+	nr_io_queues = nvme_nr_io_queues(ctrl->ctrl.opts);;
+	ctrl->queues = kcalloc(nr_io_queues + 1, sizeof(*ctrl->queues),
 			GFP_KERNEL);
 	if (!ctrl->queues)
 		goto out_uninit_ctrl;
@@ -648,7 +700,8 @@ static struct nvmf_transport_ops nvme_loop_transport = {
 	.name		= "loop",
 	.module		= THIS_MODULE,
 	.create_ctrl	= nvme_loop_create_ctrl,
-	.allowed_opts	= NVMF_OPT_TRADDR,
+	.allowed_opts	= NVMF_OPT_TRADDR | NVMF_OPT_NR_WRITE_QUEUES |
+			  NVMF_OPT_NR_POLL_QUEUES,
 };
 
 static int __init nvme_loop_init_module(void)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/3] blk-mq: directly poll requests
  2023-03-22  0:23 [PATCH 0/3] nvme fabrics polling fixes Keith Busch
  2023-03-22  0:23 ` [PATCH 1/3] nvme-fabrics: add queue setup helpers Keith Busch
  2023-03-22  0:23 ` [PATCH 2/3] nvme: add polling options for loop target Keith Busch
@ 2023-03-22  0:23 ` Keith Busch
  2023-03-22  7:36   ` Sagi Grimberg
                     ` (3 more replies)
  2023-03-22  7:31 ` [PATCH 0/3] nvme fabrics polling fixes Sagi Grimberg
  2023-03-22  8:48 ` Daniel Wagner
  4 siblings, 4 replies; 28+ messages in thread
From: Keith Busch @ 2023-03-22  0:23 UTC (permalink / raw)
  To: linux-block, axboe, linux-nvme, hch, sagi
  Cc: Keith Busch, Martin Belanger, Daniel Wagner

From: Keith Busch <kbusch@kernel.org>

Polling needs a bio with a valid bi_bdev, but neither of those are
guaranteed for polled driver requests. Make request based polling use
directly use blk-mq's polling function.

When executing a request from a polled hctx, we know the request's
cookie, and that it's from a live multi-queue that supports polling, so
we can safely skip everything that bio_poll provides.

Link: http://lists.infradead.org/pipermail/linux-nvme/2023-March/038340.html
Reported-by: Martin Belanger <Martin.Belanger@dell.com>
Reported-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-mq.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e30459df8151..932d2e95392e6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1334,8 +1334,6 @@ bool blk_rq_is_poll(struct request *rq)
 		return false;
 	if (rq->mq_hctx->type != HCTX_TYPE_POLL)
 		return false;
-	if (WARN_ON_ONCE(!rq->bio))
-		return false;
 	return true;
 }
 EXPORT_SYMBOL_GPL(blk_rq_is_poll);
@@ -1343,7 +1341,7 @@ EXPORT_SYMBOL_GPL(blk_rq_is_poll);
 static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
 {
 	do {
-		bio_poll(rq->bio, NULL, 0);
+		blk_mq_poll(rq->q, blk_rq_to_qc(rq), NULL, 0);
 		cond_resched();
 	} while (!completion_done(wait));
 }
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] nvme-fabrics: add queue setup helpers
  2023-03-22  0:23 ` [PATCH 1/3] nvme-fabrics: add queue setup helpers Keith Busch
@ 2023-03-22  1:46   ` Chaitanya Kulkarni
  2023-03-22  4:38   ` kernel test robot
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 28+ messages in thread
From: Chaitanya Kulkarni @ 2023-03-22  1:46 UTC (permalink / raw)
  To: Keith Busch, linux-block, axboe, linux-nvme, hch, sagi; +Cc: Keith Busch

On 3/21/2023 5:23 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> tcp and rdma transports have lots of duplicate code setting up the
> different queue mappings. Add common helpers.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---

LGTM.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] nvme: add polling options for loop target
  2023-03-22  0:23 ` [PATCH 2/3] nvme: add polling options for loop target Keith Busch
@ 2023-03-22  1:47   ` Chaitanya Kulkarni
  2023-03-22  7:44   ` Sagi Grimberg
  2023-03-22  8:23   ` Christoph Hellwig
  2 siblings, 0 replies; 28+ messages in thread
From: Chaitanya Kulkarni @ 2023-03-22  1:47 UTC (permalink / raw)
  To: Keith Busch, linux-block, axboe, linux-nvme, hch, sagi; +Cc: Keith Busch

On 3/21/2023 5:23 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> This is for mostly for testing purposes.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---

LGTM, it will be nice to have a blktests for this,
as I think this is really useful.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] nvme-fabrics: add queue setup helpers
  2023-03-22  0:23 ` [PATCH 1/3] nvme-fabrics: add queue setup helpers Keith Busch
  2023-03-22  1:46   ` Chaitanya Kulkarni
@ 2023-03-22  4:38   ` kernel test robot
  2023-03-22  5:21     ` Chaitanya Kulkarni
  2023-03-22  7:35   ` Sagi Grimberg
  2023-03-22  9:25   ` kernel test robot
  3 siblings, 1 reply; 28+ messages in thread
From: kernel test robot @ 2023-03-22  4:38 UTC (permalink / raw)
  To: Keith Busch, linux-block, axboe, linux-nvme, hch, sagi
  Cc: oe-kbuild-all, Keith Busch

Hi Keith,

I love your patch! Perhaps something to improve:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.3-rc3 next-20230322]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/nvme-fabrics-add-queue-setup-helpers/20230322-082537
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20230322002350.4038048-2-kbusch%40meta.com
patch subject: [PATCH 1/3] nvme-fabrics: add queue setup helpers
config: arm-randconfig-r046-20230321 (https://download.01.org/0day-ci/archive/20230322/202303221244.KMWQNHnu-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/0acb3964621b925db18ebec1a6c57bc3c3446859
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Keith-Busch/nvme-fabrics-add-queue-setup-helpers/20230322-082537
        git checkout 0acb3964621b925db18ebec1a6c57bc3c3446859
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arm olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash drivers/nvme/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303221244.KMWQNHnu-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from drivers/nvme/host/core.c:26:
>> drivers/nvme/host/fabrics.h:222:29: warning: 'struct ib_device' declared inside parameter list will not be visible outside of this definition or declaration
     222 |                      struct ib_device *dev, u32 io_queues[HCTX_MAX_TYPES]);
         |                             ^~~~~~~~~
--
   In file included from drivers/nvme/host/core.c:26:
>> drivers/nvme/host/fabrics.h:222:29: warning: 'struct ib_device' declared inside parameter list will not be visible outside of this definition or declaration
     222 |                      struct ib_device *dev, u32 io_queues[HCTX_MAX_TYPES]);
         |                             ^~~~~~~~~
   In file included from drivers/nvme/host/trace.h:175,
                    from drivers/nvme/host/core.c:30:
   include/trace/define_trace.h:95:42: fatal error: ./trace.h: No such file or directory
      95 | #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
         |                                          ^
   compilation terminated.


vim +222 drivers/nvme/host/fabrics.h

   205	
   206	int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
   207	int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
   208	int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
   209	int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
   210	int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
   211	int nvmf_register_transport(struct nvmf_transport_ops *ops);
   212	void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
   213	void nvmf_free_options(struct nvmf_ctrl_options *opts);
   214	int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
   215	bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
   216	bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
   217			struct nvmf_ctrl_options *opts);
   218	void nvme_set_io_queues(struct nvmf_ctrl_options *opts, u32 nr_io_queues,
   219				u32 io_queues[HCTX_MAX_TYPES]);
   220	unsigned int nvme_nr_io_queues(struct nvmf_ctrl_options *opts);
   221	void nvme_map_queues(struct blk_mq_tag_set *set, struct nvme_ctrl *ctrl,
 > 222			     struct ib_device *dev, u32 io_queues[HCTX_MAX_TYPES]);
   223	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] nvme-fabrics: add queue setup helpers
  2023-03-22  4:38   ` kernel test robot
@ 2023-03-22  5:21     ` Chaitanya Kulkarni
  0 siblings, 0 replies; 28+ messages in thread
From: Chaitanya Kulkarni @ 2023-03-22  5:21 UTC (permalink / raw)
  To: Keith Busch, linux-block, axboe, linux-nvme, hch, sagi; +Cc: Keith Busch

On 3/21/23 21:38, kernel test robot wrote:
> Hi Keith,
>
> I love your patch! Perhaps something to improve:
>
> [auto build test WARNING on axboe-block/for-next]
> [also build test WARNING on linus/master v6.3-rc3 next-20230322]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
>

It will be great to get the tested by tag from Martin and Daniel
especially for 3rd patch in the series before we merge.

-ck



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] nvme fabrics polling fixes
  2023-03-22  0:23 [PATCH 0/3] nvme fabrics polling fixes Keith Busch
                   ` (2 preceding siblings ...)
  2023-03-22  0:23 ` [PATCH 3/3] blk-mq: directly poll requests Keith Busch
@ 2023-03-22  7:31 ` Sagi Grimberg
  2023-03-22  8:48 ` Daniel Wagner
  4 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2023-03-22  7:31 UTC (permalink / raw)
  To: Keith Busch, linux-block, axboe, linux-nvme, hch; +Cc: Keith Busch


> I couldn't test the existing tcp

It is very simple to do, for tcp you can just operate on the loopback
iface 127.0.0.1

See example:
$ ./target.sh 1 1
$ nvme connect-all -t tcp -a 127.0.0.1 -P 1

For rdma you can work on any interface you want, you just need
iproute with rdma utility (which is available with recent iproute
packages).

Setup Software RoCE:
$ rdma link add rxe0 type rxe netdev <iface>
Setup Software iWarp:
$ rdma link add siw0 type siw netdev <iface>

Then the same as before...
$ ./target.sh 1 1
$ nvme connect-all -t tcp -a <iface-addr> -P 1


That is essentially what blktest do.

target.sh
--
#!/bin/bash

# args: <num_subsys> <nr_ns-per-subsystem> <nvme|file>
# examples:
# - ./target.sh 1 1       # uses null_blk
# - ./target.sh 1 1 nvme
# - ./target.sh 1 1 file

CFGFS=/sys/kernel/config/nvmet
NQN=testnqn

mount -t configfs configfs /sys/kernel/config > /dev/null 2>&1

rm -f $CFGFS/ports/*/subsystems/*
rm -f $CFGFS/subsystems/*/allowed_hosts/* > /dev/null 2>&1
rmdir $CFGFS/subsystems/*/namespaces/* > /dev/null 2>&1
rmdir $CFGFS/subsystems/* > /dev/null 2>&1
rmdir $CFGFS/ports/* > /dev/null 2>&1
rmdir $CFGFS/hosts/* > /dev/null 2>&1

modprobe -r nvmet_rdma nvmet-tcp > /dev/null 2>&1
modprobe -r ib_umad rdma_ucm ib_uverbs rdma_cm > /dev/null 2>&1

modprobe -r null_blk nvme-tcp nvme-rdma
modprobe -r nvme-loop
rm -f /tmp/nsf*

if [[ $1 == "0" ]]; then
         exit
fi

modprobe null_blk nr_devices=$2 #queue_mode=0 irqmode=0
modprobe -a nvmet_tcp nvmet_rdma

mkdir $CFGFS/ports/1
echo 0.0.0.0 > $CFGFS/ports/1/addr_traddr
echo 8009 > $CFGFS/ports/1/addr_trsvcid
echo "tcp" >  $CFGFS/ports/1/addr_trtype
echo "ipv4" >  $CFGFS/ports/1/addr_adrfam

mkdir $CFGFS/ports/2
echo 0.0.0.0 > $CFGFS/ports/2/addr_traddr
echo 4420 > $CFGFS/ports/2/addr_trsvcid
echo "rdma" >  $CFGFS/ports/2/addr_trtype
echo "ipv4" >  $CFGFS/ports/2/addr_adrfam

mkdir $CFGFS/ports/3
echo "loop" >  $CFGFS/ports/3/addr_trtype

for j in `seq $1`
do
         mkdir $CFGFS/subsystems/$NQN$j
         echo 1 > $CFGFS/subsystems/$NQN$j/attr_allow_any_host
         k=0
         for i in `seq $2`
         do
                 mkdir $CFGFS/subsystems/$NQN$j/namespaces/$i
                 if [[ "$3" == "nvme" ]]; then
                         echo -n "/dev/nvme0n1" > 
$CFGFS/subsystems/$NQN$j/namespaces/$i/device_path
                 elif [[ "$3" == "file" ]]; then
                         ns="/tmp/nsf$k"
                         touch $ns
                         truncate --size=2G $ns
                         echo -n "$ns" > 
$CFGFS/subsystems/$NQN$j/namespaces/$i/device_path
                 else
                         echo -n "/dev/nullb$k" > 
$CFGFS/subsystems/$NQN$j/namespaces/$i/device_path
                 fi
                 echo 1 > $CFGFS/subsystems/$NQN$j/namespaces/$i/enable
                 let "k+=1"
         done
         ln -s $CFGFS/subsystems/$NQN$j $CFGFS/ports/1/subsystems/
         ln -s $CFGFS/subsystems/$NQN$j $CFGFS/ports/2/subsystems/
         ln -s $CFGFS/subsystems/$NQN$j $CFGFS/ports/3/subsystems/
done
--

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] nvme-fabrics: add queue setup helpers
  2023-03-22  0:23 ` [PATCH 1/3] nvme-fabrics: add queue setup helpers Keith Busch
  2023-03-22  1:46   ` Chaitanya Kulkarni
  2023-03-22  4:38   ` kernel test robot
@ 2023-03-22  7:35   ` Sagi Grimberg
  2023-03-22  8:27     ` Christoph Hellwig
  2023-03-22  9:25   ` kernel test robot
  3 siblings, 1 reply; 28+ messages in thread
From: Sagi Grimberg @ 2023-03-22  7:35 UTC (permalink / raw)
  To: Keith Busch, linux-block, axboe, linux-nvme, hch; +Cc: Keith Busch


> From: Keith Busch <kbusch@kernel.org>
> 
> tcp and rdma transports have lots of duplicate code setting up the
> different queue mappings. Add common helpers.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   drivers/nvme/host/fabrics.c | 95 +++++++++++++++++++++++++++++++++++++
>   drivers/nvme/host/fabrics.h |  5 ++
>   drivers/nvme/host/rdma.c    | 81 ++-----------------------------
>   drivers/nvme/host/tcp.c     | 92 ++---------------------------------
>   4 files changed, 109 insertions(+), 164 deletions(-)
> 
> diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
> index bbaa04a0c502b..9dcb56ea1cc00 100644
> --- a/drivers/nvme/host/fabrics.c
> +++ b/drivers/nvme/host/fabrics.c
> @@ -4,6 +4,7 @@
>    * Copyright (c) 2015-2016 HGST, a Western Digital Company.
>    */
>   #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +#include <linux/blk-mq-rdma.h>
>   #include <linux/init.h>
>   #include <linux/miscdevice.h>
>   #include <linux/module.h>
> @@ -957,6 +958,100 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
>   	return ret;
>   }
>   
> +unsigned int nvme_nr_io_queues(struct nvmf_ctrl_options *opts)
> +{
> +	unsigned int nr_io_queues;
> +
> +	nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
> +	nr_io_queues += min(opts->nr_write_queues, num_online_cpus());
> +	nr_io_queues += min(opts->nr_poll_queues, num_online_cpus());
> +
> +	return nr_io_queues;
> +}
> +EXPORT_SYMBOL_GPL(nvme_nr_io_queues);

Given that it is shared only with tcp/rdma, maybe rename it
to nvmf_ip_nr_io_queues.

> +
> +void nvme_set_io_queues(struct nvmf_ctrl_options *opts, u32 nr_io_queues,
> +			u32 io_queues[HCTX_MAX_TYPES])
> +{
> +	if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
> +		/*
> +		 * separate read/write queues
> +		 * hand out dedicated default queues only after we have
> +		 * sufficient read queues.
> +		 */
> +		io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
> +		nr_io_queues -= io_queues[HCTX_TYPE_READ];
> +		io_queues[HCTX_TYPE_DEFAULT] =
> +			min(opts->nr_write_queues, nr_io_queues);
> +		nr_io_queues -= io_queues[HCTX_TYPE_DEFAULT];
> +	} else {
> +		/*
> +		 * shared read/write queues
> +		 * either no write queues were requested, or we don't have
> +		 * sufficient queue count to have dedicated default queues.
> +		 */
> +		io_queues[HCTX_TYPE_DEFAULT] =
> +			min(opts->nr_io_queues, nr_io_queues);
> +		nr_io_queues -= io_queues[HCTX_TYPE_DEFAULT];
> +	}
> +
> +	if (opts->nr_poll_queues && nr_io_queues) {
> +		/* map dedicated poll queues only if we have queues left */
> +		io_queues[HCTX_TYPE_POLL] =
> +			min(opts->nr_poll_queues, nr_io_queues);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(nvme_set_io_queues);

nvmf_ip_set_io_queues. Unless you think that this can be shared with
pci/fc as well?

> +
> +void nvme_map_queues(struct blk_mq_tag_set *set, struct nvme_ctrl *ctrl,
> +		     struct ib_device *dev, u32 io_queues[HCTX_MAX_TYPES])

Ugh, the ib_device input that may be null is bad...
I think that we can kill blk_mq_rdma_map_queues altogether
and unify the two.

But lets separate this and the patch before from the patch
that fixes the regression. I'd like that one to go asap
and unrelated to the others.

> +{
> +	struct nvmf_ctrl_options *opts = ctrl->opts;
> +
> +	if (opts->nr_write_queues && io_queues[HCTX_TYPE_READ]) {
> +		/* separate read/write queues */
> +		set->map[HCTX_TYPE_DEFAULT].nr_queues =
> +			io_queues[HCTX_TYPE_DEFAULT];
> +		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
> +		set->map[HCTX_TYPE_READ].nr_queues =
> +			io_queues[HCTX_TYPE_READ];
> +		set->map[HCTX_TYPE_READ].queue_offset =
> +			io_queues[HCTX_TYPE_DEFAULT];
> +	} else {
> +		/* shared read/write queues */
> +		set->map[HCTX_TYPE_DEFAULT].nr_queues =
> +			io_queues[HCTX_TYPE_DEFAULT];
> +		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
> +		set->map[HCTX_TYPE_READ].nr_queues =
> +			io_queues[HCTX_TYPE_DEFAULT];
> +		set->map[HCTX_TYPE_READ].queue_offset = 0;
> +	}
> +
> +	if (dev) {
> +		blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT], dev, 0);
> +		blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ], dev, 0);
> +	} else {
> +		blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
> +		blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
> +	}
> +
> +	if (opts->nr_poll_queues && io_queues[HCTX_TYPE_POLL]) {
> +		/* map dedicated poll queues only if we have queues left */
> +		set->map[HCTX_TYPE_POLL].nr_queues = io_queues[HCTX_TYPE_POLL];
> +		set->map[HCTX_TYPE_POLL].queue_offset =
> +			io_queues[HCTX_TYPE_DEFAULT] +
> +			io_queues[HCTX_TYPE_READ];
> +		blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
> +	}
> +
> +	dev_info(ctrl->device,
> +		"mapped %d/%d/%d default/read/poll queues.\n",
> +		io_queues[HCTX_TYPE_DEFAULT],
> +		io_queues[HCTX_TYPE_READ],
> +		io_queues[HCTX_TYPE_POLL]);
> +}
> +EXPORT_SYMBOL_GPL(nvme_map_queues);
> +
>   static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts,
>   		unsigned int required_opts)
>   {
> diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
> index dcac3df8a5f76..bad3e21ffb8ba 100644
> --- a/drivers/nvme/host/fabrics.h
> +++ b/drivers/nvme/host/fabrics.h
> @@ -215,5 +215,10 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
>   bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
>   bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
>   		struct nvmf_ctrl_options *opts);
> +void nvme_set_io_queues(struct nvmf_ctrl_options *opts, u32 nr_io_queues,
> +			u32 io_queues[HCTX_MAX_TYPES]);
> +unsigned int nvme_nr_io_queues(struct nvmf_ctrl_options *opts);
> +void nvme_map_queues(struct blk_mq_tag_set *set, struct nvme_ctrl *ctrl,
> +		     struct ib_device *dev, u32 io_queues[HCTX_MAX_TYPES]);
>   
>   #endif /* _NVME_FABRICS_H */
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index bbad26b82b56d..cbec566db2761 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -718,18 +718,10 @@ static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl,
>   static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
>   {
>   	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
> -	struct ib_device *ibdev = ctrl->device->dev;
> -	unsigned int nr_io_queues, nr_default_queues;
> -	unsigned int nr_read_queues, nr_poll_queues;
> +	unsigned int nr_io_queues;
>   	int i, ret;
>   
> -	nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
> -				min(opts->nr_io_queues, num_online_cpus()));
> -	nr_default_queues =  min_t(unsigned int, ibdev->num_comp_vectors,
> -				min(opts->nr_write_queues, num_online_cpus()));
> -	nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
> -	nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
> -
> +	nr_io_queues = nvme_nr_io_queues(opts);
>   	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
>   	if (ret)
>   		return ret;
> @@ -744,34 +736,7 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
>   	dev_info(ctrl->ctrl.device,
>   		"creating %d I/O queues.\n", nr_io_queues);
>   
> -	if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
> -		/*
> -		 * separate read/write queues
> -		 * hand out dedicated default queues only after we have
> -		 * sufficient read queues.
> -		 */
> -		ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
> -		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
> -		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
> -			min(nr_default_queues, nr_io_queues);
> -		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -	} else {
> -		/*
> -		 * shared read/write queues
> -		 * either no write queues were requested, or we don't have
> -		 * sufficient queue count to have dedicated default queues.
> -		 */
> -		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
> -			min(nr_read_queues, nr_io_queues);
> -		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -	}
> -
> -	if (opts->nr_poll_queues && nr_io_queues) {
> -		/* map dedicated poll queues only if we have queues left */
> -		ctrl->io_queues[HCTX_TYPE_POLL] =
> -			min(nr_poll_queues, nr_io_queues);
> -	}
> -
> +	nvme_set_io_queues(opts, nr_io_queues, ctrl->io_queues);
>   	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
>   		ret = nvme_rdma_alloc_queue(ctrl, i,
>   				ctrl->ctrl.sqsize + 1);
> @@ -2143,46 +2108,8 @@ static void nvme_rdma_complete_rq(struct request *rq)
>   static void nvme_rdma_map_queues(struct blk_mq_tag_set *set)
>   {
>   	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data);
> -	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
>   
> -	if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
> -		/* separate read/write queues */
> -		set->map[HCTX_TYPE_DEFAULT].nr_queues =
> -			ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
> -		set->map[HCTX_TYPE_READ].nr_queues =
> -			ctrl->io_queues[HCTX_TYPE_READ];
> -		set->map[HCTX_TYPE_READ].queue_offset =
> -			ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -	} else {
> -		/* shared read/write queues */
> -		set->map[HCTX_TYPE_DEFAULT].nr_queues =
> -			ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
> -		set->map[HCTX_TYPE_READ].nr_queues =
> -			ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -		set->map[HCTX_TYPE_READ].queue_offset = 0;
> -	}
> -	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
> -			ctrl->device->dev, 0);
> -	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
> -			ctrl->device->dev, 0);
> -
> -	if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
> -		/* map dedicated poll queues only if we have queues left */
> -		set->map[HCTX_TYPE_POLL].nr_queues =
> -				ctrl->io_queues[HCTX_TYPE_POLL];
> -		set->map[HCTX_TYPE_POLL].queue_offset =
> -			ctrl->io_queues[HCTX_TYPE_DEFAULT] +
> -			ctrl->io_queues[HCTX_TYPE_READ];
> -		blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
> -	}
> -
> -	dev_info(ctrl->ctrl.device,
> -		"mapped %d/%d/%d default/read/poll queues.\n",
> -		ctrl->io_queues[HCTX_TYPE_DEFAULT],
> -		ctrl->io_queues[HCTX_TYPE_READ],
> -		ctrl->io_queues[HCTX_TYPE_POLL]);
> +	nvme_map_queues(set, &ctrl->ctrl, ctrl->device->dev, ctrl->io_queues);
>   }
>   
>   static const struct blk_mq_ops nvme_rdma_mq_ops = {
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 7723a49895244..96298f8794e1a 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -1781,58 +1781,12 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
>   	return ret;
>   }
>   
> -static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
> -{
> -	unsigned int nr_io_queues;
> -
> -	nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
> -	nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
> -	nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
> -
> -	return nr_io_queues;
> -}
> -
> -static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
> -		unsigned int nr_io_queues)
> -{
> -	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
> -	struct nvmf_ctrl_options *opts = nctrl->opts;
> -
> -	if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
> -		/*
> -		 * separate read/write queues
> -		 * hand out dedicated default queues only after we have
> -		 * sufficient read queues.
> -		 */
> -		ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
> -		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
> -		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
> -			min(opts->nr_write_queues, nr_io_queues);
> -		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -	} else {
> -		/*
> -		 * shared read/write queues
> -		 * either no write queues were requested, or we don't have
> -		 * sufficient queue count to have dedicated default queues.
> -		 */
> -		ctrl->io_queues[HCTX_TYPE_DEFAULT] =
> -			min(opts->nr_io_queues, nr_io_queues);
> -		nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -	}
> -
> -	if (opts->nr_poll_queues && nr_io_queues) {
> -		/* map dedicated poll queues only if we have queues left */
> -		ctrl->io_queues[HCTX_TYPE_POLL] =
> -			min(opts->nr_poll_queues, nr_io_queues);
> -	}
> -}
> -
>   static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
>   {
>   	unsigned int nr_io_queues;
>   	int ret;
>   
> -	nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
> +	nr_io_queues = nvme_nr_io_queues(ctrl->opts);
>   	ret = nvme_set_queue_count(ctrl, &nr_io_queues);
>   	if (ret)
>   		return ret;
> @@ -1847,8 +1801,8 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
>   	dev_info(ctrl->device,
>   		"creating %d I/O queues.\n", nr_io_queues);
>   
> -	nvme_tcp_set_io_queues(ctrl, nr_io_queues);
> -
> +	nvme_set_io_queues(ctrl->opts, nr_io_queues,
> +			   to_tcp_ctrl(ctrl)->io_queues);
>   	return __nvme_tcp_alloc_io_queues(ctrl);
>   }
>   
> @@ -2428,44 +2382,8 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
>   static void nvme_tcp_map_queues(struct blk_mq_tag_set *set)
>   {
>   	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
> -	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
> -
> -	if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
> -		/* separate read/write queues */
> -		set->map[HCTX_TYPE_DEFAULT].nr_queues =
> -			ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
> -		set->map[HCTX_TYPE_READ].nr_queues =
> -			ctrl->io_queues[HCTX_TYPE_READ];
> -		set->map[HCTX_TYPE_READ].queue_offset =
> -			ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -	} else {
> -		/* shared read/write queues */
> -		set->map[HCTX_TYPE_DEFAULT].nr_queues =
> -			ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -		set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
> -		set->map[HCTX_TYPE_READ].nr_queues =
> -			ctrl->io_queues[HCTX_TYPE_DEFAULT];
> -		set->map[HCTX_TYPE_READ].queue_offset = 0;
> -	}
> -	blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
> -	blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
> -
> -	if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
> -		/* map dedicated poll queues only if we have queues left */
> -		set->map[HCTX_TYPE_POLL].nr_queues =
> -				ctrl->io_queues[HCTX_TYPE_POLL];
> -		set->map[HCTX_TYPE_POLL].queue_offset =
> -			ctrl->io_queues[HCTX_TYPE_DEFAULT] +
> -			ctrl->io_queues[HCTX_TYPE_READ];
> -		blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
> -	}
> -
> -	dev_info(ctrl->ctrl.device,
> -		"mapped %d/%d/%d default/read/poll queues.\n",
> -		ctrl->io_queues[HCTX_TYPE_DEFAULT],
> -		ctrl->io_queues[HCTX_TYPE_READ],
> -		ctrl->io_queues[HCTX_TYPE_POLL]);
> +
> +	nvme_map_queues(set, &ctrl->ctrl, NULL, ctrl->io_queues);
>   }
>   
>   static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] blk-mq: directly poll requests
  2023-03-22  0:23 ` [PATCH 3/3] blk-mq: directly poll requests Keith Busch
@ 2023-03-22  7:36   ` Sagi Grimberg
  2023-03-22  8:23   ` Christoph Hellwig
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2023-03-22  7:36 UTC (permalink / raw)
  To: Keith Busch, linux-block, axboe, linux-nvme, hch
  Cc: Keith Busch, Martin Belanger, Daniel Wagner

Nice.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] nvme: add polling options for loop target
  2023-03-22  0:23 ` [PATCH 2/3] nvme: add polling options for loop target Keith Busch
  2023-03-22  1:47   ` Chaitanya Kulkarni
@ 2023-03-22  7:44   ` Sagi Grimberg
  2023-03-22  8:23   ` Christoph Hellwig
  2 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2023-03-22  7:44 UTC (permalink / raw)
  To: Keith Busch, linux-block, axboe, linux-nvme, hch; +Cc: Keith Busch



On 3/22/23 02:23, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> This is for mostly for testing purposes.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   drivers/nvme/target/loop.c | 63 +++++++++++++++++++++++++++++++++++---
>   1 file changed, 58 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
> index f2d24b2d992f8..0587ead60b09e 100644
> --- a/drivers/nvme/target/loop.c
> +++ b/drivers/nvme/target/loop.c
> @@ -22,6 +22,7 @@ struct nvme_loop_iod {
>   	struct nvmet_req	req;
>   	struct nvme_loop_queue	*queue;
>   	struct work_struct	work;
> +	struct work_struct	poll;
>   	struct sg_table		sg_table;
>   	struct scatterlist	first_sgl[];
>   };
> @@ -37,6 +38,7 @@ struct nvme_loop_ctrl {
>   	struct nvme_ctrl	ctrl;
>   
>   	struct nvmet_port	*port;
> +	u32			io_queues[HCTX_MAX_TYPES];
>   };
>   
>   static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
> @@ -76,7 +78,11 @@ static void nvme_loop_complete_rq(struct request *req)
>   	struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
>   
>   	sg_free_table_chained(&iod->sg_table, NVME_INLINE_SG_CNT);
> -	nvme_complete_rq(req);
> +
> +	if (req->mq_hctx->type != HCTX_TYPE_POLL || !in_interrupt())
> +		nvme_complete_rq(req);
> +	else
> +		queue_work(nvmet_wq, &iod->poll);
>   }
>   
>   static struct blk_mq_tags *nvme_loop_tagset(struct nvme_loop_queue *queue)
> @@ -120,6 +126,15 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
>   	}
>   }
>   
> +static void nvme_loop_poll_work(struct work_struct *work)
> +{
> +	struct nvme_loop_iod *iod =
> +		container_of(work, struct nvme_loop_iod, poll);
> +	struct request *req = blk_mq_rq_from_pdu(iod);
> +
> +	nvme_complete_rq(req);
> +}
> +
>   static void nvme_loop_execute_work(struct work_struct *work)
>   {
>   	struct nvme_loop_iod *iod =
> @@ -170,6 +185,30 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
>   	return BLK_STS_OK;
>   }
>   
> +static bool nvme_loop_poll_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
> +{
> +	struct blk_mq_hw_ctx *hctx = data;
> +	struct nvme_loop_iod *iod;
> +	struct request *rq;
> +
> +	rq = blk_mq_tag_to_rq(hctx->tags, bitnr);
> +	if (!rq)
> +		return true;
> +
> +	iod = blk_mq_rq_to_pdu(rq);
> +	flush_work(&iod->poll);

If we want to go down this route, I would think that maybe
it'd be better to add .poll to nvmet_req like .execute, that can
actually be wired to bio_poll ? for file it can be wired to fop.iopoll

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] nvme: add polling options for loop target
  2023-03-22  0:23 ` [PATCH 2/3] nvme: add polling options for loop target Keith Busch
  2023-03-22  1:47   ` Chaitanya Kulkarni
  2023-03-22  7:44   ` Sagi Grimberg
@ 2023-03-22  8:23   ` Christoph Hellwig
  2023-03-22  8:46     ` Daniel Wagner
  2023-03-22 14:30     ` Keith Busch
  2 siblings, 2 replies; 28+ messages in thread
From: Christoph Hellwig @ 2023-03-22  8:23 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, axboe, linux-nvme, hch, sagi, Keith Busch

On Tue, Mar 21, 2023 at 05:23:49PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> This is for mostly for testing purposes.

I have to admit I'd rather not merge this upstream.  Any good reason
why we'd absolutely would want to have it?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] blk-mq: directly poll requests
  2023-03-22  0:23 ` [PATCH 3/3] blk-mq: directly poll requests Keith Busch
  2023-03-22  7:36   ` Sagi Grimberg
@ 2023-03-22  8:23   ` Christoph Hellwig
  2023-03-22  9:08     ` Sagi Grimberg
  2023-03-22  8:37   ` Daniel Wagner
  2023-03-31  7:57   ` Shinichiro Kawasaki
  3 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2023-03-22  8:23 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-block, axboe, linux-nvme, hch, sagi, Keith Busch,
	Martin Belanger, Daniel Wagner

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] nvme-fabrics: add queue setup helpers
  2023-03-22  7:35   ` Sagi Grimberg
@ 2023-03-22  8:27     ` Christoph Hellwig
  2023-03-22  9:07       ` Sagi Grimberg
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2023-03-22  8:27 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Keith Busch, linux-block, axboe, linux-nvme, hch, Keith Busch

On Wed, Mar 22, 2023 at 09:35:46AM +0200, Sagi Grimberg wrote:
>>   +unsigned int nvme_nr_io_queues(struct nvmf_ctrl_options *opts)
>> +{
>> +	unsigned int nr_io_queues;
>> +
>> +	nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
>> +	nr_io_queues += min(opts->nr_write_queues, num_online_cpus());
>> +	nr_io_queues += min(opts->nr_poll_queues, num_online_cpus());
>> +
>> +	return nr_io_queues;
>> +}
>> +EXPORT_SYMBOL_GPL(nvme_nr_io_queues);
>
> Given that it is shared only with tcp/rdma, maybe rename it
> to nvmf_ip_nr_io_queues.

Even if the other transports don't use it, nothing is really IP
specific here, so I don't think that's too useful.  But I'd use
the nvmf_ prefix like other functions in this file.

Just a personal choice, but I'd write this as:

	return min(opts->nr_io_queues, num_online_cpus()) +
		min(opts->nr_write_queues, num_online_cpus()) +
		min(opts->nr_poll_queues, num_online_cpus());

>> +EXPORT_SYMBOL_GPL(nvme_set_io_queues);
>
> nvmf_ip_set_io_queues. Unless you think that this can be shared with
> pci/fc as well?

Again I'd drop the _ip as nothing is IP specific.  FC might or might not
eventually use it, for PCI we don't have the opts structure anyway
(never mind this sits in fabrics.c).

>> +void nvme_map_queues(struct blk_mq_tag_set *set, struct nvme_ctrl *ctrl,
>> +		     struct ib_device *dev, u32 io_queues[HCTX_MAX_TYPES])
>
> Ugh, the ib_device input that may be null is bad...
> I think that we can kill blk_mq_rdma_map_queues altogether
> and unify the two.

Yes, I don't think anything touching an ib_device should be in
common code.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] blk-mq: directly poll requests
  2023-03-22  0:23 ` [PATCH 3/3] blk-mq: directly poll requests Keith Busch
  2023-03-22  7:36   ` Sagi Grimberg
  2023-03-22  8:23   ` Christoph Hellwig
@ 2023-03-22  8:37   ` Daniel Wagner
  2023-03-22 18:16     ` Chaitanya Kulkarni
  2023-03-31  7:57   ` Shinichiro Kawasaki
  3 siblings, 1 reply; 28+ messages in thread
From: Daniel Wagner @ 2023-03-22  8:37 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-block, axboe, linux-nvme, hch, sagi, Keith Busch, Martin Belanger

On Tue, Mar 21, 2023 at 05:23:50PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Polling needs a bio with a valid bi_bdev, but neither of those are
> guaranteed for polled driver requests. Make request based polling use
> directly use blk-mq's polling function.
> 
> When executing a request from a polled hctx, we know the request's
> cookie, and that it's from a live multi-queue that supports polling, so
> we can safely skip everything that bio_poll provides.
> 
> Link: http://lists.infradead.org/pipermail/linux-nvme/2023-March/038340.html
> Reported-by: Martin Belanger <Martin.Belanger@dell.com>
> Reported-by: Daniel Wagner <dwagner@suse.de>
> Signed-off-by: Keith Busch <kbusch@kernel.org>

I've tested the whole series and this patch alone with rdma and tcp.

Tested-by: Daniel Wagner <dwagner@suse.de>
Revieded-by: Daniel Wagner <dwagner@suse.de>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] nvme: add polling options for loop target
  2023-03-22  8:23   ` Christoph Hellwig
@ 2023-03-22  8:46     ` Daniel Wagner
  2023-03-22 13:52       ` Christoph Hellwig
  2023-03-22 14:30     ` Keith Busch
  1 sibling, 1 reply; 28+ messages in thread
From: Daniel Wagner @ 2023-03-22  8:46 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, linux-block, axboe, linux-nvme, sagi, Keith Busch

On Wed, Mar 22, 2023 at 09:23:10AM +0100, Christoph Hellwig wrote:
> On Tue, Mar 21, 2023 at 05:23:49PM -0700, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > This is for mostly for testing purposes.
> 
> I have to admit I'd rather not merge this upstream.  Any good reason
> why we'd absolutely would want to have it?

The blktest I have written for this problem fails for loop without something
like this. We can certaintanly teach blktests not run a specific test for loop
but currently, the _require_nvme_trtype_is_fabrics check is including loop.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] nvme fabrics polling fixes
  2023-03-22  0:23 [PATCH 0/3] nvme fabrics polling fixes Keith Busch
                   ` (3 preceding siblings ...)
  2023-03-22  7:31 ` [PATCH 0/3] nvme fabrics polling fixes Sagi Grimberg
@ 2023-03-22  8:48 ` Daniel Wagner
  2023-03-22 13:24   ` Sagi Grimberg
  4 siblings, 1 reply; 28+ messages in thread
From: Daniel Wagner @ 2023-03-22  8:48 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, axboe, linux-nvme, hch, sagi, Keith Busch

On Tue, Mar 21, 2023 at 05:23:47PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> I couldn't test the existing tcp or rdma options, so I had to make a
> loop poll option. The last patch fixes the polling queues when used with
> fabrics.
> 
> Note, this depends on patch I sent earlier today that I should have just
> included in this series:
> 
>   https://lore.kernel.org/linux-block/20230321215001.2655451-1-kbusch@meta.com/T/#u

I've tested this series with

  https://github.com/igaw/blktests/tree/queue-counts

and while for rdma all is good I got a lockdep warning for tcp:


 ======================================================
 WARNING: possible circular locking dependency detected
 6.3.0-rc1+ #15 Tainted: G        W
 ------------------------------------------------------
 kworker/6:0/54 is trying to acquire lock:
 ffff888121d88030 ((work_completion)(&queue->io_work)){+.+.}-{0:0}, at: __flush_work+0xb9/0x170

 but task is already holding lock:
 ffff888100b0fd20 ((work_completion)(&queue->release_work)){+.+.}-{0:0}, at: process_one_work+0x707/0xbc0

 which lock already depends on the new lock.


 the existing dependency chain (in reverse order) is:

 -> #2 ((work_completion)(&queue->release_work)){+.+.}-{0:0}:
        lock_acquire+0x13a/0x310
        process_one_work+0x728/0xbc0
        worker_thread+0x97a/0x1480
        kthread+0x228/0x2b0
        ret_from_fork+0x1f/0x30

 -> #1 ((wq_completion)nvmet-wq){+.+.}-{0:0}:
        lock_acquire+0x13a/0x310
        __flush_workqueue+0x185/0x14e0
        nvmet_tcp_install_queue+0x63/0x270 [nvmet_tcp]
        nvmet_install_queue+0x2b1/0x6a0 [nvmet]
        nvmet_execute_admin_connect+0x381/0x880 [nvmet]
        nvmet_tcp_io_work+0x15e8/0x8f60 [nvmet_tcp]
        process_one_work+0x756/0xbc0
        worker_thread+0x97a/0x1480
        kthread+0x228/0x2b0
        ret_from_fork+0x1f/0x30

 -> #0 ((work_completion)(&queue->io_work)){+.+.}-{0:0}:
        validate_chain+0x19f1/0x6d50
        __lock_acquire+0x122d/0x1e90
        lock_acquire+0x13a/0x310
        __flush_work+0xd5/0x170
        __cancel_work_timer+0x36b/0x470
        nvmet_tcp_release_queue_work+0x25c/0x1000 [nvmet_tcp]
        process_one_work+0x756/0xbc0
        worker_thread+0x97a/0x1480
        kthread+0x228/0x2b0
        ret_from_fork+0x1f/0x30

 other info that might help us debug this:

 Chain exists of:
   (work_completion)(&queue->io_work) --> (wq_completion)nvmet-wq --> (work_completion)(&queue->release_work)

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock((work_completion)(&queue->release_work));
                                lock((wq_completion)nvmet-wq);
                                lock((work_completion)(&queue->release_work));
   lock((work_completion)(&queue->io_work));

  *** DEADLOCK ***

 2 locks held by kworker/6:0/54:
  #0: ffff888109ff6d48 ((wq_completion)nvmet-wq){+.+.}-{0:0}, at: process_one_work+0x6c8/0xbc0
  #1: ffff888100b0fd20 ((work_completion)(&queue->release_work)){+.+.}-{0:0}, at: process_one_work+0x707/0xbc0

 stack backtrace:
 CPU: 6 PID: 54 Comm: kworker/6:0 Tainted: G        W          6.3.0-rc1+ #15 f4d05de834b07d62567d33b70ec70fb0fa06f103
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
 Workqueue: nvmet-wq nvmet_tcp_release_queue_work [nvmet_tcp]
 Call Trace:
  <TASK>
  dump_stack_lvl+0x5a/0x80
  check_noncircular+0x2c8/0x390
  ? add_chain_block+0x5e0/0x5e0
  ? ret_from_fork+0x1f/0x30
  ? lockdep_lock+0xd3/0x260
  ? stack_trace_save+0x10a/0x1e0
  ? stack_trace_snprint+0x100/0x100
  ? check_noncircular+0x1a6/0x390
  validate_chain+0x19f1/0x6d50
  ? lockdep_unlock+0x9e/0x1f0
  ? validate_chain+0x15b2/0x6d50
  ? reacquire_held_locks+0x510/0x510
  ? reacquire_held_locks+0x510/0x510
  ? reacquire_held_locks+0x510/0x510
  ? add_lock_to_list+0xbf/0x2c0
  ? lockdep_unlock+0x9e/0x1f0
  ? validate_chain+0x15b2/0x6d50
  ? reacquire_held_locks+0x510/0x510
  ? reacquire_held_locks+0x510/0x510
  ? xfs_buf_find_lock+0xb0/0x430 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? reacquire_held_locks+0x510/0x510
  ? validate_chain+0x176/0x6d50
  ? trace_lock_acquired+0x7b/0x180
  ? lock_is_held_type+0x8b/0x110
  ? lock_is_held_type+0x8b/0x110
  ? rcu_read_lock_sched_held+0x34/0x70
  ? reacquire_held_locks+0x510/0x510
  ? xfs_buf_get_map+0xd72/0x11a0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? lock_is_held_type+0x8b/0x110
  ? rcu_read_lock_sched_held+0x34/0x70
  ? trace_xfs_buf_read+0x7c/0x180 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_buf_read_map+0x111/0x700 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? lock_is_held_type+0x8b/0x110
  ? lock_is_held_type+0x8b/0x110
  ? xfs_btree_read_buf_block+0x205/0x300 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? rcu_read_lock_sched_held+0x34/0x70
  ? trace_xfs_trans_read_buf+0x79/0x170 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_btree_read_buf_block+0x205/0x300 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_trans_read_buf_map+0x303/0x4f0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? trace_xfs_trans_getsb+0x170/0x170 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_verify_fsbno+0x74/0x130 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_btree_ptr_to_daddr+0x19b/0x660 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_buf_set_ref+0x1d/0x50 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_btree_read_buf_block+0x233/0x300 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? mark_lock+0x8f/0x320
  ? xfs_btree_readahead+0x250/0x250 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_verify_fsbno+0x74/0x130 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_btree_ptr_to_daddr+0x19b/0x660 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_dio_write_end_io+0x32f/0x3f0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_dio_write_end_io+0x32f/0x3f0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_dio_write_end_io+0x32f/0x3f0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_dio_write_end_io+0x32f/0x3f0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? __module_address+0x86/0x1e0
  ? ret_from_fork+0x1f/0x30
  ? deref_stack_reg+0x17f/0x210
  ? ret_from_fork+0x1f/0x30
  ? unwind_next_frame+0x16b/0x2240
  ? ret_from_fork+0x1f/0x30
  ? stack_trace_save+0x1e0/0x1e0
  ? arch_stack_walk+0xb7/0xf0
  ? lock_is_held_type+0x8b/0x110
  ? find_busiest_group+0x104e/0x2480
  ? load_balance+0x2540/0x2540
  ? stack_trace_save+0x10a/0x1e0
  ? mark_lock+0x8f/0x320
  ? __lock_acquire+0x122d/0x1e90
  ? lock_is_held_type+0x8b/0x110
  ? rcu_lock_acquire+0x30/0x30
  ? xfs_buf_ioend+0x248/0x450 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? xfs_buf_ioend+0x248/0x450 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
  ? __module_address+0x86/0x1e0
  ? ret_from_fork+0x1f/0x30
  ? deref_stack_reg+0x17f/0x210
  ? ret_from_fork+0x1f/0x30
  ? unwind_next_frame+0x16b/0x2240
  ? stack_trace_save+0x10a/0x1e0
  ? deref_stack_reg+0x17f/0x210
  ? look_up_lock_class+0x65/0x130
  ? register_lock_class+0x5d/0x860
  ? mark_lock+0x8f/0x320
  __lock_acquire+0x122d/0x1e90
  lock_acquire+0x13a/0x310
  ? __flush_work+0xb9/0x170
  ? read_lock_is_recursive+0x10/0x10
  ? lock_is_held_type+0x8b/0x110
  ? rcu_lock_acquire+0x30/0x30
  __flush_work+0xd5/0x170
  ? __flush_work+0xb9/0x170
  ? flush_work+0x10/0x10
  ? lock_is_held_type+0x8b/0x110
  ? __lock_acquire+0x1e90/0x1e90
  ? do_raw_spin_unlock+0x112/0x890
  ? mark_lock+0x8f/0x320
  ? lockdep_hardirqs_on_prepare+0x2d5/0x610
  __cancel_work_timer+0x36b/0x470
  ? cancel_work_sync+0x10/0x10
  ? mark_lock+0x8f/0x320
  ? lockdep_hardirqs_on_prepare+0x2d5/0x610
  ? nvmet_tcp_release_queue_work+0x24d/0x1000 [nvmet_tcp f61749ac066e0812c28869697bc2623872f02bd4]
  ? datagram_poll+0x380/0x380
  nvmet_tcp_release_queue_work+0x25c/0x1000 [nvmet_tcp f61749ac066e0812c28869697bc2623872f02bd4]
  process_one_work+0x756/0xbc0
  ? rescuer_thread+0x13f0/0x13f0
  ? lock_acquired+0x2f2/0x930
  ? worker_thread+0xf55/0x1480
  worker_thread+0x97a/0x1480
  ? rcu_lock_release+0x20/0x20
  kthread+0x228/0x2b0
  ? rcu_lock_release+0x20/0x20
  ? kthread_blkcg+0xa0/0xa0
  ret_from_fork+0x1f/0x30
  </TASK>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] nvme-fabrics: add queue setup helpers
  2023-03-22  8:27     ` Christoph Hellwig
@ 2023-03-22  9:07       ` Sagi Grimberg
  0 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2023-03-22  9:07 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, linux-block, axboe, linux-nvme, Keith Busch


>>>    +unsigned int nvme_nr_io_queues(struct nvmf_ctrl_options *opts)
>>> +{
>>> +	unsigned int nr_io_queues;
>>> +
>>> +	nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
>>> +	nr_io_queues += min(opts->nr_write_queues, num_online_cpus());
>>> +	nr_io_queues += min(opts->nr_poll_queues, num_online_cpus());
>>> +
>>> +	return nr_io_queues;
>>> +}
>>> +EXPORT_SYMBOL_GPL(nvme_nr_io_queues);
>>
>> Given that it is shared only with tcp/rdma, maybe rename it
>> to nvmf_ip_nr_io_queues.
> 
> Even if the other transports don't use it, nothing is really IP
> specific here, so I don't think that's too useful.  But I'd use
> the nvmf_ prefix like other functions in this file.

OK.

> 
> Just a personal choice, but I'd write this as:
> 
> 	return min(opts->nr_io_queues, num_online_cpus()) +
> 		min(opts->nr_write_queues, num_online_cpus()) +
> 		min(opts->nr_poll_queues, num_online_cpus());
> 
>>> +EXPORT_SYMBOL_GPL(nvme_set_io_queues);
>>
>> nvmf_ip_set_io_queues. Unless you think that this can be shared with
>> pci/fc as well?
> 
> Again I'd drop the _ip as nothing is IP specific.  FC might or might not
> eventually use it, for PCI we don't have the opts structure anyway
> (never mind this sits in fabrics.c).

OK

> 
>>> +void nvme_map_queues(struct blk_mq_tag_set *set, struct nvme_ctrl *ctrl,
>>> +		     struct ib_device *dev, u32 io_queues[HCTX_MAX_TYPES])
>>
>> Ugh, the ib_device input that may be null is bad...
>> I think that we can kill blk_mq_rdma_map_queues altogether
>> and unify the two.
> 
> Yes, I don't think anything touching an ib_device should be in
> common code.
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] blk-mq: directly poll requests
  2023-03-22  8:23   ` Christoph Hellwig
@ 2023-03-22  9:08     ` Sagi Grimberg
  0 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2023-03-22  9:08 UTC (permalink / raw)
  To: Christoph Hellwig, Keith Busch
  Cc: linux-block, axboe, linux-nvme, Keith Busch, Martin Belanger,
	Daniel Wagner


> Looks good:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>

This should go in unrelated to the rest of the series.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/3] nvme-fabrics: add queue setup helpers
  2023-03-22  0:23 ` [PATCH 1/3] nvme-fabrics: add queue setup helpers Keith Busch
                     ` (2 preceding siblings ...)
  2023-03-22  7:35   ` Sagi Grimberg
@ 2023-03-22  9:25   ` kernel test robot
  3 siblings, 0 replies; 28+ messages in thread
From: kernel test robot @ 2023-03-22  9:25 UTC (permalink / raw)
  To: Keith Busch, linux-block, axboe, linux-nvme, hch, sagi
  Cc: oe-kbuild-all, Keith Busch

Hi Keith,

I love your patch! Yet something to improve:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on linus/master v6.3-rc3 next-20230322]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/nvme-fabrics-add-queue-setup-helpers/20230322-082537
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20230322002350.4038048-2-kbusch%40meta.com
patch subject: [PATCH 1/3] nvme-fabrics: add queue setup helpers
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20230322/202303221736.jlRVazVR-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/0acb3964621b925db18ebec1a6c57bc3c3446859
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Keith-Busch/nvme-fabrics-add-queue-setup-helpers/20230322-082537
        git checkout 0acb3964621b925db18ebec1a6c57bc3c3446859
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 olddefconfig
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202303221736.jlRVazVR-lkp@intel.com/

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> ERROR: modpost: "blk_mq_rdma_map_queues" [drivers/nvme/host/nvme-fabrics.ko] undefined!

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/3] nvme fabrics polling fixes
  2023-03-22  8:48 ` Daniel Wagner
@ 2023-03-22 13:24   ` Sagi Grimberg
  0 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2023-03-22 13:24 UTC (permalink / raw)
  To: Daniel Wagner, Keith Busch
  Cc: linux-block, axboe, linux-nvme, hch, Keith Busch

>> From: Keith Busch <kbusch@kernel.org>
>>
>> I couldn't test the existing tcp or rdma options, so I had to make a
>> loop poll option. The last patch fixes the polling queues when used with
>> fabrics.
>>
>> Note, this depends on patch I sent earlier today that I should have just
>> included in this series:
>>
>>    https://lore.kernel.org/linux-block/20230321215001.2655451-1-kbusch@meta.com/T/#u
> 
> I've tested this series with
> 
>    https://github.com/igaw/blktests/tree/queue-counts
> 
> and while for rdma all is good I got a lockdep warning for tcp:
> 
> 
>   ======================================================
>   WARNING: possible circular locking dependency detected
>   6.3.0-rc1+ #15 Tainted: G        W
>   ------------------------------------------------------
>   kworker/6:0/54 is trying to acquire lock:
>   ffff888121d88030 ((work_completion)(&queue->io_work)){+.+.}-{0:0}, at: __flush_work+0xb9/0x170
> 
>   but task is already holding lock:
>   ffff888100b0fd20 ((work_completion)(&queue->release_work)){+.+.}-{0:0}, at: process_one_work+0x707/0xbc0
> 
>   which lock already depends on the new lock.
> 
> 
>   the existing dependency chain (in reverse order) is:
> 
>   -> #2 ((work_completion)(&queue->release_work)){+.+.}-{0:0}:
>          lock_acquire+0x13a/0x310
>          process_one_work+0x728/0xbc0
>          worker_thread+0x97a/0x1480
>          kthread+0x228/0x2b0
>          ret_from_fork+0x1f/0x30
> 
>   -> #1 ((wq_completion)nvmet-wq){+.+.}-{0:0}:
>          lock_acquire+0x13a/0x310
>          __flush_workqueue+0x185/0x14e0
>          nvmet_tcp_install_queue+0x63/0x270 [nvmet_tcp]
>          nvmet_install_queue+0x2b1/0x6a0 [nvmet]
>          nvmet_execute_admin_connect+0x381/0x880 [nvmet]
>          nvmet_tcp_io_work+0x15e8/0x8f60 [nvmet_tcp]
>          process_one_work+0x756/0xbc0
>          worker_thread+0x97a/0x1480
>          kthread+0x228/0x2b0
>          ret_from_fork+0x1f/0x30
> 
>   -> #0 ((work_completion)(&queue->io_work)){+.+.}-{0:0}:
>          validate_chain+0x19f1/0x6d50
>          __lock_acquire+0x122d/0x1e90
>          lock_acquire+0x13a/0x310
>          __flush_work+0xd5/0x170
>          __cancel_work_timer+0x36b/0x470
>          nvmet_tcp_release_queue_work+0x25c/0x1000 [nvmet_tcp]
>          process_one_work+0x756/0xbc0
>          worker_thread+0x97a/0x1480
>          kthread+0x228/0x2b0
>          ret_from_fork+0x1f/0x30
> 
>   other info that might help us debug this:
> 
>   Chain exists of:
>     (work_completion)(&queue->io_work) --> (wq_completion)nvmet-wq --> (work_completion)(&queue->release_work)

This came up before.

Its because in nvmet_tcp_install_queue we flush the nvmet_wq, to let
queue releases flush before accepting new queues. Without it, there is
no back-pressure and a host can simply reconnect in a loop and exhaust
nvmet system memory...

I don't think its a deadlock, because the queue that flushes the
nvmet_wq cannot be in release, this is only possible that other queues
would be releasing. So I can't see how it would deadlock. But I don't
know how to teach lockdep.

I'll try to see if there is a different way to do this, without annoying
lockdep.

> 
>    Possible unsafe locking scenario:
> 
>          CPU0                    CPU1
>          ----                    ----
>     lock((work_completion)(&queue->release_work));
>                                  lock((wq_completion)nvmet-wq);
>                                  lock((work_completion)(&queue->release_work));
>     lock((work_completion)(&queue->io_work));
> 
>    *** DEADLOCK ***
> 
>   2 locks held by kworker/6:0/54:
>    #0: ffff888109ff6d48 ((wq_completion)nvmet-wq){+.+.}-{0:0}, at: process_one_work+0x6c8/0xbc0
>    #1: ffff888100b0fd20 ((work_completion)(&queue->release_work)){+.+.}-{0:0}, at: process_one_work+0x707/0xbc0
> 
>   stack backtrace:
>   CPU: 6 PID: 54 Comm: kworker/6:0 Tainted: G        W          6.3.0-rc1+ #15 f4d05de834b07d62567d33b70ec70fb0fa06f103
>   Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
>   Workqueue: nvmet-wq nvmet_tcp_release_queue_work [nvmet_tcp]
>   Call Trace:
>    <TASK>
>    dump_stack_lvl+0x5a/0x80
>    check_noncircular+0x2c8/0x390
>    ? add_chain_block+0x5e0/0x5e0
>    ? ret_from_fork+0x1f/0x30
>    ? lockdep_lock+0xd3/0x260
>    ? stack_trace_save+0x10a/0x1e0
>    ? stack_trace_snprint+0x100/0x100
>    ? check_noncircular+0x1a6/0x390
>    validate_chain+0x19f1/0x6d50
>    ? lockdep_unlock+0x9e/0x1f0
>    ? validate_chain+0x15b2/0x6d50
>    ? reacquire_held_locks+0x510/0x510
>    ? reacquire_held_locks+0x510/0x510
>    ? reacquire_held_locks+0x510/0x510
>    ? add_lock_to_list+0xbf/0x2c0
>    ? lockdep_unlock+0x9e/0x1f0
>    ? validate_chain+0x15b2/0x6d50
>    ? reacquire_held_locks+0x510/0x510
>    ? reacquire_held_locks+0x510/0x510
>    ? xfs_buf_find_lock+0xb0/0x430 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? reacquire_held_locks+0x510/0x510
>    ? validate_chain+0x176/0x6d50
>    ? trace_lock_acquired+0x7b/0x180
>    ? lock_is_held_type+0x8b/0x110
>    ? lock_is_held_type+0x8b/0x110
>    ? rcu_read_lock_sched_held+0x34/0x70
>    ? reacquire_held_locks+0x510/0x510
>    ? xfs_buf_get_map+0xd72/0x11a0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? lock_is_held_type+0x8b/0x110
>    ? rcu_read_lock_sched_held+0x34/0x70
>    ? trace_xfs_buf_read+0x7c/0x180 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_buf_read_map+0x111/0x700 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? lock_is_held_type+0x8b/0x110
>    ? lock_is_held_type+0x8b/0x110
>    ? xfs_btree_read_buf_block+0x205/0x300 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? rcu_read_lock_sched_held+0x34/0x70
>    ? trace_xfs_trans_read_buf+0x79/0x170 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_btree_read_buf_block+0x205/0x300 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_trans_read_buf_map+0x303/0x4f0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? trace_xfs_trans_getsb+0x170/0x170 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_verify_fsbno+0x74/0x130 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_btree_ptr_to_daddr+0x19b/0x660 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_buf_set_ref+0x1d/0x50 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_btree_read_buf_block+0x233/0x300 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? mark_lock+0x8f/0x320
>    ? xfs_btree_readahead+0x250/0x250 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_verify_fsbno+0x74/0x130 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_btree_ptr_to_daddr+0x19b/0x660 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_dio_write_end_io+0x32f/0x3f0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_dio_write_end_io+0x32f/0x3f0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_dio_write_end_io+0x32f/0x3f0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_dio_write_end_io+0x32f/0x3f0 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? __module_address+0x86/0x1e0
>    ? ret_from_fork+0x1f/0x30
>    ? deref_stack_reg+0x17f/0x210
>    ? ret_from_fork+0x1f/0x30
>    ? unwind_next_frame+0x16b/0x2240
>    ? ret_from_fork+0x1f/0x30
>    ? stack_trace_save+0x1e0/0x1e0
>    ? arch_stack_walk+0xb7/0xf0
>    ? lock_is_held_type+0x8b/0x110
>    ? find_busiest_group+0x104e/0x2480
>    ? load_balance+0x2540/0x2540
>    ? stack_trace_save+0x10a/0x1e0
>    ? mark_lock+0x8f/0x320
>    ? __lock_acquire+0x122d/0x1e90
>    ? lock_is_held_type+0x8b/0x110
>    ? rcu_lock_acquire+0x30/0x30
>    ? xfs_buf_ioend+0x248/0x450 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? xfs_buf_ioend+0x248/0x450 [xfs e56ce85f3b18232dbd061be3c73dc29bed4ad37b]
>    ? __module_address+0x86/0x1e0
>    ? ret_from_fork+0x1f/0x30
>    ? deref_stack_reg+0x17f/0x210
>    ? ret_from_fork+0x1f/0x30
>    ? unwind_next_frame+0x16b/0x2240
>    ? stack_trace_save+0x10a/0x1e0
>    ? deref_stack_reg+0x17f/0x210
>    ? look_up_lock_class+0x65/0x130
>    ? register_lock_class+0x5d/0x860
>    ? mark_lock+0x8f/0x320
>    __lock_acquire+0x122d/0x1e90
>    lock_acquire+0x13a/0x310
>    ? __flush_work+0xb9/0x170
>    ? read_lock_is_recursive+0x10/0x10
>    ? lock_is_held_type+0x8b/0x110
>    ? rcu_lock_acquire+0x30/0x30
>    __flush_work+0xd5/0x170
>    ? __flush_work+0xb9/0x170
>    ? flush_work+0x10/0x10
>    ? lock_is_held_type+0x8b/0x110
>    ? __lock_acquire+0x1e90/0x1e90
>    ? do_raw_spin_unlock+0x112/0x890
>    ? mark_lock+0x8f/0x320
>    ? lockdep_hardirqs_on_prepare+0x2d5/0x610
>    __cancel_work_timer+0x36b/0x470
>    ? cancel_work_sync+0x10/0x10
>    ? mark_lock+0x8f/0x320
>    ? lockdep_hardirqs_on_prepare+0x2d5/0x610
>    ? nvmet_tcp_release_queue_work+0x24d/0x1000 [nvmet_tcp f61749ac066e0812c28869697bc2623872f02bd4]
>    ? datagram_poll+0x380/0x380
>    nvmet_tcp_release_queue_work+0x25c/0x1000 [nvmet_tcp f61749ac066e0812c28869697bc2623872f02bd4]
>    process_one_work+0x756/0xbc0
>    ? rescuer_thread+0x13f0/0x13f0
>    ? lock_acquired+0x2f2/0x930
>    ? worker_thread+0xf55/0x1480
>    worker_thread+0x97a/0x1480
>    ? rcu_lock_release+0x20/0x20
>    kthread+0x228/0x2b0
>    ? rcu_lock_release+0x20/0x20
>    ? kthread_blkcg+0xa0/0xa0
>    ret_from_fork+0x1f/0x30
>    </TASK>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] nvme: add polling options for loop target
  2023-03-22  8:46     ` Daniel Wagner
@ 2023-03-22 13:52       ` Christoph Hellwig
  2023-03-22 14:06         ` Daniel Wagner
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Hellwig @ 2023-03-22 13:52 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: Christoph Hellwig, Keith Busch, linux-block, axboe, linux-nvme,
	sagi, Keith Busch

On Wed, Mar 22, 2023 at 09:46:51AM +0100, Daniel Wagner wrote:
> The blktest I have written for this problem fails for loop without something
> like this. We can certaintanly teach blktests not run a specific test for loop
> but currently, the _require_nvme_trtype_is_fabrics check is including loop.

Who says that we could support polling on all current and future
fabrics transports?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] nvme: add polling options for loop target
  2023-03-22 13:52       ` Christoph Hellwig
@ 2023-03-22 14:06         ` Daniel Wagner
  2023-03-22 14:20           ` Christoph Hellwig
  0 siblings, 1 reply; 28+ messages in thread
From: Daniel Wagner @ 2023-03-22 14:06 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, linux-block, axboe, linux-nvme, sagi, Keith Busch

On Wed, Mar 22, 2023 at 02:52:00PM +0100, Christoph Hellwig wrote:
> Who says that we could support polling on all current and future
> fabrics transports?

I just assumed this is a generic feature supposed to present in all transports.
I'll update my new blktest test to run only tcp or rdma.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] nvme: add polling options for loop target
  2023-03-22 14:06         ` Daniel Wagner
@ 2023-03-22 14:20           ` Christoph Hellwig
  0 siblings, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2023-03-22 14:20 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: Christoph Hellwig, Keith Busch, linux-block, axboe, linux-nvme,
	sagi, Keith Busch

On Wed, Mar 22, 2023 at 03:06:19PM +0100, Daniel Wagner wrote:
> On Wed, Mar 22, 2023 at 02:52:00PM +0100, Christoph Hellwig wrote:
> > Who says that we could support polling on all current and future
> > fabrics transports?
> 
> I just assumed this is a generic feature supposed to present in all transports.
> I'll update my new blktest test to run only tcp or rdma.

The best idea would be to do a trival and error, that is do a _notrun
if trying to create a connection with the options fails.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/3] nvme: add polling options for loop target
  2023-03-22  8:23   ` Christoph Hellwig
  2023-03-22  8:46     ` Daniel Wagner
@ 2023-03-22 14:30     ` Keith Busch
  1 sibling, 0 replies; 28+ messages in thread
From: Keith Busch @ 2023-03-22 14:30 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, linux-block, axboe, linux-nvme, sagi

On Wed, Mar 22, 2023 at 09:23:10AM +0100, Christoph Hellwig wrote:
> On Tue, Mar 21, 2023 at 05:23:49PM -0700, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > This is for mostly for testing purposes.
> 
> I have to admit I'd rather not merge this upstream.  Any good reason
> why we'd absolutely would want to have it?

The only value is that it's the easiest fabric to exercise some of these
generic code paths, and it's how I validated the fix in patch 3. Otherwise this
is has no other practical use case, so I don't mind dropping it.

Let's just go with patch 3 only from this series. I'll rework patch 1 atop
Sagi's rdma affinity removal since it's a nice cleanup.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] blk-mq: directly poll requests
  2023-03-22  8:37   ` Daniel Wagner
@ 2023-03-22 18:16     ` Chaitanya Kulkarni
  0 siblings, 0 replies; 28+ messages in thread
From: Chaitanya Kulkarni @ 2023-03-22 18:16 UTC (permalink / raw)
  To: Daniel Wagner, Keith Busch
  Cc: linux-block, axboe, linux-nvme, hch, sagi, Keith Busch, Martin Belanger

On 3/22/23 01:37, Daniel Wagner wrote:
> On Tue, Mar 21, 2023 at 05:23:50PM -0700, Keith Busch wrote:
>> From: Keith Busch <kbusch@kernel.org>
>>
>> Polling needs a bio with a valid bi_bdev, but neither of those are
>> guaranteed for polled driver requests. Make request based polling use
>> directly use blk-mq's polling function.
>>
>> When executing a request from a polled hctx, we know the request's
>> cookie, and that it's from a live multi-queue that supports polling, so
>> we can safely skip everything that bio_poll provides.
>>
>> Link: http://lists.infradead.org/pipermail/linux-nvme/2023-March/038340.html
>> Reported-by: Martin Belanger <Martin.Belanger@dell.com>
>> Reported-by: Daniel Wagner <dwagner@suse.de>
>> Signed-off-by: Keith Busch <kbusch@kernel.org>
> I've tested the whole series and this patch alone with rdma and tcp.
>
> Tested-by: Daniel Wagner <dwagner@suse.de>
> Revieded-by: Daniel Wagner <dwagner@suse.de>

Thanks Daniel, please add my tag also.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/3] blk-mq: directly poll requests
  2023-03-22  0:23 ` [PATCH 3/3] blk-mq: directly poll requests Keith Busch
                     ` (2 preceding siblings ...)
  2023-03-22  8:37   ` Daniel Wagner
@ 2023-03-31  7:57   ` Shinichiro Kawasaki
  3 siblings, 0 replies; 28+ messages in thread
From: Shinichiro Kawasaki @ 2023-03-31  7:57 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-block, axboe, linux-nvme, hch, sagi, Keith Busch,
	Martin Belanger, Daniel Wagner

On Mar 21, 2023 / 17:23, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Polling needs a bio with a valid bi_bdev, but neither of those are
> guaranteed for polled driver requests. Make request based polling use
> directly use blk-mq's polling function.
> 
> When executing a request from a polled hctx, we know the request's
> cookie, and that it's from a live multi-queue that supports polling, so
> we can safely skip everything that bio_poll provides.
> 
> Link: http://lists.infradead.org/pipermail/linux-nvme/2023-March/038340.html
> Reported-by: Martin Belanger <Martin.Belanger@dell.com>
> Reported-by: Daniel Wagner <dwagner@suse.de>
> Signed-off-by: Keith Busch <kbusch@kernel.org>

I also tested this patch with rdma and tcp transports using the blktests patches
by Daniel [1]. It fixes the failure, and I did not observe test hangs. Good.

Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>

I suggest to add the "Cc: stable@kernel.org" tag also. Without this fix patch,
the new blktests test case causes test process hang or test system hang on
stable kernel versions 6.1.0 and 6.2.0. I confirmed the fix patch avoids the
hangs on those stable kernel versions (This patch can not be applied the kernel
to v5.15.105 due to conflicts).

[1] https://lore.kernel.org/linux-block/20230329090202.8351-1-dwagner@suse.de/

-- 
Shin'ichiro Kawasaki

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2023-03-31  7:57 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-22  0:23 [PATCH 0/3] nvme fabrics polling fixes Keith Busch
2023-03-22  0:23 ` [PATCH 1/3] nvme-fabrics: add queue setup helpers Keith Busch
2023-03-22  1:46   ` Chaitanya Kulkarni
2023-03-22  4:38   ` kernel test robot
2023-03-22  5:21     ` Chaitanya Kulkarni
2023-03-22  7:35   ` Sagi Grimberg
2023-03-22  8:27     ` Christoph Hellwig
2023-03-22  9:07       ` Sagi Grimberg
2023-03-22  9:25   ` kernel test robot
2023-03-22  0:23 ` [PATCH 2/3] nvme: add polling options for loop target Keith Busch
2023-03-22  1:47   ` Chaitanya Kulkarni
2023-03-22  7:44   ` Sagi Grimberg
2023-03-22  8:23   ` Christoph Hellwig
2023-03-22  8:46     ` Daniel Wagner
2023-03-22 13:52       ` Christoph Hellwig
2023-03-22 14:06         ` Daniel Wagner
2023-03-22 14:20           ` Christoph Hellwig
2023-03-22 14:30     ` Keith Busch
2023-03-22  0:23 ` [PATCH 3/3] blk-mq: directly poll requests Keith Busch
2023-03-22  7:36   ` Sagi Grimberg
2023-03-22  8:23   ` Christoph Hellwig
2023-03-22  9:08     ` Sagi Grimberg
2023-03-22  8:37   ` Daniel Wagner
2023-03-22 18:16     ` Chaitanya Kulkarni
2023-03-31  7:57   ` Shinichiro Kawasaki
2023-03-22  7:31 ` [PATCH 0/3] nvme fabrics polling fixes Sagi Grimberg
2023-03-22  8:48 ` Daniel Wagner
2023-03-22 13:24   ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.