All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kanchan Joshi <joshi.k@samsung.com>
To: hch@lst.de, sagi@grimberg.me, kbusch@kernel.org, axboe@kernel.dk
Cc: io-uring@vger.kernel.org, linux-nvme@lists.infradead.org,
	linux-block@vger.kernel.org, asml.silence@gmail.com,
	joshiiitr@gmail.com, anuj20.g@samsung.com, gost.dev@samsung.com,
	Kanchan Joshi <joshi.k@samsung.com>
Subject: [PATCH for-next 4/4] nvme-multipath: add multipathing for uring-passthrough commands
Date: Mon, 11 Jul 2022 16:31:55 +0530	[thread overview]
Message-ID: <20220711110155.649153-5-joshi.k@samsung.com> (raw)
In-Reply-To: <20220711110155.649153-1-joshi.k@samsung.com>

From: Anuj Gupta <anuj20.g@samsung.com>

This heavily reuses nvme-mpath infrastructure for block-path.
Block-path operates on bio, and uses that as long-lived object across
requeue attempts. For passthrough interface io_uring_cmd happens to
be such object, so add a requeue list for that.

If path is not available, uring-cmds are added into that list and
resubmitted on discovery of new path. Existing requeue handler
(nvme_requeue_work) is extended to do this resubmission.

For failed commands, failover is done directly (i.e. without first
queuing into list) by resubmitting individual command from task-work
based callback.

Suggested-by: Sagi Grimberg <sagi@grimberg.me>
Co-developed-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 drivers/nvme/host/ioctl.c     | 141 ++++++++++++++++++++++++++++++++--
 drivers/nvme/host/multipath.c |  36 ++++++++-
 drivers/nvme/host/nvme.h      |  12 +++
 include/linux/io_uring.h      |   2 +
 4 files changed, 182 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index fc02eddd4977..281c21d3c67d 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -340,12 +340,6 @@ struct nvme_uring_data {
 	__u32	timeout_ms;
 };
 
-static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
-		struct io_uring_cmd *ioucmd)
-{
-	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
-}
-
 static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
 {
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
@@ -448,6 +442,14 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
 	pdu->meta_len = d.metadata_len;
 
+	if (issue_flags & IO_URING_F_MPATH) {
+		req->cmd_flags |= REQ_NVME_MPATH;
+		/*
+		 * we would need the buffer address (d.addr field) if we have
+		 * to retry the command. Store it by repurposing ioucmd->cmd
+		 */
+		ioucmd->cmd = (void *)d.addr;
+	}
 	blk_execute_rq_nowait(req, false);
 	return -EIOCBQUEUED;
 }
@@ -665,12 +667,135 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 	int srcu_idx = srcu_read_lock(&head->srcu);
 	struct nvme_ns *ns = nvme_find_path(head);
 	int ret = -EINVAL;
+	struct device *dev = &head->cdev_device;
+
+	if (likely(ns)) {
+		ret = nvme_ns_uring_cmd(ns, ioucmd,
+				issue_flags | IO_URING_F_MPATH);
+	} else if (nvme_available_path(head)) {
+		struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+		struct nvme_uring_cmd *payload = NULL;
+
+		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
+		/*
+		 * We may requeue two kinds of uring commands:
+		 * 1. command failed post submission. pdu->req will be non-null
+		 * for this
+		 * 2. command that could not be submitted because path was not
+		 * available. For this pdu->req is set to NULL.
+		 */
+		pdu->req = NULL;
+		/*
+		 * passthrough command will not be available during retry as it
+		 * is embedded in io_uring's SQE. Hence we allocate/copy here
+		 */
+		payload = kmalloc(sizeof(struct nvme_uring_cmd), GFP_KERNEL);
+		if (!payload) {
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+		memcpy(payload, ioucmd->cmd, sizeof(struct nvme_uring_cmd));
+		ioucmd->cmd = payload;
 
-	if (ns)
-		ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
+		spin_lock_irq(&head->requeue_ioucmd_lock);
+		ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
+		spin_unlock_irq(&head->requeue_ioucmd_lock);
+		ret = -EIOCBQUEUED;
+	} else {
+		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
+	}
+out_unlock:
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return ret;
 }
+
+int nvme_uring_cmd_io_retry(struct nvme_ns *ns, struct request *oreq,
+		struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
+	struct nvme_uring_data d;
+	struct nvme_command c;
+	struct request *req;
+	struct bio *obio = oreq->bio;
+	void *meta = NULL;
+
+	memcpy(&c, nvme_req(oreq)->cmd, sizeof(struct nvme_command));
+	d.metadata = (__u64)pdu->meta_buffer;
+	d.metadata_len = pdu->meta_len;
+	d.timeout_ms = oreq->timeout;
+	d.addr = (__u64)ioucmd->cmd;
+	if (obio) {
+		d.data_len = obio->bi_iter.bi_size;
+		blk_rq_unmap_user(obio);
+	} else {
+		d.data_len = 0;
+	}
+	blk_mq_free_request(oreq);
+	req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
+			d.data_len, nvme_to_user_ptr(d.metadata),
+			d.metadata_len, 0, &meta, d.timeout_ms ?
+			msecs_to_jiffies(d.timeout_ms) : 0,
+			ioucmd->cmd_op == NVME_URING_CMD_IO_VEC, 0, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->end_io = nvme_uring_cmd_end_io;
+	req->end_io_data = ioucmd;
+	pdu->bio = req->bio;
+	pdu->meta = meta;
+	req->cmd_flags |= REQ_NVME_MPATH;
+	blk_execute_rq_nowait(req, false);
+	return -EIOCBQUEUED;
+}
+
+void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd)
+{
+	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
+	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head,
+			cdev);
+	int srcu_idx = srcu_read_lock(&head->srcu);
+	struct nvme_ns *ns = nvme_find_path(head);
+	unsigned int issue_flags = IO_URING_F_SQE128 | IO_URING_F_CQE32 |
+		IO_URING_F_MPATH;
+	struct device *dev = &head->cdev_device;
+
+	if (likely(ns)) {
+		struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+		struct request *oreq = pdu->req;
+		int ret;
+
+		if (oreq == NULL) {
+			/*
+			 * this was not submitted (to device) earlier. For this
+			 * ioucmd->cmd points to persistent memory. Free that
+			 * up post submission
+			 */
+			const void *cmd = ioucmd->cmd;
+
+			ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
+			kfree(cmd);
+		} else {
+			/*
+			 * this was submitted (to device) earlier. Use old
+			 * request, bio (if it exists) and nvme-pdu to recreate
+			 * the command for the discovered path
+			 */
+			ret = nvme_uring_cmd_io_retry(ns, oreq, ioucmd, pdu);
+		}
+		if (ret != -EIOCBQUEUED)
+			io_uring_cmd_done(ioucmd, ret, 0);
+	} else if (nvme_available_path(head)) {
+		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
+		spin_lock_irq(&head->requeue_ioucmd_lock);
+		ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
+		spin_unlock_irq(&head->requeue_ioucmd_lock);
+	} else {
+		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
+		io_uring_cmd_done(ioucmd, -EINVAL, 0);
+	}
+	srcu_read_unlock(&head->srcu, srcu_idx);
+}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index f26640ccb955..fe5655d98c36 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -6,6 +6,7 @@
 #include <linux/backing-dev.h>
 #include <linux/moduleparam.h>
 #include <linux/vmalloc.h>
+#include <linux/io_uring.h>
 #include <trace/events/block.h>
 #include "nvme.h"
 
@@ -80,6 +81,17 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 			blk_freeze_queue_start(h->disk->queue);
 }
 
+static void nvme_ioucmd_failover_req(struct request *req, struct nvme_ns *ns)
+{
+	struct io_uring_cmd *ioucmd = req->end_io_data;
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+
+	/* store the request, to reconstruct the command during retry */
+	pdu->req = req;
+	/* retry in original submitter context */
+	io_uring_cmd_execute_in_task(ioucmd, nvme_ioucmd_mpath_retry);
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -99,6 +111,11 @@ void nvme_failover_req(struct request *req)
 		queue_work(nvme_wq, &ns->ctrl->ana_work);
 	}
 
+	if (blk_rq_is_passthrough(req)) {
+		nvme_ioucmd_failover_req(req, ns);
+		return;
+	}
+
 	spin_lock_irqsave(&ns->head->requeue_lock, flags);
 	for (bio = req->bio; bio; bio = bio->bi_next) {
 		bio_set_dev(bio, ns->head->disk->part0);
@@ -314,7 +331,7 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	return ns;
 }
 
-static bool nvme_available_path(struct nvme_ns_head *head)
+bool nvme_available_path(struct nvme_ns_head *head)
 {
 	struct nvme_ns *ns;
 
@@ -459,7 +476,9 @@ static void nvme_requeue_work(struct work_struct *work)
 	struct nvme_ns_head *head =
 		container_of(work, struct nvme_ns_head, requeue_work);
 	struct bio *bio, *next;
+	struct io_uring_cmd *ioucmd, *ioucmd_next;
 
+	/* process requeued bios*/
 	spin_lock_irq(&head->requeue_lock);
 	next = bio_list_get(&head->requeue_list);
 	spin_unlock_irq(&head->requeue_lock);
@@ -470,6 +489,21 @@ static void nvme_requeue_work(struct work_struct *work)
 
 		submit_bio_noacct(bio);
 	}
+
+	/* process requeued passthrough-commands */
+	spin_lock_irq(&head->requeue_ioucmd_lock);
+	ioucmd_next = ioucmd_list_get(&head->requeue_ioucmd_list);
+	spin_unlock_irq(&head->requeue_ioucmd_lock);
+
+	while ((ioucmd = ioucmd_next) != NULL) {
+		ioucmd_next = ioucmd->next;
+		ioucmd->next = NULL;
+		/*
+		 * Retry in original submitter context as we would be
+		 * processing the passthrough command again
+		 */
+		io_uring_cmd_execute_in_task(ioucmd, nvme_ioucmd_mpath_retry);
+	}
 }
 
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9d3ff6feda06..125b48e74e72 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -16,6 +16,7 @@
 #include <linux/rcupdate.h>
 #include <linux/wait.h>
 #include <linux/t10-pi.h>
+#include <linux/io_uring.h>
 
 #include <trace/events/block.h>
 
@@ -189,6 +190,12 @@ enum {
 	NVME_REQ_USERCMD		= (1 << 1),
 };
 
+static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
+		struct io_uring_cmd *ioucmd)
+{
+	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
+}
+
 static inline struct nvme_request *nvme_req(struct request *req)
 {
 	return blk_mq_rq_to_pdu(req);
@@ -442,6 +449,9 @@ struct nvme_ns_head {
 	struct work_struct	requeue_work;
 	struct mutex		lock;
 	unsigned long		flags;
+	/* for uring-passthru multipath handling */
+	struct ioucmd_list	requeue_ioucmd_list;
+	spinlock_t		requeue_ioucmd_lock;
 #define NVME_NSHEAD_DISK_LIVE	0
 	struct nvme_ns __rcu	*current_path[];
 #endif
@@ -830,6 +840,7 @@ int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
+void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd);
 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
 
@@ -844,6 +855,7 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 	return ctrl->ana_log_buf != NULL;
 }
 
+bool nvme_available_path(struct nvme_ns_head *head);
 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index d734599cbcd7..57f4dfc83316 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -15,6 +15,8 @@ enum io_uring_cmd_flags {
 	IO_URING_F_SQE128		= 4,
 	IO_URING_F_CQE32		= 8,
 	IO_URING_F_IOPOLL		= 16,
+	/* to indicate that it is a MPATH req*/
+	IO_URING_F_MPATH		= 32,
 };
 
 struct io_uring_cmd {
-- 
2.25.1


  parent reply	other threads:[~2022-07-11 11:28 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20220711110753epcas5p4169b9e288d15ca35740dbb66a6f6983a@epcas5p4.samsung.com>
2022-07-11 11:01 ` [PATCH for-next 0/4] nvme-multipathing for uring-passthrough Kanchan Joshi
     [not found]   ` <CGME20220711110800epcas5p3d338dd486fd778c5ba5bfe93a91ec8bd@epcas5p3.samsung.com>
2022-07-11 11:01     ` [PATCH for-next 1/4] io_uring, nvme: rename a function Kanchan Joshi
2022-07-14 13:55       ` Ming Lei
     [not found]   ` <CGME20220711110812epcas5p33aa90b23aa62fb11722aa8195754becf@epcas5p3.samsung.com>
2022-07-11 11:01     ` [PATCH for-next 2/4] nvme: compact nvme_uring_cmd_pdu struct Kanchan Joshi
2022-07-12  6:32       ` Christoph Hellwig
     [not found]   ` <CGME20220711110824epcas5p22c8e945cb8c3c3ac46c8c2b5ab55db9b@epcas5p2.samsung.com>
2022-07-11 11:01     ` [PATCH for-next 3/4] io_uring: grow a field in struct io_uring_cmd Kanchan Joshi
2022-07-11 17:00       ` Sagi Grimberg
2022-07-11 17:19         ` Jens Axboe
2022-07-11 17:18       ` Jens Axboe
2022-07-11 17:55         ` Sagi Grimberg
2022-07-11 18:22           ` Sagi Grimberg
2022-07-11 18:24             ` Jens Axboe
2022-07-11 18:58               ` Sagi Grimberg
2022-07-12 11:40               ` Kanchan Joshi
2022-07-14  3:40             ` Ming Lei
2022-07-14  8:19               ` Kanchan Joshi
2022-07-14 15:30                 ` Daniel Wagner
2022-07-15 11:07                   ` Kanchan Joshi
2022-07-18  9:03                     ` Daniel Wagner
     [not found]   ` <CGME20220711110827epcas5p3fd81f142f55ca3048abc38a9ef0d0089@epcas5p3.samsung.com>
2022-07-11 11:01     ` Kanchan Joshi [this message]
2022-07-11 13:51       ` [PATCH for-next 4/4] nvme-multipath: add multipathing for uring-passthrough commands Sagi Grimberg
2022-07-11 15:12         ` Stefan Metzmacher
2022-07-11 16:58           ` Sagi Grimberg
2022-07-11 18:54           ` Kanchan Joshi
2022-07-11 18:37         ` Kanchan Joshi
2022-07-11 19:56           ` Sagi Grimberg
2022-07-12  4:23             ` Kanchan Joshi
2022-07-12 21:26               ` Sagi Grimberg
2022-07-13  5:37                 ` Kanchan Joshi
2022-07-13  9:03                   ` Sagi Grimberg
2022-07-13 11:28                     ` Kanchan Joshi
2022-07-13 12:17                       ` Sagi Grimberg
2022-07-14 15:14                   ` Ming Lei
2022-07-14 23:05                     ` Kanchan Joshi
2022-07-15  1:35                       ` Ming Lei
2022-07-15  1:46                         ` Ming Lei
2022-07-15  4:24                           ` Kanchan Joshi
2022-07-12  6:52       ` Christoph Hellwig
2022-07-12 11:33         ` Kanchan Joshi
2022-07-12 20:13         ` Sagi Grimberg
2022-07-13  5:36           ` Christoph Hellwig
2022-07-13  8:04             ` Sagi Grimberg
2022-07-13 10:12               ` Christoph Hellwig
2022-07-13 11:00                 ` Sagi Grimberg
2022-07-13 11:28                   ` Christoph Hellwig
2022-07-13 12:16                     ` Sagi Grimberg
2022-07-13 11:49                   ` Hannes Reinecke
2022-07-13 12:43                     ` Sagi Grimberg
2022-07-13 13:30                       ` Hannes Reinecke
2022-07-13 13:41                         ` Sagi Grimberg
2022-07-13 14:07                           ` Hannes Reinecke
2022-07-13 15:59                             ` Sagi Grimberg

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220711110155.649153-5-joshi.k@samsung.com \
    --to=joshi.k@samsung.com \
    --cc=anuj20.g@samsung.com \
    --cc=asml.silence@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=gost.dev@samsung.com \
    --cc=hch@lst.de \
    --cc=io-uring@vger.kernel.org \
    --cc=joshiiitr@gmail.com \
    --cc=kbusch@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.