linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH v3 0/3] Async nvme passthrough over io_uring
       [not found] <CGME20210316140229epcas5p23d68a4c9694bbf7759b5901115a4309b@epcas5p2.samsung.com>
@ 2021-03-16 14:01 ` Kanchan Joshi
       [not found]   ` <CGME20210316140233epcas5p372405e7cb302c61dba5e1094fa796513@epcas5p3.samsung.com>
                     ` (3 more replies)
  0 siblings, 4 replies; 24+ messages in thread
From: Kanchan Joshi @ 2021-03-16 14:01 UTC (permalink / raw)
  To: axboe, hch, kbusch, chaitanya.kulkarni
  Cc: io-uring, linux-nvme, anuj20.g, javier.gonz, nj.shetty,
	selvakuma.s1, Kanchan Joshi

This series adds async passthrough capability for nvme block-dev over
iouring_cmd. The patches are on top of Jens uring-cmd branch:
https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-fops.v3

Application is expected to allocate passthrough command structure, set
it up traditionally, and pass its address via "block_uring_cmd->addr".
On completion, CQE is posted with completion-status after any ioctl
specific buffer/field update.

Changes from v2:
1. Rebase against latest uring-cmd branch of Jens
2. Remove per-io nvme_command allocation
3. Disallow passthrough commands with non-zero command effects

Change from v1:
1. Rewire the work on top of Jens uring-cmd interface
2. Support only passthrough, and not other nvme ioctls

Kanchan Joshi (3):
  io_uring: add helper for uring_cmd completion in submitter-task
  nvme: keep nvme_command instead of pointer to it
  nvme: wire up support for async passthrough

 drivers/nvme/host/core.c     | 186 ++++++++++++++++++++++++++++++-----
 drivers/nvme/host/fabrics.c  |   4 +-
 drivers/nvme/host/lightnvm.c |  16 +--
 drivers/nvme/host/nvme.h     |   5 +-
 drivers/nvme/host/pci.c      |   1 +
 fs/io_uring.c                |  36 ++++++-
 include/linux/io_uring.h     |   8 ++
 7 files changed, 213 insertions(+), 43 deletions(-)

-- 
2.25.1


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [RFC PATCH v3 1/3] io_uring: add helper for uring_cmd completion in submitter-task
       [not found]   ` <CGME20210316140233epcas5p372405e7cb302c61dba5e1094fa796513@epcas5p3.samsung.com>
@ 2021-03-16 14:01     ` Kanchan Joshi
  2021-03-16 15:43       ` Stefan Metzmacher
  2021-03-18  1:57       ` Jens Axboe
  0 siblings, 2 replies; 24+ messages in thread
From: Kanchan Joshi @ 2021-03-16 14:01 UTC (permalink / raw)
  To: axboe, hch, kbusch, chaitanya.kulkarni
  Cc: io-uring, linux-nvme, anuj20.g, javier.gonz, nj.shetty,
	selvakuma.s1, Kanchan Joshi

Completion of a uring_cmd ioctl may involve referencing certain
ioctl-specific fields, requiring original subitter context.
Introduce 'uring_cmd_complete_in_task' that driver can use for this
purpose. The API facilitates task-work infra, while driver gets to
implement cmd-specific handling in a callback.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
 fs/io_uring.c            | 36 ++++++++++++++++++++++++++++++++----
 include/linux/io_uring.h |  8 ++++++++
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 583f8fd735d8..ca459ea9cb83 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -772,9 +772,12 @@ struct io_kiocb {
 		/* use only after cleaning per-op data, see io_clean_op() */
 		struct io_completion	compl;
 	};
-
-	/* opcode allocated if it needs to store data for async defer */
-	void				*async_data;
+	union {
+		/* opcode allocated if it needs to store data for async defer */
+		void				*async_data;
+		/* used for uring-cmd, when driver needs to update in task */
+		void (*driver_cb)(struct io_uring_cmd *cmd);
+	};
 	u8				opcode;
 	/* polled IO has completed */
 	u8				iopoll_completed;
@@ -1716,7 +1719,7 @@ static void io_dismantle_req(struct io_kiocb *req)
 {
 	io_clean_op(req);
 
-	if (req->async_data)
+	if (io_op_defs[req->opcode].async_size && req->async_data)
 		kfree(req->async_data);
 	if (req->file)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
@@ -2032,6 +2035,31 @@ static void io_req_task_submit(struct callback_head *cb)
 	__io_req_task_submit(req);
 }
 
+static void uring_cmd_work(struct callback_head *cb)
+{
+	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+	struct io_uring_cmd *cmd = &req->uring_cmd;
+
+	req->driver_cb(cmd);
+}
+int uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*driver_cb)(struct io_uring_cmd *))
+{
+	int ret;
+	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+
+	req->driver_cb = driver_cb;
+	req->task_work.func = uring_cmd_work;
+	ret = io_req_task_work_add(req);
+	if (unlikely(ret)) {
+		req->result = -ECANCELED;
+		percpu_ref_get(&req->ctx->refs);
+		io_req_task_work_add_fallback(req, io_req_task_cancel);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(uring_cmd_complete_in_task);
+
 static void io_req_task_queue(struct io_kiocb *req)
 {
 	int ret;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index e0a31354eff1..559f41d0f19a 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -41,6 +41,8 @@ struct io_uring_cmd {
 
 #if defined(CONFIG_IO_URING)
 void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret);
+int uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*driver_cb)(struct io_uring_cmd *));
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_task_cancel(void);
 void __io_uring_files_cancel(struct files_struct *files);
@@ -65,6 +67,12 @@ static inline void io_uring_free(struct task_struct *tsk)
 static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret)
 {
 }
+
+int uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*driver_cb)(struct io_uring_cmd *))
+{
+	return -1;
+}
 static inline struct sock *io_uring_get_socket(struct file *file)
 {
 	return NULL;
-- 
2.25.1


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RFC PATCH v3 2/3] nvme: keep nvme_command instead of pointer to it
       [not found]   ` <CGME20210316140236epcas5p4de087ee51a862402146fbbc621d4d4c6@epcas5p4.samsung.com>
@ 2021-03-16 14:01     ` Kanchan Joshi
  2021-03-16 17:16       ` Keith Busch
  0 siblings, 1 reply; 24+ messages in thread
From: Kanchan Joshi @ 2021-03-16 14:01 UTC (permalink / raw)
  To: axboe, hch, kbusch, chaitanya.kulkarni
  Cc: io-uring, linux-nvme, anuj20.g, javier.gonz, nj.shetty,
	selvakuma.s1, Kanchan Joshi

nvme_req structure originally contained a pointer to nvme_command.
Change nvme_req structure to keep the command itself.
This helps in avoiding hot-path memory-allocation for async-passthrough.

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
 drivers/nvme/host/core.c     |  6 +++---
 drivers/nvme/host/fabrics.c  |  4 ++--
 drivers/nvme/host/lightnvm.c | 16 +++++-----------
 drivers/nvme/host/nvme.h     |  2 +-
 4 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e68a8c4ac5a6..46c1bb7a89f0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -596,7 +596,7 @@ static inline void nvme_init_request(struct request *req,
 
 	req->cmd_flags |= REQ_FAILFAST_DRIVER;
 	nvme_clear_nvme_request(req);
-	nvme_req(req)->cmd = cmd;
+	nvme_req(req)->cmd = *cmd;
 }
 
 struct request *nvme_alloc_request(struct request_queue *q,
@@ -728,7 +728,7 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
 static void nvme_setup_passthrough(struct request *req,
 		struct nvme_command *cmd)
 {
-	memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
+	memcpy(cmd, &nvme_req(req)->cmd, sizeof(*cmd));
 	/* passthru commands should let the driver set the SGL flags */
 	cmd->common.flags &= ~NVME_CMD_SGL_ALL;
 }
@@ -1128,7 +1128,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
 
 void nvme_execute_passthru_rq(struct request *rq)
 {
-	struct nvme_command *cmd = nvme_req(rq)->cmd;
+	struct nvme_command *cmd = &nvme_req(rq)->cmd;
 	struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
 	struct nvme_ns *ns = rq->q->queuedata;
 	struct gendisk *disk = ns ? ns->disk : NULL;
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 5dfd806fc2d2..c374dcf6595e 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -578,8 +578,8 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 	 */
 	switch (ctrl->state) {
 	case NVME_CTRL_CONNECTING:
-		if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
-		    req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
+		if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(&req->cmd) &&
+		    req->cmd.fabrics.fctype == nvme_fabrics_type_connect)
 			return true;
 		break;
 	default:
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index b705988629f2..5f4d7f0f5d8d 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -640,7 +640,6 @@ static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
 	rqd->error = nvme_req(rq)->status;
 	nvm_end_io(rqd);
 
-	kfree(nvme_req(rq)->cmd);
 	blk_mq_free_request(rq);
 }
 
@@ -672,25 +671,21 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd,
 {
 	struct nvm_geo *geo = &dev->geo;
 	struct request_queue *q = dev->q;
-	struct nvme_nvm_command *cmd;
+	struct nvme_nvm_command cmd;
 	struct request *rq;
 	int ret;
 
-	cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
-	if (!cmd)
-		return -ENOMEM;
-
-	rq = nvme_nvm_alloc_request(q, rqd, cmd);
+	rq = nvme_nvm_alloc_request(q, rqd, &cmd);
 	if (IS_ERR(rq)) {
 		ret = PTR_ERR(rq);
-		goto err_free_cmd;
+		goto err_cmd;
 	}
 
 	if (buf) {
 		ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas,
 				GFP_KERNEL);
 		if (ret)
-			goto err_free_cmd;
+			goto err_cmd;
 	}
 
 	rq->end_io_data = rqd;
@@ -699,8 +694,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd,
 
 	return 0;
 
-err_free_cmd:
-	kfree(cmd);
+err_cmd:
 	return ret;
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 07b34175c6ce..0254aa611dfa 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -157,7 +157,7 @@ enum nvme_quirks {
  * this structure as the first member of their request-private data.
  */
 struct nvme_request {
-	struct nvme_command	*cmd;
+	struct nvme_command	cmd;
 	union nvme_result	result;
 	u8			retries;
 	u8			flags;
-- 
2.25.1


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RFC PATCH v3 3/3] nvme: wire up support for async passthrough
       [not found]   ` <CGME20210316140240epcas5p3e71bfe2afecd728c5af60056f21cc9b7@epcas5p3.samsung.com>
@ 2021-03-16 14:01     ` Kanchan Joshi
  2021-03-17  8:52       ` Christoph Hellwig
  2021-03-17 16:45       ` Keith Busch
  0 siblings, 2 replies; 24+ messages in thread
From: Kanchan Joshi @ 2021-03-16 14:01 UTC (permalink / raw)
  To: axboe, hch, kbusch, chaitanya.kulkarni
  Cc: io-uring, linux-nvme, anuj20.g, javier.gonz, nj.shetty,
	selvakuma.s1, Kanchan Joshi

Introduce handler for mq_ops->uring_cmd(), implementing async
passthrough on block-device.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
 drivers/nvme/host/core.c | 180 ++++++++++++++++++++++++++++++++++-----
 drivers/nvme/host/nvme.h |   3 +
 drivers/nvme/host/pci.c  |   1 +
 3 files changed, 162 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 46c1bb7a89f0..c4f0e54fe9a5 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1053,6 +1053,88 @@ static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
 	return ERR_PTR(ret);
 }
 
+/*
+ * Convert integer values from ioctl structures to user pointers, silently
+ * ignoring the upper bits in the compat case to match behaviour of 32-bit
+ * kernels.
+ */
+static void __user *nvme_to_user_ptr(uintptr_t ptrval)
+{
+	if (in_compat_syscall())
+		ptrval = (compat_uptr_t)ptrval;
+	return (void __user *)ptrval;
+}
+/*
+ * This is carved within the block_uring_cmd, to avoid dynamic allocation.
+ * Care should be taken not to grow this beyond what is available.
+ */
+struct uring_cmd_data {
+	union {
+		struct bio *bio;
+		u64 result; /* nvme cmd result */
+	};
+	void *meta; /* kernel-resident buffer */
+	int status; /* nvme cmd status */
+};
+
+inline u64 *ucmd_data_addr(struct io_uring_cmd *ioucmd)
+{
+	return &(((struct block_uring_cmd *)&ioucmd->pdu)->unused[0]);
+}
+
+void ioucmd_task_cb(struct io_uring_cmd *ioucmd)
+{
+	struct uring_cmd_data *ucd;
+	struct nvme_passthru_cmd __user *ptcmd;
+	struct block_uring_cmd *bcmd;
+
+	bcmd = (struct block_uring_cmd *) &ioucmd->pdu;
+	ptcmd = (void __user *) bcmd->addr;
+	ucd = (struct uring_cmd_data *) ucmd_data_addr(ioucmd);
+
+	/* handle meta update */
+	if (ucd->meta) {
+		void __user *umeta = nvme_to_user_ptr(ptcmd->metadata);
+
+		if (!ucd->status)
+			if (copy_to_user(umeta, ucd->meta, ptcmd->metadata_len))
+				ucd->status = -EFAULT;
+		kfree(ucd->meta);
+	}
+	/* handle result update */
+	if (put_user(ucd->result, (u32 __user *)&ptcmd->result))
+		ucd->status = -EFAULT;
+	io_uring_cmd_done(ioucmd, ucd->status);
+}
+
+void nvme_end_async_pt(struct request *req, blk_status_t err)
+{
+	struct io_uring_cmd *ioucmd;
+	struct uring_cmd_data *ucd;
+	struct bio *bio;
+	int ret;
+
+	ioucmd = req->end_io_data;
+	ucd = (struct uring_cmd_data *) ucmd_data_addr(ioucmd);
+	/* extract bio before reusing the same field for status */
+	bio = ucd->bio;
+
+	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+		ucd->status = -EINTR;
+	else
+		ucd->status = nvme_req(req)->status;
+	ucd->result = le64_to_cpu(nvme_req(req)->result.u64);
+
+	/* this takes care of setting up task-work */
+	ret = uring_cmd_complete_in_task(ioucmd, ioucmd_task_cb);
+	if (ret < 0)
+		kfree(ucd->meta);
+
+	/* unmap pages, free bio, nvme command and request */
+	blk_rq_unmap_user(bio);
+	blk_mq_free_request(req);
+}
+
 static u32 nvme_known_admin_effects(u8 opcode)
 {
 	switch (opcode) {
@@ -1140,10 +1222,27 @@ void nvme_execute_passthru_rq(struct request *rq)
 }
 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
 
+static void nvme_setup_uring_cmd_data(struct request *rq,
+		struct io_uring_cmd *ioucmd, void *meta, bool write)
+{
+	struct uring_cmd_data *ucd;
+
+	ucd = (struct uring_cmd_data *) ucmd_data_addr(ioucmd);
+	/* to free bio on completion, as req->bio will be null at that time */
+	ucd->bio = rq->bio;
+	/* meta update is required only for read requests */
+	if (meta && !write)
+		ucd->meta = meta;
+	else
+		ucd->meta = NULL;
+	rq->end_io_data = ioucmd;
+}
+
 static int nvme_submit_user_cmd(struct request_queue *q,
 		struct nvme_command *cmd, void __user *ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		u32 meta_seed, u64 *result, unsigned timeout)
+		u32 meta_seed, u64 *result, unsigned int timeout,
+		struct io_uring_cmd *ioucmd)
 {
 	bool write = nvme_is_write(cmd);
 	struct nvme_ns *ns = q->queuedata;
@@ -1179,6 +1278,20 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 			req->cmd_flags |= REQ_INTEGRITY;
 		}
 	}
+	if (ioucmd) { /* async handling */
+		u32 effects;
+
+		effects = nvme_command_effects(ns->ctrl, ns, cmd->common.opcode);
+		/* filter commands with non-zero effects, keep it simple for now*/
+		if (effects) {
+			ret = -EOPNOTSUPP;
+			goto out_unmap;
+		}
+		nvme_setup_uring_cmd_data(req, ioucmd, meta, write);
+		blk_execute_rq_nowait(ns ? ns->disk : NULL, req, 0,
+					nvme_end_async_pt);
+		return 0;
+	}
 
 	nvme_execute_passthru_rq(req);
 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -1544,18 +1657,6 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl)
 	queue_work(nvme_wq, &ctrl->async_event_work);
 }
 
-/*
- * Convert integer values from ioctl structures to user pointers, silently
- * ignoring the upper bits in the compat case to match behaviour of 32-bit
- * kernels.
- */
-static void __user *nvme_to_user_ptr(uintptr_t ptrval)
-{
-	if (in_compat_syscall())
-		ptrval = (compat_uptr_t)ptrval;
-	return (void __user *)ptrval;
-}
-
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
 	struct nvme_user_io io;
@@ -1616,11 +1717,13 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 
 	return nvme_submit_user_cmd(ns->queue, &c,
 			nvme_to_user_ptr(io.addr), length,
-			metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
+			metadata, meta_len, lower_32_bits(io.slba), NULL, 0,
+			NULL);
 }
 
 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-			struct nvme_passthru_cmd __user *ucmd)
+			struct nvme_passthru_cmd __user *ucmd,
+			struct io_uring_cmd *ioucmd)
 {
 	struct nvme_passthru_cmd cmd;
 	struct nvme_command c;
@@ -1654,9 +1757,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 			nvme_to_user_ptr(cmd.addr), cmd.data_len,
 			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-			0, &result, timeout);
+			0, &result, timeout, ioucmd);
 
-	if (status >= 0) {
+	if (!ioucmd && status >= 0) {
 		if (put_user(result, &ucmd->result))
 			return -EFAULT;
 	}
@@ -1698,7 +1801,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
 			nvme_to_user_ptr(cmd.addr), cmd.data_len,
 			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-			0, &cmd.result, timeout);
+			0, &cmd.result, timeout, NULL);
 
 	if (status >= 0) {
 		if (put_user(cmd.result, &ucmd->result))
@@ -1760,7 +1863,7 @@ static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
 
 	switch (cmd) {
 	case NVME_IOCTL_ADMIN_CMD:
-		ret = nvme_user_cmd(ctrl, NULL, argp);
+		ret = nvme_user_cmd(ctrl, NULL, argp, NULL);
 		break;
 	case NVME_IOCTL_ADMIN64_CMD:
 		ret = nvme_user_cmd64(ctrl, NULL, argp);
@@ -1799,7 +1902,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 		ret = ns->head->ns_id;
 		break;
 	case NVME_IOCTL_IO_CMD:
-		ret = nvme_user_cmd(ns->ctrl, ns, argp);
+		ret = nvme_user_cmd(ns->ctrl, ns, argp, NULL);
 		break;
 	case NVME_IOCTL_SUBMIT_IO:
 		ret = nvme_submit_io(ns, argp);
@@ -1818,6 +1921,39 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 	return ret;
 }
 
+int nvme_uring_cmd(struct request_queue *q, struct io_uring_cmd *ioucmd,
+		enum io_uring_cmd_flags flags)
+{
+	struct nvme_ns_head *head = NULL;
+	struct block_device *bdev = I_BDEV(ioucmd->file->f_mapping->host);
+	struct block_uring_cmd *bcmd = (struct block_uring_cmd *)&ioucmd->pdu;
+	struct nvme_ns *ns;
+	int srcu_idx, ret;
+	void __user *argp = (void __user *) bcmd->addr;
+
+	BUILD_BUG_ON(sizeof(struct uring_cmd_data) >
+			sizeof(struct block_uring_cmd) -
+			offsetof(struct block_uring_cmd, unused));
+
+	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+	if (unlikely(!ns))
+		return -EWOULDBLOCK;
+
+	switch (bcmd->ioctl_cmd) {
+	case NVME_IOCTL_IO_CMD:
+		ret = nvme_user_cmd(ns->ctrl, ns, argp, ioucmd);
+		break;
+	default:
+		ret = -ENOTTY;
+	}
+
+	if (ret >= 0)
+		ret = -EIOCBQUEUED;
+	nvme_put_ns_from_disk(head, srcu_idx);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_uring_cmd);
+
 #ifdef CONFIG_COMPAT
 struct nvme_user_io32 {
 	__u8	opcode;
@@ -3309,7 +3445,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
 	kref_get(&ns->kref);
 	up_read(&ctrl->namespaces_rwsem);
 
-	ret = nvme_user_cmd(ctrl, ns, argp);
+	ret = nvme_user_cmd(ctrl, ns, argp, NULL);
 	nvme_put_ns(ns);
 	return ret;
 
@@ -3326,7 +3462,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 
 	switch (cmd) {
 	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_cmd(ctrl, NULL, argp);
+		return nvme_user_cmd(ctrl, NULL, argp, NULL);
 	case NVME_IOCTL_ADMIN64_CMD:
 		return nvme_user_cmd64(ctrl, NULL, argp);
 	case NVME_IOCTL_IO_CMD:
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0254aa611dfa..f3daee4a4848 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -19,6 +19,7 @@
 #include <linux/t10-pi.h>
 
 #include <trace/events/block.h>
+#include <linux/io_uring.h>
 
 extern unsigned int nvme_io_timeout;
 #define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
@@ -620,6 +621,8 @@ int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
 void nvme_start_freeze(struct nvme_ctrl *ctrl);
 
 #define NVME_QID_ANY -1
+int nvme_uring_cmd(struct request_queue *q, struct io_uring_cmd *ucmd,
+		enum io_uring_cmd_flags flags);
 struct request *nvme_alloc_request(struct request_queue *q,
 		struct nvme_command *cmd, blk_mq_req_flags_t flags);
 void nvme_cleanup_cmd(struct request *req);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7b6632c00ffd..6c84dc964259 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1629,6 +1629,7 @@ static const struct blk_mq_ops nvme_mq_ops = {
 	.map_queues	= nvme_pci_map_queues,
 	.timeout	= nvme_timeout,
 	.poll		= nvme_poll,
+	.uring_cmd	= nvme_uring_cmd,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
-- 
2.25.1


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 1/3] io_uring: add helper for uring_cmd completion in submitter-task
  2021-03-16 14:01     ` [RFC PATCH v3 1/3] io_uring: add helper for uring_cmd completion in submitter-task Kanchan Joshi
@ 2021-03-16 15:43       ` Stefan Metzmacher
  2021-03-18  1:57       ` Jens Axboe
  1 sibling, 0 replies; 24+ messages in thread
From: Stefan Metzmacher @ 2021-03-16 15:43 UTC (permalink / raw)
  To: Kanchan Joshi, axboe, hch, kbusch, chaitanya.kulkarni
  Cc: io-uring, linux-nvme, anuj20.g, javier.gonz, nj.shetty, selvakuma.s1


Hi Kanchan,

> +static void uring_cmd_work(struct callback_head *cb)
> +{
> +	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
> +	struct io_uring_cmd *cmd = &req->uring_cmd;
> +
> +	req->driver_cb(cmd);
> +}
> +int uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
> +			void (*driver_cb)(struct io_uring_cmd *))
> +{
> +	int ret;
> +	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
> +
> +	req->driver_cb = driver_cb;
> +	req->task_work.func = uring_cmd_work;
> +	ret = io_req_task_work_add(req);
> +	if (unlikely(ret)) {
> +		req->result = -ECANCELED;
> +		percpu_ref_get(&req->ctx->refs);
> +		io_req_task_work_add_fallback(req, io_req_task_cancel);
> +	}
> +	return ret;
> +}
> +EXPORT_SYMBOL(uring_cmd_complete_in_task);

I think this should be have an io_ prefix:
io_uring_cmd_complete_in_task()

I'll let Jens comment if this is needed at all...

metze

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 0/3] Async nvme passthrough over io_uring
  2021-03-16 14:01 ` [RFC PATCH v3 0/3] Async nvme passthrough over io_uring Kanchan Joshi
                     ` (2 preceding siblings ...)
       [not found]   ` <CGME20210316140240epcas5p3e71bfe2afecd728c5af60056f21cc9b7@epcas5p3.samsung.com>
@ 2021-03-16 15:51   ` Jens Axboe
  2021-03-17  9:31     ` Kanchan Joshi
  3 siblings, 1 reply; 24+ messages in thread
From: Jens Axboe @ 2021-03-16 15:51 UTC (permalink / raw)
  To: Kanchan Joshi, hch, kbusch, chaitanya.kulkarni
  Cc: io-uring, linux-nvme, anuj20.g, javier.gonz, nj.shetty, selvakuma.s1

On 3/16/21 8:01 AM, Kanchan Joshi wrote:
> This series adds async passthrough capability for nvme block-dev over
> iouring_cmd. The patches are on top of Jens uring-cmd branch:
> https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-fops.v3
> 
> Application is expected to allocate passthrough command structure, set
> it up traditionally, and pass its address via "block_uring_cmd->addr".
> On completion, CQE is posted with completion-status after any ioctl
> specific buffer/field update.

Do you have a test app? I'd be curious to try and add support for this
to t/io_uring from fio just to run some perf numbers.

-- 
Jens Axboe


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 2/3] nvme: keep nvme_command instead of pointer to it
  2021-03-16 14:01     ` [RFC PATCH v3 2/3] nvme: keep nvme_command instead of pointer to it Kanchan Joshi
@ 2021-03-16 17:16       ` Keith Busch
  2021-03-17  9:38         ` Kanchan Joshi
  0 siblings, 1 reply; 24+ messages in thread
From: Keith Busch @ 2021-03-16 17:16 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: axboe, hch, chaitanya.kulkarni, io-uring, linux-nvme, anuj20.g,
	javier.gonz, nj.shetty, selvakuma.s1

On Tue, Mar 16, 2021 at 07:31:25PM +0530, Kanchan Joshi wrote:
> nvme_req structure originally contained a pointer to nvme_command.
> Change nvme_req structure to keep the command itself.
> This helps in avoiding hot-path memory-allocation for async-passthrough.

I have a slightly different take on how to handle pre-allocated
passthrough commands. Every transport except PCI already preallocates a
'struct nvme_command' within the pdu, so allocating another one looks
redundant. Also, it does consume quite a bit of memory for something
that is used only for the passthrough case.

I think we can solve both concerns by always using the PDU nvme_command
rather than have the transport drivers provide it. I just sent the patch
here if you can take a look. It tested fine on PCI and loop (haven't
tested any other transports).

 http://lists.infradead.org/pipermail/linux-nvme/2021-March/023711.html

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 3/3] nvme: wire up support for async passthrough
  2021-03-16 14:01     ` [RFC PATCH v3 3/3] nvme: wire up support for async passthrough Kanchan Joshi
@ 2021-03-17  8:52       ` Christoph Hellwig
  2021-03-17 16:49         ` Jens Axboe
  2021-03-18  5:54         ` Kanchan Joshi
  2021-03-17 16:45       ` Keith Busch
  1 sibling, 2 replies; 24+ messages in thread
From: Christoph Hellwig @ 2021-03-17  8:52 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: axboe, hch, kbusch, chaitanya.kulkarni, io-uring, linux-nvme,
	anuj20.g, javier.gonz, nj.shetty, selvakuma.s1

> +/*
> + * This is carved within the block_uring_cmd, to avoid dynamic allocation.
> + * Care should be taken not to grow this beyond what is available.
> + */
> +struct uring_cmd_data {
> +	union {
> +		struct bio *bio;
> +		u64 result; /* nvme cmd result */
> +	};
> +	void *meta; /* kernel-resident buffer */
> +	int status; /* nvme cmd status */
> +};
> +
> +inline u64 *ucmd_data_addr(struct io_uring_cmd *ioucmd)
> +{
> +	return &(((struct block_uring_cmd *)&ioucmd->pdu)->unused[0]);
> +}

The whole typing is a mess, but this mostly goes back to the series
you're basing this on.  Jens, can you send out the series so that
we can do a proper review?

IMHO struct io_uring_cmd needs to stay private in io-uring.c, and
the method needs to get the file and the untyped payload in form
of a void * separately.  and block_uring_cmd should be private to
the example ioctl, not exposed to drivers implementing their own
schemes.

> +void ioucmd_task_cb(struct io_uring_cmd *ioucmd)

This should be mark static and have a much more descriptive name
including a nvme_ prefix.

> +	/* handle meta update */
> +	if (ucd->meta) {
> +		void __user *umeta = nvme_to_user_ptr(ptcmd->metadata);
> +
> +		if (!ucd->status)
> +			if (copy_to_user(umeta, ucd->meta, ptcmd->metadata_len))
> +				ucd->status = -EFAULT;
> +		kfree(ucd->meta);
> +	}
> +	/* handle result update */
> +	if (put_user(ucd->result, (u32 __user *)&ptcmd->result))

The comments aren't very useful, and the cast here is a warning sign.
Why do you need it?

> +		ucd->status = -EFAULT;
> +	io_uring_cmd_done(ioucmd, ucd->status);

Shouldn't the io-uring core take care of this io_uring_cmd_done
call?

> +void nvme_end_async_pt(struct request *req, blk_status_t err)

static?

> +{
> +	struct io_uring_cmd *ioucmd;
> +	struct uring_cmd_data *ucd;
> +	struct bio *bio;
> +	int ret;
> +
> +	ioucmd = req->end_io_data;
> +	ucd = (struct uring_cmd_data *) ucmd_data_addr(ioucmd);
> +	/* extract bio before reusing the same field for status */
> +	bio = ucd->bio;
> +
> +	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
> +		ucd->status = -EINTR;
> +	else
> +		ucd->status = nvme_req(req)->status;
> +	ucd->result = le64_to_cpu(nvme_req(req)->result.u64);
> +
> +	/* this takes care of setting up task-work */
> +	ret = uring_cmd_complete_in_task(ioucmd, ioucmd_task_cb);
> +	if (ret < 0)
> +		kfree(ucd->meta);
> +
> +	/* unmap pages, free bio, nvme command and request */
> +	blk_rq_unmap_user(bio);
> +	blk_mq_free_request(req);

How can we free the request here if the data is only copied out in
a task_work?

>  static int nvme_submit_user_cmd(struct request_queue *q,
>  		struct nvme_command *cmd, void __user *ubuffer,
>  		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
> -		u32 meta_seed, u64 *result, unsigned timeout)
> +		u32 meta_seed, u64 *result, unsigned int timeout,
> +		struct io_uring_cmd *ioucmd)
>  {
>  	bool write = nvme_is_write(cmd);
>  	struct nvme_ns *ns = q->queuedata;
> @@ -1179,6 +1278,20 @@ static int nvme_submit_user_cmd(struct request_queue *q,
>  			req->cmd_flags |= REQ_INTEGRITY;
>  		}
>  	}
> +	if (ioucmd) { /* async handling */

nvme_submit_user_cmd already is a mess.  Please split this out into
a separate function.  Maybe the logic to map the user buffers can be
split into a little shared helper.

> +int nvme_uring_cmd(struct request_queue *q, struct io_uring_cmd *ioucmd,
> +		enum io_uring_cmd_flags flags)

Another comment on the original infrastructure:  this really needs to
be a block_device_operations method taking a struct block_device instead
of being tied into blk-mq.

> +EXPORT_SYMBOL_GPL(nvme_uring_cmd);

I don't think this shoud be exported.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 0/3] Async nvme passthrough over io_uring
  2021-03-16 15:51   ` [RFC PATCH v3 0/3] Async nvme passthrough over io_uring Jens Axboe
@ 2021-03-17  9:31     ` Kanchan Joshi
  2021-03-18  1:58       ` Jens Axboe
  0 siblings, 1 reply; 24+ messages in thread
From: Kanchan Joshi @ 2021-03-17  9:31 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Kanchan Joshi, Christoph Hellwig, Keith Busch,
	Chaitanya Kulkarni, io-uring, linux-nvme, anuj20.g,
	Javier Gonzalez, Nitesh Shetty, Selvakumar S

On Tue, Mar 16, 2021 at 9:31 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 3/16/21 8:01 AM, Kanchan Joshi wrote:
> > This series adds async passthrough capability for nvme block-dev over
> > iouring_cmd. The patches are on top of Jens uring-cmd branch:
> > https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-fops.v3
> >
> > Application is expected to allocate passthrough command structure, set
> > it up traditionally, and pass its address via "block_uring_cmd->addr".
> > On completion, CQE is posted with completion-status after any ioctl
> > specific buffer/field update.
>
> Do you have a test app? I'd be curious to try and add support for this
> to t/io_uring from fio just to run some perf numbers.

Yes Jens. Need to do a couple of things to make it public, will post it today.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 2/3] nvme: keep nvme_command instead of pointer to it
  2021-03-16 17:16       ` Keith Busch
@ 2021-03-17  9:38         ` Kanchan Joshi
  2021-03-17 14:17           ` Keith Busch
  0 siblings, 1 reply; 24+ messages in thread
From: Kanchan Joshi @ 2021-03-17  9:38 UTC (permalink / raw)
  To: Keith Busch
  Cc: Kanchan Joshi, Jens Axboe, Christoph Hellwig, Chaitanya Kulkarni,
	io-uring, linux-nvme, anuj20.g, Javier Gonzalez, Nitesh Shetty,
	Selvakumar S

On Tue, Mar 16, 2021 at 10:53 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Tue, Mar 16, 2021 at 07:31:25PM +0530, Kanchan Joshi wrote:
> > nvme_req structure originally contained a pointer to nvme_command.
> > Change nvme_req structure to keep the command itself.
> > This helps in avoiding hot-path memory-allocation for async-passthrough.
>
> I have a slightly different take on how to handle pre-allocated
> passthrough commands. Every transport except PCI already preallocates a
> 'struct nvme_command' within the pdu, so allocating another one looks
> redundant. Also, it does consume quite a bit of memory for something
> that is used only for the passthrough case.
>
> I think we can solve both concerns by always using the PDU nvme_command
> rather than have the transport drivers provide it. I just sent the patch
> here if you can take a look. It tested fine on PCI and loop (haven't
> tested any other transports).
>
>  http://lists.infradead.org/pipermail/linux-nvme/2021-March/023711.html

Sounds fine, thanks for the patch, looking at it.
Which kernel you used for these. 'Patch 2' doesn't  apply cleanly.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 2/3] nvme: keep nvme_command instead of pointer to it
  2021-03-17  9:38         ` Kanchan Joshi
@ 2021-03-17 14:17           ` Keith Busch
  0 siblings, 0 replies; 24+ messages in thread
From: Keith Busch @ 2021-03-17 14:17 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Kanchan Joshi, Jens Axboe, Christoph Hellwig, Chaitanya Kulkarni,
	io-uring, linux-nvme, anuj20.g, Javier Gonzalez, Nitesh Shetty,
	Selvakumar S

On Wed, Mar 17, 2021 at 03:08:09PM +0530, Kanchan Joshi wrote:
> On Tue, Mar 16, 2021 at 10:53 PM Keith Busch <kbusch@kernel.org> wrote:
> >
> > On Tue, Mar 16, 2021 at 07:31:25PM +0530, Kanchan Joshi wrote:
> > > nvme_req structure originally contained a pointer to nvme_command.
> > > Change nvme_req structure to keep the command itself.
> > > This helps in avoiding hot-path memory-allocation for async-passthrough.
> >
> > I have a slightly different take on how to handle pre-allocated
> > passthrough commands. Every transport except PCI already preallocates a
> > 'struct nvme_command' within the pdu, so allocating another one looks
> > redundant. Also, it does consume quite a bit of memory for something
> > that is used only for the passthrough case.
> >
> > I think we can solve both concerns by always using the PDU nvme_command
> > rather than have the transport drivers provide it. I just sent the patch
> > here if you can take a look. It tested fine on PCI and loop (haven't
> > tested any other transports).
> >
> >  http://lists.infradead.org/pipermail/linux-nvme/2021-March/023711.html
> 
> Sounds fine, thanks for the patch, looking at it.
> Which kernel you used for these. 'Patch 2' doesn't  apply cleanly.

I used nvme-5.13 as my starting point.

 http://git.infradead.org/nvme.git/shortlog/refs/heads/nvme-5.13

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 3/3] nvme: wire up support for async passthrough
  2021-03-16 14:01     ` [RFC PATCH v3 3/3] nvme: wire up support for async passthrough Kanchan Joshi
  2021-03-17  8:52       ` Christoph Hellwig
@ 2021-03-17 16:45       ` Keith Busch
  2021-03-17 17:02         ` Christoph Hellwig
  1 sibling, 1 reply; 24+ messages in thread
From: Keith Busch @ 2021-03-17 16:45 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: axboe, hch, chaitanya.kulkarni, io-uring, linux-nvme, anuj20.g,
	javier.gonz, nj.shetty, selvakuma.s1

On Tue, Mar 16, 2021 at 07:31:26PM +0530, Kanchan Joshi wrote:
> @@ -1179,6 +1278,20 @@ static int nvme_submit_user_cmd(struct request_queue *q,
>  			req->cmd_flags |= REQ_INTEGRITY;
>  		}
>  	}
> +	if (ioucmd) { /* async handling */
> +		u32 effects;
> +
> +		effects = nvme_command_effects(ns->ctrl, ns, cmd->common.opcode);
> +		/* filter commands with non-zero effects, keep it simple for now*/

You shouldn't need to be concerned with this. You've wired up the ioucmd
only to the NVME_IOCTL_IO_CMD, and nvme_command_effects() can only
return 0 for that.

It would be worth adding support for NVME_IOCTL_IO_CMD64 too, though,
and that doesn't change the effects handling either.

> +		if (effects) {
> +			ret = -EOPNOTSUPP;
> +			goto out_unmap;
> +		}
> +		nvme_setup_uring_cmd_data(req, ioucmd, meta, write);
> +		blk_execute_rq_nowait(ns ? ns->disk : NULL, req, 0,
> +					nvme_end_async_pt);
> +		return 0;
> +	}

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 3/3] nvme: wire up support for async passthrough
  2021-03-17  8:52       ` Christoph Hellwig
@ 2021-03-17 16:49         ` Jens Axboe
  2021-03-17 16:59           ` Christoph Hellwig
  2021-03-18  5:54         ` Kanchan Joshi
  1 sibling, 1 reply; 24+ messages in thread
From: Jens Axboe @ 2021-03-17 16:49 UTC (permalink / raw)
  To: Christoph Hellwig, Kanchan Joshi
  Cc: kbusch, chaitanya.kulkarni, io-uring, linux-nvme, anuj20.g,
	javier.gonz, nj.shetty, selvakuma.s1

On 3/17/21 2:52 AM, Christoph Hellwig wrote:
>> +/*
>> + * This is carved within the block_uring_cmd, to avoid dynamic allocation.
>> + * Care should be taken not to grow this beyond what is available.
>> + */
>> +struct uring_cmd_data {
>> +	union {
>> +		struct bio *bio;
>> +		u64 result; /* nvme cmd result */
>> +	};
>> +	void *meta; /* kernel-resident buffer */
>> +	int status; /* nvme cmd status */
>> +};
>> +
>> +inline u64 *ucmd_data_addr(struct io_uring_cmd *ioucmd)
>> +{
>> +	return &(((struct block_uring_cmd *)&ioucmd->pdu)->unused[0]);
>> +}
> 
> The whole typing is a mess, but this mostly goes back to the series
> you're basing this on.  Jens, can you send out the series so that
> we can do a proper review?

I will post it soon, only reason I haven't reposted is that I'm not that
happy with how the sqe split is done (and that it's done in the first
place). But I'll probably just post the current version for comments,
and hopefully we can get it to where it needs to be soon.

-- 
Jens Axboe


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 3/3] nvme: wire up support for async passthrough
  2021-03-17 16:49         ` Jens Axboe
@ 2021-03-17 16:59           ` Christoph Hellwig
  2021-03-17 17:21             ` Jens Axboe
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2021-03-17 16:59 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Kanchan Joshi, kbusch, chaitanya.kulkarni,
	io-uring, linux-nvme, anuj20.g, javier.gonz, nj.shetty,
	selvakuma.s1

On Wed, Mar 17, 2021 at 10:49:28AM -0600, Jens Axboe wrote:
> I will post it soon, only reason I haven't reposted is that I'm not that
> happy with how the sqe split is done (and that it's done in the first
> place). But I'll probably just post the current version for comments,
> and hopefully we can get it to where it needs to be soon.

Yes, I don't like that at all either.  I almost wonder if we should
use an entirely different format after opcode and flags, although
I suspect fd would be nice to have in the same spot as well.

On a related note: I think it really should have a generic cmd
dispatching mechanism like ioctls have, preferably even enforcing
the _IO* mechanism.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 3/3] nvme: wire up support for async passthrough
  2021-03-17 16:45       ` Keith Busch
@ 2021-03-17 17:02         ` Christoph Hellwig
  0 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2021-03-17 17:02 UTC (permalink / raw)
  To: Keith Busch
  Cc: Kanchan Joshi, axboe, hch, chaitanya.kulkarni, io-uring,
	linux-nvme, anuj20.g, javier.gonz, nj.shetty, selvakuma.s1

On Wed, Mar 17, 2021 at 09:45:50AM -0700, Keith Busch wrote:
> > +	if (ioucmd) { /* async handling */
> > +		u32 effects;
> > +
> > +		effects = nvme_command_effects(ns->ctrl, ns, cmd->common.opcode);
> > +		/* filter commands with non-zero effects, keep it simple for now*/
> 
> You shouldn't need to be concerned with this. You've wired up the ioucmd
> only to the NVME_IOCTL_IO_CMD, and nvme_command_effects() can only
> return 0 for that.
> 
> It would be worth adding support for NVME_IOCTL_IO_CMD64 too, though,
> and that doesn't change the effects handling either.

There is no good reason to support the non-64 structure in new code.
And we really should support the admin command submission (on the char
device node) as well.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 3/3] nvme: wire up support for async passthrough
  2021-03-17 16:59           ` Christoph Hellwig
@ 2021-03-17 17:21             ` Jens Axboe
  2021-03-17 18:59               ` Jens Axboe
  0 siblings, 1 reply; 24+ messages in thread
From: Jens Axboe @ 2021-03-17 17:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Kanchan Joshi, kbusch, chaitanya.kulkarni, io-uring, linux-nvme,
	anuj20.g, javier.gonz, nj.shetty, selvakuma.s1

On 3/17/21 10:59 AM, Christoph Hellwig wrote:
> On Wed, Mar 17, 2021 at 10:49:28AM -0600, Jens Axboe wrote:
>> I will post it soon, only reason I haven't reposted is that I'm not that
>> happy with how the sqe split is done (and that it's done in the first
>> place). But I'll probably just post the current version for comments,
>> and hopefully we can get it to where it needs to be soon.
> 
> Yes, I don't like that at all either.  I almost wonder if we should
> use an entirely different format after opcode and flags, although
> I suspect fd would be nice to have in the same spot as well.

Exactly - trying to think of how best to do this. It's somewhat a shame
that I didn't place user_data right after fd, or even at the end of the
struct. But oh well.

One idea would be to have io_uring_sqe_hdr and have that be
op/flags/prio/fd as we should have those for anything, and just embed
that at the top of both io_uring_sqe (our general command), and
io_uring_whatever which is what the passthrough stuff would use.

Not sure, I need to dabble in the code a bit and see how we can make it
the cleanest.

> On a related note: I think it really should have a generic cmd
> dispatching mechanism like ioctls have, preferably even enforcing
> the _IO* mechanism.

Yes, we could certainly do that. I don't want to turn it into a
free-for-all and the wild west of passthrough, some notion of coherent
definitions would be prudent.

-- 
Jens Axboe


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 3/3] nvme: wire up support for async passthrough
  2021-03-17 17:21             ` Jens Axboe
@ 2021-03-17 18:59               ` Jens Axboe
  0 siblings, 0 replies; 24+ messages in thread
From: Jens Axboe @ 2021-03-17 18:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Kanchan Joshi, kbusch, chaitanya.kulkarni, io-uring, linux-nvme,
	anuj20.g, javier.gonz, nj.shetty, selvakuma.s1

On 3/17/21 11:21 AM, Jens Axboe wrote:
> On 3/17/21 10:59 AM, Christoph Hellwig wrote:
>> On Wed, Mar 17, 2021 at 10:49:28AM -0600, Jens Axboe wrote:
>>> I will post it soon, only reason I haven't reposted is that I'm not that
>>> happy with how the sqe split is done (and that it's done in the first
>>> place). But I'll probably just post the current version for comments,
>>> and hopefully we can get it to where it needs to be soon.
>>
>> Yes, I don't like that at all either.  I almost wonder if we should
>> use an entirely different format after opcode and flags, although
>> I suspect fd would be nice to have in the same spot as well.
> 
> Exactly - trying to think of how best to do this. It's somewhat a shame
> that I didn't place user_data right after fd, or even at the end of the
> struct. But oh well.
> 
> One idea would be to have io_uring_sqe_hdr and have that be
> op/flags/prio/fd as we should have those for anything, and just embed
> that at the top of both io_uring_sqe (our general command), and
> io_uring_whatever which is what the passthrough stuff would use.
> 
> Not sure, I need to dabble in the code a bit and see how we can make it
> the cleanest.

How about something like the top two patches here to kick it off, then
build on top?

https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-fops.v4

-- 
Jens Axboe


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 1/3] io_uring: add helper for uring_cmd completion in submitter-task
  2021-03-16 14:01     ` [RFC PATCH v3 1/3] io_uring: add helper for uring_cmd completion in submitter-task Kanchan Joshi
  2021-03-16 15:43       ` Stefan Metzmacher
@ 2021-03-18  1:57       ` Jens Axboe
  2021-03-18  5:25         ` Kanchan Joshi
  1 sibling, 1 reply; 24+ messages in thread
From: Jens Axboe @ 2021-03-18  1:57 UTC (permalink / raw)
  To: Kanchan Joshi, hch, kbusch, chaitanya.kulkarni
  Cc: io-uring, linux-nvme, anuj20.g, javier.gonz, nj.shetty, selvakuma.s1

On 3/16/21 8:01 AM, Kanchan Joshi wrote:
> Completion of a uring_cmd ioctl may involve referencing certain
> ioctl-specific fields, requiring original subitter context.
> Introduce 'uring_cmd_complete_in_task' that driver can use for this
> purpose. The API facilitates task-work infra, while driver gets to
> implement cmd-specific handling in a callback.
> 
> Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> ---
>  fs/io_uring.c            | 36 ++++++++++++++++++++++++++++++++----
>  include/linux/io_uring.h |  8 ++++++++
>  2 files changed, 40 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 583f8fd735d8..ca459ea9cb83 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -772,9 +772,12 @@ struct io_kiocb {
>  		/* use only after cleaning per-op data, see io_clean_op() */
>  		struct io_completion	compl;
>  	};
> -
> -	/* opcode allocated if it needs to store data for async defer */
> -	void				*async_data;
> +	union {
> +		/* opcode allocated if it needs to store data for async defer */
> +		void				*async_data;
> +		/* used for uring-cmd, when driver needs to update in task */
> +		void (*driver_cb)(struct io_uring_cmd *cmd);
> +	};

I don't like this at all, it's very possible that we'd need async
data for passthrough commands as well in certain cases. And what it
gets to that point, we'll have no other recourse than to un-unionize
this and pay the cost. It also means we end up with:

> @@ -1716,7 +1719,7 @@ static void io_dismantle_req(struct io_kiocb *req)
>  {
>  	io_clean_op(req);
>  
> -	if (req->async_data)
> +	if (io_op_defs[req->opcode].async_size && req->async_data)
>  		kfree(req->async_data);
>  	if (req->file)
>  		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));

which are also very fragile.

We already have the task work, just have the driver init and/or call a
helper to get it run from task context with the callback it desires?

If you look at this:

> @@ -2032,6 +2035,31 @@ static void io_req_task_submit(struct callback_head *cb)
>  	__io_req_task_submit(req);
>  }
>  
> +static void uring_cmd_work(struct callback_head *cb)
> +{
> +	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
> +	struct io_uring_cmd *cmd = &req->uring_cmd;
> +
> +	req->driver_cb(cmd);
> +}
> +int uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
> +			void (*driver_cb)(struct io_uring_cmd *))
> +{
> +	int ret;
> +	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
> +
> +	req->driver_cb = driver_cb;
> +	req->task_work.func = uring_cmd_work;
> +	ret = io_req_task_work_add(req);
> +	if (unlikely(ret)) {
> +		req->result = -ECANCELED;
> +		percpu_ref_get(&req->ctx->refs);
> +		io_req_task_work_add_fallback(req, io_req_task_cancel);
> +	}
> +	return ret;
> +}
> +EXPORT_SYMBOL(uring_cmd_complete_in_task);

Then you're basically jumping through hoops to get that callback.
Why don't you just have:

io_uring_schedule_task(struct io_uring_cmd *cmd, task_work_func_t cb)
{
	struct io_kiocb *req = container_of(cmd, struct io_kiocb, uring_cmd);
	int ret;

	req->task_work.func = cb;
	ret = io_req_task_work_add(req);
	if (unlikely(ret)) {
		req->result = -ECANCELED;
		io_req_task_work_add_fallback(req, io_req_task_cancel);
	}
	return ret;
}

?

Also, please export any symbol with _GPL. I don't want non-GPL drivers
using this infrastructure.

-- 
Jens Axboe


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 0/3] Async nvme passthrough over io_uring
  2021-03-17  9:31     ` Kanchan Joshi
@ 2021-03-18  1:58       ` Jens Axboe
  2021-03-18  7:47         ` Kanchan Joshi
  0 siblings, 1 reply; 24+ messages in thread
From: Jens Axboe @ 2021-03-18  1:58 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Kanchan Joshi, Christoph Hellwig, Keith Busch,
	Chaitanya Kulkarni, io-uring, linux-nvme, anuj20.g,
	Javier Gonzalez, Nitesh Shetty, Selvakumar S

On 3/17/21 3:31 AM, Kanchan Joshi wrote:
> On Tue, Mar 16, 2021 at 9:31 PM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 3/16/21 8:01 AM, Kanchan Joshi wrote:
>>> This series adds async passthrough capability for nvme block-dev over
>>> iouring_cmd. The patches are on top of Jens uring-cmd branch:
>>> https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-fops.v3
>>>
>>> Application is expected to allocate passthrough command structure, set
>>> it up traditionally, and pass its address via "block_uring_cmd->addr".
>>> On completion, CQE is posted with completion-status after any ioctl
>>> specific buffer/field update.
>>
>> Do you have a test app? I'd be curious to try and add support for this
>> to t/io_uring from fio just to run some perf numbers.
> 
> Yes Jens. Need to do a couple of things to make it public, will post it today.

Sounds good! I commented on 1/3, I think it can be simplified and
cleaned up quite a bit, which is great. Then let's base it on top of v4
that I posted, let me know if you run into any issues with that.

-- 
Jens Axboe


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 1/3] io_uring: add helper for uring_cmd completion in submitter-task
  2021-03-18  1:57       ` Jens Axboe
@ 2021-03-18  5:25         ` Kanchan Joshi
  2021-03-18  5:48           ` Christoph Hellwig
  0 siblings, 1 reply; 24+ messages in thread
From: Kanchan Joshi @ 2021-03-18  5:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Kanchan Joshi, Christoph Hellwig, Keith Busch,
	Chaitanya Kulkarni, io-uring, linux-nvme, anuj20.g,
	Javier Gonzalez, Nitesh Shetty, Selvakumar S

On Thu, Mar 18, 2021 at 7:31 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 3/16/21 8:01 AM, Kanchan Joshi wrote:
> > Completion of a uring_cmd ioctl may involve referencing certain
> > ioctl-specific fields, requiring original subitter context.
> > Introduce 'uring_cmd_complete_in_task' that driver can use for this
> > purpose. The API facilitates task-work infra, while driver gets to
> > implement cmd-specific handling in a callback.
> >
> > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> > ---
> >  fs/io_uring.c            | 36 ++++++++++++++++++++++++++++++++----
> >  include/linux/io_uring.h |  8 ++++++++
> >  2 files changed, 40 insertions(+), 4 deletions(-)
> >
> > diff --git a/fs/io_uring.c b/fs/io_uring.c
> > index 583f8fd735d8..ca459ea9cb83 100644
> > --- a/fs/io_uring.c
> > +++ b/fs/io_uring.c
> > @@ -772,9 +772,12 @@ struct io_kiocb {
> >               /* use only after cleaning per-op data, see io_clean_op() */
> >               struct io_completion    compl;
> >       };
> > -
> > -     /* opcode allocated if it needs to store data for async defer */
> > -     void                            *async_data;
> > +     union {
> > +             /* opcode allocated if it needs to store data for async defer */
> > +             void                            *async_data;
> > +             /* used for uring-cmd, when driver needs to update in task */
> > +             void (*driver_cb)(struct io_uring_cmd *cmd);
> > +     };
>
> I don't like this at all, it's very possible that we'd need async
> data for passthrough commands as well in certain cases. And what it
> gets to that point, we'll have no other recourse than to un-unionize
> this and pay the cost. It also means we end up with:
>
> > @@ -1716,7 +1719,7 @@ static void io_dismantle_req(struct io_kiocb *req)
> >  {
> >       io_clean_op(req);
> >
> > -     if (req->async_data)
> > +     if (io_op_defs[req->opcode].async_size && req->async_data)
> >               kfree(req->async_data);
> >       if (req->file)
> >               io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
>
> which are also very fragile.

I did not want to have it this way....but faced troubles with the more
natural way of doing this. Please see below.

> We already have the task work, just have the driver init and/or call a
> helper to get it run from task context with the callback it desires?
>
> If you look at this:
>
> > @@ -2032,6 +2035,31 @@ static void io_req_task_submit(struct callback_head *cb)
> >       __io_req_task_submit(req);
> >  }
> >
> > +static void uring_cmd_work(struct callback_head *cb)
> > +{
> > +     struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
> > +     struct io_uring_cmd *cmd = &req->uring_cmd;
> > +
> > +     req->driver_cb(cmd);
> > +}
> > +int uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
> > +                     void (*driver_cb)(struct io_uring_cmd *))
> > +{
> > +     int ret;
> > +     struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
> > +
> > +     req->driver_cb = driver_cb;
> > +     req->task_work.func = uring_cmd_work;
> > +     ret = io_req_task_work_add(req);
> > +     if (unlikely(ret)) {
> > +             req->result = -ECANCELED;
> > +             percpu_ref_get(&req->ctx->refs);
> > +             io_req_task_work_add_fallback(req, io_req_task_cancel);
> > +     }
> > +     return ret;
> > +}
> > +EXPORT_SYMBOL(uring_cmd_complete_in_task);
>
> Then you're basically jumping through hoops to get that callback.
> Why don't you just have:
>
> io_uring_schedule_task(struct io_uring_cmd *cmd, task_work_func_t cb)
> {
>         struct io_kiocb *req = container_of(cmd, struct io_kiocb, uring_cmd);
>         int ret;
>
>         req->task_work.func = cb;
>         ret = io_req_task_work_add(req);
>         if (unlikely(ret)) {
>                 req->result = -ECANCELED;
>                 io_req_task_work_add_fallback(req, io_req_task_cancel);
>         }
>         return ret;
> }
>
> ?
I started with that, but the problem was implementing the driver callback .
The callbacks receive only one argument which is "struct callback_head
*", and the driver needs to extract "io_uring_cmd *" out of it.
This part -
+static void uring_cmd_work(struct callback_head *cb)
+{
+     struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+     struct io_uring_cmd *cmd = &req->uring_cmd;

If the callback has to move to the driver (nvme), the driver needs
visibility to "struct io_kiocb" which is uring-local.
Do you see a better way to handle this?
I also thought about keeping the driver_cb inside the unused part of
uring_cmd (instead of union with req->async_data), but it had two
problems - 1. uring needs to peek inside driver-part of uring_cmd to
invoke this callback
2. losing precious space  (I am using that space to avoid per-command
dynamic-allocation in driver)

> Also, please export any symbol with _GPL. I don't want non-GPL drivers
> using this infrastructure.

Got it.


-- 
Kanchan

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 1/3] io_uring: add helper for uring_cmd completion in submitter-task
  2021-03-18  5:25         ` Kanchan Joshi
@ 2021-03-18  5:48           ` Christoph Hellwig
  2021-03-18  6:14             ` Kanchan Joshi
  0 siblings, 1 reply; 24+ messages in thread
From: Christoph Hellwig @ 2021-03-18  5:48 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Jens Axboe, Kanchan Joshi, Christoph Hellwig, Keith Busch,
	Chaitanya Kulkarni, io-uring, linux-nvme, anuj20.g,
	Javier Gonzalez, Nitesh Shetty, Selvakumar S

On Thu, Mar 18, 2021 at 10:55:55AM +0530, Kanchan Joshi wrote:
> I started with that, but the problem was implementing the driver callback .
> The callbacks receive only one argument which is "struct callback_head
> *", and the driver needs to extract "io_uring_cmd *" out of it.
> This part -
> +static void uring_cmd_work(struct callback_head *cb)
> +{
> +     struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
> +     struct io_uring_cmd *cmd = &req->uring_cmd;
> 
> If the callback has to move to the driver (nvme), the driver needs
> visibility to "struct io_kiocb" which is uring-local.
> Do you see a better way to handle this?

Can't you just add a helper in io_uring.c that does something like this:

struct io_uring_cmd *callback_to_io_uring_cmd(struct callback_head *cb)
{
	return &container_of(cb, struct io_kiocb, task_work)->uring_cmd;
}
EXPORT_SYMBOL_GPL(callback_to_io_uring_cmd);

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 3/3] nvme: wire up support for async passthrough
  2021-03-17  8:52       ` Christoph Hellwig
  2021-03-17 16:49         ` Jens Axboe
@ 2021-03-18  5:54         ` Kanchan Joshi
  1 sibling, 0 replies; 24+ messages in thread
From: Kanchan Joshi @ 2021-03-18  5:54 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Kanchan Joshi, Jens Axboe, Keith Busch, Chaitanya Kulkarni,
	io-uring, linux-nvme, anuj20.g, Javier Gonzalez, Nitesh Shetty,
	Selvakumar S

On Wed, Mar 17, 2021 at 2:24 PM Christoph Hellwig <hch@lst.de> wrote:
>
> > +/*
> > + * This is carved within the block_uring_cmd, to avoid dynamic allocation.
> > + * Care should be taken not to grow this beyond what is available.
> > + */
> > +struct uring_cmd_data {
> > +     union {
> > +             struct bio *bio;
> > +             u64 result; /* nvme cmd result */
> > +     };
> > +     void *meta; /* kernel-resident buffer */
> > +     int status; /* nvme cmd status */
> > +};
> > +
> > +inline u64 *ucmd_data_addr(struct io_uring_cmd *ioucmd)
> > +{
> > +     return &(((struct block_uring_cmd *)&ioucmd->pdu)->unused[0]);
> > +}
>
> The whole typing is a mess, but this mostly goes back to the series
> you're basing this on.  Jens, can you send out the series so that
> we can do a proper review?
>
> IMHO struct io_uring_cmd needs to stay private in io-uring.c, and
> the method needs to get the file and the untyped payload in form
> of a void * separately.  and block_uring_cmd should be private to
> the example ioctl, not exposed to drivers implementing their own
> schemes.
>
> > +void ioucmd_task_cb(struct io_uring_cmd *ioucmd)
>
> This should be mark static and have a much more descriptive name
> including a nvme_ prefix.

Yes. Will change.

> > +     /* handle meta update */
> > +     if (ucd->meta) {
> > +             void __user *umeta = nvme_to_user_ptr(ptcmd->metadata);
> > +
> > +             if (!ucd->status)
> > +                     if (copy_to_user(umeta, ucd->meta, ptcmd->metadata_len))
> > +                             ucd->status = -EFAULT;
> > +             kfree(ucd->meta);
> > +     }
> > +     /* handle result update */
> > +     if (put_user(ucd->result, (u32 __user *)&ptcmd->result))
>
> The comments aren't very useful, and the cast here is a warning sign.
> Why do you need it?

Will do away with cast and comments.

> > +             ucd->status = -EFAULT;
> > +     io_uring_cmd_done(ioucmd, ucd->status);
>
> Shouldn't the io-uring core take care of this io_uring_cmd_done
> call?

At some point we (driver) need to tell the io_uring that command is
over, and return the status to it so that uring can update CQE.
This call "io_uring_cmd_done" does just that.

> > +void nvme_end_async_pt(struct request *req, blk_status_t err)
>
> static?

Indeed. Will change.

> > +{
> > +     struct io_uring_cmd *ioucmd;
> > +     struct uring_cmd_data *ucd;
> > +     struct bio *bio;
> > +     int ret;
> > +
> > +     ioucmd = req->end_io_data;
> > +     ucd = (struct uring_cmd_data *) ucmd_data_addr(ioucmd);
> > +     /* extract bio before reusing the same field for status */
> > +     bio = ucd->bio;
> > +
> > +     if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
> > +             ucd->status = -EINTR;
> > +     else
> > +             ucd->status = nvme_req(req)->status;
> > +     ucd->result = le64_to_cpu(nvme_req(req)->result.u64);
> > +
> > +     /* this takes care of setting up task-work */
> > +     ret = uring_cmd_complete_in_task(ioucmd, ioucmd_task_cb);
> > +     if (ret < 0)
> > +             kfree(ucd->meta);
> > +
> > +     /* unmap pages, free bio, nvme command and request */
> > +     blk_rq_unmap_user(bio);
> > +     blk_mq_free_request(req);
>
> How can we free the request here if the data is only copied out in
> a task_work?

Things that we want to use in task_work (command status and result)
are alive in "ucd" (which is carved inside uring_cmd itself, and will
not be reclaimed until we tell io_uring that command is over).
The meta buffer is separate, and it is also alive via ucd->meta. It
will be freed only in task-work.
bio/request/pages cleanup do not have to wait till task-work.

> >  static int nvme_submit_user_cmd(struct request_queue *q,
> >               struct nvme_command *cmd, void __user *ubuffer,
> >               unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
> > -             u32 meta_seed, u64 *result, unsigned timeout)
> > +             u32 meta_seed, u64 *result, unsigned int timeout,
> > +             struct io_uring_cmd *ioucmd)
> >  {
> >       bool write = nvme_is_write(cmd);
> >       struct nvme_ns *ns = q->queuedata;
> > @@ -1179,6 +1278,20 @@ static int nvme_submit_user_cmd(struct request_queue *q,
> >                       req->cmd_flags |= REQ_INTEGRITY;
> >               }
> >       }
> > +     if (ioucmd) { /* async handling */
>
> nvme_submit_user_cmd already is a mess.  Please split this out into
> a separate function.  Maybe the logic to map the user buffers can be
> split into a little shared helper.

Ok. I will look at refactoring the way you mentioned.

> > +int nvme_uring_cmd(struct request_queue *q, struct io_uring_cmd *ioucmd,
> > +             enum io_uring_cmd_flags flags)
>
> Another comment on the original infrastructure:  this really needs to
> be a block_device_operations method taking a struct block_device instead
> of being tied into blk-mq.
>
> > +EXPORT_SYMBOL_GPL(nvme_uring_cmd);
>
> I don't think this shoud be exported.

It is needed to populate the callback in PCI transport. Not right?


Thanks for the detailed review.
-- 
Kanchan

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 1/3] io_uring: add helper for uring_cmd completion in submitter-task
  2021-03-18  5:48           ` Christoph Hellwig
@ 2021-03-18  6:14             ` Kanchan Joshi
  0 siblings, 0 replies; 24+ messages in thread
From: Kanchan Joshi @ 2021-03-18  6:14 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Kanchan Joshi, Keith Busch, Chaitanya Kulkarni,
	io-uring, linux-nvme, anuj20.g, Javier Gonzalez, Nitesh Shetty,
	Selvakumar S

On Thu, Mar 18, 2021 at 11:18 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Thu, Mar 18, 2021 at 10:55:55AM +0530, Kanchan Joshi wrote:
> > I started with that, but the problem was implementing the driver callback .
> > The callbacks receive only one argument which is "struct callback_head
> > *", and the driver needs to extract "io_uring_cmd *" out of it.
> > This part -
> > +static void uring_cmd_work(struct callback_head *cb)
> > +{
> > +     struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
> > +     struct io_uring_cmd *cmd = &req->uring_cmd;
> >
> > If the callback has to move to the driver (nvme), the driver needs
> > visibility to "struct io_kiocb" which is uring-local.
> > Do you see a better way to handle this?
>
> Can't you just add a helper in io_uring.c that does something like this:
>
> struct io_uring_cmd *callback_to_io_uring_cmd(struct callback_head *cb)
> {
>         return &container_of(cb, struct io_kiocb, task_work)->uring_cmd;
> }
> EXPORT_SYMBOL_GPL(callback_to_io_uring_cmd);

That solves it, thanks.


--
Kanchan

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH v3 0/3] Async nvme passthrough over io_uring
  2021-03-18  1:58       ` Jens Axboe
@ 2021-03-18  7:47         ` Kanchan Joshi
  0 siblings, 0 replies; 24+ messages in thread
From: Kanchan Joshi @ 2021-03-18  7:47 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Kanchan Joshi, Christoph Hellwig, Keith Busch,
	Chaitanya Kulkarni, io-uring, linux-nvme, anuj20.g,
	Javier Gonzalez, Nitesh Shetty, Selvakumar S

On Thu, Mar 18, 2021 at 7:28 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 3/17/21 3:31 AM, Kanchan Joshi wrote:
> > On Tue, Mar 16, 2021 at 9:31 PM Jens Axboe <axboe@kernel.dk> wrote:
> >>
> >> On 3/16/21 8:01 AM, Kanchan Joshi wrote:
> >>> This series adds async passthrough capability for nvme block-dev over
> >>> iouring_cmd. The patches are on top of Jens uring-cmd branch:
> >>> https://git.kernel.dk/cgit/linux-block/log/?h=io_uring-fops.v3
> >>>
> >>> Application is expected to allocate passthrough command structure, set
> >>> it up traditionally, and pass its address via "block_uring_cmd->addr".
> >>> On completion, CQE is posted with completion-status after any ioctl
> >>> specific buffer/field update.
> >>
> >> Do you have a test app? I'd be curious to try and add support for this
> >> to t/io_uring from fio just to run some perf numbers.
> >
> > Yes Jens. Need to do a couple of things to make it public, will post it today.

Please see if this is accessible to you -
https://github.com/joshkan/fio/commit/6c18653bc87015213a18c23d56518d4daf21b191

I run it on nvme device with the extra option "-uring_cmd=1". And pit
passthru read/write against regular uring read/write.
While write perf looks fine, I notice higher-qd reads going tad-bit
low which is puzzling.
But I need to test more to see if this is about my test-env (including
the added test-code) itself.

It will be great if you could, at some point in future, have a look at
this test or spin off what you already had in mind.

> Sounds good! I commented on 1/3, I think it can be simplified and
> cleaned up quite a bit, which is great. Then let's base it on top of v4
> that I posted, let me know if you run into any issues with that.

Yes, will move to V4,  thanks!

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2021-03-18  7:48 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <CGME20210316140229epcas5p23d68a4c9694bbf7759b5901115a4309b@epcas5p2.samsung.com>
2021-03-16 14:01 ` [RFC PATCH v3 0/3] Async nvme passthrough over io_uring Kanchan Joshi
     [not found]   ` <CGME20210316140233epcas5p372405e7cb302c61dba5e1094fa796513@epcas5p3.samsung.com>
2021-03-16 14:01     ` [RFC PATCH v3 1/3] io_uring: add helper for uring_cmd completion in submitter-task Kanchan Joshi
2021-03-16 15:43       ` Stefan Metzmacher
2021-03-18  1:57       ` Jens Axboe
2021-03-18  5:25         ` Kanchan Joshi
2021-03-18  5:48           ` Christoph Hellwig
2021-03-18  6:14             ` Kanchan Joshi
     [not found]   ` <CGME20210316140236epcas5p4de087ee51a862402146fbbc621d4d4c6@epcas5p4.samsung.com>
2021-03-16 14:01     ` [RFC PATCH v3 2/3] nvme: keep nvme_command instead of pointer to it Kanchan Joshi
2021-03-16 17:16       ` Keith Busch
2021-03-17  9:38         ` Kanchan Joshi
2021-03-17 14:17           ` Keith Busch
     [not found]   ` <CGME20210316140240epcas5p3e71bfe2afecd728c5af60056f21cc9b7@epcas5p3.samsung.com>
2021-03-16 14:01     ` [RFC PATCH v3 3/3] nvme: wire up support for async passthrough Kanchan Joshi
2021-03-17  8:52       ` Christoph Hellwig
2021-03-17 16:49         ` Jens Axboe
2021-03-17 16:59           ` Christoph Hellwig
2021-03-17 17:21             ` Jens Axboe
2021-03-17 18:59               ` Jens Axboe
2021-03-18  5:54         ` Kanchan Joshi
2021-03-17 16:45       ` Keith Busch
2021-03-17 17:02         ` Christoph Hellwig
2021-03-16 15:51   ` [RFC PATCH v3 0/3] Async nvme passthrough over io_uring Jens Axboe
2021-03-17  9:31     ` Kanchan Joshi
2021-03-18  1:58       ` Jens Axboe
2021-03-18  7:47         ` Kanchan Joshi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).