All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/5] io_uring passthough for nvme
       [not found] <CGME20220503184911eucas1p1beb172219537d78fcaf2a1417f532cf7@eucas1p1.samsung.com>
@ 2022-05-03 18:48 ` Pankaj Raghav
       [not found]   ` <CGME20220503184912eucas1p1bb0e3d36c06cfde8436df3a45e67bd32@eucas1p1.samsung.com>
                     ` (4 more replies)
  0 siblings, 5 replies; 14+ messages in thread
From: Pankaj Raghav @ 2022-05-03 18:48 UTC (permalink / raw)
  To: axboe, hch
  Cc: io-uring, linux-nvme, asml.silence, ming.lei, mcgrof, shr,
	joshiiitr, gost.dev, Kanchan Joshi

From: Kanchan Joshi <joshi.k@samsung.com>

NOTE: Sending on behalf of Kanchan as he is having some trouble sending
emails.

This iteration is against io_uring-big-sqe brach (linux-block).
On top of a739b2354 ("io_uring: enable CQE32").

fio testing branch:
https://github.com/joshkan/fio/tree/big-cqe-pt.v3

Changes since v2:
- Rewire uring-cmd infrastructure on top of new big CQE
- Prep patch (refactored) and feedback from Christoph
- Add new opcode and structure in nvme for uring-cmd
- Enable vectored-io

Changes since v1:
https://lore.kernel.org/linux-nvme/20220401110310.611869-1-joshi.k@samsung.com/
- Trim down by removing patches for polling, fixed-buffer and bio-cache
- Add big CQE and move uring-cmd to use that
- Remove indirect (pointer) submission

v1:
https://lore.kernel.org/linux-nvme/20220308152105.309618-1-joshi.k@samsung.com/

Anuj Gupta (1):
  nvme: add vectored-io support for uring-cmd

Christoph Hellwig (1):
  nvme: refactor nvme_submit_user_cmd()

Jens Axboe (2):
  fs,io_uring: add infrastructure for uring-cmd
  block: wire-up support for passthrough plugging

Kanchan Joshi (1):
  nvme: wire-up uring-cmd support for io-passthru on char-device.

 block/blk-mq.c                  |  90 +++++++-------
 drivers/nvme/host/core.c        |   1 +
 drivers/nvme/host/ioctl.c       | 212 +++++++++++++++++++++++++++++++-
 drivers/nvme/host/multipath.c   |   1 +
 drivers/nvme/host/nvme.h        |   3 +
 fs/io_uring.c                   |  81 ++++++++++--
 include/linux/fs.h              |   2 +
 include/linux/io_uring.h        |  29 +++++
 include/uapi/linux/io_uring.h   |   8 +-
 include/uapi/linux/nvme_ioctl.h |  29 +++++
 10 files changed, 399 insertions(+), 57 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v3 1/5] fs,io_uring: add infrastructure for uring-cmd
       [not found]   ` <CGME20220503184912eucas1p1bb0e3d36c06cfde8436df3a45e67bd32@eucas1p1.samsung.com>
@ 2022-05-03 18:48     ` Pankaj Raghav
  2022-05-03 20:52       ` Christoph Hellwig
  2022-05-03 21:03       ` Jens Axboe
  0 siblings, 2 replies; 14+ messages in thread
From: Pankaj Raghav @ 2022-05-03 18:48 UTC (permalink / raw)
  To: axboe, hch
  Cc: io-uring, linux-nvme, asml.silence, ming.lei, mcgrof, shr,
	joshiiitr, gost.dev, Kanchan Joshi

From: Jens Axboe <axboe@kernel.dk>

file_operations->uring_cmd is a file private handler, similar to ioctls
but hopefully a lot more sane and useful.

IORING_OP_URING_CMD is a file private kind of request. io_uring doesn't
know what is in this command type, it's for the provider of ->uring_cmd()
to deal with. This operation can be issued only on the ring that is
setup with both IORING_SETUP_SQE128 and IORING_SETUP_CQE32 flags.

Co-developed-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 81 ++++++++++++++++++++++++++++++++---
 include/linux/fs.h            |  2 +
 include/linux/io_uring.h      | 29 +++++++++++++
 include/uapi/linux/io_uring.h |  8 +++-
 4 files changed, 112 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c7e3f7e74d92..b774e6eac538 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -202,13 +202,6 @@ struct io_rings {
 	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
 };
 
-enum io_uring_cmd_flags {
-	IO_URING_F_COMPLETE_DEFER	= 1,
-	IO_URING_F_UNLOCKED		= 2,
-	/* int's last bit, sign checks are usually faster than a bit test */
-	IO_URING_F_NONBLOCK		= INT_MIN,
-};
-
 struct io_mapped_ubuf {
 	u64		ubuf;
 	u64		ubuf_end;
@@ -969,6 +962,7 @@ struct io_kiocb {
 		struct io_xattr		xattr;
 		struct io_socket	sock;
 		struct io_nop		nop;
+		struct io_uring_cmd	uring_cmd;
 	};
 
 	u8				opcode;
@@ -1254,6 +1248,10 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_SOCKET] = {
 		.audit_skip		= 1,
 	},
+	[IORING_OP_URING_CMD] = {
+		.needs_file		= 1,
+		.plug			= 1,
+	},
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -1393,6 +1391,8 @@ const char *io_uring_get_opcode(u8 opcode)
 		return "GETXATTR";
 	case IORING_OP_SOCKET:
 		return "SOCKET";
+	case IORING_OP_URING_CMD:
+		return "URING_CMD";
 	case IORING_OP_LAST:
 		return "INVALID";
 	}
@@ -4907,6 +4907,66 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
+{
+	req->uring_cmd.driver_cb(&req->uring_cmd);
+}
+
+void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*driver_cb)(struct io_uring_cmd *))
+{
+	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+
+	req->uring_cmd.driver_cb = driver_cb;
+	req->io_task_work.func = io_uring_cmd_work;
+	io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
+
+/*
+ * Called by consumers of io_uring_cmd, if they originally returned
+ * -EIOCBQUEUED upon receiving the command.
+ */
+void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
+{
+	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+
+	if (ret < 0)
+		req_set_fail(req);
+	__io_req_complete32(req, 0, ret, 0, res2, 0);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_done);
+
+static int io_uring_cmd_prep(struct io_kiocb *req,
+			     const struct io_uring_sqe *sqe)
+{
+	struct io_uring_cmd *ioucmd = &req->uring_cmd;
+
+	if (req->ctx->flags & IORING_SETUP_IOPOLL)
+		return -EOPNOTSUPP;
+	/* do not support uring-cmd without big SQE/CQE */
+	if (!(req->ctx->flags & IORING_SETUP_SQE128))
+		return -EOPNOTSUPP;
+	if (!(req->ctx->flags & IORING_SETUP_CQE32))
+		return -EOPNOTSUPP;
+	ioucmd->cmd = (void *) &sqe->cmd;
+	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
+	ioucmd->flags = 0;
+	return 0;
+}
+
+static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct file *file = req->file;
+	struct io_uring_cmd *ioucmd = &req->uring_cmd;
+
+	if (!req->file->f_op->uring_cmd)
+		return -EOPNOTSUPP;
+	ioucmd->flags |= issue_flags;
+	file->f_op->uring_cmd(ioucmd);
+	return 0;
+}
+
 static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -7764,6 +7824,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_getxattr_prep(req, sqe);
 	case IORING_OP_SOCKET:
 		return io_socket_prep(req, sqe);
+	case IORING_OP_URING_CMD:
+		return io_uring_cmd_prep(req, sqe);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -8085,6 +8147,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	case IORING_OP_SOCKET:
 		ret = io_socket(req, issue_flags);
 		break;
+	case IORING_OP_URING_CMD:
+		ret = io_uring_cmd(req, issue_flags);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -12688,6 +12753,8 @@ static int __init io_uring_init(void)
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
 
+	BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64);
+
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
 				SLAB_ACCOUNT);
 	return 0;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bbde95387a23..6b64c07efcf4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1953,6 +1953,7 @@ struct dir_context {
 #define REMAP_FILE_ADVISORY		(REMAP_FILE_CAN_SHORTEN)
 
 struct iov_iter;
+struct io_uring_cmd;
 
 struct file_operations {
 	struct module *owner;
@@ -1995,6 +1996,7 @@ struct file_operations {
 				   struct file *file_out, loff_t pos_out,
 				   loff_t len, unsigned int remap_flags);
 	int (*fadvise)(struct file *, loff_t, loff_t, int);
+	void (*uring_cmd)(struct io_uring_cmd *ioucmd);
 } __randomize_layout;
 
 struct inode_operations {
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 24651c229ed2..a4ff4696cbea 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -5,7 +5,28 @@
 #include <linux/sched.h>
 #include <linux/xarray.h>
 
+enum io_uring_cmd_flags {
+	IO_URING_F_COMPLETE_DEFER	= 1,
+	IO_URING_F_UNLOCKED		= 2,
+	/* int's last bit, sign checks are usually faster than a bit test */
+	IO_URING_F_NONBLOCK		= INT_MIN,
+};
+
+struct io_uring_cmd {
+	struct file     *file;
+	void            *cmd;
+	/* for irq-completion - if driver requires doing stuff in task-context*/
+	void (*driver_cb)(struct io_uring_cmd *cmd);
+	u32             flags;
+	u32             cmd_op;
+	u32		unused;
+	u8		pdu[28]; /* available inline for free use */
+};
+
 #if defined(CONFIG_IO_URING)
+void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2);
+void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*driver_cb)(struct io_uring_cmd *));
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
@@ -30,6 +51,14 @@ static inline void io_uring_free(struct task_struct *tsk)
 		__io_uring_free(tsk);
 }
 #else
+static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
+		ssize_t ret2)
+{
+}
+static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
+			void (*driver_cb)(struct io_uring_cmd *))
+{
+}
 static inline struct sock *io_uring_get_socket(struct file *file)
 {
 	return NULL;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 881e508767f8..c081511119bf 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -22,10 +22,12 @@ struct io_uring_sqe {
 	union {
 		__u64	off;	/* offset into file */
 		__u64	addr2;
+		__u32	cmd_op;
 	};
 	union {
 		__u64	addr;	/* pointer to buffer or iovecs */
 		__u64	splice_off_in;
+		__u16	cmd_len;
 	};
 	__u32	len;		/* buffer size or number of iovecs */
 	union {
@@ -61,7 +63,10 @@ struct io_uring_sqe {
 		__s32	splice_fd_in;
 		__u32	file_index;
 	};
-	__u64	addr3;
+	union {
+		__u64	addr3;
+		__u64	cmd;
+	};
 	__u64	__pad2[1];
 
 	/*
@@ -160,6 +165,7 @@ enum io_uring_op {
 	IORING_OP_FGETXATTR,
 	IORING_OP_GETXATTR,
 	IORING_OP_SOCKET,
+	IORING_OP_URING_CMD,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v3 2/5] block: wire-up support for passthrough plugging
       [not found]   ` <CGME20220503184913eucas1p156abb6e2273c8dabc22e87ec8b218a5c@eucas1p1.samsung.com>
@ 2022-05-03 18:48     ` Pankaj Raghav
  2022-05-03 20:53       ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Pankaj Raghav @ 2022-05-03 18:48 UTC (permalink / raw)
  To: axboe, hch
  Cc: io-uring, linux-nvme, asml.silence, ming.lei, mcgrof, shr,
	joshiiitr, gost.dev

From: Jens Axboe <axboe@kernel.dk>

Add support for plugging in passthrough path. When plugging is enabled, the
requests are added to a plug instead of getting dispatched to the driver.
And when the plug is finished, the whole batch gets dispatched via
->queue_rqs which turns out to be more efficient. Otherwise dispatching
used to happen via ->queue_rq, one request at a time.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 90 ++++++++++++++++++++++++++------------------------
 1 file changed, 47 insertions(+), 43 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 84d749511f55..e432e7e8fe20 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2340,6 +2340,40 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	blk_mq_hctx_mark_pending(hctx, ctx);
 }
 
+/*
+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
+ * queues. This is important for md arrays to benefit from merging
+ * requests.
+ */
+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
+{
+	if (plug->multiple_queues)
+		return BLK_MAX_REQUEST_COUNT * 2;
+	return BLK_MAX_REQUEST_COUNT;
+}
+
+static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
+{
+	struct request *last = rq_list_peek(&plug->mq_list);
+
+	if (!plug->rq_count) {
+		trace_block_plug(rq->q);
+	} else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
+		   (!blk_queue_nomerges(rq->q) &&
+		    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
+		blk_mq_flush_plug_list(plug, false);
+		trace_block_plug(rq->q);
+	}
+
+	if (!plug->multiple_queues && last && last->q != rq->q)
+		plug->multiple_queues = true;
+	if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
+		plug->has_elevator = true;
+	rq->rq_next = NULL;
+	rq_list_add(&plug->mq_list, rq);
+	plug->rq_count++;
+}
+
 /**
  * blk_mq_request_bypass_insert - Insert a request at dispatch list.
  * @rq: Pointer to request to be inserted.
@@ -2353,16 +2387,20 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
 				  bool run_queue)
 {
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+	struct blk_plug *plug = current->plug;
 
-	spin_lock(&hctx->lock);
-	if (at_head)
-		list_add(&rq->queuelist, &hctx->dispatch);
-	else
-		list_add_tail(&rq->queuelist, &hctx->dispatch);
-	spin_unlock(&hctx->lock);
-
-	if (run_queue)
-		blk_mq_run_hw_queue(hctx, false);
+	if (plug) {
+		blk_add_rq_to_plug(plug, rq);
+	} else {
+		spin_lock(&hctx->lock);
+		if (at_head)
+			list_add(&rq->queuelist, &hctx->dispatch);
+		else
+			list_add_tail(&rq->queuelist, &hctx->dispatch);
+		spin_unlock(&hctx->lock);
+		if (run_queue)
+			blk_mq_run_hw_queue(hctx, false);
+	}
 }
 
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
@@ -2676,40 +2714,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 		hctx->queue->mq_ops->commit_rqs(hctx);
 }
 
-/*
- * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
- * queues. This is important for md arrays to benefit from merging
- * requests.
- */
-static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
-{
-	if (plug->multiple_queues)
-		return BLK_MAX_REQUEST_COUNT * 2;
-	return BLK_MAX_REQUEST_COUNT;
-}
-
-static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
-{
-	struct request *last = rq_list_peek(&plug->mq_list);
-
-	if (!plug->rq_count) {
-		trace_block_plug(rq->q);
-	} else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
-		   (!blk_queue_nomerges(rq->q) &&
-		    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
-		blk_mq_flush_plug_list(plug, false);
-		trace_block_plug(rq->q);
-	}
-
-	if (!plug->multiple_queues && last && last->q != rq->q)
-		plug->multiple_queues = true;
-	if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
-		plug->has_elevator = true;
-	rq->rq_next = NULL;
-	rq_list_add(&plug->mq_list, rq);
-	plug->rq_count++;
-}
-
 static bool blk_mq_attempt_bio_merge(struct request_queue *q,
 				     struct bio *bio, unsigned int nr_segs)
 {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v3 3/5] nvme: refactor nvme_submit_user_cmd()
       [not found]   ` <CGME20220503184914eucas1p1d9df18afe3234c0698a66cdb9c664ddc@eucas1p1.samsung.com>
@ 2022-05-03 18:48     ` Pankaj Raghav
  0 siblings, 0 replies; 14+ messages in thread
From: Pankaj Raghav @ 2022-05-03 18:48 UTC (permalink / raw)
  To: axboe, hch
  Cc: io-uring, linux-nvme, asml.silence, ming.lei, mcgrof, shr,
	joshiiitr, gost.dev

From: Christoph Hellwig <hch@lst.de>

Divide the work into two helpers, namely nvme_alloc_user_request and
nvme_execute_user_rq. This is a prep patch, that will help wiring up
uring-cmd support in nvme.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/ioctl.c | 47 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 554566371ffa..3531de8073a6 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -19,6 +19,13 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
 	return (void __user *)ptrval;
 }
 
+static inline void *nvme_meta_from_bio(struct bio *bio)
+{
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+
+	return bip ? bvec_virt(bip->bip_vec) : NULL;
+}
+
 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
 		unsigned len, u32 seed, bool write)
 {
@@ -53,10 +60,10 @@ static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
 	return ERR_PTR(ret);
 }
 
-static int nvme_submit_user_cmd(struct request_queue *q,
+static struct request *nvme_alloc_user_request(struct request_queue *q,
 		struct nvme_command *cmd, void __user *ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		u32 meta_seed, u64 *result, unsigned timeout, bool vec)
+		u32 meta_seed, unsigned timeout, bool vec)
 {
 	bool write = nvme_is_write(cmd);
 	struct nvme_ns *ns = q->queuedata;
@@ -68,7 +75,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 
 	req = blk_mq_alloc_request(q, nvme_req_op(cmd), 0);
 	if (IS_ERR(req))
-		return PTR_ERR(req);
+		return req;
 	nvme_init_request(req, cmd);
 
 	if (timeout)
@@ -108,7 +115,26 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 		}
 	}
 
+	return req;
+
+out_unmap:
+	if (bio)
+		blk_rq_unmap_user(bio);
+out:
+	blk_mq_free_request(req);
+	return ERR_PTR(ret);
+}
+
+static int nvme_execute_user_rq(struct request *req, void __user *meta_buffer,
+		unsigned meta_len, u64 *result)
+{
+	struct bio *bio = req->bio;
+	bool write = bio_op(bio) == REQ_OP_DRV_OUT;
+	int ret;
+	void *meta = nvme_meta_from_bio(bio);
+
 	ret = nvme_execute_passthru_rq(req);
+
 	if (result)
 		*result = le64_to_cpu(nvme_req(req)->result.u64);
 	if (meta && !ret && !write) {
@@ -116,14 +142,25 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 			ret = -EFAULT;
 	}
 	kfree(meta);
- out_unmap:
 	if (bio)
 		blk_rq_unmap_user(bio);
- out:
 	blk_mq_free_request(req);
 	return ret;
 }
 
+static int nvme_submit_user_cmd(struct request_queue *q,
+		struct nvme_command *cmd, void __user *ubuffer,
+		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
+		u32 meta_seed, u64 *result, unsigned timeout, bool vec)
+{
+	struct request *req;
+
+	req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer,
+			meta_len, meta_seed, timeout, vec);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	return nvme_execute_user_rq(req, meta_buffer, meta_len, result);
+}
 
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v3 4/5] nvme: wire-up uring-cmd support for io-passthru on char-device.
       [not found]   ` <CGME20220503184915eucas1p2ae04772900c24ef0b23fd8bedead20ae@eucas1p2.samsung.com>
@ 2022-05-03 18:48     ` Pankaj Raghav
  2022-05-03 20:55       ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Pankaj Raghav @ 2022-05-03 18:48 UTC (permalink / raw)
  To: axboe, hch
  Cc: io-uring, linux-nvme, asml.silence, ming.lei, mcgrof, shr,
	joshiiitr, gost.dev, Kanchan Joshi, Anuj Gupta

From: Kanchan Joshi <joshi.k@samsung.com>

Introduce handler for fops->uring_cmd(), implementing async passthru
on char device (/dev/ngX). The handler supports newly introduced
operation NVME_URING_CMD_IO. This operates on a new structure
nvme_uring_cmd, which is similiar to struct nvme_passthru_cmd64 but
without the embedded 8b result field. This is not needed since uring-cmd
allows to return additional result to user-space via big-CQE.

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 drivers/nvme/host/core.c        |   1 +
 drivers/nvme/host/ioctl.c       | 166 +++++++++++++++++++++++++++++++-
 drivers/nvme/host/multipath.c   |   1 +
 drivers/nvme/host/nvme.h        |   3 +
 include/uapi/linux/nvme_ioctl.h |  25 +++++
 5 files changed, 193 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e1846d04817f..682df98db341 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3699,6 +3699,7 @@ static const struct file_operations nvme_ns_chr_fops = {
 	.release	= nvme_ns_chr_release,
 	.unlocked_ioctl	= nvme_ns_chr_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
+	.uring_cmd	= nvme_ns_chr_uring_cmd,
 };
 
 static int nvme_add_ns_cdev(struct nvme_ns *ns)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 3531de8073a6..e428d692375a 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -26,6 +26,80 @@ static inline void *nvme_meta_from_bio(struct bio *bio)
 	return bip ? bvec_virt(bip->bip_vec) : NULL;
 }
 
+/*
+ * This overlays struct io_uring_cmd pdu.
+ * Expect build errors if this grows larger than that.
+ */
+struct nvme_uring_cmd_pdu {
+	union {
+		struct bio *bio;
+		struct request *req;
+	};
+	void *meta; /* kernel-resident buffer */
+	void __user *meta_buffer;
+	u32 meta_len;
+} __packed;
+
+static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
+		struct io_uring_cmd *ioucmd)
+{
+	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
+}
+
+static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
+{
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+	struct request *req = pdu->req;
+	struct bio *bio = req->bio;
+	bool write = req_op(req) == REQ_OP_DRV_OUT;
+	int status;
+	u64 result;
+
+	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+		status = -EINTR;
+	else
+		status = nvme_req(req)->status;
+
+	result = le64_to_cpu(nvme_req(req)->result.u64);
+	blk_mq_free_request(req);
+	if (bio)
+		blk_rq_unmap_user(bio);
+
+	if (pdu->meta && !status && !write) {
+		if (copy_to_user(pdu->meta_buffer, pdu->meta, pdu->meta_len))
+			status = -EFAULT;
+	}
+	kfree(pdu->meta);
+	io_uring_cmd_done(ioucmd, status, result);
+}
+
+static void nvme_end_async_pt(struct request *req, blk_status_t err)
+{
+	struct io_uring_cmd *ioucmd = req->end_io_data;
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+	/* extract bio before reusing the same field for request */
+	struct bio *bio = pdu->bio;
+
+	pdu->req = req;
+	req->bio = bio;
+	/* this takes care of moving rest of completion-work to task context */
+	io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
+}
+
+static void nvme_setup_uring_cmd_data(struct request *rq,
+		struct io_uring_cmd *ioucmd, void __user *meta_buffer,
+		u32 meta_len)
+{
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+
+	/* to free bio on completion, as req->bio will be null at that time */
+	pdu->bio = rq->bio;
+	pdu->meta = nvme_meta_from_bio(rq->bio);
+	pdu->meta_buffer = meta_buffer;
+	pdu->meta_len = meta_len;
+	rq->end_io_data = ioucmd;
+}
+
 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
 		unsigned len, u32 seed, bool write)
 {
@@ -63,7 +137,8 @@ static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
 static struct request *nvme_alloc_user_request(struct request_queue *q,
 		struct nvme_command *cmd, void __user *ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		u32 meta_seed, unsigned timeout, bool vec)
+		u32 meta_seed, unsigned timeout, bool vec, unsigned int rq_flags,
+		blk_mq_req_flags_t blk_flags)
 {
 	bool write = nvme_is_write(cmd);
 	struct nvme_ns *ns = q->queuedata;
@@ -73,7 +148,7 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
 	void *meta = NULL;
 	int ret;
 
-	req = blk_mq_alloc_request(q, nvme_req_op(cmd), 0);
+	req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags);
 	if (IS_ERR(req))
 		return req;
 	nvme_init_request(req, cmd);
@@ -156,7 +231,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	struct request *req;
 
 	req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer,
-			meta_len, meta_seed, timeout, vec);
+			meta_len, meta_seed, timeout, vec, 0, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 	return nvme_execute_user_rq(req, meta_buffer, meta_len, result);
@@ -333,6 +408,55 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	return status;
 }
 
+static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+		struct io_uring_cmd *ioucmd)
+{
+	struct nvme_uring_cmd *cmd =
+		(struct nvme_uring_cmd *)ioucmd->cmd;
+	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
+	struct nvme_command c;
+	struct request *req;
+	unsigned int rq_flags = 0;
+	blk_mq_req_flags_t blk_flags = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (cmd->flags)
+		return -EINVAL;
+	if (!nvme_validate_passthru_nsid(ctrl, ns, cmd->nsid))
+		return -EINVAL;
+
+	if (ioucmd->flags & IO_URING_F_NONBLOCK) {
+		rq_flags = REQ_NOWAIT;
+		blk_flags = BLK_MQ_REQ_NOWAIT;
+	}
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = cmd->opcode;
+	c.common.flags = cmd->flags;
+	c.common.nsid = cpu_to_le32(cmd->nsid);
+	c.common.cdw2[0] = cpu_to_le32(cmd->cdw2);
+	c.common.cdw2[1] = cpu_to_le32(cmd->cdw3);
+	c.common.cdw10 = cpu_to_le32(cmd->cdw10);
+	c.common.cdw11 = cpu_to_le32(cmd->cdw11);
+	c.common.cdw12 = cpu_to_le32(cmd->cdw12);
+	c.common.cdw13 = cpu_to_le32(cmd->cdw13);
+	c.common.cdw14 = cpu_to_le32(cmd->cdw14);
+	c.common.cdw15 = cpu_to_le32(cmd->cdw15);
+
+	req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(cmd->addr),
+			cmd->data_len, nvme_to_user_ptr(cmd->metadata),
+			cmd->metadata_len, 0, cmd->timeout_ms ?
+			msecs_to_jiffies(cmd->timeout_ms) : 0, 0, rq_flags,
+			blk_flags);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	nvme_setup_uring_cmd_data(req, ioucmd, nvme_to_user_ptr(cmd->metadata),
+			cmd->metadata_len);
+	blk_execute_rq_nowait(req, 0, nvme_end_async_pt);
+	return -EIOCBQUEUED;
+}
+
 static bool is_ctrl_ioctl(unsigned int cmd)
 {
 	if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
@@ -424,6 +548,31 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return __nvme_ioctl(ns, cmd, (void __user *)arg);
 }
 
+static void nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd)
+{
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
+
+	switch (ioucmd->cmd_op) {
+	case NVME_URING_CMD_IO:
+		ret = nvme_uring_cmd_io(ns->ctrl, ns, ioucmd);
+		break;
+	default:
+		ret = -ENOTTY;
+	}
+
+	if (ret != -EIOCBQUEUED)
+		io_uring_cmd_done(ioucmd, ret, 0);
+}
+
+void nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd)
+{
+	struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev,
+			struct nvme_ns, cdev);
+	nvme_ns_uring_cmd(ns, ioucmd);
+}
+
 #ifdef CONFIG_NVME_MULTIPATH
 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
 		void __user *argp, struct nvme_ns_head *head, int srcu_idx)
@@ -490,6 +639,17 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return ret;
 }
+void nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd)
+{
+	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
+	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
+	int srcu_idx = srcu_read_lock(&head->srcu);
+	struct nvme_ns *ns = nvme_find_path(head);
+
+	if (ns)
+		nvme_ns_uring_cmd(ns, ioucmd);
+	srcu_read_unlock(&head->srcu, srcu_idx);
+}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index d464fdf978fb..d3e2440d8abb 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -437,6 +437,7 @@ static const struct file_operations nvme_ns_head_chr_fops = {
 	.release	= nvme_ns_head_chr_release,
 	.unlocked_ioctl	= nvme_ns_head_chr_ioctl,
 	.compat_ioctl	= compat_ptr_ioctl,
+	.uring_cmd	= nvme_ns_head_chr_uring_cmd,
 };
 
 static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a2b53ca63335..57c9adb97ce9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -16,6 +16,7 @@
 #include <linux/rcupdate.h>
 #include <linux/wait.h>
 #include <linux/t10-pi.h>
+#include <linux/io_uring.h>
 
 #include <trace/events/block.h>
 
@@ -782,6 +783,8 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
 long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
+void nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd);
+void nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd);
 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
 extern const struct attribute_group *nvme_ns_id_attr_groups[];
diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
index b2e43185e3b5..04e458c649ab 100644
--- a/include/uapi/linux/nvme_ioctl.h
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -70,6 +70,28 @@ struct nvme_passthru_cmd64 {
 	__u64	result;
 };
 
+/* same as struct nvme_passthru_cmd64, minus the 8b result field */
+struct nvme_uring_cmd {
+	__u8	opcode;
+	__u8	flags;
+	__u16	rsvd1;
+	__u32	nsid;
+	__u32	cdw2;
+	__u32	cdw3;
+	__u64	metadata;
+	__u64	addr;
+	__u32	metadata_len;
+	__u32	data_len;
+	__u32	cdw10;
+	__u32	cdw11;
+	__u32	cdw12;
+	__u32	cdw13;
+	__u32	cdw14;
+	__u32	cdw15;
+	__u32	timeout_ms;
+	__u32   rsvd2;
+};
+
 #define nvme_admin_cmd nvme_passthru_cmd
 
 #define NVME_IOCTL_ID		_IO('N', 0x40)
@@ -83,4 +105,7 @@ struct nvme_passthru_cmd64 {
 #define NVME_IOCTL_IO64_CMD	_IOWR('N', 0x48, struct nvme_passthru_cmd64)
 #define NVME_IOCTL_IO64_CMD_VEC	_IOWR('N', 0x49, struct nvme_passthru_cmd64)
 
+/* io_uring async commands: */
+#define NVME_URING_CMD_IO	_IOWR('N', 0x80, struct nvme_uring_cmd)
+
 #endif /* _UAPI_LINUX_NVME_IOCTL_H */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v3 5/5] nvme: add vectored-io support for uring-cmd
       [not found]   ` <CGME20220503184916eucas1p266cbb3ffc1622b292bf59b5eccec9933@eucas1p2.samsung.com>
@ 2022-05-03 18:48     ` Pankaj Raghav
  2022-05-03 20:56       ` Christoph Hellwig
  0 siblings, 1 reply; 14+ messages in thread
From: Pankaj Raghav @ 2022-05-03 18:48 UTC (permalink / raw)
  To: axboe, hch
  Cc: io-uring, linux-nvme, asml.silence, ming.lei, mcgrof, shr,
	joshiiitr, gost.dev, Anuj Gupta, Kanchan Joshi

From: Anuj Gupta <anuj20.g@samsung.com>

wire up support for async passthru that takes an array of buffers (using
iovec). Exposed via a new op NVME_URING_CMD_IO_VEC. Same 'struct
nvme_uring_cmd' is to be used with -

1. cmd.addr as base address of user iovec array
2. cmd.vec_cnt as count of iovec array elements

Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 drivers/nvme/host/ioctl.c       | 9 ++++++---
 include/uapi/linux/nvme_ioctl.h | 6 +++++-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index e428d692375a..ffc695aa5ba3 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -409,7 +409,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 }
 
 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-		struct io_uring_cmd *ioucmd)
+		struct io_uring_cmd *ioucmd, bool vec)
 {
 	struct nvme_uring_cmd *cmd =
 		(struct nvme_uring_cmd *)ioucmd->cmd;
@@ -446,7 +446,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(cmd->addr),
 			cmd->data_len, nvme_to_user_ptr(cmd->metadata),
 			cmd->metadata_len, 0, cmd->timeout_ms ?
-			msecs_to_jiffies(cmd->timeout_ms) : 0, 0, rq_flags,
+			msecs_to_jiffies(cmd->timeout_ms) : 0, vec, rq_flags,
 			blk_flags);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
@@ -556,7 +556,10 @@ static void nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd)
 
 	switch (ioucmd->cmd_op) {
 	case NVME_URING_CMD_IO:
-		ret = nvme_uring_cmd_io(ns->ctrl, ns, ioucmd);
+		ret = nvme_uring_cmd_io(ns->ctrl, ns, ioucmd, false);
+		break;
+	case NVME_URING_CMD_IO_VEC:
+		ret = nvme_uring_cmd_io(ns->ctrl, ns, ioucmd, true);
 		break;
 	default:
 		ret = -ENOTTY;
diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h
index 04e458c649ab..938e0a0bf46f 100644
--- a/include/uapi/linux/nvme_ioctl.h
+++ b/include/uapi/linux/nvme_ioctl.h
@@ -81,7 +81,10 @@ struct nvme_uring_cmd {
 	__u64	metadata;
 	__u64	addr;
 	__u32	metadata_len;
-	__u32	data_len;
+	union {
+		__u32	data_len; /* for non-vectored io */
+		__u32	vec_cnt; /* for vectored io */
+	};
 	__u32	cdw10;
 	__u32	cdw11;
 	__u32	cdw12;
@@ -107,5 +110,6 @@ struct nvme_uring_cmd {
 
 /* io_uring async commands: */
 #define NVME_URING_CMD_IO	_IOWR('N', 0x80, struct nvme_uring_cmd)
+#define NVME_URING_CMD_IO_VEC	_IOWR('N', 0x81, struct nvme_uring_cmd)
 
 #endif /* _UAPI_LINUX_NVME_IOCTL_H */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 1/5] fs,io_uring: add infrastructure for uring-cmd
  2022-05-03 18:48     ` [PATCH v3 1/5] fs,io_uring: add infrastructure for uring-cmd Pankaj Raghav
@ 2022-05-03 20:52       ` Christoph Hellwig
  2022-05-04 15:12         ` Kanchan Joshi
  2022-05-03 21:03       ` Jens Axboe
  1 sibling, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2022-05-03 20:52 UTC (permalink / raw)
  To: Pankaj Raghav
  Cc: axboe, hch, io-uring, linux-nvme, asml.silence, ming.lei, mcgrof,
	shr, joshiiitr, gost.dev, Kanchan Joshi

This mostly looks good, but a few comments an an (untested) patch
to fix these complaints below:

> +struct io_uring_cmd {
> +	struct file     *file;
> +	void            *cmd;
> +	/* for irq-completion - if driver requires doing stuff in task-context*/
> +	void (*driver_cb)(struct io_uring_cmd *cmd);

I'd rename this task_Work_cb or similar, as driver is not very
meaningful.

> +	u32             flags;
> +	u32             cmd_op;
> +	u32		unused;
> +	u8		pdu[28]; /* available inline for free use */

The unused field here is not only unused, but also messed up the
alignment so that any pointer in pdu will not be properly aligned
on 64-bit platforms and tjus might cause crashes on some platforms.

For now we can just remove it, but I really want to get rid of the pdu
thing that causes these problems, but I will not hold the series off for
that and plan to look into that later.

Also a lot of the fields here were using spaces instead of tabs for
indentation.

>  	union {
>  		__u64	addr;	/* pointer to buffer or iovecs */
>  		__u64	splice_off_in;
> +		__u16	cmd_len;
>  	};

This field isn't actually used and can be removed.

>  	__u32	len;		/* buffer size or number of iovecs */
>  	union {
> @@ -61,7 +63,10 @@ struct io_uring_sqe {
>  		__s32	splice_fd_in;
>  		__u32	file_index;
>  	};
> -	__u64	addr3;
> +	union {
> +		__u64	addr3;
> +		__u64	cmd;
> +	};
>  	__u64	__pad2[1];

The way how the tail of the structure is handled here is a mess.  I
think something like a union of two structs for the small and large
case would be much better, so that cmd can be the actual array for
the data that can be used directly and without the nasty cast (that
also casted away the constness):

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b774e6eac5384..fe24d606ca306 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4544,7 +4544,7 @@ static int io_getxattr_prep(struct io_kiocb *req,
 	if (ret)
 		return ret;
 
-	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+	path = u64_to_user_ptr(READ_ONCE(sqe->small.addr3));
 
 	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
 	if (IS_ERR(ix->filename)) {
@@ -4645,7 +4645,7 @@ static int io_setxattr_prep(struct io_kiocb *req,
 	if (ret)
 		return ret;
 
-	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+	path = u64_to_user_ptr(READ_ONCE(sqe->small.addr3));
 
 	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
 	if (IS_ERR(ix->filename)) {
@@ -4909,15 +4909,15 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 
 static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
 {
-	req->uring_cmd.driver_cb(&req->uring_cmd);
+	req->uring_cmd.task_work_cb(&req->uring_cmd);
 }
 
 void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*driver_cb)(struct io_uring_cmd *))
+			void (*task_work_cb)(struct io_uring_cmd *))
 {
 	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
 
-	req->uring_cmd.driver_cb = driver_cb;
+	req->uring_cmd.task_work_cb = task_work_cb;
 	req->io_task_work.func = io_uring_cmd_work;
 	io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
 }
@@ -4949,7 +4949,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req,
 		return -EOPNOTSUPP;
 	if (!(req->ctx->flags & IORING_SETUP_CQE32))
 		return -EOPNOTSUPP;
-	ioucmd->cmd = (void *) &sqe->cmd;
+	ioucmd->cmd = sqe->big.cmd;
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 	ioucmd->flags = 0;
 	return 0;
@@ -12735,7 +12735,7 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
 	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
-	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
+	BUILD_BUG_SQE_ELEM(48, __u64,  small.addr3);
 
 	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
 		     sizeof(struct io_uring_rsrc_update));
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index a4ff4696cbead..7274884859910 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -13,20 +13,19 @@ enum io_uring_cmd_flags {
 };
 
 struct io_uring_cmd {
-	struct file     *file;
-	void            *cmd;
-	/* for irq-completion - if driver requires doing stuff in task-context*/
-	void (*driver_cb)(struct io_uring_cmd *cmd);
-	u32             flags;
-	u32             cmd_op;
-	u32		unused;
+	struct file	*file;
+	const u8	*cmd;
+	/* callback to defer completions to task context */
+	void (*task_work_cb)(struct io_uring_cmd *cmd);
+	u32		flags;
+	u32		cmd_op;
 	u8		pdu[28]; /* available inline for free use */
 };
 
 #if defined(CONFIG_IO_URING)
 void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2);
 void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*driver_cb)(struct io_uring_cmd *));
+			void (*task_work_cb)(struct io_uring_cmd *));
 struct sock *io_uring_get_socket(struct file *file);
 void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
@@ -56,7 +55,7 @@ static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
 {
 }
 static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
-			void (*driver_cb)(struct io_uring_cmd *))
+			void (*task_work_cb)(struct io_uring_cmd *))
 {
 }
 static inline struct sock *io_uring_get_socket(struct file *file)
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 630982b3c34c5..b1f64c2412935 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -539,8 +539,8 @@ TRACE_EVENT(io_uring_req_failed,
 		__entry->buf_index	= sqe->buf_index;
 		__entry->personality	= sqe->personality;
 		__entry->file_index	= sqe->file_index;
-		__entry->pad1		= sqe->__pad2[0];
-		__entry->addr3		= sqe->addr3;
+		__entry->addr3		= sqe->small.addr3;
+		__entry->pad1		= sqe->small.__pad2[0];
 		__entry->error		= error;
 	),
 
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index c081511119bfa..bd4221184f594 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -27,7 +27,6 @@ struct io_uring_sqe {
 	union {
 		__u64	addr;	/* pointer to buffer or iovecs */
 		__u64	splice_off_in;
-		__u16	cmd_len;
 	};
 	__u32	len;		/* buffer size or number of iovecs */
 	union {
@@ -64,16 +63,19 @@ struct io_uring_sqe {
 		__u32	file_index;
 	};
 	union {
-		__u64	addr3;
-		__u64	cmd;
+		struct {
+			__u64	addr3;
+			__u64	__pad2[1];
+		} small;
+
+		/*
+		 * If the ring is initialized with IORING_SETUP_SQE128, then
+		 * this field is used for 80 bytes of arbitrary command data.
+		 */
+		struct {
+			__u8	cmd[0];
+		} big;
 	};
-	__u64	__pad2[1];
-
-	/*
-	 * If the ring is initialized with IORING_SETUP_SQE128, then this field
-	 * contains 64-bytes of padding, doubling the size of the SQE.
-	 */
-	__u64	__big_sqe_pad[0];
 };
 
 enum {

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 2/5] block: wire-up support for passthrough plugging
  2022-05-03 18:48     ` [PATCH v3 2/5] block: wire-up support for passthrough plugging Pankaj Raghav
@ 2022-05-03 20:53       ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2022-05-03 20:53 UTC (permalink / raw)
  To: Pankaj Raghav
  Cc: axboe, hch, io-uring, linux-nvme, asml.silence, ming.lei, mcgrof,
	shr, joshiiitr, gost.dev

> +	if (plug) {
> +		blk_add_rq_to_plug(plug, rq);
> +	} else {

I'd just do an eary return after the blk_add_rq_to_plug call and leave
the rest of the code untouched and unindented.

Otherwise this looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 4/5] nvme: wire-up uring-cmd support for io-passthru on char-device.
  2022-05-03 18:48     ` [PATCH v3 4/5] nvme: wire-up uring-cmd support for io-passthru on char-device Pankaj Raghav
@ 2022-05-03 20:55       ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2022-05-03 20:55 UTC (permalink / raw)
  To: Pankaj Raghav
  Cc: axboe, hch, io-uring, linux-nvme, asml.silence, ming.lei, mcgrof,
	shr, joshiiitr, gost.dev, Kanchan Joshi, Anuj Gupta

On Tue, May 03, 2022 at 08:48:30PM +0200, Pankaj Raghav wrote:
> operation NVME_URING_CMD_IO. This operates on a new structure
> nvme_uring_cmd, which is similiar to struct nvme_passthru_cmd64 but
> without the embedded 8b result field. This is not needed since uring-cmd
> allows to return additional result to user-space via big-CQE.

So let's have a discussion for everyone on whether to reuse the existing
struct or not.

Pros for reusing:

 - any application that is passing around a nvme_passthru_cmd64 doesn't
   need to do special marshalling

Cons:

 - these fields are pointless

I'm fine going either way, we just need to think about the implications.

Otherwise this looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 5/5] nvme: add vectored-io support for uring-cmd
  2022-05-03 18:48     ` [PATCH v3 5/5] nvme: add vectored-io support for uring-cmd Pankaj Raghav
@ 2022-05-03 20:56       ` Christoph Hellwig
  0 siblings, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2022-05-03 20:56 UTC (permalink / raw)
  To: Pankaj Raghav
  Cc: axboe, hch, io-uring, linux-nvme, asml.silence, ming.lei, mcgrof,
	shr, joshiiitr, gost.dev, Anuj Gupta, Kanchan Joshi

> +	union {
> +		__u32	data_len; /* for non-vectored io */
> +		__u32	vec_cnt; /* for vectored io */
> +	};

Nothing ever uses vec_cnt, so I don't think there is any point in adding
this union.

The rest looks good to me:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 1/5] fs,io_uring: add infrastructure for uring-cmd
  2022-05-03 18:48     ` [PATCH v3 1/5] fs,io_uring: add infrastructure for uring-cmd Pankaj Raghav
  2022-05-03 20:52       ` Christoph Hellwig
@ 2022-05-03 21:03       ` Jens Axboe
  1 sibling, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2022-05-03 21:03 UTC (permalink / raw)
  To: Pankaj Raghav, hch
  Cc: io-uring, linux-nvme, asml.silence, ming.lei, mcgrof, shr,
	joshiiitr, gost.dev, Kanchan Joshi

On 5/3/22 12:48 PM, Pankaj Raghav wrote:
> From: Jens Axboe <axboe@kernel.dk>
> 
> file_operations->uring_cmd is a file private handler, similar to ioctls
> but hopefully a lot more sane and useful.
> 
> IORING_OP_URING_CMD is a file private kind of request. io_uring doesn't
> know what is in this command type, it's for the provider of ->uring_cmd()
> to deal with. This operation can be issued only on the ring that is
> setup with both IORING_SETUP_SQE128 and IORING_SETUP_CQE32 flags.

A few minor comments below.

> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index c7e3f7e74d92..b774e6eac538 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> +static int io_uring_cmd_prep(struct io_kiocb *req,
> +			     const struct io_uring_sqe *sqe)
> +{
> +	struct io_uring_cmd *ioucmd = &req->uring_cmd;
> +
> +	if (req->ctx->flags & IORING_SETUP_IOPOLL)
> +		return -EOPNOTSUPP;
> +	/* do not support uring-cmd without big SQE/CQE */
> +	if (!(req->ctx->flags & IORING_SETUP_SQE128))
> +		return -EOPNOTSUPP;
> +	if (!(req->ctx->flags & IORING_SETUP_CQE32))
> +		return -EOPNOTSUPP;
> +	ioucmd->cmd = (void *) &sqe->cmd;
> +	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
> +	ioucmd->flags = 0;
> +	return 0;
> +}

I'd define

	struct io_ring_ctx *ctx = req->ctx;

here. And you should read 'rw_flags' and return -EINVAL if it's set, so
we can be backwards compatible if flags are added later. Probably read
eg sqe->ioprio as well as that isn't directly applicable (it'd just be
set in the command directly) and -EINVAL if that is set. Ala:

static int io_uring_cmd_prep(struct io_kiocb *req,
			     const struct io_uring_sqe *sqe)
{
	struct io_uring_cmd *ioucmd = &req->uring_cmd;
	struct io_ring_ctx *ctx = req->ctx;

	if (ctx->flags & IORING_SETUP_IOPOLL)
		return -EOPNOTSUPP;
	/* do not support uring-cmd without big SQE/CQE */
	if (!(ctx->flags & IORING_SETUP_SQE128))
		return -EOPNOTSUPP;
	if (!(ctx->flags & IORING_SETUP_CQE32))
		return -EOPNOTSUPP;
	if (sqe->ioprio || sqe->rw_flags
		return -EINVAL;

	ioucmd->cmd = (void *) &sqe->cmd;
	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
	ioucmd->flags = 0;
	return 0;
}

> +static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
> +{
> +	struct file *file = req->file;
> +	struct io_uring_cmd *ioucmd = &req->uring_cmd;
> +
> +	if (!req->file->f_op->uring_cmd)
> +		return -EOPNOTSUPP;
> +	ioucmd->flags |= issue_flags;
> +	file->f_op->uring_cmd(ioucmd);
> +	return 0;
> +}

We should pass in issue_flags here, it's a property of the call path,
not the command itself:

static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
{
	struct file *file = req->file;
	struct io_uring_cmd *ioucmd = &req->uring_cmd;

	if (!req->file->f_op->uring_cmd)
		return -EOPNOTSUPP;
	file->f_op->uring_cmd(ioucmd, issue_flags);
	return 0;
}

and then get rid of ioucmd->flags, that can get re-added as the copy of
sqe->rw_flags when we need that.

Agree with Christoph on the things he brought up. If possible, please
just respin this patch with the suggested changes and we can queue it
up.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 1/5] fs,io_uring: add infrastructure for uring-cmd
  2022-05-04 15:12         ` Kanchan Joshi
@ 2022-05-04 12:09           ` Jens Axboe
  2022-05-04 15:48           ` Kanchan Joshi
  1 sibling, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2022-05-04 12:09 UTC (permalink / raw)
  To: Kanchan Joshi, Christoph Hellwig
  Cc: Pankaj Raghav, io-uring, linux-nvme, Pavel Begunkov, Ming Lei,
	Luis Chamberlain, Stefan Roesch, gost.dev, Kanchan Joshi

On 5/4/22 9:12 AM, Kanchan Joshi wrote:
>> @@ -64,16 +63,19 @@ struct io_uring_sqe {
>>                 __u32   file_index;
>>         };
>>         union {
>> -               __u64   addr3;
>> -               __u64   cmd;
>> +               struct {
>> +                       __u64   addr3;
>> +                       __u64   __pad2[1];
>> +               } small;
> 
> Thinking if this can cause any issue for existing users of addr3, i.e.
> in the userspace side? Since it needs to access this field with
> small.addr3.
> Jens - is xattr infra already frozen?

It's not, as it's not upstream yet. But I don't think we need to change
it, just make the two structs unnamed instead. That also avoids awkward
small/big prefixes.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 1/5] fs,io_uring: add infrastructure for uring-cmd
  2022-05-03 20:52       ` Christoph Hellwig
@ 2022-05-04 15:12         ` Kanchan Joshi
  2022-05-04 12:09           ` Jens Axboe
  2022-05-04 15:48           ` Kanchan Joshi
  0 siblings, 2 replies; 14+ messages in thread
From: Kanchan Joshi @ 2022-05-04 15:12 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Pankaj Raghav, Jens Axboe, io-uring, linux-nvme, Pavel Begunkov,
	Ming Lei, Luis Chamberlain, Stefan Roesch, gost.dev,
	Kanchan Joshi

On Tue, May 3, 2022 at 1:52 PM Christoph Hellwig <hch@lst.de> wrote:
>
> This mostly looks good, but a few comments an an (untested) patch
> to fix these complaints below:
>
> > +struct io_uring_cmd {
> > +     struct file     *file;
> > +     void            *cmd;
> > +     /* for irq-completion - if driver requires doing stuff in task-context*/
> > +     void (*driver_cb)(struct io_uring_cmd *cmd);
>
> I'd rename this task_Work_cb or similar, as driver is not very
> meaningful.
>
> > +     u32             flags;
> > +     u32             cmd_op;
> > +     u32             unused;
> > +     u8              pdu[28]; /* available inline for free use */
>
> The unused field here is not only unused, but also messed up the
> alignment so that any pointer in pdu will not be properly aligned
> on 64-bit platforms and tjus might cause crashes on some platforms.

Indeed, thanks. Since we are going to remove that, we can give that
space to pdu for now i.e. pdu[32].This will help in removing "packed"
attribute in nvme pdu (which is 32 bytes without packing).

> For now we can just remove it, but I really want to get rid of the pdu
> thing that causes these problems, but I will not hold the series off for
> that and plan to look into that later.
> Also a lot of the fields here were using spaces instead of tabs for
> indentation.
>
> >       union {
> >               __u64   addr;   /* pointer to buffer or iovecs */
> >               __u64   splice_off_in;
> > +             __u16   cmd_len;
> >       };
>
> This field isn't actually used and can be removed.
>
> >       __u32   len;            /* buffer size or number of iovecs */
> >       union {
> > @@ -61,7 +63,10 @@ struct io_uring_sqe {
> >               __s32   splice_fd_in;
> >               __u32   file_index;
> >       };
> > -     __u64   addr3;
> > +     union {
> > +             __u64   addr3;
> > +             __u64   cmd;
> > +     };
> >       __u64   __pad2[1];
>
> The way how the tail of the structure is handled here is a mess.  I
> think something like a union of two structs for the small and large
> case would be much better, so that cmd can be the actual array for
> the data that can be used directly and without the nasty cast (that
> also casted away the constness):
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index b774e6eac5384..fe24d606ca306 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -4544,7 +4544,7 @@ static int io_getxattr_prep(struct io_kiocb *req,
>         if (ret)
>                 return ret;
>
> -       path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
> +       path = u64_to_user_ptr(READ_ONCE(sqe->small.addr3));
>
>         ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
>         if (IS_ERR(ix->filename)) {
> @@ -4645,7 +4645,7 @@ static int io_setxattr_prep(struct io_kiocb *req,
>         if (ret)
>                 return ret;
>
> -       path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
> +       path = u64_to_user_ptr(READ_ONCE(sqe->small.addr3));
>
>         ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
>         if (IS_ERR(ix->filename)) {
> @@ -4909,15 +4909,15 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
>
>  static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
>  {
> -       req->uring_cmd.driver_cb(&req->uring_cmd);
> +       req->uring_cmd.task_work_cb(&req->uring_cmd);
>  }
>
>  void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
> -                       void (*driver_cb)(struct io_uring_cmd *))
> +                       void (*task_work_cb)(struct io_uring_cmd *))
>  {
>         struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
>
> -       req->uring_cmd.driver_cb = driver_cb;
> +       req->uring_cmd.task_work_cb = task_work_cb;
>         req->io_task_work.func = io_uring_cmd_work;
>         io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
>  }
> @@ -4949,7 +4949,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req,
>                 return -EOPNOTSUPP;
>         if (!(req->ctx->flags & IORING_SETUP_CQE32))
>                 return -EOPNOTSUPP;
> -       ioucmd->cmd = (void *) &sqe->cmd;
> +       ioucmd->cmd = sqe->big.cmd;
>         ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
>         ioucmd->flags = 0;
>         return 0;
> @@ -12735,7 +12735,7 @@ static int __init io_uring_init(void)
>         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
>         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
>         BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
> -       BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
> +       BUILD_BUG_SQE_ELEM(48, __u64,  small.addr3);
>
>         BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
>                      sizeof(struct io_uring_rsrc_update));
> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
> index a4ff4696cbead..7274884859910 100644
> --- a/include/linux/io_uring.h
> +++ b/include/linux/io_uring.h
> @@ -13,20 +13,19 @@ enum io_uring_cmd_flags {
>  };
>
>  struct io_uring_cmd {
> -       struct file     *file;
> -       void            *cmd;
> -       /* for irq-completion - if driver requires doing stuff in task-context*/
> -       void (*driver_cb)(struct io_uring_cmd *cmd);
> -       u32             flags;
> -       u32             cmd_op;
> -       u32             unused;
> +       struct file     *file;
> +       const u8        *cmd;
> +       /* callback to defer completions to task context */
> +       void (*task_work_cb)(struct io_uring_cmd *cmd);
> +       u32             flags;
> +       u32             cmd_op;
>         u8              pdu[28]; /* available inline for free use */
>  };
>
>  #if defined(CONFIG_IO_URING)
>  void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2);
>  void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
> -                       void (*driver_cb)(struct io_uring_cmd *));
> +                       void (*task_work_cb)(struct io_uring_cmd *));
>  struct sock *io_uring_get_socket(struct file *file);
>  void __io_uring_cancel(bool cancel_all);
>  void __io_uring_free(struct task_struct *tsk);
> @@ -56,7 +55,7 @@ static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
>  {
>  }
>  static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
> -                       void (*driver_cb)(struct io_uring_cmd *))
> +                       void (*task_work_cb)(struct io_uring_cmd *))
>  {
>  }
>  static inline struct sock *io_uring_get_socket(struct file *file)
> diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
> index 630982b3c34c5..b1f64c2412935 100644
> --- a/include/trace/events/io_uring.h
> +++ b/include/trace/events/io_uring.h
> @@ -539,8 +539,8 @@ TRACE_EVENT(io_uring_req_failed,
>                 __entry->buf_index      = sqe->buf_index;
>                 __entry->personality    = sqe->personality;
>                 __entry->file_index     = sqe->file_index;
> -               __entry->pad1           = sqe->__pad2[0];
> -               __entry->addr3          = sqe->addr3;
> +               __entry->addr3          = sqe->small.addr3;
> +               __entry->pad1           = sqe->small.__pad2[0];
>                 __entry->error          = error;
>         ),
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index c081511119bfa..bd4221184f594 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -27,7 +27,6 @@ struct io_uring_sqe {
>         union {
>                 __u64   addr;   /* pointer to buffer or iovecs */
>                 __u64   splice_off_in;
> -               __u16   cmd_len;
>         };
>         __u32   len;            /* buffer size or number of iovecs */
>         union {
> @@ -64,16 +63,19 @@ struct io_uring_sqe {
>                 __u32   file_index;
>         };
>         union {
> -               __u64   addr3;
> -               __u64   cmd;
> +               struct {
> +                       __u64   addr3;
> +                       __u64   __pad2[1];
> +               } small;

Thinking if this can cause any issue for existing users of addr3, i.e.
in the userspace side? Since it needs to access this field with
small.addr3.
Jens - is xattr infra already frozen?

> +               /*
> +                * If the ring is initialized with IORING_SETUP_SQE128, then
> +                * this field is used for 80 bytes of arbitrary command data.
> +                */
> +               struct {
> +                       __u8    cmd[0];
> +               } big;
>         };
> -       __u64   __pad2[1];
> -
> -       /*
> -        * If the ring is initialized with IORING_SETUP_SQE128, then this field
> -        * contains 64-bytes of padding, doubling the size of the SQE.
> -        */
> -       __u64   __big_sqe_pad[0];
>  };
>
>  enum {



-- 
Joshi

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 1/5] fs,io_uring: add infrastructure for uring-cmd
  2022-05-04 15:12         ` Kanchan Joshi
  2022-05-04 12:09           ` Jens Axboe
@ 2022-05-04 15:48           ` Kanchan Joshi
  1 sibling, 0 replies; 14+ messages in thread
From: Kanchan Joshi @ 2022-05-04 15:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Pankaj Raghav, Jens Axboe, io-uring, linux-nvme, Pavel Begunkov,
	Ming Lei, Luis Chamberlain, Stefan Roesch, gost.dev,
	Kanchan Joshi

> > @@ -27,7 +27,6 @@ struct io_uring_sqe {
> >         union {
> >                 __u64   addr;   /* pointer to buffer or iovecs */
> >                 __u64   splice_off_in;
> > -               __u16   cmd_len;
> >         };
> >         __u32   len;            /* buffer size or number of iovecs */
> >         union {
> > @@ -64,16 +63,19 @@ struct io_uring_sqe {
> >                 __u32   file_index;
> >         };
> >         union {
> > -               __u64   addr3;
> > -               __u64   cmd;
> > +               struct {
> > +                       __u64   addr3;
> > +                       __u64   __pad2[1];
> > +               } small;
>
> Thinking if this can cause any issue for existing users of addr3, i.e.
> in the userspace side? Since it needs to access this field with
> small.addr3.
> Jens - is xattr infra already frozen?

If this breaks anything, we can avoid that by not touching addr3 at all.
And instead keep "__u64   __pad2[1]" and  "__u8    cmd[0]" in the union.
That means 72 bytes of space, but it is enough for the new nvme struct
(nvme_uring_cmd) which is also 72 bytes.
Does this sound fine?

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-05-04 12:09 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <CGME20220503184911eucas1p1beb172219537d78fcaf2a1417f532cf7@eucas1p1.samsung.com>
2022-05-03 18:48 ` [PATCH v3 0/5] io_uring passthough for nvme Pankaj Raghav
     [not found]   ` <CGME20220503184912eucas1p1bb0e3d36c06cfde8436df3a45e67bd32@eucas1p1.samsung.com>
2022-05-03 18:48     ` [PATCH v3 1/5] fs,io_uring: add infrastructure for uring-cmd Pankaj Raghav
2022-05-03 20:52       ` Christoph Hellwig
2022-05-04 15:12         ` Kanchan Joshi
2022-05-04 12:09           ` Jens Axboe
2022-05-04 15:48           ` Kanchan Joshi
2022-05-03 21:03       ` Jens Axboe
     [not found]   ` <CGME20220503184913eucas1p156abb6e2273c8dabc22e87ec8b218a5c@eucas1p1.samsung.com>
2022-05-03 18:48     ` [PATCH v3 2/5] block: wire-up support for passthrough plugging Pankaj Raghav
2022-05-03 20:53       ` Christoph Hellwig
     [not found]   ` <CGME20220503184914eucas1p1d9df18afe3234c0698a66cdb9c664ddc@eucas1p1.samsung.com>
2022-05-03 18:48     ` [PATCH v3 3/5] nvme: refactor nvme_submit_user_cmd() Pankaj Raghav
     [not found]   ` <CGME20220503184915eucas1p2ae04772900c24ef0b23fd8bedead20ae@eucas1p2.samsung.com>
2022-05-03 18:48     ` [PATCH v3 4/5] nvme: wire-up uring-cmd support for io-passthru on char-device Pankaj Raghav
2022-05-03 20:55       ` Christoph Hellwig
     [not found]   ` <CGME20220503184916eucas1p266cbb3ffc1622b292bf59b5eccec9933@eucas1p2.samsung.com>
2022-05-03 18:48     ` [PATCH v3 5/5] nvme: add vectored-io support for uring-cmd Pankaj Raghav
2022-05-03 20:56       ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.