All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] block integrity: direclty map user space addresses
@ 2023-10-18 15:18 Keith Busch
  2023-10-18 15:18 ` [PATCH 1/4] block: bio-integrity: add support for user buffers Keith Busch
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: Keith Busch @ 2023-10-18 15:18 UTC (permalink / raw)
  To: linux-block, linux-nvme, io-uring
  Cc: axboe, hch, joshi.k, martin.petersen, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Handling passthrough metadata ("integrity") today introduces overhead
and complications that we can avoid if we just map user space addresses
directly. This patch series implements that.

Keith Busch (4):
  block: bio-integrity: add support for user buffers
  nvme: use bio_integrity_map_user
  iouring: remove IORING_URING_CMD_POLLED
  io_uring: remove uring_cmd cookie

 block/bio-integrity.c         |  67 +++++++++++++
 drivers/nvme/host/ioctl.c     | 174 ++++++----------------------------
 include/linux/bio.h           |   8 ++
 include/linux/io_uring.h      |   8 +-
 include/uapi/linux/io_uring.h |   2 -
 io_uring/uring_cmd.c          |   1 -
 6 files changed, 104 insertions(+), 156 deletions(-)

-- 
2.34.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/4] block: bio-integrity: add support for user buffers
  2023-10-18 15:18 [PATCH 0/4] block integrity: direclty map user space addresses Keith Busch
@ 2023-10-18 15:18 ` Keith Busch
  2023-10-19  5:39   ` Christoph Hellwig
                     ` (3 more replies)
  2023-10-18 15:18 ` [PATCH 2/4] nvme: use bio_integrity_map_user Keith Busch
                   ` (3 subsequent siblings)
  4 siblings, 4 replies; 16+ messages in thread
From: Keith Busch @ 2023-10-18 15:18 UTC (permalink / raw)
  To: linux-block, linux-nvme, io-uring
  Cc: axboe, hch, joshi.k, martin.petersen, Keith Busch

From: Keith Busch <kbusch@kernel.org>

User space passthrough commands that utilize metadata currently need to
bounce the "integrity" buffer through the kernel. This adds unnecessary
overhead and memory pressure.

Add support for mapping user space directly so that we can avoid this
costly copy. This is similiar to how the bio payload utilizes user
addresses with bio_map_user_iov().

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bio-integrity.c | 67 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h   |  8 ++++++
 2 files changed, 75 insertions(+)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index ec8ac8cf6e1b9..08f70b837a29b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -91,6 +91,19 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
+static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
+{
+	bool dirty = bio_data_dir(bip->bip_bio) == READ;
+	struct bvec_iter iter;
+	struct bio_vec bv;
+
+	bip_for_each_vec(bv, bip, iter) {
+		if (dirty && !PageCompound(bv.bv_page))
+			set_page_dirty_lock(bv.bv_page);
+		unpin_user_page(bv.bv_page);
+	}
+}
+
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
@@ -105,6 +118,8 @@ void bio_integrity_free(struct bio *bio)
 
 	if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
 		kfree(bvec_virt(bip->bip_vec));
+	else if (bip->bip_flags & BIP_INTEGRITY_USER)
+		bio_integrity_unmap_user(bip);;
 
 	__bio_integrity_free(bs, bip);
 	bio->bi_integrity = NULL;
@@ -160,6 +175,58 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
 
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len,
+			   u32 seed, u32 maxvecs)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
+	struct page *stack_pages[UIO_FASTIOV];
+	size_t offset = offset_in_page(ubuf);
+	unsigned long ptr = (uintptr_t)ubuf;
+	struct page **pages = stack_pages;
+	struct bio_integrity_payload *bip;
+	int npages, ret, i;
+
+	if (bio_integrity(bio) || ptr & align || maxvecs > UIO_FASTIOV)
+		return -EINVAL;
+
+	bip = bio_integrity_alloc(bio, GFP_KERNEL, maxvecs);
+	if (IS_ERR(bip))
+		return PTR_ERR(bip);
+
+	ret = pin_user_pages_fast(ptr, UIO_FASTIOV, FOLL_WRITE, pages);
+	if (unlikely(ret < 0))
+		goto free_bip;
+
+	npages = ret;
+	for (i = 0; i < npages; i++) {
+		u32 bytes = min_t(u32, len, PAGE_SIZE - offset);
+		ret = bio_integrity_add_page(bio, pages[i], bytes, offset);
+		if (ret != bytes) {
+			ret = -EINVAL;
+			goto release_pages;
+		}
+		len -= ret;
+		offset = 0;
+	}
+
+	if (len) {
+		ret = -EINVAL;
+		goto release_pages;
+	}
+
+	bip->bip_iter.bi_sector = seed;
+	bip->bip_flags |= BIP_INTEGRITY_USER;
+	return 0;
+
+release_pages:
+	unpin_user_pages(pages, npages);
+free_bip:
+	bio_integrity_free(bio);
+	return ret;
+}
+EXPORT_SYMBOL(bio_integrity_map_user);
+
 /**
  * bio_integrity_process - Process integrity metadata for a bio
  * @bio:	bio to generate/verify integrity metadata for
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 41d417ee13499..144cc280b6ad3 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -324,6 +324,7 @@ enum bip_flags {
 	BIP_CTRL_NOCHECK	= 1 << 2, /* disable HBA integrity checking */
 	BIP_DISK_NOCHECK	= 1 << 3, /* disable disk integrity checking */
 	BIP_IP_CHECKSUM		= 1 << 4, /* IP checksum */
+	BIP_INTEGRITY_USER	= 1 << 5, /* Integrity payload is user address */
 };
 
 /*
@@ -720,6 +721,7 @@ static inline bool bioset_initialized(struct bio_set *bs)
 
 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
+extern int bio_integrity_map_user(struct bio *, void __user *, unsigned int, u32, u32);
 extern bool bio_integrity_prep(struct bio *);
 extern void bio_integrity_advance(struct bio *, unsigned int);
 extern void bio_integrity_trim(struct bio *);
@@ -789,6 +791,12 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
 	return 0;
 }
 
+static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
+					 unsigned int len, u32 seed, u32 maxvecs)
+{
+	return -EINVAL
+}
+
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 /*
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/4] nvme: use bio_integrity_map_user
  2023-10-18 15:18 [PATCH 0/4] block integrity: direclty map user space addresses Keith Busch
  2023-10-18 15:18 ` [PATCH 1/4] block: bio-integrity: add support for user buffers Keith Busch
@ 2023-10-18 15:18 ` Keith Busch
  2023-10-19  5:40   ` Christoph Hellwig
  2023-10-25 13:26   ` Kanchan Joshi
  2023-10-18 15:18 ` [PATCH 3/4] iouring: remove IORING_URING_CMD_POLLED Keith Busch
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 16+ messages in thread
From: Keith Busch @ 2023-10-18 15:18 UTC (permalink / raw)
  To: linux-block, linux-nvme, io-uring
  Cc: axboe, hch, joshi.k, martin.petersen, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Map user metadata buffers directly instead of maintaining a complicated
copy buffer.

Now that the bio tracks the metadata through its bip, nvme doesn't need
special metadata handling, callbacks, or additional fields in the pdu.
This greatly simplifies passthrough handling and avoids a "might_fault"
copy_to_user in the completion path. This also creates pdu space to
track the original request separately from its bio, further simplifying
polling without relying on special iouring fields.

The downside is that nvme requires the metadata buffer be physically
contiguous, so user space will need to utilize huge pages if the buffer
needs to span multiple pages. In practice, metadata payload sizes are a
small fraction of the main payload, so this shouldn't be a problem.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/ioctl.c | 174 ++++++--------------------------------
 1 file changed, 27 insertions(+), 147 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index d8ff796fd5f21..a4889536ca4c6 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -96,52 +96,17 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
 	return (void __user *)ptrval;
 }
 
-static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
+static int nvme_add_user_metadata(struct request *req, void __user *ubuf,
 		unsigned len, u32 seed)
 {
-	struct bio_integrity_payload *bip;
-	int ret = -ENOMEM;
-	void *buf;
-	struct bio *bio = req->bio;
-
-	buf = kmalloc(len, GFP_KERNEL);
-	if (!buf)
-		goto out;
-
-	ret = -EFAULT;
-	if ((req_op(req) == REQ_OP_DRV_OUT) && copy_from_user(buf, ubuf, len))
-		goto out_free_meta;
-
-	bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
-	if (IS_ERR(bip)) {
-		ret = PTR_ERR(bip);
-		goto out_free_meta;
-	}
+	int ret;
 
-	bip->bip_iter.bi_sector = seed;
-	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
-			offset_in_page(buf));
-	if (ret != len) {
-		ret = -ENOMEM;
-		goto out_free_meta;
-	}
+	ret = bio_integrity_map_user(req->bio, ubuf, len, seed, 1);
+	if (ret)
+		return ret;
 
 	req->cmd_flags |= REQ_INTEGRITY;
-	return buf;
-out_free_meta:
-	kfree(buf);
-out:
-	return ERR_PTR(ret);
-}
-
-static int nvme_finish_user_metadata(struct request *req, void __user *ubuf,
-		void *meta, unsigned len, int ret)
-{
-	if (!ret && req_op(req) == REQ_OP_DRV_IN &&
-	    copy_to_user(ubuf, meta, len))
-		ret = -EFAULT;
-	kfree(meta);
-	return ret;
+	return 0;
 }
 
 static struct request *nvme_alloc_user_request(struct request_queue *q,
@@ -160,14 +125,12 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
 
 static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd,
-		unsigned int flags)
+		u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags)
 {
 	struct request_queue *q = req->q;
 	struct nvme_ns *ns = q->queuedata;
 	struct block_device *bdev = ns ? ns->disk->part0 : NULL;
 	struct bio *bio = NULL;
-	void *meta = NULL;
 	int ret;
 
 	if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
@@ -194,13 +157,10 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		bio_set_dev(bio, bdev);
 
 	if (bdev && meta_buffer && meta_len) {
-		meta = nvme_add_user_metadata(req, meta_buffer, meta_len,
+		ret = nvme_add_user_metadata(req, meta_buffer, meta_len,
 				meta_seed);
-		if (IS_ERR(meta)) {
-			ret = PTR_ERR(meta);
+		if (ret)
 			goto out_unmap;
-		}
-		*metap = meta;
 	}
 
 	return ret;
@@ -221,7 +181,6 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	struct nvme_ns *ns = q->queuedata;
 	struct nvme_ctrl *ctrl;
 	struct request *req;
-	void *meta = NULL;
 	struct bio *bio;
 	u32 effects;
 	int ret;
@@ -233,7 +192,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	req->timeout = timeout;
 	if (ubuffer && bufflen) {
 		ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer,
-				meta_len, meta_seed, &meta, NULL, flags);
+				meta_len, meta_seed, NULL, flags);
 		if (ret)
 			return ret;
 	}
@@ -245,9 +204,6 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	ret = nvme_execute_rq(req, false);
 	if (result)
 		*result = le64_to_cpu(nvme_req(req)->result.u64);
-	if (meta)
-		ret = nvme_finish_user_metadata(req, meta_buffer, meta,
-						meta_len, ret);
 	if (bio)
 		blk_rq_unmap_user(bio);
 	blk_mq_free_request(req);
@@ -442,19 +398,10 @@ struct nvme_uring_data {
  * Expect build errors if this grows larger than that.
  */
 struct nvme_uring_cmd_pdu {
-	union {
-		struct bio *bio;
-		struct request *req;
-	};
-	u32 meta_len;
-	u32 nvme_status;
-	union {
-		struct {
-			void *meta; /* kernel-resident buffer */
-			void __user *meta_buffer;
-		};
-		u64 result;
-	} u;
+	struct request *req;
+	struct bio *bio;
+	u64 result;
+	int status;
 };
 
 static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
@@ -463,31 +410,6 @@ static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
 	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
 }
 
-static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd,
-				    unsigned issue_flags)
-{
-	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-	struct request *req = pdu->req;
-	int status;
-	u64 result;
-
-	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
-		status = -EINTR;
-	else
-		status = nvme_req(req)->status;
-
-	result = le64_to_cpu(nvme_req(req)->result.u64);
-
-	if (pdu->meta_len)
-		status = nvme_finish_user_metadata(req, pdu->u.meta_buffer,
-					pdu->u.meta, pdu->meta_len, status);
-	if (req->bio)
-		blk_rq_unmap_user(req->bio);
-	blk_mq_free_request(req);
-
-	io_uring_cmd_done(ioucmd, status, result, issue_flags);
-}
-
 static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd,
 			       unsigned issue_flags)
 {
@@ -495,8 +417,7 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd,
 
 	if (pdu->bio)
 		blk_rq_unmap_user(pdu->bio);
-
-	io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result, issue_flags);
+	io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags);
 }
 
 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
@@ -505,50 +426,24 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 	struct io_uring_cmd *ioucmd = req->end_io_data;
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
 
-	req->bio = pdu->bio;
 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
-		pdu->nvme_status = -EINTR;
+		pdu->status = -EINTR;
 	else
-		pdu->nvme_status = nvme_req(req)->status;
-	pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64);
+		pdu->status = nvme_req(req)->status;
+	pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
 
 	/*
 	 * For iopoll, complete it directly.
 	 * Otherwise, move the completion to task work.
 	 */
-	if (blk_rq_is_poll(req)) {
-		WRITE_ONCE(ioucmd->cookie, NULL);
+	if (blk_rq_is_poll(req))
 		nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED);
-	} else {
+	else
 		io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
-	}
 
 	return RQ_END_IO_FREE;
 }
 
-static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
-						     blk_status_t err)
-{
-	struct io_uring_cmd *ioucmd = req->end_io_data;
-	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-
-	req->bio = pdu->bio;
-	pdu->req = req;
-
-	/*
-	 * For iopoll, complete it directly.
-	 * Otherwise, move the completion to task work.
-	 */
-	if (blk_rq_is_poll(req)) {
-		WRITE_ONCE(ioucmd->cookie, NULL);
-		nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED);
-	} else {
-		io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb);
-	}
-
-	return RQ_END_IO_NONE;
-}
-
 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 		struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
 {
@@ -560,7 +455,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	struct request *req;
 	blk_opf_t rq_flags = REQ_ALLOC_CACHE;
 	blk_mq_req_flags_t blk_flags = 0;
-	void *meta = NULL;
 	int ret;
 
 	c.common.opcode = READ_ONCE(cmd->opcode);
@@ -608,27 +502,17 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (d.addr && d.data_len) {
 		ret = nvme_map_user_request(req, d.addr,
 			d.data_len, nvme_to_user_ptr(d.metadata),
-			d.metadata_len, 0, &meta, ioucmd, vec);
+			d.metadata_len, 0, ioucmd, vec);
 		if (ret)
 			return ret;
 	}
 
-	if (blk_rq_is_poll(req)) {
-		ioucmd->flags |= IORING_URING_CMD_POLLED;
-		WRITE_ONCE(ioucmd->cookie, req);
-	}
 
 	/* to free bio on completion, as req->bio will be null at that time */
 	pdu->bio = req->bio;
-	pdu->meta_len = d.metadata_len;
+	pdu->req = req;
 	req->end_io_data = ioucmd;
-	if (pdu->meta_len) {
-		pdu->u.meta = meta;
-		pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata);
-		req->end_io = nvme_uring_cmd_end_io_meta;
-	} else {
-		req->end_io = nvme_uring_cmd_end_io;
-	}
+	req->end_io = nvme_uring_cmd_end_io;
 	blk_execute_rq_nowait(req, false);
 	return -EIOCBQUEUED;
 }
@@ -779,16 +663,12 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
 				 struct io_comp_batch *iob,
 				 unsigned int poll_flags)
 {
-	struct request *req;
-	int ret = 0;
-
-	if (!(ioucmd->flags & IORING_URING_CMD_POLLED))
-		return 0;
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+	struct request *req = pdu->req;
 
-	req = READ_ONCE(ioucmd->cookie);
 	if (req && blk_rq_is_poll(req))
-		ret = blk_rq_poll(req, iob, poll_flags);
-	return ret;
+		return blk_rq_poll(req, iob, poll_flags);
+	return 0;
 }
 #ifdef CONFIG_NVME_MULTIPATH
 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/4] iouring: remove IORING_URING_CMD_POLLED
  2023-10-18 15:18 [PATCH 0/4] block integrity: direclty map user space addresses Keith Busch
  2023-10-18 15:18 ` [PATCH 1/4] block: bio-integrity: add support for user buffers Keith Busch
  2023-10-18 15:18 ` [PATCH 2/4] nvme: use bio_integrity_map_user Keith Busch
@ 2023-10-18 15:18 ` Keith Busch
  2023-10-19  5:41   ` Christoph Hellwig
  2023-10-23  6:18   ` Kanchan Joshi
  2023-10-18 15:18 ` [PATCH 4/4] io_uring: remove uring_cmd cookie Keith Busch
  2023-10-19  5:34 ` [PATCH 0/4] block integrity: direclty map user space addresses Christoph Hellwig
  4 siblings, 2 replies; 16+ messages in thread
From: Keith Busch @ 2023-10-18 15:18 UTC (permalink / raw)
  To: linux-block, linux-nvme, io-uring
  Cc: axboe, hch, joshi.k, martin.petersen, Keith Busch

From: Keith Busch <kbusch@kernel.org>

No more users of this flag.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/uapi/linux/io_uring.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8e61f8b7c2ced..10e724370b612 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -249,10 +249,8 @@ enum io_uring_op {
  * sqe->uring_cmd_flags
  * IORING_URING_CMD_FIXED	use registered buffer; pass this flag
  *				along with setting sqe->buf_index.
- * IORING_URING_CMD_POLLED	driver use only
  */
 #define IORING_URING_CMD_FIXED	(1U << 0)
-#define IORING_URING_CMD_POLLED	(1U << 31)
 
 
 /*
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 4/4] io_uring: remove uring_cmd cookie
  2023-10-18 15:18 [PATCH 0/4] block integrity: direclty map user space addresses Keith Busch
                   ` (2 preceding siblings ...)
  2023-10-18 15:18 ` [PATCH 3/4] iouring: remove IORING_URING_CMD_POLLED Keith Busch
@ 2023-10-18 15:18 ` Keith Busch
  2023-10-19  5:34 ` [PATCH 0/4] block integrity: direclty map user space addresses Christoph Hellwig
  4 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2023-10-18 15:18 UTC (permalink / raw)
  To: linux-block, linux-nvme, io-uring
  Cc: axboe, hch, joshi.k, martin.petersen, Keith Busch

From: Keith Busch <kbusch@kernel.org>

No more users of this field.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/io_uring.h | 8 ++------
 io_uring/uring_cmd.c     | 1 -
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 106cdc55ff3bd..30d3db4bc61a7 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -25,12 +25,8 @@ enum io_uring_cmd_flags {
 struct io_uring_cmd {
 	struct file	*file;
 	const struct io_uring_sqe *sqe;
-	union {
-		/* callback to defer completions to task context */
-		void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
-		/* used for polled completion */
-		void *cookie;
-	};
+	/* callback to defer completions to task context */
+	void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
 	u32		cmd_op;
 	u32		flags;
 	u8		pdu[32]; /* available inline for free use */
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 537795fddc87d..56f3ef8206057 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -133,7 +133,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 			return -EOPNOTSUPP;
 		issue_flags |= IO_URING_F_IOPOLL;
 		req->iopoll_completed = 0;
-		WRITE_ONCE(ioucmd->cookie, NULL);
 	}
 
 	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] block integrity: direclty map user space addresses
  2023-10-18 15:18 [PATCH 0/4] block integrity: direclty map user space addresses Keith Busch
                   ` (3 preceding siblings ...)
  2023-10-18 15:18 ` [PATCH 4/4] io_uring: remove uring_cmd cookie Keith Busch
@ 2023-10-19  5:34 ` Christoph Hellwig
  4 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2023-10-19  5:34 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-block, linux-nvme, io-uring, axboe, hch, joshi.k,
	martin.petersen, Keith Busch

s/direclty/directly/

in the subject.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] block: bio-integrity: add support for user buffers
  2023-10-18 15:18 ` [PATCH 1/4] block: bio-integrity: add support for user buffers Keith Busch
@ 2023-10-19  5:39   ` Christoph Hellwig
  2023-10-21  3:53   ` kernel test robot
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2023-10-19  5:39 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-block, linux-nvme, io-uring, axboe, hch, joshi.k,
	martin.petersen, Keith Busch

int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len,
> +			   u32 seed, u32 maxvecs)
> +{
> +	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> +	unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
> +	struct page *stack_pages[UIO_FASTIOV];
> +	size_t offset = offset_in_page(ubuf);
> +	unsigned long ptr = (uintptr_t)ubuf;
> +	struct page **pages = stack_pages;
> +	struct bio_integrity_payload *bip;
> +	int npages, ret, i;
> +
> +	if (bio_integrity(bio) || ptr & align || maxvecs > UIO_FASTIOV)
> +		return -EINVAL;

We also need to check the length for the dma alignment/pad, not
just the start.  (The undocumented iov_iter_alignment_iovec helper
obsfucateѕ this for the data path).

> +	bip = bio_integrity_alloc(bio, GFP_KERNEL, maxvecs);
> +	if (IS_ERR(bip))
> +		return PTR_ERR(bip);
> +
> +	ret = pin_user_pages_fast(ptr, UIO_FASTIOV, FOLL_WRITE, pages);
> +	if (unlikely(ret < 0))
> +		goto free_bip;
> +
> +	npages = ret;
> +	for (i = 0; i < npages; i++) {
> +		u32 bytes = min_t(u32, len, PAGE_SIZE - offset);
> +		ret = bio_integrity_add_page(bio, pages[i], bytes, offset);
> +		if (ret != bytes) {
> +			ret = -EINVAL;
> +			goto release_pages;
> +		}
> +		len -= ret;
> +		offset = 0;
> +	}

Any reason to not use the bio_vec array as the buffer, similar to the
data size here?

> +EXPORT_SYMBOL(bio_integrity_map_user);

Everything that just thinly wraps get_user_pages_fast needs to be
EXPORT_SYMBOL_GPL.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] nvme: use bio_integrity_map_user
  2023-10-18 15:18 ` [PATCH 2/4] nvme: use bio_integrity_map_user Keith Busch
@ 2023-10-19  5:40   ` Christoph Hellwig
  2023-10-25 13:26   ` Kanchan Joshi
  1 sibling, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2023-10-19  5:40 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-block, linux-nvme, io-uring, axboe, hch, joshi.k,
	martin.petersen, Keith Busch

On Wed, Oct 18, 2023 at 08:18:41AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Map user metadata buffers directly instead of maintaining a complicated
> copy buffer.
> 
> Now that the bio tracks the metadata through its bip, nvme doesn't need
> special metadata handling, callbacks, or additional fields in the pdu.
> This greatly simplifies passthrough handling and avoids a "might_fault"
> copy_to_user in the completion path. This also creates pdu space to
> track the original request separately from its bio, further simplifying
> polling without relying on special iouring fields.
> 
> The downside is that nvme requires the metadata buffer be physically
> contiguous, so user space will need to utilize huge pages if the buffer
> needs to span multiple pages. In practice, metadata payload sizes are a
> small fraction of the main payload, so this shouldn't be a problem.

We can't just remove the old path.  We might still need bounce
buffering to due misalignment and/or because it is notcontiguous.
Same as we have a direct map and a copy path for data.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] iouring: remove IORING_URING_CMD_POLLED
  2023-10-18 15:18 ` [PATCH 3/4] iouring: remove IORING_URING_CMD_POLLED Keith Busch
@ 2023-10-19  5:41   ` Christoph Hellwig
  2023-10-19 14:43     ` Keith Busch
  2023-10-23  6:18   ` Kanchan Joshi
  1 sibling, 1 reply; 16+ messages in thread
From: Christoph Hellwig @ 2023-10-19  5:41 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-block, linux-nvme, io-uring, axboe, hch, joshi.k,
	martin.petersen, Keith Busch

Looks good and should probably go straight to the io_uring tree
instead of being mixed up with the metadata changes.

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] iouring: remove IORING_URING_CMD_POLLED
  2023-10-19  5:41   ` Christoph Hellwig
@ 2023-10-19 14:43     ` Keith Busch
  0 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2023-10-19 14:43 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, linux-block, linux-nvme, io-uring, axboe, joshi.k,
	martin.petersen

On Thu, Oct 19, 2023 at 07:41:05AM +0200, Christoph Hellwig wrote:
> Looks good and should probably go straight to the io_uring tree
> instead of being mixed up with the metadata changes.

The previos metadata patch removes the only user of the flag, so this
can't go in separately.

But if the driver needs to fallback to kernel bounce buffer for
unaligned or multi-page requests, I don't think I can easily get rid of
the iouring flag since the driver PDU doesn't have enough room to track
everything it needs.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] block: bio-integrity: add support for user buffers
  2023-10-18 15:18 ` [PATCH 1/4] block: bio-integrity: add support for user buffers Keith Busch
  2023-10-19  5:39   ` Christoph Hellwig
@ 2023-10-21  3:53   ` kernel test robot
  2023-10-21  4:13   ` kernel test robot
  2023-10-25 12:51   ` Kanchan Joshi
  3 siblings, 0 replies; 16+ messages in thread
From: kernel test robot @ 2023-10-21  3:53 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-nvme, io-uring
  Cc: oe-kbuild-all, axboe, hch, joshi.k, martin.petersen, Keith Busch

Hi Keith,

kernel test robot noticed the following build errors:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.6-rc6 next-20231020]
[cannot apply to axboe-block/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/block-bio-integrity-add-support-for-user-buffers/20231018-232704
base:   linus/master
patch link:    https://lore.kernel.org/r/20231018151843.3542335-2-kbusch%40meta.com
patch subject: [PATCH 1/4] block: bio-integrity: add support for user buffers
config: um-i386_defconfig (https://download.01.org/0day-ci/archive/20231021/202310211117.qmDPOVfI-lkp@intel.com/config)
compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231021/202310211117.qmDPOVfI-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310211117.qmDPOVfI-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from include/linux/blkdev.h:17:0,
                    from init/main.c:85:
   include/linux/bio.h: In function 'bio_integrity_map_user':
>> include/linux/bio.h:798:1: error: expected ';' before '}' token
    }
    ^
--
   In file included from include/linux/blkdev.h:17:0,
                    from lib/vsprintf.c:47:
   include/linux/bio.h: In function 'bio_integrity_map_user':
>> include/linux/bio.h:798:1: error: expected ';' before '}' token
    }
    ^
   lib/vsprintf.c: In function 'va_format':
   lib/vsprintf.c:1682:2: warning: function 'va_format' might be a candidate for 'gnu_printf' format attribute [-Wsuggest-attribute=format]
     buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
     ^~~


vim +798 include/linux/bio.h

   793	
   794	static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
   795						 unsigned int len, u32 seed, u32 maxvecs)
   796	{
   797		return -EINVAL
 > 798	}
   799	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] block: bio-integrity: add support for user buffers
  2023-10-18 15:18 ` [PATCH 1/4] block: bio-integrity: add support for user buffers Keith Busch
  2023-10-19  5:39   ` Christoph Hellwig
  2023-10-21  3:53   ` kernel test robot
@ 2023-10-21  4:13   ` kernel test robot
  2023-10-25 12:51   ` Kanchan Joshi
  3 siblings, 0 replies; 16+ messages in thread
From: kernel test robot @ 2023-10-21  4:13 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-nvme, io-uring
  Cc: llvm, oe-kbuild-all, axboe, hch, joshi.k, martin.petersen, Keith Busch

Hi Keith,

kernel test robot noticed the following build errors:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.6-rc6 next-20231020]
[cannot apply to axboe-block/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Keith-Busch/block-bio-integrity-add-support-for-user-buffers/20231018-232704
base:   linus/master
patch link:    https://lore.kernel.org/r/20231018151843.3542335-2-kbusch%40meta.com
patch subject: [PATCH 1/4] block: bio-integrity: add support for user buffers
config: um-allnoconfig (https://download.01.org/0day-ci/archive/20231021/202310211209.gA0mAZaz-lkp@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231021/202310211209.gA0mAZaz-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202310211209.gA0mAZaz-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from init/main.c:21:
   In file included from include/linux/syscalls.h:90:
   In file included from include/trace/syscall.h:7:
   In file included from include/linux/trace_events.h:9:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     547 |         val = __raw_readb(PCI_IOBASE + addr);
         |                           ~~~~~~~~~~ ^
   include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     560 |         val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
         |                                                         ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
      37 | #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
         |                                                   ^
   In file included from init/main.c:21:
   In file included from include/linux/syscalls.h:90:
   In file included from include/trace/syscall.h:7:
   In file included from include/linux/trace_events.h:9:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     573 |         val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
         |                                                         ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
      35 | #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
         |                                                   ^
   In file included from init/main.c:21:
   In file included from include/linux/syscalls.h:90:
   In file included from include/trace/syscall.h:7:
   In file included from include/linux/trace_events.h:9:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     584 |         __raw_writeb(value, PCI_IOBASE + addr);
         |                             ~~~~~~~~~~ ^
   include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     594 |         __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
         |                                                       ~~~~~~~~~~ ^
   include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     604 |         __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
         |                                                       ~~~~~~~~~~ ^
   include/asm-generic/io.h:692:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     692 |         readsb(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:700:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     700 |         readsw(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:708:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     708 |         readsl(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:717:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     717 |         writesb(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
   include/asm-generic/io.h:726:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     726 |         writesw(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
   include/asm-generic/io.h:735:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     735 |         writesl(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
   In file included from init/main.c:85:
   In file included from include/linux/blkdev.h:17:
>> include/linux/bio.h:797:16: error: expected ';' after return statement
     797 |         return -EINVAL
         |                       ^
         |                       ;
   12 warnings and 1 error generated.
--
   In file included from mm/swapfile.c:9:
   In file included from include/linux/blkdev.h:9:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     547 |         val = __raw_readb(PCI_IOBASE + addr);
         |                           ~~~~~~~~~~ ^
   include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     560 |         val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
         |                                                         ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
      37 | #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
         |                                                   ^
   In file included from mm/swapfile.c:9:
   In file included from include/linux/blkdev.h:9:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     573 |         val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
         |                                                         ~~~~~~~~~~ ^
   include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
      35 | #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
         |                                                   ^
   In file included from mm/swapfile.c:9:
   In file included from include/linux/blkdev.h:9:
   In file included from include/linux/blk_types.h:10:
   In file included from include/linux/bvec.h:10:
   In file included from include/linux/highmem.h:12:
   In file included from include/linux/hardirq.h:11:
   In file included from arch/um/include/asm/hardirq.h:5:
   In file included from include/asm-generic/hardirq.h:17:
   In file included from include/linux/irq.h:20:
   In file included from include/linux/io.h:13:
   In file included from arch/um/include/asm/io.h:24:
   include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     584 |         __raw_writeb(value, PCI_IOBASE + addr);
         |                             ~~~~~~~~~~ ^
   include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     594 |         __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
         |                                                       ~~~~~~~~~~ ^
   include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     604 |         __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
         |                                                       ~~~~~~~~~~ ^
   include/asm-generic/io.h:692:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     692 |         readsb(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:700:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     700 |         readsw(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:708:20: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     708 |         readsl(PCI_IOBASE + addr, buffer, count);
         |                ~~~~~~~~~~ ^
   include/asm-generic/io.h:717:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     717 |         writesb(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
   include/asm-generic/io.h:726:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     726 |         writesw(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
   include/asm-generic/io.h:735:21: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
     735 |         writesl(PCI_IOBASE + addr, buffer, count);
         |                 ~~~~~~~~~~ ^
   In file included from mm/swapfile.c:9:
   In file included from include/linux/blkdev.h:17:
>> include/linux/bio.h:797:16: error: expected ';' after return statement
     797 |         return -EINVAL
         |                       ^
         |                       ;
   In file included from mm/swapfile.c:14:
   include/linux/mman.h:158:9: warning: division by zero is undefined [-Wdivision-by-zero]
     158 |                _calc_vm_trans(flags, MAP_SYNC,       VM_SYNC      ) |
         |                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/mman.h:136:21: note: expanded from macro '_calc_vm_trans'
     136 |    : ((x) & (bit1)) / ((bit1) / (bit2))))
         |                     ^ ~~~~~~~~~~~~~~~~~
   13 warnings and 1 error generated.


vim +797 include/linux/bio.h

   793	
   794	static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
   795						 unsigned int len, u32 seed, u32 maxvecs)
   796	{
 > 797		return -EINVAL
   798	}
   799	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] iouring: remove IORING_URING_CMD_POLLED
  2023-10-18 15:18 ` [PATCH 3/4] iouring: remove IORING_URING_CMD_POLLED Keith Busch
  2023-10-19  5:41   ` Christoph Hellwig
@ 2023-10-23  6:18   ` Kanchan Joshi
  1 sibling, 0 replies; 16+ messages in thread
From: Kanchan Joshi @ 2023-10-23  6:18 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-nvme, io-uring
  Cc: axboe, hch, martin.petersen, Keith Busch

On 10/18/2023 8:48 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> No more users of this flag.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   include/uapi/linux/io_uring.h | 2 --
>   1 file changed, 2 deletions(-)
> 
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 8e61f8b7c2ced..10e724370b612 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -249,10 +249,8 @@ enum io_uring_op {
>    * sqe->uring_cmd_flags
>    * IORING_URING_CMD_FIXED	use registered buffer; pass this flag
>    *				along with setting sqe->buf_index.
> - * IORING_URING_CMD_POLLED	driver use only
>    */
>   #define IORING_URING_CMD_FIXED	(1U << 0)
> -#define IORING_URING_CMD_POLLED	(1U << 31)
>   

This is bit outdated. This flag got moved to a different file since this 
patch.
https://lore.kernel.org/io-uring/20230928124327.135679-2-ming.lei@redhat.com/

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] block: bio-integrity: add support for user buffers
  2023-10-18 15:18 ` [PATCH 1/4] block: bio-integrity: add support for user buffers Keith Busch
                     ` (2 preceding siblings ...)
  2023-10-21  4:13   ` kernel test robot
@ 2023-10-25 12:51   ` Kanchan Joshi
  2023-10-25 14:42     ` Keith Busch
  3 siblings, 1 reply; 16+ messages in thread
From: Kanchan Joshi @ 2023-10-25 12:51 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-nvme, io-uring
  Cc: axboe, hch, martin.petersen, Keith Busch

On 10/18/2023 8:48 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> User space passthrough commands that utilize metadata currently need to
> bounce the "integrity" buffer through the kernel. This adds unnecessary
> overhead and memory pressure.
> 
> Add support for mapping user space directly so that we can avoid this
> costly copy. This is similiar to how the bio payload utilizes user
> addresses with bio_map_user_iov().
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   block/bio-integrity.c | 67 +++++++++++++++++++++++++++++++++++++++++++
>   include/linux/bio.h   |  8 ++++++
>   2 files changed, 75 insertions(+)
> 
> diff --git a/block/bio-integrity.c b/block/bio-integrity.c
> index ec8ac8cf6e1b9..08f70b837a29b 100644
> --- a/block/bio-integrity.c
> +++ b/block/bio-integrity.c
> @@ -91,6 +91,19 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
>   }
>   EXPORT_SYMBOL(bio_integrity_alloc);
>   
> +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
> +{
> +	bool dirty = bio_data_dir(bip->bip_bio) == READ;
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +
> +	bip_for_each_vec(bv, bip, iter) {
> +		if (dirty && !PageCompound(bv.bv_page))
> +			set_page_dirty_lock(bv.bv_page);
> +		unpin_user_page(bv.bv_page);
> +	}
> +}
> +
>   /**
>    * bio_integrity_free - Free bio integrity payload
>    * @bio:	bio containing bip to be freed
> @@ -105,6 +118,8 @@ void bio_integrity_free(struct bio *bio)
>   
>   	if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
>   		kfree(bvec_virt(bip->bip_vec));
> +	else if (bip->bip_flags & BIP_INTEGRITY_USER)
> +		bio_integrity_unmap_user(bip);;
>   
>   	__bio_integrity_free(bs, bip);
>   	bio->bi_integrity = NULL;
> @@ -160,6 +175,58 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
>   }
>   EXPORT_SYMBOL(bio_integrity_add_page);
>   
> +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len,
> +			   u32 seed, u32 maxvecs)
> +{
> +	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> +	unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
> +	struct page *stack_pages[UIO_FASTIOV];
> +	size_t offset = offset_in_page(ubuf);
> +	unsigned long ptr = (uintptr_t)ubuf;
> +	struct page **pages = stack_pages;
> +	struct bio_integrity_payload *bip;
> +	int npages, ret, i;
> +
> +	if (bio_integrity(bio) || ptr & align || maxvecs > UIO_FASTIOV)
> +		return -EINVAL;
> +
> +	bip = bio_integrity_alloc(bio, GFP_KERNEL, maxvecs);
> +	if (IS_ERR(bip))
> +		return PTR_ERR(bip);
> +
> +	ret = pin_user_pages_fast(ptr, UIO_FASTIOV, FOLL_WRITE, pages);

Why not pass maxvecs here? If you pass UIO_FASTIOV, it will map those 
many pages here. And will result into a leak (missed unpin) eventually 
(see below).

> +	if (unlikely(ret < 0))
> +		goto free_bip;
> +
> +	npages = ret;
> +	for (i = 0; i < npages; i++) {
> +		u32 bytes = min_t(u32, len, PAGE_SIZE - offset);

Nit: bytes can be declared outside.

> +		ret = bio_integrity_add_page(bio, pages[i], bytes, offset);
> +		if (ret != bytes) {
> +			ret = -EINVAL;
> +			goto release_pages;
> +		}
> +		len -= ret;

Take the case of single '4KB + 8b' io.
This len will become 0 in the first iteration.
But the loop continues for UIO_FASTIOV iterations. It will add only one 
page into bio_integrity_add_page.

And that is what it will unpin during bio_integrity_unmap_user(). 
Remaining pages will continue to remain pinned.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] nvme: use bio_integrity_map_user
  2023-10-18 15:18 ` [PATCH 2/4] nvme: use bio_integrity_map_user Keith Busch
  2023-10-19  5:40   ` Christoph Hellwig
@ 2023-10-25 13:26   ` Kanchan Joshi
  1 sibling, 0 replies; 16+ messages in thread
From: Kanchan Joshi @ 2023-10-25 13:26 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-nvme, io-uring
  Cc: axboe, hch, martin.petersen, Keith Busch

On 10/18/2023 8:48 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Map user metadata buffers directly instead of maintaining a complicated
> copy buffer.
> 
> Now that the bio tracks the metadata through its bip, nvme doesn't need
> special metadata handling, callbacks, or additional fields in the pdu.
> This greatly simplifies passthrough handling and avoids a "might_fault"
> copy_to_user in the completion path. This also creates pdu space to
> track the original request separately from its bio, further simplifying
> polling without relying on special iouring fields.
> 
> The downside is that nvme requires the metadata buffer be physically
> contiguous, so user space will need to utilize huge pages if the buffer
> needs to span multiple pages.

I did not notice where this downside part is getting checked in the code.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] block: bio-integrity: add support for user buffers
  2023-10-25 12:51   ` Kanchan Joshi
@ 2023-10-25 14:42     ` Keith Busch
  0 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2023-10-25 14:42 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Keith Busch, linux-block, linux-nvme, io-uring, axboe, hch,
	martin.petersen

On Wed, Oct 25, 2023 at 06:21:55PM +0530, Kanchan Joshi wrote:
> On 10/18/2023 8:48 PM, Keith Busch wrote:
> >   }
> >   EXPORT_SYMBOL(bio_integrity_add_page);
> >   
> > +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len,
> > +			   u32 seed, u32 maxvecs)
> > +{
> > +	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> > +	unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
> > +	struct page *stack_pages[UIO_FASTIOV];
> > +	size_t offset = offset_in_page(ubuf);
> > +	unsigned long ptr = (uintptr_t)ubuf;
> > +	struct page **pages = stack_pages;
> > +	struct bio_integrity_payload *bip;
> > +	int npages, ret, i;
> > +
> > +	if (bio_integrity(bio) || ptr & align || maxvecs > UIO_FASTIOV)
> > +		return -EINVAL;
> > +
> > +	bip = bio_integrity_alloc(bio, GFP_KERNEL, maxvecs);
> > +	if (IS_ERR(bip))
> > +		return PTR_ERR(bip);
> > +
> > +	ret = pin_user_pages_fast(ptr, UIO_FASTIOV, FOLL_WRITE, pages);
> 
> Why not pass maxvecs here? If you pass UIO_FASTIOV, it will map those 
> many pages here. And will result into a leak (missed unpin) eventually 
> (see below).

The 'maxvecs' is for the number of bvecs, and UIO_FASTIOV is for the
number of pages. A single bvec can contain multiple pages, so the idea
was to attempt merging if multiple pages were required.

This patch though didn't calculate the pages right. Next version I'm
working on uses iov_iter instead. V2 also retains a kernel copy
fallback.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2023-10-25 14:42 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-18 15:18 [PATCH 0/4] block integrity: direclty map user space addresses Keith Busch
2023-10-18 15:18 ` [PATCH 1/4] block: bio-integrity: add support for user buffers Keith Busch
2023-10-19  5:39   ` Christoph Hellwig
2023-10-21  3:53   ` kernel test robot
2023-10-21  4:13   ` kernel test robot
2023-10-25 12:51   ` Kanchan Joshi
2023-10-25 14:42     ` Keith Busch
2023-10-18 15:18 ` [PATCH 2/4] nvme: use bio_integrity_map_user Keith Busch
2023-10-19  5:40   ` Christoph Hellwig
2023-10-25 13:26   ` Kanchan Joshi
2023-10-18 15:18 ` [PATCH 3/4] iouring: remove IORING_URING_CMD_POLLED Keith Busch
2023-10-19  5:41   ` Christoph Hellwig
2023-10-19 14:43     ` Keith Busch
2023-10-23  6:18   ` Kanchan Joshi
2023-10-18 15:18 ` [PATCH 4/4] io_uring: remove uring_cmd cookie Keith Busch
2023-10-19  5:34 ` [PATCH 0/4] block integrity: direclty map user space addresses Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.