All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET v3] Batched completions
@ 2021-10-17  2:06 Jens Axboe
  2021-10-17  2:06 ` [PATCH 1/6] block: add a struct io_comp_batch argument to fops->iopoll() Jens Axboe
                   ` (6 more replies)
  0 siblings, 7 replies; 18+ messages in thread
From: Jens Axboe @ 2021-10-17  2:06 UTC (permalink / raw)
  To: linux-block

Hi,

We now do decent batching of allocations for submit, but we still
complete requests individually. This costs a lot of CPU cycles.

This patchset adds support for collecting requests for completion,
and then completing them as a batch. This includes things like freeing
a batch of tags.

This version is looking pretty good to me now, and should be ready
for 5.16.

Changes since v2:
- Get rid of dev_id
- Get rid of mq_ops->complete_batch
- Drop now unnecessary ib->complete setting in blk_poll()
- Drop one sbitmap patch that was questionnable
- Rename io_batch to io_comp_batch
- Track need_timestamp on per-iob basis instead of for each request
- Drop elevator support for batching, cleaner without
- Make the batched driver addition simpler
- Unify nvme polled/irq handling
- Drop io_uring file checking, no longer neededd
- Cleanup io_uring completion side

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/6] block: add a struct io_comp_batch argument to fops->iopoll()
  2021-10-17  2:06 [PATCHSET v3] Batched completions Jens Axboe
@ 2021-10-17  2:06 ` Jens Axboe
  2021-10-18 10:16   ` Christoph Hellwig
  2021-10-17  2:06 ` [PATCH 2/6] sbitmap: add helper to clear a batch of tags Jens Axboe
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-10-17  2:06 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

struct io_comp_batch contains a list head and a completion handler, which
will allow completions to more effciently completed batches of IO.

For now, no functional changes in this patch, we just define the
io_comp_batch structure and add the argument to the file_operations iopoll
handler.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c              |  9 +++++----
 block/blk-exec.c              |  2 +-
 block/blk-mq.c                |  9 +++++----
 block/blk-mq.h                |  3 ++-
 block/fops.c                  |  4 ++--
 drivers/block/null_blk/main.c |  2 +-
 drivers/block/rnbd/rnbd-clt.c |  2 +-
 drivers/nvme/host/pci.c       |  4 ++--
 drivers/nvme/host/rdma.c      |  2 +-
 drivers/nvme/host/tcp.c       |  2 +-
 drivers/scsi/scsi_lib.c       |  2 +-
 fs/io_uring.c                 |  2 +-
 fs/iomap/direct-io.c          |  2 +-
 include/linux/blk-mq.h        |  2 +-
 include/linux/blkdev.h        | 13 +++++++++++--
 include/linux/fs.h            |  3 ++-
 mm/page_io.c                  |  2 +-
 17 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index fced71948162..2ae8bea80f2d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1071,7 +1071,7 @@ EXPORT_SYMBOL(submit_bio);
  * Note: the caller must either be the context that submitted @bio, or
  * be in a RCU critical section to prevent freeing of @bio.
  */
-int bio_poll(struct bio *bio, unsigned int flags)
+int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
 {
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 	blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
@@ -1089,7 +1089,7 @@ int bio_poll(struct bio *bio, unsigned int flags)
 	if (WARN_ON_ONCE(!queue_is_mq(q)))
 		ret = 0;	/* not yet implemented, should not happen */
 	else
-		ret = blk_mq_poll(q, cookie, flags);
+		ret = blk_mq_poll(q, cookie, iob, flags);
 	blk_queue_exit(q);
 	return ret;
 }
@@ -1099,7 +1099,8 @@ EXPORT_SYMBOL_GPL(bio_poll);
  * Helper to implement file_operations.iopoll.  Requires the bio to be stored
  * in iocb->private, and cleared before freeing the bio.
  */
-int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags)
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
+		    unsigned int flags)
 {
 	struct bio *bio;
 	int ret = 0;
@@ -1127,7 +1128,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags)
 	rcu_read_lock();
 	bio = READ_ONCE(kiocb->private);
 	if (bio && bio->bi_bdev)
-		ret = bio_poll(bio, flags);
+		ret = bio_poll(bio, iob, flags);
 	rcu_read_unlock();
 
 	return ret;
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 55f0cd34b37b..1b8b47f6e79b 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -77,7 +77,7 @@ static bool blk_rq_is_poll(struct request *rq)
 static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
 {
 	do {
-		bio_poll(rq->bio, 0);
+		bio_poll(rq->bio, NULL, 0);
 		cond_resched();
 	} while (!completion_done(wait));
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 22b30a89bf3a..8eb80e70e8ea 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4196,14 +4196,14 @@ static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
 }
 
 static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
-		unsigned int flags)
+			       struct io_comp_batch *iob, unsigned int flags)
 {
 	struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
 	long state = get_current_state();
 	int ret;
 
 	do {
-		ret = q->mq_ops->poll(hctx);
+		ret = q->mq_ops->poll(hctx, iob);
 		if (ret > 0) {
 			__set_current_state(TASK_RUNNING);
 			return ret;
@@ -4223,14 +4223,15 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
 	return 0;
 }
 
-int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags)
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+		unsigned int flags)
 {
 	if (!(flags & BLK_POLL_NOSLEEP) &&
 	    q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
 		if (blk_mq_poll_hybrid(q, cookie))
 			return 1;
 	}
-	return blk_mq_poll_classic(q, cookie, flags);
+	return blk_mq_poll_classic(q, cookie, iob, flags);
 }
 
 unsigned int blk_mq_rq_cpu(struct request *rq)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index c12554c58abd..d8ccb341e82e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -31,7 +31,8 @@ struct blk_mq_ctx {
 } ____cacheline_aligned_in_smp;
 
 void blk_mq_submit_bio(struct bio *bio);
-int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags);
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+		unsigned int flags);
 void blk_mq_exit_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
diff --git a/block/fops.c b/block/fops.c
index 7eebd590342b..21d25ee0e4bf 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -105,7 +105,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!READ_ONCE(bio.bi_private))
 			break;
-		if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, 0))
+		if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0))
 			blk_io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -291,7 +291,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		if (!READ_ONCE(dio->waiter))
 			break;
 
-		if (!do_poll || !bio_poll(bio, 0))
+		if (!do_poll || !bio_poll(bio, NULL, 0))
 			blk_io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 7ce911e2289d..f4af95c2f9a9 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1494,7 +1494,7 @@ static int null_map_queues(struct blk_mq_tag_set *set)
 	return 0;
 }
 
-static int null_poll(struct blk_mq_hw_ctx *hctx)
+static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
 	struct nullb_queue *nq = hctx->driver_data;
 	LIST_HEAD(list);
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index bd4a41afbbfc..0ec0191d4196 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -1176,7 +1176,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx)
+static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
 	struct rnbd_queue *q = hctx->driver_data;
 	struct rnbd_clt_dev *dev = q->dev;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0dd4b44b59cd..d1ab9250101a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1092,7 +1092,7 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
 	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 }
 
-static int nvme_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
 	bool found;
@@ -1274,7 +1274,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 * Did we miss an interrupt?
 	 */
 	if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
-		nvme_poll(req->mq_hctx);
+		nvme_poll(req->mq_hctx, NULL);
 	else
 		nvme_poll_irqdisable(nvmeq);
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 40317e1b9183..1624da3702d4 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2106,7 +2106,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
 	struct nvme_rdma_queue *queue = hctx->driver_data;
 
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 3c1c29dd3020..9ce3458ee1dd 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2429,7 +2429,7 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
 	return 0;
 }
 
-static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
 	struct nvme_tcp_queue *queue = hctx->driver_data;
 	struct sock *sk = queue->sock->sk;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 33fd9a01330c..30f7d0b4eb73 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1784,7 +1784,7 @@ static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq,
 }
 
 
-static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx)
+static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 {
 	struct Scsi_Host *shost = hctx->driver_data;
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0c59ded790b3..fed8ec12a36d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2450,7 +2450,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
-		ret = kiocb->ki_filp->f_op->iopoll(kiocb, poll_flags);
+		ret = kiocb->ki_filp->f_op->iopoll(kiocb, NULL, poll_flags);
 		if (unlikely(ret < 0))
 			return ret;
 		else if (ret)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 8efab177011d..83ecfba53abe 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -630,7 +630,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 				break;
 
 			if (!dio->submit.poll_bio ||
-			    !bio_poll(dio->submit.poll_bio, 0))
+			    !bio_poll(dio->submit.poll_bio, NULL, 0))
 				blk_io_schedule();
 		}
 		__set_current_state(TASK_RUNNING);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9fc868abcc81..938ca6e86556 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -532,7 +532,7 @@ struct blk_mq_ops {
 	/**
 	 * @poll: Called to poll for completion of a specific tag.
 	 */
-	int (*poll)(struct blk_mq_hw_ctx *);
+	int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);
 
 	/**
 	 * @complete: Mark the request as complete.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c6a6402cb1a1..7832efd198a8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -569,8 +569,9 @@ blk_status_t errno_to_blk_status(int errno);
 #define BLK_POLL_ONESHOT		(1 << 0)
 /* do not sleep to wait for the expected completion time */
 #define BLK_POLL_NOSLEEP		(1 << 1)
-int bio_poll(struct bio *bio, unsigned int flags);
-int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags);
+int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
+			unsigned int flags);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
@@ -1298,6 +1299,14 @@ int fsync_bdev(struct block_device *bdev);
 int freeze_bdev(struct block_device *bdev);
 int thaw_bdev(struct block_device *bdev);
 
+struct io_comp_batch {
+	struct request *req_list;
+	bool need_ts;
+	void (*complete)(struct io_comp_batch *);
+};
+
+#define DEFINE_IO_COMP_BATCH(name)	struct io_comp_batch name = { }
+
 #define rq_list_add(listptr, rq)	do {		\
 	(rq)->rq_next = *(listptr);			\
 	*(listptr) = rq;				\
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f98a361a6752..c6b61c124715 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -48,6 +48,7 @@
 struct backing_dev_info;
 struct bdi_writeback;
 struct bio;
+struct io_comp_batch;
 struct export_operations;
 struct fiemap_extent_info;
 struct hd_geometry;
@@ -2075,7 +2076,7 @@ struct file_operations {
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
-	int (*iopoll)(struct kiocb *kiocb, unsigned int flags);
+	int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, unsigned int flags);
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
diff --git a/mm/page_io.c b/mm/page_io.c
index a68faab5b310..6010fb07f231 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -424,7 +424,7 @@ int swap_readpage(struct page *page, bool synchronous)
 		if (!READ_ONCE(bio->bi_private))
 			break;
 
-		if (!bio_poll(bio, 0))
+		if (!bio_poll(bio, NULL, 0))
 			blk_io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/6] sbitmap: add helper to clear a batch of tags
  2021-10-17  2:06 [PATCHSET v3] Batched completions Jens Axboe
  2021-10-17  2:06 ` [PATCH 1/6] block: add a struct io_comp_batch argument to fops->iopoll() Jens Axboe
@ 2021-10-17  2:06 ` Jens Axboe
  2021-10-17  2:06 ` [PATCH 3/6] block: add support for blk_mq_end_request_batch() Jens Axboe
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2021-10-17  2:06 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

sbitmap currently only supports clearing tags one-by-one, add a helper
that allows the caller to pass in an array of tags to clear.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 11 +++++++++++
 lib/sbitmap.c           | 44 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index e30b56023ead..4a6ff274335a 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -528,6 +528,17 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu);
 
+/**
+ * sbitmap_queue_clear_batch() - Free a batch of allocated bits
+ * &struct sbitmap_queue.
+ * @sbq: Bitmap to free from.
+ * @offset: offset for each tag in array
+ * @tags: array of tags
+ * @nr_tags: number of tags in array
+ */
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+				int *tags, int nr_tags);
+
 static inline int sbq_index_inc(int index)
 {
 	return (index + 1) & (SBQ_WAIT_QUEUES - 1);
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index f398e0ae548e..c6e2f1f2c4d2 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -628,6 +628,46 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 
+static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag)
+{
+	if (likely(!sb->round_robin && tag < sb->depth))
+		*per_cpu_ptr(sb->alloc_hint, cpu) = tag;
+}
+
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+				int *tags, int nr_tags)
+{
+	struct sbitmap *sb = &sbq->sb;
+	unsigned long *addr = NULL;
+	unsigned long mask = 0;
+	int i;
+
+	smp_mb__before_atomic();
+	for (i = 0; i < nr_tags; i++) {
+		const int tag = tags[i] - offset;
+		unsigned long *this_addr;
+
+		/* since we're clearing a batch, skip the deferred map */
+		this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
+		if (!addr) {
+			addr = this_addr;
+		} else if (addr != this_addr) {
+			atomic_long_andnot(mask, (atomic_long_t *) addr);
+			mask = 0;
+			addr = this_addr;
+		}
+		mask |= (1UL << SB_NR_TO_BIT(sb, tag));
+	}
+
+	if (mask)
+		atomic_long_andnot(mask, (atomic_long_t *) addr);
+
+	smp_mb__after_atomic();
+	sbitmap_queue_wake_up(sbq);
+	sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
+					tags[nr_tags - 1] - offset);
+}
+
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
@@ -652,9 +692,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 	 */
 	smp_mb__after_atomic();
 	sbitmap_queue_wake_up(sbq);
-
-	if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth))
-		*per_cpu_ptr(sbq->sb.alloc_hint, cpu) = nr;
+	sbitmap_update_cpu_hint(&sbq->sb, cpu, nr);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
 
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/6] block: add support for blk_mq_end_request_batch()
  2021-10-17  2:06 [PATCHSET v3] Batched completions Jens Axboe
  2021-10-17  2:06 ` [PATCH 1/6] block: add a struct io_comp_batch argument to fops->iopoll() Jens Axboe
  2021-10-17  2:06 ` [PATCH 2/6] sbitmap: add helper to clear a batch of tags Jens Axboe
@ 2021-10-17  2:06 ` Jens Axboe
  2021-10-18 10:18   ` Christoph Hellwig
  2021-10-17  2:06 ` [PATCH 4/6] nvme: add support for batched completion of polled IO Jens Axboe
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-10-17  2:06 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

Instead of calling blk_mq_end_request() on a single request, add a helper
that takes the new struct io_comp_batch and completes any request stored
in there.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c     |  6 ++++
 block/blk-mq-tag.h     |  1 +
 block/blk-mq.c         | 81 ++++++++++++++++++++++++++++++++----------
 include/linux/blk-mq.h | 29 +++++++++++++++
 4 files changed, 98 insertions(+), 19 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c43b97201161..b94c3e8ef392 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -207,6 +207,12 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 	}
 }
 
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
+{
+	sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
+					tag_array, nr_tags);
+}
+
 struct bt_iter_data {
 	struct blk_mq_hw_ctx *hctx;
 	busy_iter_fn *fn;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index e617c7220626..df787b5a23bd 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -19,6 +19,7 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
 			      unsigned int *offset);
 extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 			   unsigned int tag);
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
 extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 					struct blk_mq_tags **tags,
 					unsigned int depth, bool can_grow);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8eb80e70e8ea..58dc0c0c24ac 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -292,15 +292,6 @@ void blk_mq_wake_waiters(struct request_queue *q)
 			blk_mq_tag_wakeup_all(hctx->tags, true);
 }
 
-/*
- * Only need start/end time stamping if we have iostat or
- * blk stats enabled, or using an IO scheduler.
- */
-static inline bool blk_mq_need_time_stamp(struct request *rq)
-{
-	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
-}
-
 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		unsigned int tag, u64 alloc_time_ns)
 {
@@ -754,19 +745,21 @@ bool blk_update_request(struct request *req, blk_status_t error,
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 
-inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
 {
-	if (blk_mq_need_time_stamp(rq)) {
-		u64 now = ktime_get_ns();
+	if (rq->rq_flags & RQF_STATS) {
+		blk_mq_poll_stats_start(rq->q);
+		blk_stat_add(rq, now);
+	}
 
-		if (rq->rq_flags & RQF_STATS) {
-			blk_mq_poll_stats_start(rq->q);
-			blk_stat_add(rq, now);
-		}
+	blk_mq_sched_completed_request(rq, now);
+	blk_account_io_done(rq, now);
+}
 
-		blk_mq_sched_completed_request(rq, now);
-		blk_account_io_done(rq, now);
-	}
+inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+{
+	if (blk_mq_need_time_stamp(rq))
+		__blk_mq_end_request_acct(rq, ktime_get_ns());
 
 	if (rq->end_io) {
 		rq_qos_done(rq->q, rq);
@@ -785,6 +778,56 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
 }
 EXPORT_SYMBOL(blk_mq_end_request);
 
+#define TAG_COMP_BATCH		32
+
+static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
+					  int *tag_array, int nr_tags)
+{
+	struct request_queue *q = hctx->queue;
+
+	blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
+	percpu_ref_put_many(&q->q_usage_counter, nr_tags);
+}
+
+void blk_mq_end_request_batch(struct io_comp_batch *iob)
+{
+	int tags[TAG_COMP_BATCH], nr_tags = 0;
+	struct blk_mq_hw_ctx *last_hctx = NULL;
+	struct request *rq;
+	u64 now = 0;
+
+	if (iob->need_ts)
+		now = ktime_get_ns();
+
+	while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
+		prefetch(rq->bio);
+		prefetch(rq->rq_next);
+
+		blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq));
+		__blk_mq_end_request_acct(rq, now);
+
+		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+		if (!refcount_dec_and_test(&rq->ref))
+			continue;
+
+		blk_crypto_free_request(rq);
+		blk_pm_mark_last_busy(rq);
+		rq_qos_done(rq->q, rq);
+
+		if (nr_tags == TAG_COMP_BATCH ||
+		    (last_hctx && last_hctx != rq->mq_hctx)) {
+			blk_mq_flush_tag_batch(last_hctx, tags, nr_tags);
+			nr_tags = 0;
+		}
+		tags[nr_tags++] = rq->tag;
+		last_hctx = rq->mq_hctx;
+	}
+
+	if (nr_tags)
+		blk_mq_flush_tag_batch(last_hctx, tags, nr_tags);
+}
+EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
+
 static void blk_complete_reqs(struct llist_head *list)
 {
 	struct llist_node *entry = llist_reverse_order(llist_del_all(list));
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 938ca6e86556..4dbf7948b0e4 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -761,6 +761,35 @@ static inline void blk_mq_set_request_complete(struct request *rq)
 void blk_mq_start_request(struct request *rq);
 void blk_mq_end_request(struct request *rq, blk_status_t error);
 void __blk_mq_end_request(struct request *rq, blk_status_t error);
+void blk_mq_end_request_batch(struct io_comp_batch *ib);
+
+/*
+ * Only need start/end time stamping if we have iostat or
+ * blk stats enabled, or using an IO scheduler.
+ */
+static inline bool blk_mq_need_time_stamp(struct request *rq)
+{
+	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
+}
+
+/*
+ * Batched completions only work when there is no I/O error and no special
+ * ->end_io handler.
+ */
+static inline bool blk_mq_add_to_batch(struct request *req,
+				       struct io_comp_batch *iob, int ioerror,
+				       void (*complete)(struct io_comp_batch *))
+{
+	if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror)
+		return false;
+	if (!iob->complete)
+		iob->complete = complete;
+	else if (iob->complete != complete)
+		return false;
+	iob->need_ts |= blk_mq_need_time_stamp(req);
+	rq_list_add(&iob->req_list, req);
+	return true;
+}
 
 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q);
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/6] nvme: add support for batched completion of polled IO
  2021-10-17  2:06 [PATCHSET v3] Batched completions Jens Axboe
                   ` (2 preceding siblings ...)
  2021-10-17  2:06 ` [PATCH 3/6] block: add support for blk_mq_end_request_batch() Jens Axboe
@ 2021-10-17  2:06 ` Jens Axboe
  2021-10-18 10:20   ` Christoph Hellwig
  2021-10-17  2:06 ` [PATCH 5/6] io_uring: utilize the io batching infrastructure for more efficient " Jens Axboe
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-10-17  2:06 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

Take advantage of struct io_comp_batch, if passed in to the nvme poll
handler. If it's set, rather than complete each request individually
inline, store them in the io_comp_batch list. We only do so for requests
that will complete successfully, anything else will be completed inline as
before.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 17 ++++++++++++++---
 drivers/nvme/host/nvme.h | 14 ++++++++++++++
 drivers/nvme/host/pci.c  | 31 +++++++++++++++++++++++++------
 3 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c2c2e8545292..4eadecc67c91 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -346,15 +346,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	return RETRY;
 }
 
-static inline void nvme_end_req(struct request *req)
+static inline void nvme_end_req_zoned(struct request *req)
 {
-	blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
 	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 	    req_op(req) == REQ_OP_ZONE_APPEND)
 		req->__sector = nvme_lba_to_sect(req->q->queuedata,
 			le64_to_cpu(nvme_req(req)->result.u64));
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+	blk_status_t status = nvme_error_status(nvme_req(req)->status);
 
+	nvme_end_req_zoned(req);
 	nvme_trace_bio_complete(req);
 	blk_mq_end_request(req, status);
 }
@@ -381,6 +385,13 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
+void nvme_complete_batch_req(struct request *req)
+{
+	nvme_cleanup_cmd(req);
+	nvme_end_req_zoned(req);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
+
 /*
  * Called to unwind from ->queue_rq on a failed command submission so that the
  * multipathing code gets called to potentially failover to another path.
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ed79a6c7e804..ef2467b93adb 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -638,6 +638,20 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 }
 
 void nvme_complete_rq(struct request *req);
+void nvme_complete_batch_req(struct request *req);
+
+static __always_inline void nvme_complete_batch(struct io_comp_batch *iob,
+						void (*fn)(struct request *rq))
+{
+	struct request *req;
+
+	rq_list_for_each(&iob->req_list, req) {
+		fn(req);
+		nvme_complete_batch_req(req);
+	}
+	blk_mq_end_request_batch(iob);
+}
+
 blk_status_t nvme_host_path_error(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d1ab9250101a..e916d5e167c1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static void nvme_pci_complete_rq(struct request *req)
+static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_dev *dev = iod->nvmeq->dev;
@@ -969,9 +969,19 @@ static void nvme_pci_complete_rq(struct request *req)
 			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
 	if (blk_rq_nr_phys_segments(req))
 		nvme_unmap_data(dev, req);
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+	nvme_pci_unmap_rq(req);
 	nvme_complete_rq(req);
 }
 
+static void nvme_pci_complete_batch(struct io_comp_batch *iob)
+{
+	nvme_complete_batch(iob, nvme_pci_unmap_rq);
+}
+
 /* We read the CQE phase first to check if the rest of the entry is valid */
 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
 {
@@ -996,7 +1006,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+				   struct io_comp_batch *iob, u16 idx)
 {
 	struct nvme_completion *cqe = &nvmeq->cqes[idx];
 	__u16 command_id = READ_ONCE(cqe->command_id);
@@ -1023,7 +1034,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 	}
 
 	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
+	if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
+	    !blk_mq_add_to_batch(req, iob, nvme_req(req)->status,
+					nvme_pci_complete_batch))
 		nvme_pci_complete_rq(req);
 }
 
@@ -1039,7 +1052,8 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
 	}
 }
 
-static inline int nvme_process_cq(struct nvme_queue *nvmeq)
+static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
+			       struct io_comp_batch *iob)
 {
 	int found = 0;
 
@@ -1050,7 +1064,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 		 * the cqe requires a full read memory barrier
 		 */
 		dma_rmb();
-		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+		nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
 		nvme_update_cq_head(nvmeq);
 	}
 
@@ -1059,6 +1073,11 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 	return found;
 }
 
+static inline int nvme_process_cq(struct nvme_queue *nvmeq)
+{
+	return nvme_poll_cq(nvmeq, NULL);
+ }
+
 static irqreturn_t nvme_irq(int irq, void *data)
 {
 	struct nvme_queue *nvmeq = data;
@@ -1101,7 +1120,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 		return 0;
 
 	spin_lock(&nvmeq->cq_poll_lock);
-	found = nvme_process_cq(nvmeq);
+	found = nvme_poll_cq(nvmeq, iob);
 	spin_unlock(&nvmeq->cq_poll_lock);
 
 	return found;
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 5/6] io_uring: utilize the io batching infrastructure for more efficient polled IO
  2021-10-17  2:06 [PATCHSET v3] Batched completions Jens Axboe
                   ` (3 preceding siblings ...)
  2021-10-17  2:06 ` [PATCH 4/6] nvme: add support for batched completion of polled IO Jens Axboe
@ 2021-10-17  2:06 ` Jens Axboe
  2021-10-18 10:20   ` Christoph Hellwig
  2021-10-17  2:06 ` [PATCH 6/6] nvme: wire up completion batching for the IRQ path Jens Axboe
  2021-10-20 11:14 ` [PATCHSET v3] Batched completions John Garry
  6 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-10-17  2:06 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

Wire up using an io_comp_batch for f_op->iopoll(). If the lower stack
supports it, we can handle high rates of polled IO more efficiently.

This raises the single core efficiency on my system from ~6.1M IOPS to
~6.6M IOPS running a random read workload at depth 128 on two gen2
Optane drives.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fed8ec12a36d..57fb6f13b50b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2428,6 +2428,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
 	unsigned int poll_flags = BLK_POLL_NOSLEEP;
+	DEFINE_IO_COMP_BATCH(iob);
 	int nr_events = 0;
 
 	/*
@@ -2450,18 +2451,21 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
-		ret = kiocb->ki_filp->f_op->iopoll(kiocb, NULL, poll_flags);
+		ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
 		if (unlikely(ret < 0))
 			return ret;
 		else if (ret)
 			poll_flags |= BLK_POLL_ONESHOT;
 
 		/* iopoll may have completed current req */
-		if (READ_ONCE(req->iopoll_completed))
+		if (!rq_list_empty(iob.req_list) ||
+		    READ_ONCE(req->iopoll_completed))
 			break;
 	}
 
-	if (!pos)
+	if (iob.req_list)
+		iob.complete(&iob);
+	else if (!pos)
 		return 0;
 
 	prev = start;
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 6/6] nvme: wire up completion batching for the IRQ path
  2021-10-17  2:06 [PATCHSET v3] Batched completions Jens Axboe
                   ` (4 preceding siblings ...)
  2021-10-17  2:06 ` [PATCH 5/6] io_uring: utilize the io batching infrastructure for more efficient " Jens Axboe
@ 2021-10-17  2:06 ` Jens Axboe
  2021-10-18 10:22   ` Christoph Hellwig
  2021-10-20 11:14 ` [PATCHSET v3] Batched completions John Garry
  6 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-10-17  2:06 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

Trivial to do now, just need our own io_comp_batch on the stack and pass
that in to the usual command completion handling.

I pondered making this dependent on how many entries we had to process,
but even for a single entry there's no discernable difference in
performance or latency. Running a sync workload over io_uring:

t/io_uring -b512 -d1 -s1 -c1 -p0 -F1 -B1 -n2 /dev/nvme1n1 /dev/nvme2n1

yields the below performance before the patch:

IOPS=254820, BW=124MiB/s, IOS/call=1/1, inflight=(1 1)
IOPS=251174, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)
IOPS=250806, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)

and the following after:

IOPS=255972, BW=124MiB/s, IOS/call=1/1, inflight=(1 1)
IOPS=251920, BW=123MiB/s, IOS/call=1/1, inflight=(1 1)
IOPS=251794, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)

which definitely isn't slower, about the same if you factor in a bit of
variance. For peak performance workloads, benchmarking shows a 2%
improvement.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e916d5e167c1..fdb0716614c9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1075,7 +1075,13 @@ static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
 
 static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 {
-	return nvme_poll_cq(nvmeq, NULL);
+	DEFINE_IO_COMP_BATCH(iob);
+	int found;
+
+	found = nvme_poll_cq(nvmeq, &iob);
+	if (iob.req_list)
+		nvme_pci_complete_batch(&iob);
+	return found;
  }
 
 static irqreturn_t nvme_irq(int irq, void *data)
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/6] block: add a struct io_comp_batch argument to fops->iopoll()
  2021-10-17  2:06 ` [PATCH 1/6] block: add a struct io_comp_batch argument to fops->iopoll() Jens Axboe
@ 2021-10-18 10:16   ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2021-10-18 10:16 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

> -int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags)
> +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,

> +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
> +		unsigned int flags);

> +	int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, unsigned int flags);

Please avoid overly long lines like this.

Otherwise looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/6] block: add support for blk_mq_end_request_batch()
  2021-10-17  2:06 ` [PATCH 3/6] block: add support for blk_mq_end_request_batch() Jens Axboe
@ 2021-10-18 10:18   ` Christoph Hellwig
  2021-10-18 13:40     ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2021-10-18 10:18 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Sat, Oct 16, 2021 at 08:06:20PM -0600, Jens Axboe wrote:
> Instead of calling blk_mq_end_request() on a single request, add a helper
> that takes the new struct io_comp_batch and completes any request stored
> in there.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  block/blk-mq-tag.c     |  6 ++++
>  block/blk-mq-tag.h     |  1 +
>  block/blk-mq.c         | 81 ++++++++++++++++++++++++++++++++----------
>  include/linux/blk-mq.h | 29 +++++++++++++++
>  4 files changed, 98 insertions(+), 19 deletions(-)
> 
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index c43b97201161..b94c3e8ef392 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -207,6 +207,12 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
>  	}
>  }
>  
> +void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
> +{
> +	sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
> +					tag_array, nr_tags);
> +}
> +
>  struct bt_iter_data {
>  	struct blk_mq_hw_ctx *hctx;
>  	busy_iter_fn *fn;
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index e617c7220626..df787b5a23bd 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -19,6 +19,7 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
>  			      unsigned int *offset);
>  extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
>  			   unsigned int tag);
> +void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
>  extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
>  					struct blk_mq_tags **tags,
>  					unsigned int depth, bool can_grow);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 8eb80e70e8ea..58dc0c0c24ac 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -292,15 +292,6 @@ void blk_mq_wake_waiters(struct request_queue *q)
>  			blk_mq_tag_wakeup_all(hctx->tags, true);
>  }
>  
> -/*
> - * Only need start/end time stamping if we have iostat or
> - * blk stats enabled, or using an IO scheduler.
> - */
> -static inline bool blk_mq_need_time_stamp(struct request *rq)
> -{
> -	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
> -}
> -
>  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>  		unsigned int tag, u64 alloc_time_ns)
>  {
> @@ -754,19 +745,21 @@ bool blk_update_request(struct request *req, blk_status_t error,
>  }
>  EXPORT_SYMBOL_GPL(blk_update_request);
>  
> -inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
> +static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
>  {
> -	if (blk_mq_need_time_stamp(rq)) {
> -		u64 now = ktime_get_ns();
> +	if (rq->rq_flags & RQF_STATS) {
> +		blk_mq_poll_stats_start(rq->q);
> +		blk_stat_add(rq, now);
> +	}
>  
> +	blk_mq_sched_completed_request(rq, now);
> +	blk_account_io_done(rq, now);
> +}
>  
> -		blk_mq_sched_completed_request(rq, now);
> -		blk_account_io_done(rq, now);
> -	}
> +inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
> +{
> +	if (blk_mq_need_time_stamp(rq))
> +		__blk_mq_end_request_acct(rq, ktime_get_ns());
>  
>  	if (rq->end_io) {
>  		rq_qos_done(rq->q, rq);
> @@ -785,6 +778,56 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
>  }
>  EXPORT_SYMBOL(blk_mq_end_request);
>  
> +#define TAG_COMP_BATCH		32
> +
> +static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
> +					  int *tag_array, int nr_tags)
> +{
> +	struct request_queue *q = hctx->queue;
> +
> +	blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
> +	percpu_ref_put_many(&q->q_usage_counter, nr_tags);
> +}
> +
> +void blk_mq_end_request_batch(struct io_comp_batch *iob)
> +{
> +	int tags[TAG_COMP_BATCH], nr_tags = 0;
> +	struct blk_mq_hw_ctx *last_hctx = NULL;
> +	struct request *rq;
> +	u64 now = 0;
> +
> +	if (iob->need_ts)
> +		now = ktime_get_ns();
> +
> +	while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
> +		prefetch(rq->bio);
> +		prefetch(rq->rq_next);
> +
> +		blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq));
> +		__blk_mq_end_request_acct(rq, now);

If iob->need_ts is not set we don't need to call
__blk_mq_end_request_acct, do we?

Otherwise looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/6] nvme: add support for batched completion of polled IO
  2021-10-17  2:06 ` [PATCH 4/6] nvme: add support for batched completion of polled IO Jens Axboe
@ 2021-10-18 10:20   ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2021-10-18 10:20 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Sat, Oct 16, 2021 at 08:06:21PM -0600, Jens Axboe wrote:
> Take advantage of struct io_comp_batch, if passed in to the nvme poll
> handler. If it's set, rather than complete each request individually
> inline, store them in the io_comp_batch list. We only do so for requests
> that will complete successfully, anything else will be completed inline as
> before.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  drivers/nvme/host/core.c | 17 ++++++++++++++---
>  drivers/nvme/host/nvme.h | 14 ++++++++++++++
>  drivers/nvme/host/pci.c  | 31 +++++++++++++++++++++++++------
>  3 files changed, 53 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index c2c2e8545292..4eadecc67c91 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -346,15 +346,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
>  	return RETRY;
>  }
>  
> -static inline void nvme_end_req(struct request *req)
> +static inline void nvme_end_req_zoned(struct request *req)
>  {
> -	blk_status_t status = nvme_error_status(nvme_req(req)->status);
> -
>  	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>  	    req_op(req) == REQ_OP_ZONE_APPEND)
>  		req->__sector = nvme_lba_to_sect(req->q->queuedata,
>  			le64_to_cpu(nvme_req(req)->result.u64));
> +}
> +
> +static inline void nvme_end_req(struct request *req)
> +{
> +	blk_status_t status = nvme_error_status(nvme_req(req)->status);
>  
> +	nvme_end_req_zoned(req);
>  	nvme_trace_bio_complete(req);
>  	blk_mq_end_request(req, status);
>  }
> @@ -381,6 +385,13 @@ void nvme_complete_rq(struct request *req)
>  }
>  EXPORT_SYMBOL_GPL(nvme_complete_rq);
>  
> +void nvme_complete_batch_req(struct request *req)
> +{
> +	nvme_cleanup_cmd(req);
> +	nvme_end_req_zoned(req);
> +}
> +EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
> +
>  /*
>   * Called to unwind from ->queue_rq on a failed command submission so that the
>   * multipathing code gets called to potentially failover to another path.
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index ed79a6c7e804..ef2467b93adb 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -638,6 +638,20 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
>  }
>  
>  void nvme_complete_rq(struct request *req);
> +void nvme_complete_batch_req(struct request *req);
> +
> +static __always_inline void nvme_complete_batch(struct io_comp_batch *iob,
> +						void (*fn)(struct request *rq))
> +{
> +	struct request *req;
> +
> +	rq_list_for_each(&iob->req_list, req) {
> +		fn(req);
> +		nvme_complete_batch_req(req);
> +	}
> +	blk_mq_end_request_batch(iob);
> +}
> +
>  blk_status_t nvme_host_path_error(struct request *req);
>  bool nvme_cancel_request(struct request *req, void *data, bool reserved);
>  void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index d1ab9250101a..e916d5e167c1 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	return ret;
>  }
>  
> -static void nvme_pci_complete_rq(struct request *req)
> +static __always_inline void nvme_pci_unmap_rq(struct request *req)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>  	struct nvme_dev *dev = iod->nvmeq->dev;
> @@ -969,9 +969,19 @@ static void nvme_pci_complete_rq(struct request *req)
>  			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
>  	if (blk_rq_nr_phys_segments(req))
>  		nvme_unmap_data(dev, req);
> +}
> +
> +static void nvme_pci_complete_rq(struct request *req)
> +{
> +	nvme_pci_unmap_rq(req);
>  	nvme_complete_rq(req);
>  }
>  
> +static void nvme_pci_complete_batch(struct io_comp_batch *iob)
> +{
> +	nvme_complete_batch(iob, nvme_pci_unmap_rq);
> +}
> +
>  /* We read the CQE phase first to check if the rest of the entry is valid */
>  static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
>  {
> @@ -996,7 +1006,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
>  	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
>  }
>  
> -static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
> +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
> +				   struct io_comp_batch *iob, u16 idx)
>  {
>  	struct nvme_completion *cqe = &nvmeq->cqes[idx];
>  	__u16 command_id = READ_ONCE(cqe->command_id);
> @@ -1023,7 +1034,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
>  	}
>  
>  	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
> -	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
> +	if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
> +	    !blk_mq_add_to_batch(req, iob, nvme_req(req)->status,
> +					nvme_pci_complete_batch))
>  		nvme_pci_complete_rq(req);
>  }
>  
> @@ -1039,7 +1052,8 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
>  	}
>  }
>  
> -static inline int nvme_process_cq(struct nvme_queue *nvmeq)
> +static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
> +			       struct io_comp_batch *iob)
>  {
>  	int found = 0;
>  
> @@ -1050,7 +1064,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
>  		 * the cqe requires a full read memory barrier
>  		 */
>  		dma_rmb();
> -		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
> +		nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
>  		nvme_update_cq_head(nvmeq);
>  	}
>  
> @@ -1059,6 +1073,11 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
>  	return found;
>  }
>  
> +static inline int nvme_process_cq(struct nvme_queue *nvmeq)
> +{
> +	return nvme_poll_cq(nvmeq, NULL);
> + }

Extra whitespace here.  But I'd prefer to just drop this wrapping,
and just add the io_comp_batch argument to nvme_process_cq.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 5/6] io_uring: utilize the io batching infrastructure for more efficient polled IO
  2021-10-17  2:06 ` [PATCH 5/6] io_uring: utilize the io batching infrastructure for more efficient " Jens Axboe
@ 2021-10-18 10:20   ` Christoph Hellwig
  0 siblings, 0 replies; 18+ messages in thread
From: Christoph Hellwig @ 2021-10-18 10:20 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Sat, Oct 16, 2021 at 08:06:22PM -0600, Jens Axboe wrote:
> Wire up using an io_comp_batch for f_op->iopoll(). If the lower stack
> supports it, we can handle high rates of polled IO more efficiently.
> 
> This raises the single core efficiency on my system from ~6.1M IOPS to
> ~6.6M IOPS running a random read workload at depth 128 on two gen2
> Optane drives.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] nvme: wire up completion batching for the IRQ path
  2021-10-17  2:06 ` [PATCH 6/6] nvme: wire up completion batching for the IRQ path Jens Axboe
@ 2021-10-18 10:22   ` Christoph Hellwig
  2021-10-18 13:41     ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2021-10-18 10:22 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Sat, Oct 16, 2021 at 08:06:23PM -0600, Jens Axboe wrote:
>  static inline int nvme_process_cq(struct nvme_queue *nvmeq)
>  {
> -	return nvme_poll_cq(nvmeq, NULL);
> +	DEFINE_IO_COMP_BATCH(iob);
> +	int found;
> +
> +	found = nvme_poll_cq(nvmeq, &iob);
> +	if (iob.req_list)
> +		nvme_pci_complete_batch(&iob);
> +	return found;

Ok, here the splitt makes sense.  That being said I'd rather only add
what is nvme_poll_cq as a separate function here, and I'd probably
name it __nvme_process_cq as the poll name could create some confusion.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/6] block: add support for blk_mq_end_request_batch()
  2021-10-18 10:18   ` Christoph Hellwig
@ 2021-10-18 13:40     ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2021-10-18 13:40 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 10/18/21 4:18 AM, Christoph Hellwig wrote:
> On Sat, Oct 16, 2021 at 08:06:20PM -0600, Jens Axboe wrote:
>> Instead of calling blk_mq_end_request() on a single request, add a helper
>> that takes the new struct io_comp_batch and completes any request stored
>> in there.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  block/blk-mq-tag.c     |  6 ++++
>>  block/blk-mq-tag.h     |  1 +
>>  block/blk-mq.c         | 81 ++++++++++++++++++++++++++++++++----------
>>  include/linux/blk-mq.h | 29 +++++++++++++++
>>  4 files changed, 98 insertions(+), 19 deletions(-)
>>
>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>> index c43b97201161..b94c3e8ef392 100644
>> --- a/block/blk-mq-tag.c
>> +++ b/block/blk-mq-tag.c
>> @@ -207,6 +207,12 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
>>  	}
>>  }
>>  
>> +void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
>> +{
>> +	sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
>> +					tag_array, nr_tags);
>> +}
>> +
>>  struct bt_iter_data {
>>  	struct blk_mq_hw_ctx *hctx;
>>  	busy_iter_fn *fn;
>> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
>> index e617c7220626..df787b5a23bd 100644
>> --- a/block/blk-mq-tag.h
>> +++ b/block/blk-mq-tag.h
>> @@ -19,6 +19,7 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
>>  			      unsigned int *offset);
>>  extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
>>  			   unsigned int tag);
>> +void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
>>  extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
>>  					struct blk_mq_tags **tags,
>>  					unsigned int depth, bool can_grow);
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 8eb80e70e8ea..58dc0c0c24ac 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -292,15 +292,6 @@ void blk_mq_wake_waiters(struct request_queue *q)
>>  			blk_mq_tag_wakeup_all(hctx->tags, true);
>>  }
>>  
>> -/*
>> - * Only need start/end time stamping if we have iostat or
>> - * blk stats enabled, or using an IO scheduler.
>> - */
>> -static inline bool blk_mq_need_time_stamp(struct request *rq)
>> -{
>> -	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
>> -}
>> -
>>  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
>>  		unsigned int tag, u64 alloc_time_ns)
>>  {
>> @@ -754,19 +745,21 @@ bool blk_update_request(struct request *req, blk_status_t error,
>>  }
>>  EXPORT_SYMBOL_GPL(blk_update_request);
>>  
>> -inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
>> +static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
>>  {
>> -	if (blk_mq_need_time_stamp(rq)) {
>> -		u64 now = ktime_get_ns();
>> +	if (rq->rq_flags & RQF_STATS) {
>> +		blk_mq_poll_stats_start(rq->q);
>> +		blk_stat_add(rq, now);
>> +	}
>>  
>> +	blk_mq_sched_completed_request(rq, now);
>> +	blk_account_io_done(rq, now);
>> +}
>>  
>> -		blk_mq_sched_completed_request(rq, now);
>> -		blk_account_io_done(rq, now);
>> -	}
>> +inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
>> +{
>> +	if (blk_mq_need_time_stamp(rq))
>> +		__blk_mq_end_request_acct(rq, ktime_get_ns());
>>  
>>  	if (rq->end_io) {
>>  		rq_qos_done(rq->q, rq);
>> @@ -785,6 +778,56 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
>>  }
>>  EXPORT_SYMBOL(blk_mq_end_request);
>>  
>> +#define TAG_COMP_BATCH		32
>> +
>> +static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
>> +					  int *tag_array, int nr_tags)
>> +{
>> +	struct request_queue *q = hctx->queue;
>> +
>> +	blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
>> +	percpu_ref_put_many(&q->q_usage_counter, nr_tags);
>> +}
>> +
>> +void blk_mq_end_request_batch(struct io_comp_batch *iob)
>> +{
>> +	int tags[TAG_COMP_BATCH], nr_tags = 0;
>> +	struct blk_mq_hw_ctx *last_hctx = NULL;
>> +	struct request *rq;
>> +	u64 now = 0;
>> +
>> +	if (iob->need_ts)
>> +		now = ktime_get_ns();
>> +
>> +	while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
>> +		prefetch(rq->bio);
>> +		prefetch(rq->rq_next);
>> +
>> +		blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq));
>> +		__blk_mq_end_request_acct(rq, now);
> 
> If iob->need_ts is not set we don't need to call
> __blk_mq_end_request_acct, do we?

We don't strictly need to, I'll make that change.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] nvme: wire up completion batching for the IRQ path
  2021-10-18 10:22   ` Christoph Hellwig
@ 2021-10-18 13:41     ` Jens Axboe
  2021-10-18 15:14       ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Jens Axboe @ 2021-10-18 13:41 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 10/18/21 4:22 AM, Christoph Hellwig wrote:
> On Sat, Oct 16, 2021 at 08:06:23PM -0600, Jens Axboe wrote:
>>  static inline int nvme_process_cq(struct nvme_queue *nvmeq)
>>  {
>> -	return nvme_poll_cq(nvmeq, NULL);
>> +	DEFINE_IO_COMP_BATCH(iob);
>> +	int found;
>> +
>> +	found = nvme_poll_cq(nvmeq, &iob);
>> +	if (iob.req_list)
>> +		nvme_pci_complete_batch(&iob);
>> +	return found;
> 
> Ok, here the splitt makes sense.  That being said I'd rather only add
> what is nvme_poll_cq as a separate function here, and I'd probably
> name it __nvme_process_cq as the poll name could create some confusion.

Sure, I can shuffle that around. Can I add your reviewed/acked by with
that for those two, or do you want the series resent?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] nvme: wire up completion batching for the IRQ path
  2021-10-18 13:41     ` Jens Axboe
@ 2021-10-18 15:14       ` Christoph Hellwig
  2021-10-18 15:25         ` Jens Axboe
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2021-10-18 15:14 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Christoph Hellwig, linux-block

On Mon, Oct 18, 2021 at 07:41:30AM -0600, Jens Axboe wrote:
> > Ok, here the splitt makes sense.  That being said I'd rather only add
> > what is nvme_poll_cq as a separate function here, and I'd probably
> > name it __nvme_process_cq as the poll name could create some confusion.
> 
> Sure, I can shuffle that around. Can I add your reviewed/acked by with
> that for those two, or do you want the series resent?

I'm fine with that change included:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/6] nvme: wire up completion batching for the IRQ path
  2021-10-18 15:14       ` Christoph Hellwig
@ 2021-10-18 15:25         ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2021-10-18 15:25 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 10/18/21 9:14 AM, Christoph Hellwig wrote:
> On Mon, Oct 18, 2021 at 07:41:30AM -0600, Jens Axboe wrote:
>>> Ok, here the splitt makes sense.  That being said I'd rather only add
>>> what is nvme_poll_cq as a separate function here, and I'd probably
>>> name it __nvme_process_cq as the poll name could create some confusion.
>>
>> Sure, I can shuffle that around. Can I add your reviewed/acked by with
>> that for those two, or do you want the series resent?
> 
> I'm fine with that change included:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>

Great, thanks.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET v3] Batched completions
  2021-10-17  2:06 [PATCHSET v3] Batched completions Jens Axboe
                   ` (5 preceding siblings ...)
  2021-10-17  2:06 ` [PATCH 6/6] nvme: wire up completion batching for the IRQ path Jens Axboe
@ 2021-10-20 11:14 ` John Garry
  2021-10-20 14:02   ` Jens Axboe
  6 siblings, 1 reply; 18+ messages in thread
From: John Garry @ 2021-10-20 11:14 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: linux-scsi

On 17/10/2021 03:06, Jens Axboe wrote:
> Hi,

+linux-scsi

> 
> We now do decent batching of allocations for submit, but we still
> complete requests individually. This costs a lot of CPU cycles.
> 
> This patchset adds support for collecting requests for completion,
> and then completing them as a batch. This includes things like freeing
> a batch of tags.
> 
> This version is looking pretty good to me now, and should be ready
> for 5.16.

Just wondering if anyone was looking at supporting this for SCSI 
midlayer? I was thinking about looking at it...

> 
> Changes since v2:
> - Get rid of dev_id
> - Get rid of mq_ops->complete_batch
> - Drop now unnecessary ib->complete setting in blk_poll()
> - Drop one sbitmap patch that was questionnable
> - Rename io_batch to io_comp_batch
> - Track need_timestamp on per-iob basis instead of for each request
> - Drop elevator support for batching, cleaner without
> - Make the batched driver addition simpler
> - Unify nvme polled/irq handling
> - Drop io_uring file checking, no longer neededd
> - Cleanup io_uring completion side
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCHSET v3] Batched completions
  2021-10-20 11:14 ` [PATCHSET v3] Batched completions John Garry
@ 2021-10-20 14:02   ` Jens Axboe
  0 siblings, 0 replies; 18+ messages in thread
From: Jens Axboe @ 2021-10-20 14:02 UTC (permalink / raw)
  To: John Garry, linux-block; +Cc: linux-scsi

On 10/20/21 5:14 AM, John Garry wrote:
> On 17/10/2021 03:06, Jens Axboe wrote:
>> Hi,
> 
> +linux-scsi
> 
>>
>> We now do decent batching of allocations for submit, but we still
>> complete requests individually. This costs a lot of CPU cycles.
>>
>> This patchset adds support for collecting requests for completion,
>> and then completing them as a batch. This includes things like freeing
>> a batch of tags.
>>
>> This version is looking pretty good to me now, and should be ready
>> for 5.16.
> 
> Just wondering if anyone was looking at supporting this for SCSI 
> midlayer? I was thinking about looking at it...

Since it's pretty new, don't think anyone has looked at that yet.
I just did the nvme case, for both submit and complete batching.
But the code is generic and would plug into anything.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2021-10-20 14:02 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-17  2:06 [PATCHSET v3] Batched completions Jens Axboe
2021-10-17  2:06 ` [PATCH 1/6] block: add a struct io_comp_batch argument to fops->iopoll() Jens Axboe
2021-10-18 10:16   ` Christoph Hellwig
2021-10-17  2:06 ` [PATCH 2/6] sbitmap: add helper to clear a batch of tags Jens Axboe
2021-10-17  2:06 ` [PATCH 3/6] block: add support for blk_mq_end_request_batch() Jens Axboe
2021-10-18 10:18   ` Christoph Hellwig
2021-10-18 13:40     ` Jens Axboe
2021-10-17  2:06 ` [PATCH 4/6] nvme: add support for batched completion of polled IO Jens Axboe
2021-10-18 10:20   ` Christoph Hellwig
2021-10-17  2:06 ` [PATCH 5/6] io_uring: utilize the io batching infrastructure for more efficient " Jens Axboe
2021-10-18 10:20   ` Christoph Hellwig
2021-10-17  2:06 ` [PATCH 6/6] nvme: wire up completion batching for the IRQ path Jens Axboe
2021-10-18 10:22   ` Christoph Hellwig
2021-10-18 13:41     ` Jens Axboe
2021-10-18 15:14       ` Christoph Hellwig
2021-10-18 15:25         ` Jens Axboe
2021-10-20 11:14 ` [PATCHSET v3] Batched completions John Garry
2021-10-20 14:02   ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.