All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET 0/9] Batched completions
@ 2021-10-12 18:17 Jens Axboe
  2021-10-12 18:17 ` [PATCH 1/9] block: add a struct io_batch argument to fops->iopoll() Jens Axboe
                   ` (8 more replies)
  0 siblings, 9 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:17 UTC (permalink / raw)
  To: linux-block

Hi,

We now do decent batching of allocations for submit, but we still
complete requests individually. This costs a lot of CPU cycles.

This patchset adds support for collecting requests for completion,
and then completing them as a batch. This includes things like freeing
a batch of tags.

A few rough edges in this patchset, but sending it out for comments
so I can whip it into an upstreamable thing. As per patch 8, the
wins here are pretty massive - 8-10% improvement in the peak IOPS.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH 1/9] block: add a struct io_batch argument to fops->iopoll()
  2021-10-12 18:17 [PATCHSET 0/9] Batched completions Jens Axboe
@ 2021-10-12 18:17 ` Jens Axboe
  2021-10-12 18:25   ` Bart Van Assche
  2021-10-12 18:17 ` [PATCH 2/9] sbitmap: add helper to clear a batch of tags Jens Axboe
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:17 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

struct io_batch contains a list head and a completion handler, which will
allow completions to more effciently completed batches of IO.

For now, no functional changes in this patch, we just add the argument to
the file_operations iopoll handler

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c              | 8 ++++----
 block/blk-exec.c              | 2 +-
 block/blk-mq.c                | 9 +++++----
 block/blk-mq.h                | 3 ++-
 block/fops.c                  | 4 ++--
 drivers/block/null_blk/main.c | 2 +-
 drivers/block/rnbd/rnbd-clt.c | 2 +-
 drivers/nvme/host/pci.c       | 4 ++--
 drivers/nvme/host/rdma.c      | 2 +-
 drivers/nvme/host/tcp.c       | 2 +-
 drivers/scsi/scsi_lib.c       | 2 +-
 fs/io_uring.c                 | 2 +-
 fs/iomap/direct-io.c          | 2 +-
 include/linux/blk-mq.h        | 2 +-
 include/linux/blkdev.h        | 4 ++--
 include/linux/fs.h            | 8 +++++++-
 mm/page_io.c                  | 2 +-
 17 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d5b0258dd218..877c345936a0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1099,7 +1099,7 @@ EXPORT_SYMBOL(submit_bio);
  * Note: the caller must either be the context that submitted @bio, or
  * be in a RCU critical section to prevent freeing of @bio.
  */
-int bio_poll(struct bio *bio, unsigned int flags)
+int bio_poll(struct bio *bio, struct io_batch *ib, unsigned int flags)
 {
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 	blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
@@ -1117,7 +1117,7 @@ int bio_poll(struct bio *bio, unsigned int flags)
 	if (WARN_ON_ONCE(!queue_is_mq(q)))
 		ret = 0;	/* not yet implemented, should not happen */
 	else
-		ret = blk_mq_poll(q, cookie, flags);
+		ret = blk_mq_poll(q, cookie, ib, flags);
 	blk_queue_exit(q);
 	return ret;
 }
@@ -1127,7 +1127,7 @@ EXPORT_SYMBOL_GPL(bio_poll);
  * Helper to implement file_operations.iopoll.  Requires the bio to be stored
  * in iocb->private, and cleared before freeing the bio.
  */
-int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags)
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_batch *ib, unsigned int flags)
 {
 	struct bio *bio;
 	int ret = 0;
@@ -1155,7 +1155,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags)
 	rcu_read_lock();
 	bio = READ_ONCE(kiocb->private);
 	if (bio && bio->bi_bdev)
-		ret = bio_poll(bio, flags);
+		ret = bio_poll(bio, ib, flags);
 	rcu_read_unlock();
 
 	return ret;
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 55f0cd34b37b..1b8b47f6e79b 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -77,7 +77,7 @@ static bool blk_rq_is_poll(struct request *rq)
 static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
 {
 	do {
-		bio_poll(rq->bio, 0);
+		bio_poll(rq->bio, NULL, 0);
 		cond_resched();
 	} while (!completion_done(wait));
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7027a25c5271..a38412dcb55f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4030,7 +4030,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
 }
 
 static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
-		unsigned int flags)
+			       struct io_batch *ib, unsigned int flags)
 {
 	struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
 	long state = get_current_state();
@@ -4041,7 +4041,7 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
 	do {
 		hctx->poll_invoked++;
 
-		ret = q->mq_ops->poll(hctx);
+		ret = q->mq_ops->poll(hctx, ib);
 		if (ret > 0) {
 			hctx->poll_success++;
 			__set_current_state(TASK_RUNNING);
@@ -4062,14 +4062,15 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
 	return 0;
 }
 
-int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags)
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_batch *ib,
+		unsigned int flags)
 {
 	if (!(flags & BLK_POLL_NOSLEEP) &&
 	    q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
 		if (blk_mq_poll_hybrid(q, cookie))
 			return 1;
 	}
-	return blk_mq_poll_classic(q, cookie, flags);
+	return blk_mq_poll_classic(q, cookie, ib, flags);
 }
 
 unsigned int blk_mq_rq_cpu(struct request *rq)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 8be447995106..861c5cb076a9 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -38,7 +38,8 @@ struct blk_mq_ctx {
 } ____cacheline_aligned_in_smp;
 
 void blk_mq_submit_bio(struct bio *bio);
-int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags);
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_batch *ib,
+		unsigned int flags);
 void blk_mq_exit_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
diff --git a/block/fops.c b/block/fops.c
index ce1255529ba2..30487bf6d5e4 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -106,7 +106,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!READ_ONCE(bio.bi_private))
 			break;
-		if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, 0))
+		if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0))
 			blk_io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -288,7 +288,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		if (!READ_ONCE(dio->waiter))
 			break;
 
-		if (!do_poll || !bio_poll(bio, 0))
+		if (!do_poll || !bio_poll(bio, NULL, 0))
 			blk_io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 7ce911e2289d..744601f1704d 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1494,7 +1494,7 @@ static int null_map_queues(struct blk_mq_tag_set *set)
 	return 0;
 }
 
-static int null_poll(struct blk_mq_hw_ctx *hctx)
+static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib)
 {
 	struct nullb_queue *nq = hctx->driver_data;
 	LIST_HEAD(list);
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index bd4a41afbbfc..7bd18aef1086 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -1176,7 +1176,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx)
+static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib)
 {
 	struct rnbd_queue *q = hctx->driver_data;
 	struct rnbd_clt_dev *dev = q->dev;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0dd4b44b59cd..4ad63bb9f415 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1092,7 +1092,7 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
 	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 }
 
-static int nvme_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
 	bool found;
@@ -1274,7 +1274,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 * Did we miss an interrupt?
 	 */
 	if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
-		nvme_poll(req->mq_hctx);
+		nvme_poll(req->mq_hctx, NULL);
 	else
 		nvme_poll_irqdisable(nvmeq);
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 40317e1b9183..4987cfbf5dd4 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2106,7 +2106,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib)
 {
 	struct nvme_rdma_queue *queue = hctx->driver_data;
 
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 3c1c29dd3020..7bc321ceec72 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2429,7 +2429,7 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
 	return 0;
 }
 
-static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
+static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib)
 {
 	struct nvme_tcp_queue *queue = hctx->driver_data;
 	struct sock *sk = queue->sock->sk;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 33fd9a01330c..de3fffc447da 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1784,7 +1784,7 @@ static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq,
 }
 
 
-static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx)
+static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib)
 {
 	struct Scsi_Host *shost = hctx->driver_data;
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e43e130a0e92..082ff64c1bcb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2412,7 +2412,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
-		ret = kiocb->ki_filp->f_op->iopoll(kiocb, poll_flags);
+		ret = kiocb->ki_filp->f_op->iopoll(kiocb, NULL, poll_flags);
 		if (unlikely(ret < 0))
 			return ret;
 		else if (ret)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 8efab177011d..83ecfba53abe 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -630,7 +630,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 				break;
 
 			if (!dio->submit.poll_bio ||
-			    !bio_poll(dio->submit.poll_bio, 0))
+			    !bio_poll(dio->submit.poll_bio, NULL, 0))
 				blk_io_schedule();
 		}
 		__set_current_state(TASK_RUNNING);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f86e28828a95..29555673090d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -538,7 +538,7 @@ struct blk_mq_ops {
 	/**
 	 * @poll: Called to poll for completion of a specific tag.
 	 */
-	int (*poll)(struct blk_mq_hw_ctx *);
+	int (*poll)(struct blk_mq_hw_ctx *, struct io_batch *);
 
 	/**
 	 * @complete: Mark the request as complete.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2a8689e949b4..96e1261c4846 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -569,8 +569,8 @@ blk_status_t errno_to_blk_status(int errno);
 #define BLK_POLL_ONESHOT		(1 << 0)
 /* do not sleep to wait for the expected completion time */
 #define BLK_POLL_NOSLEEP		(1 << 1)
-int bio_poll(struct bio *bio, unsigned int flags);
-int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags);
+int bio_poll(struct bio *bio, struct io_batch *ib, unsigned int flags);
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_batch *ib, unsigned int flags);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f98a361a6752..70986a1d62db 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -48,6 +48,7 @@
 struct backing_dev_info;
 struct bdi_writeback;
 struct bio;
+struct request;
 struct export_operations;
 struct fiemap_extent_info;
 struct hd_geometry;
@@ -2068,6 +2069,11 @@ struct dir_context {
 
 struct iov_iter;
 
+struct io_batch {
+	struct request *req_list;
+	void (*complete)(struct io_batch *);
+};
+
 struct file_operations {
 	struct module *owner;
 	loff_t (*llseek) (struct file *, loff_t, int);
@@ -2075,7 +2081,7 @@ struct file_operations {
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
-	int (*iopoll)(struct kiocb *kiocb, unsigned int flags);
+	int (*iopoll)(struct kiocb *kiocb, struct io_batch *, unsigned int flags);
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
diff --git a/mm/page_io.c b/mm/page_io.c
index a68faab5b310..6010fb07f231 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -424,7 +424,7 @@ int swap_readpage(struct page *page, bool synchronous)
 		if (!READ_ONCE(bio->bi_private))
 			break;
 
-		if (!bio_poll(bio, 0))
+		if (!bio_poll(bio, NULL, 0))
 			blk_io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 2/9] sbitmap: add helper to clear a batch of tags
  2021-10-12 18:17 [PATCHSET 0/9] Batched completions Jens Axboe
  2021-10-12 18:17 ` [PATCH 1/9] block: add a struct io_batch argument to fops->iopoll() Jens Axboe
@ 2021-10-12 18:17 ` Jens Axboe
  2021-10-12 18:29   ` Bart Van Assche
  2021-10-12 18:17 ` [PATCH 3/9] sbitmap: test bit before calling test_and_set_bit() Jens Axboe
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:17 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

sbitmap currently only supports clearing tags one-by-one, add a helper
that allows the caller to pass in an array of tags to clear.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 11 +++++++++++
 lib/sbitmap.c           | 44 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index e30b56023ead..4a6ff274335a 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -528,6 +528,17 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu);
 
+/**
+ * sbitmap_queue_clear_batch() - Free a batch of allocated bits
+ * &struct sbitmap_queue.
+ * @sbq: Bitmap to free from.
+ * @offset: offset for each tag in array
+ * @tags: array of tags
+ * @nr_tags: number of tags in array
+ */
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+				int *tags, int nr_tags);
+
 static inline int sbq_index_inc(int index)
 {
 	return (index + 1) & (SBQ_WAIT_QUEUES - 1);
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index f398e0ae548e..c6e2f1f2c4d2 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -628,6 +628,46 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 
+static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag)
+{
+	if (likely(!sb->round_robin && tag < sb->depth))
+		*per_cpu_ptr(sb->alloc_hint, cpu) = tag;
+}
+
+void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
+				int *tags, int nr_tags)
+{
+	struct sbitmap *sb = &sbq->sb;
+	unsigned long *addr = NULL;
+	unsigned long mask = 0;
+	int i;
+
+	smp_mb__before_atomic();
+	for (i = 0; i < nr_tags; i++) {
+		const int tag = tags[i] - offset;
+		unsigned long *this_addr;
+
+		/* since we're clearing a batch, skip the deferred map */
+		this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
+		if (!addr) {
+			addr = this_addr;
+		} else if (addr != this_addr) {
+			atomic_long_andnot(mask, (atomic_long_t *) addr);
+			mask = 0;
+			addr = this_addr;
+		}
+		mask |= (1UL << SB_NR_TO_BIT(sb, tag));
+	}
+
+	if (mask)
+		atomic_long_andnot(mask, (atomic_long_t *) addr);
+
+	smp_mb__after_atomic();
+	sbitmap_queue_wake_up(sbq);
+	sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
+					tags[nr_tags - 1] - offset);
+}
+
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
@@ -652,9 +692,7 @@ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 	 */
 	smp_mb__after_atomic();
 	sbitmap_queue_wake_up(sbq);
-
-	if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth))
-		*per_cpu_ptr(sbq->sb.alloc_hint, cpu) = nr;
+	sbitmap_update_cpu_hint(&sbq->sb, cpu, nr);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
 
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 3/9] sbitmap: test bit before calling test_and_set_bit()
  2021-10-12 18:17 [PATCHSET 0/9] Batched completions Jens Axboe
  2021-10-12 18:17 ` [PATCH 1/9] block: add a struct io_batch argument to fops->iopoll() Jens Axboe
  2021-10-12 18:17 ` [PATCH 2/9] sbitmap: add helper to clear a batch of tags Jens Axboe
@ 2021-10-12 18:17 ` Jens Axboe
  2021-10-12 18:17 ` [PATCH 4/9] block: add support for blk_mq_end_request_batch() Jens Axboe
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:17 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

If we come across bits that are already set, then it's quicker to test
that first and gate the test_and_set_bit() operation on the result of
the bit test.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 lib/sbitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index c6e2f1f2c4d2..11b244a8d00f 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -166,7 +166,7 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 			return -1;
 		}
 
-		if (!test_and_set_bit_lock(nr, word))
+		if (!test_bit(nr, word) && !test_and_set_bit_lock(nr, word))
 			break;
 
 		hint = nr + 1;
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 4/9] block: add support for blk_mq_end_request_batch()
  2021-10-12 18:17 [PATCHSET 0/9] Batched completions Jens Axboe
                   ` (2 preceding siblings ...)
  2021-10-12 18:17 ` [PATCH 3/9] sbitmap: test bit before calling test_and_set_bit() Jens Axboe
@ 2021-10-12 18:17 ` Jens Axboe
  2021-10-12 18:32   ` Bart Van Assche
  2021-10-12 18:17 ` [PATCH 5/9] nvme: move the fast path nvme error and disposition helpers Jens Axboe
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:17 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

Instead of calling blk_mq_end_request() on a single request, add a helper
that takes the new struct io_batch and completes any request stored in
there.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c     |  6 +++
 block/blk-mq-tag.h     |  1 +
 block/blk-mq.c         | 83 ++++++++++++++++++++++++++++++++++++++----
 include/linux/blk-mq.h | 13 +++++++
 4 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c43b97201161..70eb276dc870 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -207,6 +207,12 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 	}
 }
 
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *array, int nr_tags)
+{
+	sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
+					array, nr_tags);
+}
+
 struct bt_iter_data {
 	struct blk_mq_hw_ctx *hctx;
 	busy_iter_fn *fn;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 71c2f7d8e9b7..e7b6c8dff071 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -42,6 +42,7 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
 			      unsigned int *offset);
 extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 			   unsigned int tag);
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *array, int nr_tags);
 extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 					struct blk_mq_tags **tags,
 					unsigned int depth, bool can_grow);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a38412dcb55f..9509c52a66a4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -613,21 +613,26 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
 	}
 }
 
-inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+static inline void __blk_mq_end_request_acct(struct request *rq,
+					     blk_status_t error, u64 now)
 {
-	u64 now = 0;
-
-	if (blk_mq_need_time_stamp(rq))
-		now = ktime_get_ns();
-
 	if (rq->rq_flags & RQF_STATS) {
 		blk_mq_poll_stats_start(rq->q);
 		blk_stat_add(rq, now);
 	}
 
 	blk_mq_sched_completed_request(rq, now);
-
 	blk_account_io_done(rq, now);
+}
+
+inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+{
+	u64 now = 0;
+
+	if (blk_mq_need_time_stamp(rq))
+		now = ktime_get_ns();
+
+	__blk_mq_end_request_acct(rq, error, now);
 
 	if (rq->end_io) {
 		rq_qos_done(rq->q, rq);
@@ -646,6 +651,70 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
 }
 EXPORT_SYMBOL(blk_mq_end_request);
 
+#define TAG_COMP_BATCH		32
+#define TAG_SCHED_BATCH		(TAG_COMP_BATCH >> 1)
+
+static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
+					  int *tags, int nr_tags)
+{
+	struct request_queue *q = hctx->queue;
+
+	blk_mq_put_tags(hctx->tags, tags, nr_tags);
+	if (q->elevator)
+		blk_mq_put_tags(hctx->sched_tags, &tags[TAG_SCHED_BATCH], nr_tags);
+	percpu_ref_put_many(&q->q_usage_counter, nr_tags);
+	blk_mq_sched_restart(hctx);
+}
+
+void blk_mq_end_request_batch(struct io_batch *ib)
+{
+	int tags[TAG_COMP_BATCH], nr_tags = 0, acct_tags = 0;
+	struct blk_mq_hw_ctx *last_hctx = NULL;
+	u64 now = 0;
+
+	while (ib->req_list) {
+		struct request *rq;
+
+		rq = ib->req_list;
+		ib->req_list = rq->rq_next;
+		if (!now && blk_mq_need_time_stamp(rq))
+			now = ktime_get_ns();
+		blk_update_request(rq, rq->status, blk_rq_bytes(rq));
+		__blk_mq_end_request_acct(rq, rq->status, now);
+
+		if (rq->q->elevator) {
+			blk_mq_free_request(rq);
+			continue;
+		}
+
+		if (!refcount_dec_and_test(&rq->ref))
+			continue;
+
+		blk_crypto_free_request(rq);
+		blk_pm_mark_last_busy(rq);
+		rq_qos_done(rq->q, rq);
+		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+
+		if (acct_tags == TAG_COMP_BATCH ||
+		    (last_hctx && last_hctx != rq->mq_hctx)) {
+			blk_mq_flush_tag_batch(last_hctx, tags, nr_tags);
+			acct_tags = nr_tags = 0;
+		}
+		tags[nr_tags] = rq->tag;
+		last_hctx = rq->mq_hctx;
+		if (last_hctx->queue->elevator) {
+			tags[nr_tags + TAG_SCHED_BATCH] = rq->internal_tag;
+			acct_tags++;
+		}
+		nr_tags++;
+		acct_tags++;
+	}
+
+	if (nr_tags)
+		blk_mq_flush_tag_batch(last_hctx, tags, nr_tags);
+}
+EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
+
 static void blk_complete_reqs(struct llist_head *list)
 {
 	struct llist_node *entry = llist_reverse_order(llist_del_all(list));
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 29555673090d..26f9f6b07734 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -183,9 +183,16 @@ struct request {
 	unsigned int timeout;
 	unsigned long deadline;
 
+	/*
+	 * csd is used for remote completions, fifo_time at scheduler time.
+	 * They are mutually exclusive. result is used at completion time
+	 * like csd, but for batched IO. Batched IO does not use IPI
+	 * completions.
+	 */
 	union {
 		struct __call_single_data csd;
 		u64 fifo_time;
+		blk_status_t status;
 	};
 
 	/*
@@ -545,6 +552,11 @@ struct blk_mq_ops {
 	 */
 	void (*complete)(struct request *);
 
+	/**
+	 * @complete_batch: Mark list of requests as complete
+	 */
+	void (*complete_batch)(struct io_batch *);
+
 	/**
 	 * @init_hctx: Called when the block layer side of a hardware queue has
 	 * been set up, allowing the driver to allocate/init matching
@@ -734,6 +746,7 @@ static inline void blk_mq_set_request_complete(struct request *rq)
 void blk_mq_start_request(struct request *rq);
 void blk_mq_end_request(struct request *rq, blk_status_t error);
 void __blk_mq_end_request(struct request *rq, blk_status_t error);
+void blk_mq_end_request_batch(struct io_batch *ib);
 
 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q);
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 5/9] nvme: move the fast path nvme error and disposition helpers
  2021-10-12 18:17 [PATCHSET 0/9] Batched completions Jens Axboe
                   ` (3 preceding siblings ...)
  2021-10-12 18:17 ` [PATCH 4/9] block: add support for blk_mq_end_request_batch() Jens Axboe
@ 2021-10-12 18:17 ` Jens Axboe
  2021-10-13  6:57     ` Christoph Hellwig
  2021-10-12 18:17 ` [PATCH 6/9] nvme: add support for batched completion of polled IO Jens Axboe
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:17 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

These are called for every IO completion, move them inline in the nvme
private header rather than have them be a function call out of the PCI
part of the nvme drivers.

We also need them for batched handling, hence the patch also serves as a
preparation for that.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 73 ++--------------------------------------
 drivers/nvme/host/nvme.h | 72 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 71 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4b7c009fccfe..ec7fa6f31e68 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -44,9 +44,10 @@ static unsigned char shutdown_timeout = 5;
 module_param(shutdown_timeout, byte, 0644);
 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
 
-static u8 nvme_max_retries = 5;
+u8 nvme_max_retries = 5;
 module_param_named(max_retries, nvme_max_retries, byte, 0644);
 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
+EXPORT_SYMBOL_GPL(nvme_max_retries);
 
 static unsigned long default_ps_max_latency_us = 100000;
 module_param(default_ps_max_latency_us, ulong, 0644);
@@ -261,48 +262,6 @@ static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
 	nvme_put_ctrl(ctrl);
 }
 
-static blk_status_t nvme_error_status(u16 status)
-{
-	switch (status & 0x7ff) {
-	case NVME_SC_SUCCESS:
-		return BLK_STS_OK;
-	case NVME_SC_CAP_EXCEEDED:
-		return BLK_STS_NOSPC;
-	case NVME_SC_LBA_RANGE:
-	case NVME_SC_CMD_INTERRUPTED:
-	case NVME_SC_NS_NOT_READY:
-		return BLK_STS_TARGET;
-	case NVME_SC_BAD_ATTRIBUTES:
-	case NVME_SC_ONCS_NOT_SUPPORTED:
-	case NVME_SC_INVALID_OPCODE:
-	case NVME_SC_INVALID_FIELD:
-	case NVME_SC_INVALID_NS:
-		return BLK_STS_NOTSUPP;
-	case NVME_SC_WRITE_FAULT:
-	case NVME_SC_READ_ERROR:
-	case NVME_SC_UNWRITTEN_BLOCK:
-	case NVME_SC_ACCESS_DENIED:
-	case NVME_SC_READ_ONLY:
-	case NVME_SC_COMPARE_FAILED:
-		return BLK_STS_MEDIUM;
-	case NVME_SC_GUARD_CHECK:
-	case NVME_SC_APPTAG_CHECK:
-	case NVME_SC_REFTAG_CHECK:
-	case NVME_SC_INVALID_PI:
-		return BLK_STS_PROTECTION;
-	case NVME_SC_RESERVATION_CONFLICT:
-		return BLK_STS_NEXUS;
-	case NVME_SC_HOST_PATH_ERROR:
-		return BLK_STS_TRANSPORT;
-	case NVME_SC_ZONE_TOO_MANY_ACTIVE:
-		return BLK_STS_ZONE_ACTIVE_RESOURCE;
-	case NVME_SC_ZONE_TOO_MANY_OPEN:
-		return BLK_STS_ZONE_OPEN_RESOURCE;
-	default:
-		return BLK_STS_IOERR;
-	}
-}
-
 static void nvme_retry_req(struct request *req)
 {
 	unsigned long delay = 0;
@@ -318,34 +277,6 @@ static void nvme_retry_req(struct request *req)
 	blk_mq_delay_kick_requeue_list(req->q, delay);
 }
 
-enum nvme_disposition {
-	COMPLETE,
-	RETRY,
-	FAILOVER,
-};
-
-static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
-{
-	if (likely(nvme_req(req)->status == 0))
-		return COMPLETE;
-
-	if (blk_noretry_request(req) ||
-	    (nvme_req(req)->status & NVME_SC_DNR) ||
-	    nvme_req(req)->retries >= nvme_max_retries)
-		return COMPLETE;
-
-	if (req->cmd_flags & REQ_NVME_MPATH) {
-		if (nvme_is_path_error(nvme_req(req)->status) ||
-		    blk_queue_dying(req->q))
-			return FAILOVER;
-	} else {
-		if (blk_queue_dying(req->q))
-			return COMPLETE;
-	}
-
-	return RETRY;
-}
-
 static inline void nvme_end_req(struct request *req)
 {
 	blk_status_t status = nvme_error_status(nvme_req(req)->status);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ed79a6c7e804..3d11b5cb478d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -903,4 +903,76 @@ static inline bool nvme_multi_css(struct nvme_ctrl *ctrl)
 	return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
 }
 
+static inline blk_status_t nvme_error_status(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_SUCCESS:
+		return BLK_STS_OK;
+	case NVME_SC_CAP_EXCEEDED:
+		return BLK_STS_NOSPC;
+	case NVME_SC_LBA_RANGE:
+	case NVME_SC_CMD_INTERRUPTED:
+	case NVME_SC_NS_NOT_READY:
+		return BLK_STS_TARGET;
+	case NVME_SC_BAD_ATTRIBUTES:
+	case NVME_SC_ONCS_NOT_SUPPORTED:
+	case NVME_SC_INVALID_OPCODE:
+	case NVME_SC_INVALID_FIELD:
+	case NVME_SC_INVALID_NS:
+		return BLK_STS_NOTSUPP;
+	case NVME_SC_WRITE_FAULT:
+	case NVME_SC_READ_ERROR:
+	case NVME_SC_UNWRITTEN_BLOCK:
+	case NVME_SC_ACCESS_DENIED:
+	case NVME_SC_READ_ONLY:
+	case NVME_SC_COMPARE_FAILED:
+		return BLK_STS_MEDIUM;
+	case NVME_SC_GUARD_CHECK:
+	case NVME_SC_APPTAG_CHECK:
+	case NVME_SC_REFTAG_CHECK:
+	case NVME_SC_INVALID_PI:
+		return BLK_STS_PROTECTION;
+	case NVME_SC_RESERVATION_CONFLICT:
+		return BLK_STS_NEXUS;
+	case NVME_SC_HOST_PATH_ERROR:
+		return BLK_STS_TRANSPORT;
+	case NVME_SC_ZONE_TOO_MANY_ACTIVE:
+		return BLK_STS_ZONE_ACTIVE_RESOURCE;
+	case NVME_SC_ZONE_TOO_MANY_OPEN:
+		return BLK_STS_ZONE_OPEN_RESOURCE;
+	default:
+		return BLK_STS_IOERR;
+	}
+}
+
+enum nvme_disposition {
+	COMPLETE,
+	RETRY,
+	FAILOVER,
+};
+
+extern u8 nvme_max_retries;
+
+static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
+{
+	if (likely(nvme_req(req)->status == 0))
+		return COMPLETE;
+
+	if (blk_noretry_request(req) ||
+	    (nvme_req(req)->status & NVME_SC_DNR) ||
+	    nvme_req(req)->retries >= nvme_max_retries)
+		return COMPLETE;
+
+	if (req->cmd_flags & REQ_NVME_MPATH) {
+		if (nvme_is_path_error(nvme_req(req)->status) ||
+		    blk_queue_dying(req->q))
+			return FAILOVER;
+	} else {
+		if (blk_queue_dying(req->q))
+			return COMPLETE;
+	}
+
+	return RETRY;
+}
+
 #endif /* _NVME_H */
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-12 18:17 [PATCHSET 0/9] Batched completions Jens Axboe
                   ` (4 preceding siblings ...)
  2021-10-12 18:17 ` [PATCH 5/9] nvme: move the fast path nvme error and disposition helpers Jens Axboe
@ 2021-10-12 18:17 ` Jens Axboe
  2021-10-13  7:08   ` Christoph Hellwig
  2021-10-13  9:09   ` John Garry
  2021-10-12 18:17 ` [PATCH 7/9] block: assign batch completion handler in blk_poll() Jens Axboe
                   ` (2 subsequent siblings)
  8 siblings, 2 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:17 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

Take advantage of struct io_batch, if passed in to the nvme poll handler.
If it's set, rather than complete each request individually inline, store
them in the io_batch list. We only do so for requests that will complete
successfully, anything else will be completed inline as before.

Add an mq_ops->complete_batch() handler to do the post-processing of
the io_batch list once polling is complete.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 63 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4ad63bb9f415..4713da708cd4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static void nvme_pci_complete_rq(struct request *req)
+static void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_dev *dev = iod->nvmeq->dev;
@@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req)
 			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
 	if (blk_rq_nr_phys_segments(req))
 		nvme_unmap_data(dev, req);
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+	nvme_pci_unmap_rq(req);
 	nvme_complete_rq(req);
 }
 
+static void nvme_pci_complete_batch(struct io_batch *ib)
+{
+	struct request *req;
+
+	req = ib->req_list;
+	while (req) {
+		nvme_pci_unmap_rq(req);
+		if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
+			nvme_cleanup_cmd(req);
+		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+				req_op(req) == REQ_OP_ZONE_APPEND)
+			req->__sector = nvme_lba_to_sect(req->q->queuedata,
+					le64_to_cpu(nvme_req(req)->result.u64));
+		req->status = nvme_error_status(nvme_req(req)->status);
+		req = req->rq_next;
+	}
+
+	blk_mq_end_request_batch(ib);
+}
+
 /* We read the CQE phase first to check if the rest of the entry is valid */
 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
 {
@@ -996,7 +1021,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+				   struct io_batch *ib, u16 idx)
 {
 	struct nvme_completion *cqe = &nvmeq->cqes[idx];
 	__u16 command_id = READ_ONCE(cqe->command_id);
@@ -1023,8 +1049,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 	}
 
 	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
-		nvme_pci_complete_rq(req);
+	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
+		enum nvme_disposition ret;
+
+		ret = nvme_decide_disposition(req);
+		if (unlikely(!ib || req->end_io || ret != COMPLETE)) {
+			nvme_pci_complete_rq(req);
+		} else {
+			req->rq_next = ib->req_list;
+			ib->req_list = req;
+		}
+	}
 }
 
 static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
@@ -1050,7 +1085,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 		 * the cqe requires a full read memory barrier
 		 */
 		dma_rmb();
-		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+		nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head);
 		nvme_update_cq_head(nvmeq);
 	}
 
@@ -1092,6 +1127,27 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
 	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 }
 
+static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *ib)
+{
+	int found = 0;
+
+	while (nvme_cqe_pending(nvmeq)) {
+		found++;
+		/*
+		 * load-load control dependency between phase and the rest of
+		 * the cqe requires a full read memory barrier
+		 */
+		dma_rmb();
+		nvme_handle_cqe(nvmeq, ib, nvmeq->cq_head);
+		nvme_update_cq_head(nvmeq);
+	}
+
+	if (found)
+		nvme_ring_cq_doorbell(nvmeq);
+	return found;
+}
+
+
 static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
@@ -1101,7 +1157,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *ib)
 		return 0;
 
 	spin_lock(&nvmeq->cq_poll_lock);
-	found = nvme_process_cq(nvmeq);
+	found = nvme_poll_cq(nvmeq, ib);
 	spin_unlock(&nvmeq->cq_poll_lock);
 
 	return found;
@@ -1639,6 +1695,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 static const struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_pci_complete_rq,
+	.complete_batch = nvme_pci_complete_batch,
 	.commit_rqs	= nvme_commit_rqs,
 	.init_hctx	= nvme_init_hctx,
 	.init_request	= nvme_init_request,
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 7/9] block: assign batch completion handler in blk_poll()
  2021-10-12 18:17 [PATCHSET 0/9] Batched completions Jens Axboe
                   ` (5 preceding siblings ...)
  2021-10-12 18:17 ` [PATCH 6/9] nvme: add support for batched completion of polled IO Jens Axboe
@ 2021-10-12 18:17 ` Jens Axboe
  2021-10-12 18:17 ` [PATCH 8/9] io_uring: utilize the io_batch infrastructure for more efficient polled IO Jens Axboe
  2021-10-12 18:17 ` [PATCH 9/9] nvme: wire up completion batching for the IRQ path Jens Axboe
  8 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:17 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

If an io_batch is passed in to blk_poll(), we need to assign the batch
handler associated with this queue. This allows callers to complete
an io_batch handler on by calling it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9509c52a66a4..62fabc65d6b2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4107,6 +4107,19 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
 
 	hctx->poll_considered++;
 
+	/*
+	 * If batching is requested but the target doesn't support batched
+	 * completions, then just clear ib and completions will be handled
+	 * normally.
+	 */
+	if (ib) {
+		ib->complete = q->mq_ops->complete_batch;
+		if (!ib->complete) {
+			WARN_ON_ONCE(ib->req_list);
+			ib = NULL;
+		}
+	}
+
 	do {
 		hctx->poll_invoked++;
 
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 8/9] io_uring: utilize the io_batch infrastructure for more efficient polled IO
  2021-10-12 18:17 [PATCHSET 0/9] Batched completions Jens Axboe
                   ` (6 preceding siblings ...)
  2021-10-12 18:17 ` [PATCH 7/9] block: assign batch completion handler in blk_poll() Jens Axboe
@ 2021-10-12 18:17 ` Jens Axboe
  2021-10-12 18:17 ` [PATCH 9/9] nvme: wire up completion batching for the IRQ path Jens Axboe
  8 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:17 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

Wire up using an io_batch for f_op->iopoll(). If the lower stack supports
it, we can handle high rates of polled IO more efficiently.

This raises the single core efficiency on my system from ~6.1M IOPS to
~6.6M IOPS running a random read workload at depth 128 on two gen2
Optane drives.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 082ff64c1bcb..cbf00ad3ac3f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2390,6 +2390,8 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
 	unsigned int poll_flags = BLK_POLL_NOSLEEP;
+	struct file *file = NULL;
+	struct io_batch ib;
 	int nr_events = 0;
 
 	/*
@@ -2399,11 +2401,17 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 	if (ctx->poll_multi_queue && force_nonspin)
 		poll_flags |= BLK_POLL_ONESHOT;
 
+	ib.req_list = NULL;
 	wq_list_for_each(pos, start, &ctx->iopoll_list) {
 		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
 		struct kiocb *kiocb = &req->rw.kiocb;
 		int ret;
 
+		if (!file)
+			file = kiocb->ki_filp;
+		else if (file != kiocb->ki_filp)
+			break;
+
 		/*
 		 * Move completed and retryable entries to our local lists.
 		 * If we find a request that requires polling, break out
@@ -2412,19 +2420,21 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
-		ret = kiocb->ki_filp->f_op->iopoll(kiocb, NULL, poll_flags);
+		ret = kiocb->ki_filp->f_op->iopoll(kiocb, &ib, poll_flags);
 		if (unlikely(ret < 0))
 			return ret;
 		else if (ret)
 			poll_flags |= BLK_POLL_ONESHOT;
 
 		/* iopoll may have completed current req */
-		if (READ_ONCE(req->iopoll_completed))
+		if (ib.req_list || READ_ONCE(req->iopoll_completed))
 			break;
 	}
 
-	if (!pos)
+	if (!pos && !ib.req_list)
 		return 0;
+	if (ib.req_list)
+		ib.complete(&ib);
 
 	prev = start;
 	wq_list_for_each_resume(pos, prev) {
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 9/9] nvme: wire up completion batching for the IRQ path
  2021-10-12 18:17 [PATCHSET 0/9] Batched completions Jens Axboe
                   ` (7 preceding siblings ...)
  2021-10-12 18:17 ` [PATCH 8/9] io_uring: utilize the io_batch infrastructure for more efficient polled IO Jens Axboe
@ 2021-10-12 18:17 ` Jens Axboe
  2021-10-13  7:12   ` Christoph Hellwig
  8 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:17 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe

Trivial to do now, just need our own io_batch on the stack and pass that
in to the usual command completion handling.

I pondered making this dependent on how many entries we had to process,
but even for a single entry there's no discernable difference in
performance or latency. Running a sync workload over io_uring:

t/io_uring -b512 -d1 -s1 -c1 -p0 -F1 -B1 -n2 /dev/nvme1n1 /dev/nvme2n1

yields the below performance before the patch:

IOPS=254820, BW=124MiB/s, IOS/call=1/1, inflight=(1 1)
IOPS=251174, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)
IOPS=250806, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)

and the following after:

IOPS=255972, BW=124MiB/s, IOS/call=1/1, inflight=(1 1)
IOPS=251920, BW=123MiB/s, IOS/call=1/1, inflight=(1 1)
IOPS=251794, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)

which definitely isn't slower, about the same if you factor in a bit of
variance. For peak performance workloads, benchmarking shows a 2%
improvement.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4713da708cd4..fb3de6f68eb1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1076,8 +1076,10 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
 
 static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 {
+	struct io_batch ib;
 	int found = 0;
 
+	ib.req_list = NULL;
 	while (nvme_cqe_pending(nvmeq)) {
 		found++;
 		/*
@@ -1085,12 +1087,15 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 		 * the cqe requires a full read memory barrier
 		 */
 		dma_rmb();
-		nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head);
+		nvme_handle_cqe(nvmeq, &ib, nvmeq->cq_head);
 		nvme_update_cq_head(nvmeq);
 	}
 
-	if (found)
+	if (found) {
+		if (ib.req_list)
+			nvme_pci_complete_batch(&ib);
 		nvme_ring_cq_doorbell(nvmeq);
+	}
 	return found;
 }
 
-- 
2.33.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/9] block: add a struct io_batch argument to fops->iopoll()
  2021-10-12 18:17 ` [PATCH 1/9] block: add a struct io_batch argument to fops->iopoll() Jens Axboe
@ 2021-10-12 18:25   ` Bart Van Assche
  2021-10-12 18:28     ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Bart Van Assche @ 2021-10-12 18:25 UTC (permalink / raw)
  To: Jens Axboe, linux-block

On 10/12/21 11:17 AM, Jens Axboe wrote:
> -int bio_poll(struct bio *bio, unsigned int flags)
> +int bio_poll(struct bio *bio, struct io_batch *ib, unsigned int flags)
>   {

How about using the name 'iob' instead of 'ib' for the io_batch 
argument? When I saw the 'ib' argument name that name made me wonder for 
a second whether it is perhaps related to InfiniBand.

> +struct io_batch {
> +	struct request *req_list;
> +	void (*complete)(struct io_batch *);
> +};

Function pointers are not ideal in high-performance code. Is there 
another solution than introducing a new function pointer?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 1/9] block: add a struct io_batch argument to fops->iopoll()
  2021-10-12 18:25   ` Bart Van Assche
@ 2021-10-12 18:28     ` Jens Axboe
  0 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:28 UTC (permalink / raw)
  To: Bart Van Assche, linux-block

On 10/12/21 12:25 PM, Bart Van Assche wrote:
> On 10/12/21 11:17 AM, Jens Axboe wrote:
>> -int bio_poll(struct bio *bio, unsigned int flags)
>> +int bio_poll(struct bio *bio, struct io_batch *ib, unsigned int flags)
>>   {
> 
> How about using the name 'iob' instead of 'ib' for the io_batch 
> argument? When I saw the 'ib' argument name that name made me wonder for 
> a second whether it is perhaps related to InfiniBand.

Sure, I don't care too much about that, I can make it iob instead.

>> +struct io_batch {
>> +	struct request *req_list;
>> +	void (*complete)(struct io_batch *);
>> +};
> 
> Function pointers are not ideal in high-performance code. Is there 
> another solution than introducing a new function pointer?

You're going to end up with an indirect call at some point anyway, as
the completion part would be driver specific. So I don't think that's
avoidable, it's just a question of where you do it.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/9] sbitmap: add helper to clear a batch of tags
  2021-10-12 18:17 ` [PATCH 2/9] sbitmap: add helper to clear a batch of tags Jens Axboe
@ 2021-10-12 18:29   ` Bart Van Assche
  2021-10-12 18:34     ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Bart Van Assche @ 2021-10-12 18:29 UTC (permalink / raw)
  To: Jens Axboe, linux-block

On 10/12/21 11:17 AM, Jens Axboe wrote:
> +void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
> +				int *tags, int nr_tags)
> +{
> +	struct sbitmap *sb = &sbq->sb;
> +	unsigned long *addr = NULL;
> +	unsigned long mask = 0;
> +	int i;
> +
> +	smp_mb__before_atomic();
> +	for (i = 0; i < nr_tags; i++) {
> +		const int tag = tags[i] - offset;
> +		unsigned long *this_addr;
> +
> +		/* since we're clearing a batch, skip the deferred map */
> +		this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
> +		if (!addr) {
> +			addr = this_addr;
> +		} else if (addr != this_addr) {
> +			atomic_long_andnot(mask, (atomic_long_t *) addr);
> +			mask = 0;
> +			addr = this_addr;
> +		}
> +		mask |= (1UL << SB_NR_TO_BIT(sb, tag));
> +	}
> +
> +	if (mask)
> +		atomic_long_andnot(mask, (atomic_long_t *) addr);
> +
> +	smp_mb__after_atomic();
> +	sbitmap_queue_wake_up(sbq);
> +	sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
> +					tags[nr_tags - 1] - offset);
> +}
> +
>   void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
>   			 unsigned int cpu)
>   {

How does replacing the sbitmap_queue_clear() implementation by calling 
sbitmap_queue_clear_batch() affect performance? I'm wondering whether it 
is possible to prevent code duplication without affecting performance 
negatively.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 4/9] block: add support for blk_mq_end_request_batch()
  2021-10-12 18:17 ` [PATCH 4/9] block: add support for blk_mq_end_request_batch() Jens Axboe
@ 2021-10-12 18:32   ` Bart Van Assche
  2021-10-12 18:55     ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Bart Van Assche @ 2021-10-12 18:32 UTC (permalink / raw)
  To: Jens Axboe, linux-block

On 10/12/21 11:17 AM, Jens Axboe wrote:
> +void blk_mq_put_tags(struct blk_mq_tags *tags, int *array, int nr_tags)
> +{
> +	sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
> +					array, nr_tags);
> +}

How about changing the name of the 'array' argument into 'tag_array' to 
make it more clear what the argument represents?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/9] sbitmap: add helper to clear a batch of tags
  2021-10-12 18:29   ` Bart Van Assche
@ 2021-10-12 18:34     ` Jens Axboe
  0 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:34 UTC (permalink / raw)
  To: Bart Van Assche, linux-block

On 10/12/21 12:29 PM, Bart Van Assche wrote:
> On 10/12/21 11:17 AM, Jens Axboe wrote:
>> +void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
>> +				int *tags, int nr_tags)
>> +{
>> +	struct sbitmap *sb = &sbq->sb;
>> +	unsigned long *addr = NULL;
>> +	unsigned long mask = 0;
>> +	int i;
>> +
>> +	smp_mb__before_atomic();
>> +	for (i = 0; i < nr_tags; i++) {
>> +		const int tag = tags[i] - offset;
>> +		unsigned long *this_addr;
>> +
>> +		/* since we're clearing a batch, skip the deferred map */
>> +		this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word;
>> +		if (!addr) {
>> +			addr = this_addr;
>> +		} else if (addr != this_addr) {
>> +			atomic_long_andnot(mask, (atomic_long_t *) addr);
>> +			mask = 0;
>> +			addr = this_addr;
>> +		}
>> +		mask |= (1UL << SB_NR_TO_BIT(sb, tag));
>> +	}
>> +
>> +	if (mask)
>> +		atomic_long_andnot(mask, (atomic_long_t *) addr);
>> +
>> +	smp_mb__after_atomic();
>> +	sbitmap_queue_wake_up(sbq);
>> +	sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(),
>> +					tags[nr_tags - 1] - offset);
>> +}
>> +
>>   void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
>>   			 unsigned int cpu)
>>   {
> 
> How does replacing the sbitmap_queue_clear() implementation by calling 
> sbitmap_queue_clear_batch() affect performance? I'm wondering whether it 
> is possible to prevent code duplication without affecting performance 
> negatively.

Good question, I'd rather defer that to a followup though if it ends up
making sense. It's not that simple, as we play some tricks for the usual
clear path by inserting a deferred mask to avoid hitting the cacheline
repeatedly. That doesn't make sense to do for batched clears, obviously,
so they work in slightly different ways where the single bit clear has
an extra step to increase the efficiency.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 4/9] block: add support for blk_mq_end_request_batch()
  2021-10-12 18:32   ` Bart Van Assche
@ 2021-10-12 18:55     ` Jens Axboe
  0 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-12 18:55 UTC (permalink / raw)
  To: Bart Van Assche, linux-block

On 10/12/21 12:32 PM, Bart Van Assche wrote:
> On 10/12/21 11:17 AM, Jens Axboe wrote:
>> +void blk_mq_put_tags(struct blk_mq_tags *tags, int *array, int nr_tags)
>> +{
>> +	sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
>> +					array, nr_tags);
>> +}
> 
> How about changing the name of the 'array' argument into 'tag_array' to 
> make it more clear what the argument represents?

Sure, done.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 5/9] nvme: move the fast path nvme error and disposition helpers
  2021-10-12 18:17 ` [PATCH 5/9] nvme: move the fast path nvme error and disposition helpers Jens Axboe
@ 2021-10-13  6:57     ` Christoph Hellwig
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2021-10-13  6:57 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-nvme

Please Cc the nvme list for nvme changes.

> -static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
> -{
> -	if (likely(nvme_req(req)->status == 0))
> -		return COMPLETE;

I think the only part here that needs to be inline is this check.
The rest is all slow path error handling.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 5/9] nvme: move the fast path nvme error and disposition helpers
@ 2021-10-13  6:57     ` Christoph Hellwig
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2021-10-13  6:57 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-nvme

Please Cc the nvme list for nvme changes.

> -static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
> -{
> -	if (likely(nvme_req(req)->status == 0))
> -		return COMPLETE;

I think the only part here that needs to be inline is this check.
The rest is all slow path error handling.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-12 18:17 ` [PATCH 6/9] nvme: add support for batched completion of polled IO Jens Axboe
@ 2021-10-13  7:08   ` Christoph Hellwig
  2021-10-13 15:10     ` Jens Axboe
  2021-10-13  9:09   ` John Garry
  1 sibling, 1 reply; 34+ messages in thread
From: Christoph Hellwig @ 2021-10-13  7:08 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Tue, Oct 12, 2021 at 12:17:39PM -0600, Jens Axboe wrote:
> Take advantage of struct io_batch, if passed in to the nvme poll handler.
> If it's set, rather than complete each request individually inline, store
> them in the io_batch list. We only do so for requests that will complete
> successfully, anything else will be completed inline as before.
> 
> Add an mq_ops->complete_batch() handler to do the post-processing of
> the io_batch list once polling is complete.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++----
>  1 file changed, 63 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 4ad63bb9f415..4713da708cd4 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	return ret;
>  }
>  
> -static void nvme_pci_complete_rq(struct request *req)
> +static void nvme_pci_unmap_rq(struct request *req)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>  	struct nvme_dev *dev = iod->nvmeq->dev;
> @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req)
>  			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
>  	if (blk_rq_nr_phys_segments(req))
>  		nvme_unmap_data(dev, req);
> +}
> +
> +static void nvme_pci_complete_rq(struct request *req)
> +{
> +	nvme_pci_unmap_rq(req);
>  	nvme_complete_rq(req);
>  }
>  
> +static void nvme_pci_complete_batch(struct io_batch *ib)
> +{
> +	struct request *req;
> +
> +	req = ib->req_list;
> +	while (req) {
> +		nvme_pci_unmap_rq(req);
> +		if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
> +			nvme_cleanup_cmd(req);
> +		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> +				req_op(req) == REQ_OP_ZONE_APPEND)
> +			req->__sector = nvme_lba_to_sect(req->q->queuedata,
> +					le64_to_cpu(nvme_req(req)->result.u64));
> +		req->status = nvme_error_status(nvme_req(req)->status);
> +		req = req->rq_next;
> +	}
> +
> +	blk_mq_end_request_batch(ib);

I hate all this open coding.  All the common logic needs to be in a
common helper.  Also please add a for_each macro for the request list
iteration.  I already thought about that for the batch allocation in
for-next, but with ever more callers this becomes essential.

> +	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
> +		enum nvme_disposition ret;
> +
> +		ret = nvme_decide_disposition(req);
> +		if (unlikely(!ib || req->end_io || ret != COMPLETE)) {
> +			nvme_pci_complete_rq(req);

This actually is the likely case as only polling ever does the batch
completion.  In doubt I'd prefer if we can avoid these likely/unlikely
annotations as much as possible.

> +		} else {
> +			req->rq_next = ib->req_list;
> +			ib->req_list = req;
> +		}

And all this list manipulation should use proper helper.

> +	}

Also - can you look into turning this logic into an inline function with
a callback for the driver?  I think in general gcc will avoid the
indirect call for function pointers passed directly.  That way we can
keep a nice code structure but also avoid the indirections.

Same for nvme_pci_complete_batch.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 9/9] nvme: wire up completion batching for the IRQ path
  2021-10-12 18:17 ` [PATCH 9/9] nvme: wire up completion batching for the IRQ path Jens Axboe
@ 2021-10-13  7:12   ` Christoph Hellwig
  2021-10-13 15:04     ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Christoph Hellwig @ 2021-10-13  7:12 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Tue, Oct 12, 2021 at 12:17:42PM -0600, Jens Axboe wrote:
> Trivial to do now, just need our own io_batch on the stack and pass that
> in to the usual command completion handling.
> 
> I pondered making this dependent on how many entries we had to process,
> but even for a single entry there's no discernable difference in
> performance or latency. Running a sync workload over io_uring:
> 
> t/io_uring -b512 -d1 -s1 -c1 -p0 -F1 -B1 -n2 /dev/nvme1n1 /dev/nvme2n1
> 
> yields the below performance before the patch:
> 
> IOPS=254820, BW=124MiB/s, IOS/call=1/1, inflight=(1 1)
> IOPS=251174, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)
> IOPS=250806, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)
> 
> and the following after:
> 
> IOPS=255972, BW=124MiB/s, IOS/call=1/1, inflight=(1 1)
> IOPS=251920, BW=123MiB/s, IOS/call=1/1, inflight=(1 1)
> IOPS=251794, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)
> 
> which definitely isn't slower, about the same if you factor in a bit of
> variance. For peak performance workloads, benchmarking shows a 2%
> improvement.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  drivers/nvme/host/pci.c | 9 +++++++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 4713da708cd4..fb3de6f68eb1 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1076,8 +1076,10 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
>  
>  static inline int nvme_process_cq(struct nvme_queue *nvmeq)
>  {
> +	struct io_batch ib;
>  	int found = 0;
>  
> +	ib.req_list = NULL;

Is this really more efficient than

	struct io_batch ib = { };

?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-12 18:17 ` [PATCH 6/9] nvme: add support for batched completion of polled IO Jens Axboe
  2021-10-13  7:08   ` Christoph Hellwig
@ 2021-10-13  9:09   ` John Garry
  2021-10-13 15:07     ` Jens Axboe
  1 sibling, 1 reply; 34+ messages in thread
From: John Garry @ 2021-10-13  9:09 UTC (permalink / raw)
  To: Jens Axboe, linux-block

On 12/10/2021 19:17, Jens Axboe wrote:
> Signed-off-by: Jens Axboe<axboe@kernel.dk>
> ---
>   drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++----
>   1 file changed, 63 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 4ad63bb9f415..4713da708cd4 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>   	return ret;
>   }
>   
> -static void nvme_pci_complete_rq(struct request *req)
> +static void nvme_pci_unmap_rq(struct request *req)
>   {
>   	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>   	struct nvme_dev *dev = iod->nvmeq->dev;
> @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req)
>   			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
>   	if (blk_rq_nr_phys_segments(req))
>   		nvme_unmap_data(dev, req);
> +}
> +
> +static void nvme_pci_complete_rq(struct request *req)
> +{
> +	nvme_pci_unmap_rq(req);
>   	nvme_complete_rq(req);
>   }
>   
> +static void nvme_pci_complete_batch(struct io_batch *ib)
> +{
> +	struct request *req;
> +
> +	req = ib->req_list;
> +	while (req) {
> +		nvme_pci_unmap_rq(req);

This will do the DMA SG unmap per request. Often this is a performance 
bottle neck when we have an IOMMU enabled in strict mode. So since we 
complete in batches, could we combine all the SGs in the batch to do one 
big DMA unmap SG, and not one-by-one?

Just a thought.

> +		if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
> +			nvme_cleanup_cmd(req);
> +		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> +				req_op(req) == REQ_OP_ZONE_APPEND)
> +			req->__sector = nvme_lba_to_sect(req->q->queuedata,
> +					le64_to_cpu(nvme_req(req)->result.u64));
> +		req->status = nvme_error_status(nvme_req(req)->status);
> +		req = req->rq_next;
> +	}
> +
> +	blk_mq_end_request_batch(ib);
> +}


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 5/9] nvme: move the fast path nvme error and disposition helpers
  2021-10-13  6:57     ` Christoph Hellwig
  (?)
@ 2021-10-13 14:41     ` Jens Axboe
  2021-10-13 15:11       ` Christoph Hellwig
  -1 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2021-10-13 14:41 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block, linux-nvme

On 10/13/21 12:57 AM, Christoph Hellwig wrote:
> Please Cc the nvme list for nvme changes.
> 
>> -static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
>> -{
>> -	if (likely(nvme_req(req)->status == 0))
>> -		return COMPLETE;
> 
> I think the only part here that needs to be inline is this check.
> The rest is all slow path error handling.

I think I'll just kill this patch and check nvme_req(req)->status in the
caller. If it's non-zero, just do the normal completion path and skip
the batch list.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 9/9] nvme: wire up completion batching for the IRQ path
  2021-10-13  7:12   ` Christoph Hellwig
@ 2021-10-13 15:04     ` Jens Axboe
  0 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-13 15:04 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 10/13/21 1:12 AM, Christoph Hellwig wrote:
> On Tue, Oct 12, 2021 at 12:17:42PM -0600, Jens Axboe wrote:
>> Trivial to do now, just need our own io_batch on the stack and pass that
>> in to the usual command completion handling.
>>
>> I pondered making this dependent on how many entries we had to process,
>> but even for a single entry there's no discernable difference in
>> performance or latency. Running a sync workload over io_uring:
>>
>> t/io_uring -b512 -d1 -s1 -c1 -p0 -F1 -B1 -n2 /dev/nvme1n1 /dev/nvme2n1
>>
>> yields the below performance before the patch:
>>
>> IOPS=254820, BW=124MiB/s, IOS/call=1/1, inflight=(1 1)
>> IOPS=251174, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)
>> IOPS=250806, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)
>>
>> and the following after:
>>
>> IOPS=255972, BW=124MiB/s, IOS/call=1/1, inflight=(1 1)
>> IOPS=251920, BW=123MiB/s, IOS/call=1/1, inflight=(1 1)
>> IOPS=251794, BW=122MiB/s, IOS/call=1/1, inflight=(1 1)
>>
>> which definitely isn't slower, about the same if you factor in a bit of
>> variance. For peak performance workloads, benchmarking shows a 2%
>> improvement.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  drivers/nvme/host/pci.c | 9 +++++++--
>>  1 file changed, 7 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 4713da708cd4..fb3de6f68eb1 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -1076,8 +1076,10 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
>>  
>>  static inline int nvme_process_cq(struct nvme_queue *nvmeq)
>>  {
>> +	struct io_batch ib;
>>  	int found = 0;
>>  
>> +	ib.req_list = NULL;
> 
> Is this really more efficient than
> 
> 	struct io_batch ib = { };

Probably not. I could add a DEFINE_IO_BATCH() helper, would make it easier if
other kinds of init is ever needed.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-13  9:09   ` John Garry
@ 2021-10-13 15:07     ` Jens Axboe
  0 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-13 15:07 UTC (permalink / raw)
  To: John Garry, linux-block

On 10/13/21 3:09 AM, John Garry wrote:
> On 12/10/2021 19:17, Jens Axboe wrote:
>> Signed-off-by: Jens Axboe<axboe@kernel.dk>
>> ---
>>   drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++----
>>   1 file changed, 63 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 4ad63bb9f415..4713da708cd4 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>>   	return ret;
>>   }
>>   
>> -static void nvme_pci_complete_rq(struct request *req)
>> +static void nvme_pci_unmap_rq(struct request *req)
>>   {
>>   	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>>   	struct nvme_dev *dev = iod->nvmeq->dev;
>> @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req)
>>   			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
>>   	if (blk_rq_nr_phys_segments(req))
>>   		nvme_unmap_data(dev, req);
>> +}
>> +
>> +static void nvme_pci_complete_rq(struct request *req)
>> +{
>> +	nvme_pci_unmap_rq(req);
>>   	nvme_complete_rq(req);
>>   }
>>   
>> +static void nvme_pci_complete_batch(struct io_batch *ib)
>> +{
>> +	struct request *req;
>> +
>> +	req = ib->req_list;
>> +	while (req) {
>> +		nvme_pci_unmap_rq(req);
> 
> This will do the DMA SG unmap per request. Often this is a performance 
> bottle neck when we have an IOMMU enabled in strict mode. So since we 
> complete in batches, could we combine all the SGs in the batch to do one 
> big DMA unmap SG, and not one-by-one?

It is indeed, I actually have a patch for persistent maps as well. But even
without that, it would make sense to handle these unmaps a bit smarter. That
requires some iommu work though which I'm not that interested in right now,
could be done on top of this one for someone motivated enough.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-13  7:08   ` Christoph Hellwig
@ 2021-10-13 15:10     ` Jens Axboe
  2021-10-13 15:16       ` Christoph Hellwig
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2021-10-13 15:10 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 10/13/21 1:08 AM, Christoph Hellwig wrote:
> On Tue, Oct 12, 2021 at 12:17:39PM -0600, Jens Axboe wrote:
>> Take advantage of struct io_batch, if passed in to the nvme poll handler.
>> If it's set, rather than complete each request individually inline, store
>> them in the io_batch list. We only do so for requests that will complete
>> successfully, anything else will be completed inline as before.
>>
>> Add an mq_ops->complete_batch() handler to do the post-processing of
>> the io_batch list once polling is complete.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
>>  drivers/nvme/host/pci.c | 69 +++++++++++++++++++++++++++++++++++++----
>>  1 file changed, 63 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 4ad63bb9f415..4713da708cd4 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>>  	return ret;
>>  }
>>  
>> -static void nvme_pci_complete_rq(struct request *req)
>> +static void nvme_pci_unmap_rq(struct request *req)
>>  {
>>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>>  	struct nvme_dev *dev = iod->nvmeq->dev;
>> @@ -969,9 +969,34 @@ static void nvme_pci_complete_rq(struct request *req)
>>  			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
>>  	if (blk_rq_nr_phys_segments(req))
>>  		nvme_unmap_data(dev, req);
>> +}
>> +
>> +static void nvme_pci_complete_rq(struct request *req)
>> +{
>> +	nvme_pci_unmap_rq(req);
>>  	nvme_complete_rq(req);
>>  }
>>  
>> +static void nvme_pci_complete_batch(struct io_batch *ib)
>> +{
>> +	struct request *req;
>> +
>> +	req = ib->req_list;
>> +	while (req) {
>> +		nvme_pci_unmap_rq(req);
>> +		if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
>> +			nvme_cleanup_cmd(req);
>> +		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
>> +				req_op(req) == REQ_OP_ZONE_APPEND)
>> +			req->__sector = nvme_lba_to_sect(req->q->queuedata,
>> +					le64_to_cpu(nvme_req(req)->result.u64));
>> +		req->status = nvme_error_status(nvme_req(req)->status);
>> +		req = req->rq_next;
>> +	}
>> +
>> +	blk_mq_end_request_batch(ib);
> 
> I hate all this open coding.  All the common logic needs to be in a
> common helper.

I'll see if I can unify this a bit.

> Also please add a for_each macro for the request list
> iteration.  I already thought about that for the batch allocation in
> for-next, but with ever more callers this becomes essential.

Added a prep patch with list helpers, current version is using those
now.

>> +	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
>> +		enum nvme_disposition ret;
>> +
>> +		ret = nvme_decide_disposition(req);
>> +		if (unlikely(!ib || req->end_io || ret != COMPLETE)) {
>> +			nvme_pci_complete_rq(req);
> 
> This actually is the likely case as only polling ever does the batch
> completion.  In doubt I'd prefer if we can avoid these likely/unlikely
> annotations as much as possible.

If you look at the end of the series, IRQ is wired up for it too. But I
do agree with the unlikely, I generally dislike them too. I'll just kill
this one.

>> +		} else {
>> +			req->rq_next = ib->req_list;
>> +			ib->req_list = req;
>> +		}
> 
> And all this list manipulation should use proper helper.

Done

>> +	}
> 
> Also - can you look into turning this logic into an inline function with
> a callback for the driver?  I think in general gcc will avoid the
> indirect call for function pointers passed directly.  That way we can
> keep a nice code structure but also avoid the indirections.
> 
> Same for nvme_pci_complete_batch.

Not sure I follow. It's hard to do a generic callback for this, as the
batch can live outside the block layer through the plug. That's why
it's passed the way it is in terms of completion hooks.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 5/9] nvme: move the fast path nvme error and disposition helpers
  2021-10-13 14:41     ` Jens Axboe
@ 2021-10-13 15:11       ` Christoph Hellwig
  0 siblings, 0 replies; 34+ messages in thread
From: Christoph Hellwig @ 2021-10-13 15:11 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Christoph Hellwig, linux-block, linux-nvme

On Wed, Oct 13, 2021 at 08:41:27AM -0600, Jens Axboe wrote:
> I think I'll just kill this patch and check nvme_req(req)->status in the
> caller. If it's non-zero, just do the normal completion path and skip
> the batch list.

Yes, that sounds like a better approach.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-13 15:10     ` Jens Axboe
@ 2021-10-13 15:16       ` Christoph Hellwig
  2021-10-13 15:42         ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Christoph Hellwig @ 2021-10-13 15:16 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Christoph Hellwig, linux-block

On Wed, Oct 13, 2021 at 09:10:01AM -0600, Jens Axboe wrote:
> > Also - can you look into turning this logic into an inline function with
> > a callback for the driver?  I think in general gcc will avoid the
> > indirect call for function pointers passed directly.  That way we can
> > keep a nice code structure but also avoid the indirections.
> > 
> > Same for nvme_pci_complete_batch.
> 
> Not sure I follow. It's hard to do a generic callback for this, as the
> batch can live outside the block layer through the plug. That's why
> it's passed the way it is in terms of completion hooks.

Basically turn nvme_pci_complete_batch into a core nvme helper (inline)
with nvme_pci_unmap_rq passed as a function pointer that gets inlined.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-13 15:16       ` Christoph Hellwig
@ 2021-10-13 15:42         ` Jens Axboe
  2021-10-13 15:49           ` Jens Axboe
  2021-10-13 15:50           ` Christoph Hellwig
  0 siblings, 2 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-13 15:42 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 10/13/21 9:16 AM, Christoph Hellwig wrote:
> On Wed, Oct 13, 2021 at 09:10:01AM -0600, Jens Axboe wrote:
>>> Also - can you look into turning this logic into an inline function with
>>> a callback for the driver?  I think in general gcc will avoid the
>>> indirect call for function pointers passed directly.  That way we can
>>> keep a nice code structure but also avoid the indirections.
>>>
>>> Same for nvme_pci_complete_batch.
>>
>> Not sure I follow. It's hard to do a generic callback for this, as the
>> batch can live outside the block layer through the plug. That's why
>> it's passed the way it is in terms of completion hooks.
> 
> Basically turn nvme_pci_complete_batch into a core nvme helper (inline)
> with nvme_pci_unmap_rq passed as a function pointer that gets inlined.

Something like this?


diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0ac7bad405ef..1aff0ca37063 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -346,15 +346,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	return RETRY;
 }
 
-static inline void nvme_end_req(struct request *req)
+static inline void nvme_end_req_zoned(struct request *req)
 {
-	blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
 	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 	    req_op(req) == REQ_OP_ZONE_APPEND)
 		req->__sector = nvme_lba_to_sect(req->q->queuedata,
 			le64_to_cpu(nvme_req(req)->result.u64));
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+	blk_status_t status = nvme_error_status(nvme_req(req)->status);
 
+	nvme_end_req_zoned(req);
 	nvme_trace_bio_complete(req);
 	blk_mq_end_request(req, status);
 }
@@ -381,6 +385,23 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
+void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *rq))
+{
+	struct request *req;
+
+	req = rq_list_peek(&iob->req_list);
+	while (req) {
+		fn(req);
+		nvme_cleanup_cmd(req);
+		nvme_end_req_zoned(req);
+		req->status = BLK_STS_OK;
+		req = rq_list_next(req);
+	}
+
+	blk_mq_end_request_batch(iob);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_batch);
+
 /*
  * Called to unwind from ->queue_rq on a failed command submission so that the
  * multipathing code gets called to potentially failover to another path.
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ed79a6c7e804..b73a573472d9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -638,6 +638,7 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 }
 
 void nvme_complete_rq(struct request *req);
+void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *));
 blk_status_t nvme_host_path_error(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b8dbee47fced..e79c0f0268b3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -992,22 +992,7 @@ static void nvme_pci_complete_rq(struct request *req)
 
 static void nvme_pci_complete_batch(struct io_batch *iob)
 {
-	struct request *req;
-
-	req = rq_list_peek(&iob->req_list);
-	while (req) {
-		nvme_pci_unmap_rq(req);
-		if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
-			nvme_cleanup_cmd(req);
-		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
-				req_op(req) == REQ_OP_ZONE_APPEND)
-			req->__sector = nvme_lba_to_sect(req->q->queuedata,
-					le64_to_cpu(nvme_req(req)->result.u64));
-		req->status = BLK_STS_OK;
-		req = rq_list_next(req);
-	}
-
-	blk_mq_end_request_batch(iob);
+	nvme_complete_batch(iob, nvme_pci_unmap_rq);
 }
 
 /* We read the CQE phase first to check if the rest of the entry is valid */

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-13 15:42         ` Jens Axboe
@ 2021-10-13 15:49           ` Jens Axboe
  2021-10-13 15:50           ` Christoph Hellwig
  1 sibling, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-13 15:49 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 10/13/21 9:42 AM, Jens Axboe wrote:
> On 10/13/21 9:16 AM, Christoph Hellwig wrote:
>> On Wed, Oct 13, 2021 at 09:10:01AM -0600, Jens Axboe wrote:
>>>> Also - can you look into turning this logic into an inline function with
>>>> a callback for the driver?  I think in general gcc will avoid the
>>>> indirect call for function pointers passed directly.  That way we can
>>>> keep a nice code structure but also avoid the indirections.
>>>>
>>>> Same for nvme_pci_complete_batch.
>>>
>>> Not sure I follow. It's hard to do a generic callback for this, as the
>>> batch can live outside the block layer through the plug. That's why
>>> it's passed the way it is in terms of completion hooks.
>>
>> Basically turn nvme_pci_complete_batch into a core nvme helper (inline)
>> with nvme_pci_unmap_rq passed as a function pointer that gets inlined.
> 
> Something like this?

Full patch below for reference, might be easier to read. Got rid of
the prep patch for nvme as well, so this is everything.


commit 002fcc4dd9869cd0fc8684b37ede8e20bdca16a4
Author: Jens Axboe <axboe@kernel.dk>
Date:   Fri Oct 8 05:59:37 2021 -0600

    nvme: add support for batched completion of polled IO
    
    Take advantage of struct io_batch, if passed in to the nvme poll handler.
    If it's set, rather than complete each request individually inline, store
    them in the io_batch list. We only do so for requests that will complete
    successfully, anything else will be completed inline as before.
    
    Add an mq_ops->complete_batch() handler to do the post-processing of
    the io_batch list once polling is complete.
    
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c2c2e8545292..26328fd26ff0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -346,15 +346,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	return RETRY;
 }
 
-static inline void nvme_end_req(struct request *req)
+static inline void nvme_end_req_zoned(struct request *req)
 {
-	blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
 	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 	    req_op(req) == REQ_OP_ZONE_APPEND)
 		req->__sector = nvme_lba_to_sect(req->q->queuedata,
 			le64_to_cpu(nvme_req(req)->result.u64));
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+	blk_status_t status = nvme_error_status(nvme_req(req)->status);
 
+	nvme_end_req_zoned(req);
 	nvme_trace_bio_complete(req);
 	blk_mq_end_request(req, status);
 }
@@ -381,6 +385,23 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
+void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *rq))
+{
+	struct request *req;
+
+	req = rq_list_peek(&iob->req_list);
+	while (req) {
+		fn(req);
+		nvme_cleanup_cmd(req);
+		nvme_end_req_zoned(req);
+		req->status = BLK_STS_OK;
+		req = rq_list_next(req);
+	}
+
+	blk_mq_end_request_batch(iob);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_batch);
+
 /*
  * Called to unwind from ->queue_rq on a failed command submission so that the
  * multipathing code gets called to potentially failover to another path.
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ed79a6c7e804..b73a573472d9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -638,6 +638,7 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 }
 
 void nvme_complete_rq(struct request *req);
+void nvme_complete_batch(struct io_batch *iob, void (*fn)(struct request *));
 blk_status_t nvme_host_path_error(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9db6e23f41ef..b3e686404adf 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static void nvme_pci_complete_rq(struct request *req)
+static void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_dev *dev = iod->nvmeq->dev;
@@ -969,9 +969,19 @@ static void nvme_pci_complete_rq(struct request *req)
 			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
 	if (blk_rq_nr_phys_segments(req))
 		nvme_unmap_data(dev, req);
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+	nvme_pci_unmap_rq(req);
 	nvme_complete_rq(req);
 }
 
+static void nvme_pci_complete_batch(struct io_batch *iob)
+{
+	nvme_complete_batch(iob, nvme_pci_unmap_rq);
+}
+
 /* We read the CQE phase first to check if the rest of the entry is valid */
 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
 {
@@ -996,7 +1006,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+				   struct io_batch *iob, u16 idx)
 {
 	struct nvme_completion *cqe = &nvmeq->cqes[idx];
 	__u16 command_id = READ_ONCE(cqe->command_id);
@@ -1023,8 +1034,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 	}
 
 	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
-		nvme_pci_complete_rq(req);
+	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
+		/*
+		 * Do normal inline completion if we don't have a batch
+		 * list, if we have an end_io handler, or if the status of
+		 * the request isn't just normal success.
+		 */
+		if (!iob || req->end_io || nvme_req(req)->status)
+			nvme_pci_complete_rq(req);
+		else
+			rq_list_add_tail(&iob->req_list, req);
+	}
 }
 
 static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
@@ -1050,7 +1070,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 		 * the cqe requires a full read memory barrier
 		 */
 		dma_rmb();
-		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+		nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head);
 		nvme_update_cq_head(nvmeq);
 	}
 
@@ -1092,6 +1112,27 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
 	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 }
 
+static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *iob)
+{
+	int found = 0;
+
+	while (nvme_cqe_pending(nvmeq)) {
+		found++;
+		/*
+		 * load-load control dependency between phase and the rest of
+		 * the cqe requires a full read memory barrier
+		 */
+		dma_rmb();
+		nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
+		nvme_update_cq_head(nvmeq);
+	}
+
+	if (found)
+		nvme_ring_cq_doorbell(nvmeq);
+	return found;
+}
+
+
 static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
@@ -1101,7 +1142,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob)
 		return 0;
 
 	spin_lock(&nvmeq->cq_poll_lock);
-	found = nvme_process_cq(nvmeq);
+	found = nvme_poll_cq(nvmeq, iob);
 	spin_unlock(&nvmeq->cq_poll_lock);
 
 	return found;
@@ -1639,6 +1680,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 static const struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_pci_complete_rq,
+	.complete_batch = nvme_pci_complete_batch,
 	.commit_rqs	= nvme_commit_rqs,
 	.init_hctx	= nvme_init_hctx,
 	.init_request	= nvme_init_request,

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-13 15:42         ` Jens Axboe
  2021-10-13 15:49           ` Jens Axboe
@ 2021-10-13 15:50           ` Christoph Hellwig
  2021-10-13 16:04             ` Jens Axboe
  1 sibling, 1 reply; 34+ messages in thread
From: Christoph Hellwig @ 2021-10-13 15:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Christoph Hellwig, linux-block

On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote:
> Something like this?

Something like that.  Although without making the new function inline
this will generate an indirect call.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-13 15:50           ` Christoph Hellwig
@ 2021-10-13 16:04             ` Jens Axboe
  2021-10-13 16:13               ` Christoph Hellwig
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2021-10-13 16:04 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 10/13/21 9:50 AM, Christoph Hellwig wrote:
> On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote:
>> Something like this?
> 
> Something like that.  Although without making the new function inline
> this will generate an indirect call.

It will, but I don't see how we can have it both ways...

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-13 16:04             ` Jens Axboe
@ 2021-10-13 16:13               ` Christoph Hellwig
  2021-10-13 16:33                 ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Christoph Hellwig @ 2021-10-13 16:13 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Christoph Hellwig, linux-block

On Wed, Oct 13, 2021 at 10:04:36AM -0600, Jens Axboe wrote:
> On 10/13/21 9:50 AM, Christoph Hellwig wrote:
> > On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote:
> >> Something like this?
> > 
> > Something like that.  Although without making the new function inline
> > this will generate an indirect call.
> 
> It will, but I don't see how we can have it both ways...

Last time I played with these optimization gcc did inline function
pointers passed to __always_inline function into the calling
function.  That is you can keep the source level abstraction but get the
code generation as if it was open coded.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-13 16:13               ` Christoph Hellwig
@ 2021-10-13 16:33                 ` Jens Axboe
  2021-10-13 16:45                   ` Jens Axboe
  0 siblings, 1 reply; 34+ messages in thread
From: Jens Axboe @ 2021-10-13 16:33 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 10/13/21 10:13 AM, Christoph Hellwig wrote:
> On Wed, Oct 13, 2021 at 10:04:36AM -0600, Jens Axboe wrote:
>> On 10/13/21 9:50 AM, Christoph Hellwig wrote:
>>> On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote:
>>>> Something like this?
>>>
>>> Something like that.  Although without making the new function inline
>>> this will generate an indirect call.
>>
>> It will, but I don't see how we can have it both ways...
> 
> Last time I played with these optimization gcc did inline function
> pointers passed to __always_inline function into the calling
> function.  That is you can keep the source level abstraction but get the
> code generation as if it was open coded.

Gotcha, so place nvme_complete_batch() in the nvme.h header. That might
work, let me give it a whirl.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 6/9] nvme: add support for batched completion of polled IO
  2021-10-13 16:33                 ` Jens Axboe
@ 2021-10-13 16:45                   ` Jens Axboe
  0 siblings, 0 replies; 34+ messages in thread
From: Jens Axboe @ 2021-10-13 16:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block

On 10/13/21 10:33 AM, Jens Axboe wrote:
> On 10/13/21 10:13 AM, Christoph Hellwig wrote:
>> On Wed, Oct 13, 2021 at 10:04:36AM -0600, Jens Axboe wrote:
>>> On 10/13/21 9:50 AM, Christoph Hellwig wrote:
>>>> On Wed, Oct 13, 2021 at 09:42:23AM -0600, Jens Axboe wrote:
>>>>> Something like this?
>>>>
>>>> Something like that.  Although without making the new function inline
>>>> this will generate an indirect call.
>>>
>>> It will, but I don't see how we can have it both ways...
>>
>> Last time I played with these optimization gcc did inline function
>> pointers passed to __always_inline function into the calling
>> function.  That is you can keep the source level abstraction but get the
>> code generation as if it was open coded.
> 
> Gotcha, so place nvme_complete_batch() in the nvme.h header. That might
> work, let me give it a whirl.

Looks like this, and it is indeed not an indirect call. I'll re-test the
series and send out a v2.


commit 70ed26ee000d626105b0d807545af42e6d95a323
Author: Jens Axboe <axboe@kernel.dk>
Date:   Fri Oct 8 05:59:37 2021 -0600

    nvme: add support for batched completion of polled IO
    
    Take advantage of struct io_batch, if passed in to the nvme poll handler.
    If it's set, rather than complete each request individually inline, store
    them in the io_batch list. We only do so for requests that will complete
    successfully, anything else will be completed inline as before.
    
    Add an mq_ops->complete_batch() handler to do the post-processing of
    the io_batch list once polling is complete.
    
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c2c2e8545292..4b14258a3bac 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -346,15 +346,19 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	return RETRY;
 }
 
-static inline void nvme_end_req(struct request *req)
+static inline void nvme_end_req_zoned(struct request *req)
 {
-	blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
 	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 	    req_op(req) == REQ_OP_ZONE_APPEND)
 		req->__sector = nvme_lba_to_sect(req->q->queuedata,
 			le64_to_cpu(nvme_req(req)->result.u64));
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+	blk_status_t status = nvme_error_status(nvme_req(req)->status);
 
+	nvme_end_req_zoned(req);
 	nvme_trace_bio_complete(req);
 	blk_mq_end_request(req, status);
 }
@@ -381,6 +385,14 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
+void nvme_complete_batch_req(struct request *req)
+{
+	nvme_cleanup_cmd(req);
+	nvme_end_req_zoned(req);
+	req->status = BLK_STS_OK;
+}
+EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
+
 /*
  * Called to unwind from ->queue_rq on a failed command submission so that the
  * multipathing code gets called to potentially failover to another path.
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ed79a6c7e804..e0c079f704cf 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -638,6 +638,23 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 }
 
 void nvme_complete_rq(struct request *req);
+void nvme_complete_batch_req(struct request *req);
+
+static __always_inline void nvme_complete_batch(struct io_batch *iob,
+						void (*fn)(struct request *rq))
+{
+	struct request *req;
+
+	req = rq_list_peek(&iob->req_list);
+	while (req) {
+		fn(req);
+		nvme_complete_batch_req(req);
+		req = rq_list_next(req);
+	}
+
+	blk_mq_end_request_batch(iob);
+}
+
 blk_status_t nvme_host_path_error(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9db6e23f41ef..ae253f6f5c80 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -959,7 +959,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static void nvme_pci_complete_rq(struct request *req)
+static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_dev *dev = iod->nvmeq->dev;
@@ -969,9 +969,19 @@ static void nvme_pci_complete_rq(struct request *req)
 			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
 	if (blk_rq_nr_phys_segments(req))
 		nvme_unmap_data(dev, req);
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+	nvme_pci_unmap_rq(req);
 	nvme_complete_rq(req);
 }
 
+static void nvme_pci_complete_batch(struct io_batch *iob)
+{
+	nvme_complete_batch(iob, nvme_pci_unmap_rq);
+}
+
 /* We read the CQE phase first to check if the rest of the entry is valid */
 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
 {
@@ -996,7 +1006,8 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+				   struct io_batch *iob, u16 idx)
 {
 	struct nvme_completion *cqe = &nvmeq->cqes[idx];
 	__u16 command_id = READ_ONCE(cqe->command_id);
@@ -1023,8 +1034,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 	}
 
 	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
-		nvme_pci_complete_rq(req);
+	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
+		/*
+		 * Do normal inline completion if we don't have a batch
+		 * list, if we have an end_io handler, or if the status of
+		 * the request isn't just normal success.
+		 */
+		if (!iob || req->end_io || nvme_req(req)->status)
+			nvme_pci_complete_rq(req);
+		else
+			rq_list_add_tail(&iob->req_list, req);
+	}
 }
 
 static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
@@ -1050,7 +1070,7 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 		 * the cqe requires a full read memory barrier
 		 */
 		dma_rmb();
-		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+		nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head);
 		nvme_update_cq_head(nvmeq);
 	}
 
@@ -1092,6 +1112,27 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
 	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 }
 
+static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *iob)
+{
+	int found = 0;
+
+	while (nvme_cqe_pending(nvmeq)) {
+		found++;
+		/*
+		 * load-load control dependency between phase and the rest of
+		 * the cqe requires a full read memory barrier
+		 */
+		dma_rmb();
+		nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
+		nvme_update_cq_head(nvmeq);
+	}
+
+	if (found)
+		nvme_ring_cq_doorbell(nvmeq);
+	return found;
+}
+
+
 static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
@@ -1101,7 +1142,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob)
 		return 0;
 
 	spin_lock(&nvmeq->cq_poll_lock);
-	found = nvme_process_cq(nvmeq);
+	found = nvme_poll_cq(nvmeq, iob);
 	spin_unlock(&nvmeq->cq_poll_lock);
 
 	return found;
@@ -1639,6 +1680,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 static const struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_pci_complete_rq,
+	.complete_batch = nvme_pci_complete_batch,
 	.commit_rqs	= nvme_commit_rqs,
 	.init_hctx	= nvme_init_hctx,
 	.init_request	= nvme_init_request,

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2021-10-13 16:45 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-12 18:17 [PATCHSET 0/9] Batched completions Jens Axboe
2021-10-12 18:17 ` [PATCH 1/9] block: add a struct io_batch argument to fops->iopoll() Jens Axboe
2021-10-12 18:25   ` Bart Van Assche
2021-10-12 18:28     ` Jens Axboe
2021-10-12 18:17 ` [PATCH 2/9] sbitmap: add helper to clear a batch of tags Jens Axboe
2021-10-12 18:29   ` Bart Van Assche
2021-10-12 18:34     ` Jens Axboe
2021-10-12 18:17 ` [PATCH 3/9] sbitmap: test bit before calling test_and_set_bit() Jens Axboe
2021-10-12 18:17 ` [PATCH 4/9] block: add support for blk_mq_end_request_batch() Jens Axboe
2021-10-12 18:32   ` Bart Van Assche
2021-10-12 18:55     ` Jens Axboe
2021-10-12 18:17 ` [PATCH 5/9] nvme: move the fast path nvme error and disposition helpers Jens Axboe
2021-10-13  6:57   ` Christoph Hellwig
2021-10-13  6:57     ` Christoph Hellwig
2021-10-13 14:41     ` Jens Axboe
2021-10-13 15:11       ` Christoph Hellwig
2021-10-12 18:17 ` [PATCH 6/9] nvme: add support for batched completion of polled IO Jens Axboe
2021-10-13  7:08   ` Christoph Hellwig
2021-10-13 15:10     ` Jens Axboe
2021-10-13 15:16       ` Christoph Hellwig
2021-10-13 15:42         ` Jens Axboe
2021-10-13 15:49           ` Jens Axboe
2021-10-13 15:50           ` Christoph Hellwig
2021-10-13 16:04             ` Jens Axboe
2021-10-13 16:13               ` Christoph Hellwig
2021-10-13 16:33                 ` Jens Axboe
2021-10-13 16:45                   ` Jens Axboe
2021-10-13  9:09   ` John Garry
2021-10-13 15:07     ` Jens Axboe
2021-10-12 18:17 ` [PATCH 7/9] block: assign batch completion handler in blk_poll() Jens Axboe
2021-10-12 18:17 ` [PATCH 8/9] io_uring: utilize the io_batch infrastructure for more efficient polled IO Jens Axboe
2021-10-12 18:17 ` [PATCH 9/9] nvme: wire up completion batching for the IRQ path Jens Axboe
2021-10-13  7:12   ` Christoph Hellwig
2021-10-13 15:04     ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.