[PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough

linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough
@ 2022-09-27  1:44 Jens Axboe
  2022-09-27  1:44 ` [PATCH 1/5] block: enable batched allocation for blk_mq_alloc_request() Jens Axboe
                   ` (6 more replies)
  0 siblings, 7 replies; 17+ messages in thread
From: Jens Axboe @ 2022-09-27  1:44 UTC (permalink / raw)
  To: linux-block; +Cc: linux-scsi, linux-nvme

Hi,

The passthrough IO path currently doesn't do any request allocation
batching like we do for normal IO. Wire this up through the usual
blk_mq_alloc_request() allocation helper.

Similarly, we don't currently supported batched completions for
passthrough IO. Allow the request->end_io() handler to return back
whether or not it retains ownership of the request. By default all
handlers are converted to returning RQ_END_IO_NONE, which retains
the existing behavior. But with that in place, we can tweak the
nvme uring_cmd end_io handler to pass back ownership, and hence enable
completion batching for passthrough requests as well.

This is good for a 10% improvement for passthrough performance. For
a non-drive limited test case, passthrough IO is now more efficient
than the regular bdev O_DIRECT path.

Changes since v1:
- Remove spurious semicolon
- Cleanup struct nvme_uring_cmd_pdu handling

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/5] block: enable batched allocation for blk_mq_alloc_request()
  2022-09-27  1:44 [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Jens Axboe
@ 2022-09-27  1:44 ` Jens Axboe
  2022-09-28 13:38   ` Anuj gupta
  2022-09-27  1:44 ` [PATCH 2/5] block: change request end_io handler to pass back a return value Jens Axboe
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 17+ messages in thread
From: Jens Axboe @ 2022-09-27  1:44 UTC (permalink / raw)
  To: linux-block; +Cc: linux-scsi, linux-nvme, Jens Axboe

The filesystem IO path can take advantage of allocating batches of
requests, if the underlying submitter tells the block layer about it
through the blk_plug. For passthrough IO, the exported API is the
blk_mq_alloc_request() helper, and that one does not allow for
request caching.

Wire up request caching for blk_mq_alloc_request(), which is generally
done without having a bio available upfront.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 80 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 71 insertions(+), 9 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c11949d66163..d3a9f8b9c7ee 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -510,25 +510,87 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 					alloc_time_ns);
 }
 
-struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
-		blk_mq_req_flags_t flags)
+static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
+					    struct blk_plug *plug,
+					    blk_opf_t opf,
+					    blk_mq_req_flags_t flags)
 {
 	struct blk_mq_alloc_data data = {
 		.q		= q,
 		.flags		= flags,
 		.cmd_flags	= opf,
-		.nr_tags	= 1,
+		.nr_tags	= plug->nr_ios,
+		.cached_rq	= &plug->cached_rq,
 	};
 	struct request *rq;
-	int ret;
 
-	ret = blk_queue_enter(q, flags);
-	if (ret)
-		return ERR_PTR(ret);
+	if (blk_queue_enter(q, flags))
+		return NULL;
+
+	plug->nr_ios = 1;
 
 	rq = __blk_mq_alloc_requests(&data);
-	if (!rq)
-		goto out_queue_exit;
+	if (unlikely(!rq))
+		blk_queue_exit(q);
+	return rq;
+}
+
+static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
+						   blk_opf_t opf,
+						   blk_mq_req_flags_t flags)
+{
+	struct blk_plug *plug = current->plug;
+	struct request *rq;
+
+	if (!plug)
+		return NULL;
+	if (rq_list_empty(plug->cached_rq)) {
+		if (plug->nr_ios == 1)
+			return NULL;
+		rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
+		if (rq)
+			goto got_it;
+		return NULL;
+	}
+	rq = rq_list_peek(&plug->cached_rq);
+	if (!rq || rq->q != q)
+		return NULL;
+
+	if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
+		return NULL;
+	if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
+		return NULL;
+
+	plug->cached_rq = rq_list_next(rq);
+got_it:
+	rq->cmd_flags = opf;
+	INIT_LIST_HEAD(&rq->queuelist);
+	return rq;
+}
+
+struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
+		blk_mq_req_flags_t flags)
+{
+	struct request *rq;
+
+	rq = blk_mq_alloc_cached_request(q, opf, flags);
+	if (!rq) {
+		struct blk_mq_alloc_data data = {
+			.q		= q,
+			.flags		= flags,
+			.cmd_flags	= opf,
+			.nr_tags	= 1,
+		};
+		int ret;
+
+		ret = blk_queue_enter(q, flags);
+		if (ret)
+			return ERR_PTR(ret);
+
+		rq = __blk_mq_alloc_requests(&data);
+		if (!rq)
+			goto out_queue_exit;
+	}
 	rq->__data_len = 0;
 	rq->__sector = (sector_t) -1;
 	rq->bio = rq->biotail = NULL;
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/5] block: change request end_io handler to pass back a return value
  2022-09-27  1:44 [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Jens Axboe
  2022-09-27  1:44 ` [PATCH 1/5] block: enable batched allocation for blk_mq_alloc_request() Jens Axboe
@ 2022-09-27  1:44 ` Jens Axboe
  2022-09-27  1:44 ` [PATCH 3/5] block: allow end_io based requests in the completion batch handling Jens Axboe
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 17+ messages in thread
From: Jens Axboe @ 2022-09-27  1:44 UTC (permalink / raw)
  To: linux-block; +Cc: linux-scsi, linux-nvme, Jens Axboe

Everything is just converted to returning RQ_END_IO_NONE, and there
should be no functional changes with this patch.

In preparation for allowing the end_io handler to pass ownership back
to the block layer, rather than retain ownership of the request.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c                  | 10 +++++++---
 block/blk-mq.c                     | 14 +++++++++-----
 drivers/md/dm-rq.c                 |  4 +++-
 drivers/nvme/host/core.c           |  6 ++++--
 drivers/nvme/host/ioctl.c          |  5 ++++-
 drivers/nvme/host/pci.c            | 12 ++++++++----
 drivers/nvme/target/passthru.c     |  5 +++--
 drivers/scsi/scsi_error.c          |  4 +++-
 drivers/scsi/sg.c                  |  9 +++++----
 drivers/scsi/st.c                  |  4 +++-
 drivers/target/target_core_pscsi.c |  6 ++++--
 drivers/ufs/core/ufshpb.c          |  8 ++++++--
 include/linux/blk-mq.h             |  7 ++++++-
 13 files changed, 65 insertions(+), 29 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index d20a0c6b2c66..ac850f4d9c4c 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -218,7 +218,8 @@ static void blk_flush_complete_seq(struct request *rq,
 	blk_kick_flush(q, fq, cmd_flags);
 }
 
-static void flush_end_io(struct request *flush_rq, blk_status_t error)
+static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
+				       blk_status_t error)
 {
 	struct request_queue *q = flush_rq->q;
 	struct list_head *running;
@@ -232,7 +233,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 	if (!req_ref_put_and_test(flush_rq)) {
 		fq->rq_status = error;
 		spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
-		return;
+		return RQ_END_IO_NONE;
 	}
 
 	blk_account_io_flush(flush_rq);
@@ -269,6 +270,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 	}
 
 	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+	return RQ_END_IO_NONE;
 }
 
 bool is_flush_rq(struct request *rq)
@@ -354,7 +356,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	blk_flush_queue_rq(flush_rq, false);
 }
 
-static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
+static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
+					       blk_status_t error)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
@@ -376,6 +379,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
 	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 
 	blk_mq_sched_restart(hctx);
+	return RQ_END_IO_NONE;
 }
 
 /**
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d3a9f8b9c7ee..a4e018c82b7c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1001,7 +1001,8 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 
 	if (rq->end_io) {
 		rq_qos_done(rq->q, rq);
-		rq->end_io(rq, error);
+		if (rq->end_io(rq, error) == RQ_END_IO_FREE)
+			blk_mq_free_request(rq);
 	} else {
 		blk_mq_free_request(rq);
 	}
@@ -1287,12 +1288,13 @@ struct blk_rq_wait {
 	blk_status_t ret;
 };
 
-static void blk_end_sync_rq(struct request *rq, blk_status_t ret)
+static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
 {
 	struct blk_rq_wait *wait = rq->end_io_data;
 
 	wait->ret = ret;
 	complete(&wait->done);
+	return RQ_END_IO_NONE;
 }
 
 bool blk_rq_is_poll(struct request *rq)
@@ -1526,10 +1528,12 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 
 void blk_mq_put_rq_ref(struct request *rq)
 {
-	if (is_flush_rq(rq))
-		rq->end_io(rq, 0);
-	else if (req_ref_put_and_test(rq))
+	if (is_flush_rq(rq)) {
+		if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
+			blk_mq_free_request(rq);
+	} else if (req_ref_put_and_test(rq)) {
 		__blk_mq_free_request(rq);
+	}
 }
 
 static bool blk_mq_check_expired(struct request *rq, void *priv)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 4f49bbcce4f1..3001b10a3fbf 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -292,11 +292,13 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error)
 	dm_complete_request(rq, error);
 }
 
-static void end_clone_request(struct request *clone, blk_status_t error)
+static enum rq_end_io_ret end_clone_request(struct request *clone,
+					    blk_status_t error)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
 	dm_complete_request(tio->orig, error);
+	return RQ_END_IO_NONE;
 }
 
 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 13080a017ecf..f946f85e7a66 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1177,7 +1177,8 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
 	queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2);
 }
 
-static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
+static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
+						 blk_status_t status)
 {
 	struct nvme_ctrl *ctrl = rq->end_io_data;
 	unsigned long flags;
@@ -1189,7 +1190,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
 		dev_err(ctrl->device,
 			"failed nvme_keep_alive_end_io error=%d\n",
 				status);
-		return;
+		return RQ_END_IO_NONE;
 	}
 
 	ctrl->comp_seen = false;
@@ -1200,6 +1201,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
 	spin_unlock_irqrestore(&ctrl->lock, flags);
 	if (startka)
 		nvme_queue_keep_alive_work(ctrl);
+	return RQ_END_IO_NONE;
 }
 
 static void nvme_keep_alive_work(struct work_struct *work)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 548aca8b5b9f..c80b3ecca5c8 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -385,7 +385,8 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
 	io_uring_cmd_done(ioucmd, status, result);
 }
 
-static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err)
+static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
+						blk_status_t err)
 {
 	struct io_uring_cmd *ioucmd = req->end_io_data;
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
@@ -404,6 +405,8 @@ static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err)
 		nvme_uring_task_cb(ioucmd);
 	else
 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
+
+	return RQ_END_IO_NONE;
 }
 
 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 09b5d62f342b..361f09f23648 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1268,7 +1268,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
-static void abort_endio(struct request *req, blk_status_t error)
+static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error)
 {
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 
@@ -1276,6 +1276,7 @@ static void abort_endio(struct request *req, blk_status_t error)
 		 "Abort status: 0x%x", nvme_req(req)->status);
 	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
 	blk_mq_free_request(req);
+	return RQ_END_IO_NONE;
 }
 
 static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
@@ -2447,22 +2448,25 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	return result;
 }
 
-static void nvme_del_queue_end(struct request *req, blk_status_t error)
+static enum rq_end_io_ret nvme_del_queue_end(struct request *req,
+					     blk_status_t error)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
 
 	blk_mq_free_request(req);
 	complete(&nvmeq->delete_done);
+	return RQ_END_IO_NONE;
 }
 
-static void nvme_del_cq_end(struct request *req, blk_status_t error)
+static enum rq_end_io_ret nvme_del_cq_end(struct request *req,
+					  blk_status_t error)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
 
 	if (error)
 		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
 
-	nvme_del_queue_end(req, error);
+	return nvme_del_queue_end(req, error);
 }
 
 static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index 6f39a29828b1..4876ccaac55b 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -240,14 +240,15 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
 	blk_mq_free_request(rq);
 }
 
-static void nvmet_passthru_req_done(struct request *rq,
-				    blk_status_t blk_status)
+static enum rq_end_io_ret nvmet_passthru_req_done(struct request *rq,
+						  blk_status_t blk_status)
 {
 	struct nvmet_req *req = rq->end_io_data;
 
 	req->cqe->result = nvme_req(rq)->result;
 	nvmet_req_complete(req, nvme_req(rq)->status);
 	blk_mq_free_request(rq);
+	return RQ_END_IO_NONE;
 }
 
 static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 448748e3fba5..786fb963cf3f 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -2004,9 +2004,11 @@ enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *scmd)
 	}
 }
 
-static void eh_lock_door_done(struct request *req, blk_status_t status)
+static enum rq_end_io_ret eh_lock_door_done(struct request *req,
+					    blk_status_t status)
 {
 	blk_mq_free_request(req);
+	return RQ_END_IO_NONE;
 }
 
 /**
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 340b050ad28d..94c5e9a9309c 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -177,7 +177,7 @@ typedef struct sg_device { /* holds the state of each scsi generic device */
 } Sg_device;
 
 /* tasklet or soft irq callback */
-static void sg_rq_end_io(struct request *rq, blk_status_t status);
+static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status);
 static int sg_start_req(Sg_request *srp, unsigned char *cmd);
 static int sg_finish_rem_req(Sg_request * srp);
 static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size);
@@ -1311,7 +1311,7 @@ sg_rq_end_io_usercontext(struct work_struct *work)
  * This function is a "bottom half" handler that is called by the mid
  * level when a command is completed (or has failed).
  */
-static void
+static enum rq_end_io_ret
 sg_rq_end_io(struct request *rq, blk_status_t status)
 {
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
@@ -1324,11 +1324,11 @@ sg_rq_end_io(struct request *rq, blk_status_t status)
 	int result, resid, done = 1;
 
 	if (WARN_ON(srp->done != 0))
-		return;
+		return RQ_END_IO_NONE;
 
 	sfp = srp->parentfp;
 	if (WARN_ON(sfp == NULL))
-		return;
+		return RQ_END_IO_NONE;
 
 	sdp = sfp->parentdp;
 	if (unlikely(atomic_read(&sdp->detaching)))
@@ -1406,6 +1406,7 @@ sg_rq_end_io(struct request *rq, blk_status_t status)
 		INIT_WORK(&srp->ew.work, sg_rq_end_io_usercontext);
 		schedule_work(&srp->ew.work);
 	}
+	return RQ_END_IO_NONE;
 }
 
 static const struct file_operations sg_fops = {
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 850172a2b8f1..55e7c07ebe4c 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -512,7 +512,8 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
 	atomic64_dec(&STp->stats->in_flight);
 }
 
-static void st_scsi_execute_end(struct request *req, blk_status_t status)
+static enum rq_end_io_ret st_scsi_execute_end(struct request *req,
+					      blk_status_t status)
 {
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
 	struct st_request *SRpnt = req->end_io_data;
@@ -532,6 +533,7 @@ static void st_scsi_execute_end(struct request *req, blk_status_t status)
 
 	blk_rq_unmap_user(tmp);
 	blk_mq_free_request(req);
+	return RQ_END_IO_NONE;
 }
 
 static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index e6a967ddc08c..8a7306e5e133 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -39,7 +39,7 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev)
 }
 
 static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd);
-static void pscsi_req_done(struct request *, blk_status_t);
+static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t);
 
 /*	pscsi_attach_hba():
  *
@@ -1002,7 +1002,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev)
 	return 0;
 }
 
-static void pscsi_req_done(struct request *req, blk_status_t status)
+static enum rq_end_io_ret pscsi_req_done(struct request *req,
+					 blk_status_t status)
 {
 	struct se_cmd *cmd = req->end_io_data;
 	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
@@ -1029,6 +1030,7 @@ static void pscsi_req_done(struct request *req, blk_status_t status)
 	}
 
 	blk_mq_free_request(req);
+	return RQ_END_IO_NONE;
 }
 
 static const struct target_backend_ops pscsi_ops = {
diff --git a/drivers/ufs/core/ufshpb.c b/drivers/ufs/core/ufshpb.c
index a1a7a1175a5a..3d69a81c5b17 100644
--- a/drivers/ufs/core/ufshpb.c
+++ b/drivers/ufs/core/ufshpb.c
@@ -613,14 +613,17 @@ static void ufshpb_activate_subregion(struct ufshpb_lu *hpb,
 	srgn->srgn_state = HPB_SRGN_VALID;
 }
 
-static void ufshpb_umap_req_compl_fn(struct request *req, blk_status_t error)
+static enum rq_end_io_ret ufshpb_umap_req_compl_fn(struct request *req,
+						   blk_status_t error)
 {
 	struct ufshpb_req *umap_req = (struct ufshpb_req *)req->end_io_data;
 
 	ufshpb_put_req(umap_req->hpb, umap_req);
+	return RQ_END_IO_NONE;
 }
 
-static void ufshpb_map_req_compl_fn(struct request *req, blk_status_t error)
+static enum rq_end_io_ret ufshpb_map_req_compl_fn(struct request *req,
+						  blk_status_t error)
 {
 	struct ufshpb_req *map_req = (struct ufshpb_req *) req->end_io_data;
 	struct ufshpb_lu *hpb = map_req->hpb;
@@ -636,6 +639,7 @@ static void ufshpb_map_req_compl_fn(struct request *req, blk_status_t error)
 	spin_unlock_irqrestore(&hpb->rgn_state_lock, flags);
 
 	ufshpb_put_map_req(map_req->hpb, map_req);
+	return RQ_END_IO_NONE;
 }
 
 static void ufshpb_set_unmap_cmd(unsigned char *cdb, struct ufshpb_region *rgn)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 00a15808c137..e6fa49dd6196 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -14,7 +14,12 @@ struct blk_flush_queue;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_DEFAULT_RQ	128
 
-typedef void (rq_end_io_fn)(struct request *, blk_status_t);
+enum rq_end_io_ret {
+	RQ_END_IO_NONE,
+	RQ_END_IO_FREE,
+};
+
+typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);
 
 /*
  * request flags */
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/5] block: allow end_io based requests in the completion batch handling
  2022-09-27  1:44 [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Jens Axboe
  2022-09-27  1:44 ` [PATCH 1/5] block: enable batched allocation for blk_mq_alloc_request() Jens Axboe
  2022-09-27  1:44 ` [PATCH 2/5] block: change request end_io handler to pass back a return value Jens Axboe
@ 2022-09-27  1:44 ` Jens Axboe
  2022-09-28 13:42   ` Anuj gupta
  2022-09-27  1:44 ` [PATCH 4/5] nvme: split out metadata vs non metadata end_io uring_cmd completions Jens Axboe
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 17+ messages in thread
From: Jens Axboe @ 2022-09-27  1:44 UTC (permalink / raw)
  To: linux-block; +Cc: linux-scsi, linux-nvme, Jens Axboe, Stefan Roesch

With end_io handlers now being able to potentially pass ownership of
the request upon completion, we can allow requests with end_io handlers
in the batch completion handling.

Co-developed-by: Stefan Roesch <shr@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 13 +++++++++++--
 include/linux/blk-mq.h |  3 ++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a4e018c82b7c..a7dfe7a898a4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -823,8 +823,10 @@ static void blk_complete_request(struct request *req)
 	 * can find how many bytes remain in the request
 	 * later.
 	 */
-	req->bio = NULL;
-	req->__data_len = 0;
+	if (!req->end_io) {
+		req->bio = NULL;
+		req->__data_len = 0;
+	}
 }
 
 /**
@@ -1055,6 +1057,13 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
 
 		rq_qos_done(rq->q, rq);
 
+		/*
+		 * If end_io handler returns NONE, then it still has
+		 * ownership of the request.
+		 */
+		if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
+			continue;
+
 		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 		if (!req_ref_put_and_test(rq))
 			continue;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e6fa49dd6196..50811d0fb143 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -853,8 +853,9 @@ static inline bool blk_mq_add_to_batch(struct request *req,
 				       struct io_comp_batch *iob, int ioerror,
 				       void (*complete)(struct io_comp_batch *))
 {
-	if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror)
+	if (!iob || (req->rq_flags & RQF_ELV) || ioerror)
 		return false;
+
 	if (!iob->complete)
 		iob->complete = complete;
 	else if (iob->complete != complete)
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/5] nvme: split out metadata vs non metadata end_io uring_cmd completions
  2022-09-27  1:44 [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Jens Axboe
                   ` (2 preceding siblings ...)
  2022-09-27  1:44 ` [PATCH 3/5] block: allow end_io based requests in the completion batch handling Jens Axboe
@ 2022-09-27  1:44 ` Jens Axboe
  2022-09-27  7:50   ` Christoph Hellwig
                     ` (2 more replies)
  2022-09-27  1:44 ` [PATCH 5/5] nvme: enable batched completions of passthrough IO Jens Axboe
                   ` (2 subsequent siblings)
  6 siblings, 3 replies; 17+ messages in thread
From: Jens Axboe @ 2022-09-27  1:44 UTC (permalink / raw)
  To: linux-block; +Cc: linux-scsi, linux-nvme, Jens Axboe, Stefan Roesch

By splitting up the metadata and non-metadata end_io handling, we can
remove any request dependencies on the normal non-metadata IO path. This
is in preparation for enabling the normal IO passthrough path to pass
the ownership of the request back to the block layer.

Co-developed-by: Stefan Roesch <shr@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/ioctl.c | 79 ++++++++++++++++++++++++++++++---------
 1 file changed, 61 insertions(+), 18 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index c80b3ecca5c8..9e356a6c96c2 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -349,9 +349,15 @@ struct nvme_uring_cmd_pdu {
 		struct bio *bio;
 		struct request *req;
 	};
-	void *meta; /* kernel-resident buffer */
-	void __user *meta_buffer;
 	u32 meta_len;
+	u32 nvme_status;
+	union {
+		struct {
+			void *meta; /* kernel-resident buffer */
+			void __user *meta_buffer;
+		};
+		u64 result;
+	} u;
 };
 
 static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
@@ -360,11 +366,10 @@ static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
 	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
 }
 
-static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
+static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd)
 {
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
 	struct request *req = pdu->req;
-	struct bio *bio = req->bio;
 	int status;
 	u64 result;
 
@@ -375,27 +380,39 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
 
 	result = le64_to_cpu(nvme_req(req)->result.u64);
 
-	if (pdu->meta)
-		status = nvme_finish_user_metadata(req, pdu->meta_buffer,
-					pdu->meta, pdu->meta_len, status);
-	if (bio)
-		blk_rq_unmap_user(bio);
+	if (pdu->meta_len)
+		status = nvme_finish_user_metadata(req, pdu->u.meta_buffer,
+					pdu->u.meta, pdu->meta_len, status);
+	if (req->bio)
+		blk_rq_unmap_user(req->bio);
 	blk_mq_free_request(req);
 
 	io_uring_cmd_done(ioucmd, status, result);
 }
 
+static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
+{
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+
+	if (pdu->bio)
+		blk_rq_unmap_user(pdu->bio);
+
+	io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result);
+}
+
 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 						blk_status_t err)
 {
 	struct io_uring_cmd *ioucmd = req->end_io_data;
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-	/* extract bio before reusing the same field for request */
-	struct bio *bio = pdu->bio;
 	void *cookie = READ_ONCE(ioucmd->cookie);
 
-	pdu->req = req;
-	req->bio = bio;
+	req->bio = pdu->bio;
+	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+		pdu->nvme_status = -EINTR;
+	else
+		pdu->nvme_status = nvme_req(req)->status;
+	pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64);
 
 	/*
 	 * For iopoll, complete it directly.
@@ -406,6 +423,29 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 	else
 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
 
+	blk_mq_free_request(req);
+	return RQ_END_IO_NONE;
+}
+
+static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
+						     blk_status_t err)
+{
+	struct io_uring_cmd *ioucmd = req->end_io_data;
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+	void *cookie = READ_ONCE(ioucmd->cookie);
+
+	req->bio = pdu->bio;
+	pdu->req = req;
+
+	/*
+	 * For iopoll, complete it directly.
+	 * Otherwise, move the completion to task work.
+	 */
+	if (cookie != NULL && blk_rq_is_poll(req))
+		nvme_uring_task_meta_cb(ioucmd);
+	else
+		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb);
+
 	return RQ_END_IO_NONE;
 }
 
@@ -467,8 +507,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 			blk_flags);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
-	req->end_io = nvme_uring_cmd_end_io;
-	req->end_io_data = ioucmd;
 
 	if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) {
 		if (unlikely(!req->bio)) {
@@ -483,10 +521,15 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	}
 	/* to free bio on completion, as req->bio will be null at that time */
 	pdu->bio = req->bio;
-	pdu->meta = meta;
-	pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
 	pdu->meta_len = d.metadata_len;
-
+	req->end_io_data = ioucmd;
+	if (pdu->meta_len) {
+		pdu->u.meta = meta;
+		pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata);
+		req->end_io = nvme_uring_cmd_end_io_meta;
+	} else {
+		req->end_io = nvme_uring_cmd_end_io;
+	}
 	blk_execute_rq_nowait(req, false);
 	return -EIOCBQUEUED;
 }
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 5/5] nvme: enable batched completions of passthrough IO
  2022-09-27  1:44 [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Jens Axboe
                   ` (3 preceding siblings ...)
  2022-09-27  1:44 ` [PATCH 4/5] nvme: split out metadata vs non metadata end_io uring_cmd completions Jens Axboe
@ 2022-09-27  1:44 ` Jens Axboe
  2022-09-28 13:55   ` Anuj gupta
  2022-09-28 14:47   ` Sagi Grimberg
  2022-09-28 13:23 ` [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Anuj gupta
  2022-09-28 17:05 ` Keith Busch
  6 siblings, 2 replies; 17+ messages in thread
From: Jens Axboe @ 2022-09-27  1:44 UTC (permalink / raw)
  To: linux-block; +Cc: linux-scsi, linux-nvme, Jens Axboe, Stefan Roesch

Now that the normal passthrough end_io path doesn't need the request
anymore, we can kill the explicit blk_mq_free_request() and just pass
back RQ_END_IO_FREE instead. This enables the batched completion from
freeing batches of requests at the time.

This brings passthrough IO performance at least on par with bdev based
O_DIRECT with io_uring. With this and batche allocations, peak performance
goes from 110M IOPS to 122M IOPS. For IRQ based, passthrough is now also
about 10% faster than previously, going from ~61M to ~67M IOPS.

Co-developed-by: Stefan Roesch <shr@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/ioctl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 9e356a6c96c2..d9633f426690 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -423,8 +423,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 	else
 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
 
-	blk_mq_free_request(req);
-	return RQ_END_IO_NONE;
+	return RQ_END_IO_FREE;
 }
 
 static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/5] nvme: split out metadata vs non metadata end_io uring_cmd completions
  2022-09-27  1:44 ` [PATCH 4/5] nvme: split out metadata vs non metadata end_io uring_cmd completions Jens Axboe
@ 2022-09-27  7:50   ` Christoph Hellwig
  2022-09-28 13:51   ` Anuj gupta
  2022-09-28 14:47   ` Sagi Grimberg
  2 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2022-09-27  7:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-scsi, linux-nvme, Stefan Roesch

On Mon, Sep 26, 2022 at 07:44:19PM -0600, Jens Axboe wrote:
> By splitting up the metadata and non-metadata end_io handling, we can
> remove any request dependencies on the normal non-metadata IO path. This
> is in preparation for enabling the normal IO passthrough path to pass
> the ownership of the request back to the block layer.
> 
> Co-developed-by: Stefan Roesch <shr@fb.com>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough
  2022-09-27  1:44 [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Jens Axboe
                   ` (4 preceding siblings ...)
  2022-09-27  1:44 ` [PATCH 5/5] nvme: enable batched completions of passthrough IO Jens Axboe
@ 2022-09-28 13:23 ` Anuj gupta
  2022-09-28 14:22   ` Jens Axboe
  2022-09-28 17:05 ` Keith Busch
  6 siblings, 1 reply; 17+ messages in thread
From: Anuj gupta @ 2022-09-28 13:23 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-scsi, linux-nvme

On Tue, Sep 27, 2022 at 7:14 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> Hi,
>
> The passthrough IO path currently doesn't do any request allocation
> batching like we do for normal IO. Wire this up through the usual
> blk_mq_alloc_request() allocation helper.
>
> Similarly, we don't currently supported batched completions for
> passthrough IO. Allow the request->end_io() handler to return back
> whether or not it retains ownership of the request. By default all
> handlers are converted to returning RQ_END_IO_NONE, which retains
> the existing behavior. But with that in place, we can tweak the
> nvme uring_cmd end_io handler to pass back ownership, and hence enable
> completion batching for passthrough requests as well.
>
> This is good for a 10% improvement for passthrough performance. For
> a non-drive limited test case, passthrough IO is now more efficient
> than the regular bdev O_DIRECT path.
>
> Changes since v1:
> - Remove spurious semicolon
> - Cleanup struct nvme_uring_cmd_pdu handling
>
> --
> Jens Axboe
>
>
I see an improvement of ~12% (2.34 to 2.63 MIOPS) with polling enabled and
an improvement of ~4% (1.84 to 1.92 MIOPS) with polling disabled using the
t/io_uring utility (in fio) in my setup with this patch series!

--
Anuj Gupta

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] block: enable batched allocation for blk_mq_alloc_request()
  2022-09-27  1:44 ` [PATCH 1/5] block: enable batched allocation for blk_mq_alloc_request() Jens Axboe
@ 2022-09-28 13:38   ` Anuj gupta
  0 siblings, 0 replies; 17+ messages in thread
From: Anuj gupta @ 2022-09-28 13:38 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-scsi, linux-nvme

On Tue, Sep 27, 2022 at 7:19 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> The filesystem IO path can take advantage of allocating batches of
> requests, if the underlying submitter tells the block layer about it
> through the blk_plug. For passthrough IO, the exported API is the
> blk_mq_alloc_request() helper, and that one does not allow for
> request caching.
>
> Wire up request caching for blk_mq_alloc_request(), which is generally
> done without having a bio available upfront.
>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  block/blk-mq.c | 80 ++++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 71 insertions(+), 9 deletions(-)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index c11949d66163..d3a9f8b9c7ee 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -510,25 +510,87 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
>                                         alloc_time_ns);
>  }
>
> -struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
> -               blk_mq_req_flags_t flags)
> +static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
> +                                           struct blk_plug *plug,
> +                                           blk_opf_t opf,
> +                                           blk_mq_req_flags_t flags)
>  {
>         struct blk_mq_alloc_data data = {
>                 .q              = q,
>                 .flags          = flags,
>                 .cmd_flags      = opf,
> -               .nr_tags        = 1,
> +               .nr_tags        = plug->nr_ios,
> +               .cached_rq      = &plug->cached_rq,
>         };
>         struct request *rq;
> -       int ret;
>
> -       ret = blk_queue_enter(q, flags);
> -       if (ret)
> -               return ERR_PTR(ret);
> +       if (blk_queue_enter(q, flags))
> +               return NULL;
> +
> +       plug->nr_ios = 1;
>
>         rq = __blk_mq_alloc_requests(&data);
> -       if (!rq)
> -               goto out_queue_exit;
> +       if (unlikely(!rq))
> +               blk_queue_exit(q);
> +       return rq;
> +}
> +
> +static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
> +                                                  blk_opf_t opf,
> +                                                  blk_mq_req_flags_t flags)
> +{
> +       struct blk_plug *plug = current->plug;
> +       struct request *rq;
> +
> +       if (!plug)
> +               return NULL;
> +       if (rq_list_empty(plug->cached_rq)) {
> +               if (plug->nr_ios == 1)
> +                       return NULL;
> +               rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
> +               if (rq)
> +                       goto got_it;
> +               return NULL;
> +       }
> +       rq = rq_list_peek(&plug->cached_rq);
> +       if (!rq || rq->q != q)
> +               return NULL;
> +
> +       if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
> +               return NULL;
> +       if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
> +               return NULL;
> +
> +       plug->cached_rq = rq_list_next(rq);
> +got_it:
> +       rq->cmd_flags = opf;
> +       INIT_LIST_HEAD(&rq->queuelist);
> +       return rq;
> +}
> +
> +struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
> +               blk_mq_req_flags_t flags)
> +{
> +       struct request *rq;
> +
> +       rq = blk_mq_alloc_cached_request(q, opf, flags);
> +       if (!rq) {
> +               struct blk_mq_alloc_data data = {
> +                       .q              = q,
> +                       .flags          = flags,
> +                       .cmd_flags      = opf,
> +                       .nr_tags        = 1,
> +               };
> +               int ret;
> +
> +               ret = blk_queue_enter(q, flags);
> +               if (ret)
> +                       return ERR_PTR(ret);
> +
> +               rq = __blk_mq_alloc_requests(&data);
> +               if (!rq)
> +                       goto out_queue_exit;
> +       }
>         rq->__data_len = 0;
>         rq->__sector = (sector_t) -1;
>         rq->bio = rq->biotail = NULL;
> --
> 2.35.1
>

A large chunk of this improvement in passthrough performance is coming by
enabling request caching. On my setup, the performance improves from
2.34 to 2.54 MIOPS. I have tested this using the t/io_uring utility (in fio) and
I am using an Intel Optane Gen2 device.

Tested-by: Anuj Gupta <anuj20.g@samsung.com>

--
Anuj Gupta

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/5] block: allow end_io based requests in the completion batch handling
  2022-09-27  1:44 ` [PATCH 3/5] block: allow end_io based requests in the completion batch handling Jens Axboe
@ 2022-09-28 13:42   ` Anuj gupta
  0 siblings, 0 replies; 17+ messages in thread
From: Anuj gupta @ 2022-09-28 13:42 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-scsi, linux-nvme, Stefan Roesch

On Tue, Sep 27, 2022 at 7:20 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> With end_io handlers now being able to potentially pass ownership of
> the request upon completion, we can allow requests with end_io handlers
> in the batch completion handling.
>
> Co-developed-by: Stefan Roesch <shr@fb.com>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  block/blk-mq.c         | 13 +++++++++++--
>  include/linux/blk-mq.h |  3 ++-
>  2 files changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index a4e018c82b7c..a7dfe7a898a4 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -823,8 +823,10 @@ static void blk_complete_request(struct request *req)
>          * can find how many bytes remain in the request
>          * later.
>          */
> -       req->bio = NULL;
> -       req->__data_len = 0;
> +       if (!req->end_io) {
> +               req->bio = NULL;
> +               req->__data_len = 0;
> +       }
>  }
>
>  /**
> @@ -1055,6 +1057,13 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
>
>                 rq_qos_done(rq->q, rq);
>
> +               /*
> +                * If end_io handler returns NONE, then it still has
> +                * ownership of the request.
> +                */
> +               if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
> +                       continue;
> +
>                 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
>                 if (!req_ref_put_and_test(rq))
>                         continue;
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index e6fa49dd6196..50811d0fb143 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -853,8 +853,9 @@ static inline bool blk_mq_add_to_batch(struct request *req,
>                                        struct io_comp_batch *iob, int ioerror,
>                                        void (*complete)(struct io_comp_batch *))
>  {
> -       if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror)
> +       if (!iob || (req->rq_flags & RQF_ELV) || ioerror)
>                 return false;
> +
>         if (!iob->complete)
>                 iob->complete = complete;
>         else if (iob->complete != complete)
> --
> 2.35.1
>
Looks good.
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>

--
Anuj Gupta

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/5] nvme: split out metadata vs non metadata end_io uring_cmd completions
  2022-09-27  1:44 ` [PATCH 4/5] nvme: split out metadata vs non metadata end_io uring_cmd completions Jens Axboe
  2022-09-27  7:50   ` Christoph Hellwig
@ 2022-09-28 13:51   ` Anuj gupta
  2022-09-28 14:47   ` Sagi Grimberg
  2 siblings, 0 replies; 17+ messages in thread
From: Anuj gupta @ 2022-09-28 13:51 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-scsi, linux-nvme, Stefan Roesch

On Tue, Sep 27, 2022 at 7:22 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> By splitting up the metadata and non-metadata end_io handling, we can
> remove any request dependencies on the normal non-metadata IO path. This
> is in preparation for enabling the normal IO passthrough path to pass
> the ownership of the request back to the block layer.
>
> Co-developed-by: Stefan Roesch <shr@fb.com>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  drivers/nvme/host/ioctl.c | 79 ++++++++++++++++++++++++++++++---------
>  1 file changed, 61 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> index c80b3ecca5c8..9e356a6c96c2 100644
> --- a/drivers/nvme/host/ioctl.c
> +++ b/drivers/nvme/host/ioctl.c
> @@ -349,9 +349,15 @@ struct nvme_uring_cmd_pdu {
>                 struct bio *bio;
>                 struct request *req;
>         };
> -       void *meta; /* kernel-resident buffer */
> -       void __user *meta_buffer;
>         u32 meta_len;
> +       u32 nvme_status;
> +       union {
> +               struct {
> +                       void *meta; /* kernel-resident buffer */
> +                       void __user *meta_buffer;
> +               };
> +               u64 result;
> +       } u;
>  };
>
>  static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
> @@ -360,11 +366,10 @@ static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
>         return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
>  }
>
> -static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
> +static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd)
>  {
>         struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>         struct request *req = pdu->req;
> -       struct bio *bio = req->bio;
>         int status;
>         u64 result;
>
> @@ -375,27 +380,39 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
>
>         result = le64_to_cpu(nvme_req(req)->result.u64);
>
> -       if (pdu->meta)
> -               status = nvme_finish_user_metadata(req, pdu->meta_buffer,
> -                                       pdu->meta, pdu->meta_len, status);
> -       if (bio)
> -               blk_rq_unmap_user(bio);
> +       if (pdu->meta_len)
> +               status = nvme_finish_user_metadata(req, pdu->u.meta_buffer,
> +                                       pdu->u.meta, pdu->meta_len, status);
> +       if (req->bio)
> +               blk_rq_unmap_user(req->bio);
>         blk_mq_free_request(req);
>
>         io_uring_cmd_done(ioucmd, status, result);
>  }
>
> +static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
> +{
> +       struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> +
> +       if (pdu->bio)
> +               blk_rq_unmap_user(pdu->bio);
> +
> +       io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result);
> +}
> +
>  static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
>                                                 blk_status_t err)
>  {
>         struct io_uring_cmd *ioucmd = req->end_io_data;
>         struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> -       /* extract bio before reusing the same field for request */
> -       struct bio *bio = pdu->bio;
>         void *cookie = READ_ONCE(ioucmd->cookie);
>
> -       pdu->req = req;
> -       req->bio = bio;
> +       req->bio = pdu->bio;
> +       if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
> +               pdu->nvme_status = -EINTR;
> +       else
> +               pdu->nvme_status = nvme_req(req)->status;
> +       pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64);
>
>         /*
>          * For iopoll, complete it directly.
> @@ -406,6 +423,29 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
>         else
>                 io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
>
> +       blk_mq_free_request(req);
> +       return RQ_END_IO_NONE;
> +}
> +
> +static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
> +                                                    blk_status_t err)
> +{
> +       struct io_uring_cmd *ioucmd = req->end_io_data;
> +       struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
> +       void *cookie = READ_ONCE(ioucmd->cookie);
> +
> +       req->bio = pdu->bio;
> +       pdu->req = req;
> +
> +       /*
> +        * For iopoll, complete it directly.
> +        * Otherwise, move the completion to task work.
> +        */
> +       if (cookie != NULL && blk_rq_is_poll(req))
> +               nvme_uring_task_meta_cb(ioucmd);
> +       else
> +               io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb);
> +
>         return RQ_END_IO_NONE;
>  }
>
> @@ -467,8 +507,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>                         blk_flags);
>         if (IS_ERR(req))
>                 return PTR_ERR(req);
> -       req->end_io = nvme_uring_cmd_end_io;
> -       req->end_io_data = ioucmd;
>
>         if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) {
>                 if (unlikely(!req->bio)) {
> @@ -483,10 +521,15 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>         }
>         /* to free bio on completion, as req->bio will be null at that time */
>         pdu->bio = req->bio;
> -       pdu->meta = meta;
> -       pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
>         pdu->meta_len = d.metadata_len;
> -
> +       req->end_io_data = ioucmd;
> +       if (pdu->meta_len) {
> +               pdu->u.meta = meta;
> +               pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata);
> +               req->end_io = nvme_uring_cmd_end_io_meta;
> +       } else {
> +               req->end_io = nvme_uring_cmd_end_io;
> +       }
>         blk_execute_rq_nowait(req, false);
>         return -EIOCBQUEUED;
>  }
> --
> 2.35.1
>
Looks good.
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>

--
Anuj Gupta

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 5/5] nvme: enable batched completions of passthrough IO
  2022-09-27  1:44 ` [PATCH 5/5] nvme: enable batched completions of passthrough IO Jens Axboe
@ 2022-09-28 13:55   ` Anuj gupta
  2022-09-28 14:47   ` Sagi Grimberg
  1 sibling, 0 replies; 17+ messages in thread
From: Anuj gupta @ 2022-09-28 13:55 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-scsi, linux-nvme, Stefan Roesch

On Tue, Sep 27, 2022 at 7:19 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> Now that the normal passthrough end_io path doesn't need the request
> anymore, we can kill the explicit blk_mq_free_request() and just pass
> back RQ_END_IO_FREE instead. This enables the batched completion from
> freeing batches of requests at the time.
>
> This brings passthrough IO performance at least on par with bdev based
> O_DIRECT with io_uring. With this and batche allocations, peak performance
> goes from 110M IOPS to 122M IOPS. For IRQ based, passthrough is now also
> about 10% faster than previously, going from ~61M to ~67M IOPS.
>
> Co-developed-by: Stefan Roesch <shr@fb.com>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
>  drivers/nvme/host/ioctl.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> index 9e356a6c96c2..d9633f426690 100644
> --- a/drivers/nvme/host/ioctl.c
> +++ b/drivers/nvme/host/ioctl.c
> @@ -423,8 +423,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
>         else
>                 io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
>
> -       blk_mq_free_request(req);
> -       return RQ_END_IO_NONE;
> +       return RQ_END_IO_FREE;
>  }
>
>  static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
> --
> 2.35.1
>
Looks good to me.
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>

--
Anuj Gupta

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough
  2022-09-28 13:23 ` [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Anuj gupta
@ 2022-09-28 14:22   ` Jens Axboe
  0 siblings, 0 replies; 17+ messages in thread
From: Jens Axboe @ 2022-09-28 14:22 UTC (permalink / raw)
  To: Anuj gupta; +Cc: linux-block, linux-scsi, linux-nvme

On 9/28/22 7:23 AM, Anuj gupta wrote:
> On Tue, Sep 27, 2022 at 7:14 AM Jens Axboe <axboe@kernel.dk> wrote:
>>
>> Hi,
>>
>> The passthrough IO path currently doesn't do any request allocation
>> batching like we do for normal IO. Wire this up through the usual
>> blk_mq_alloc_request() allocation helper.
>>
>> Similarly, we don't currently supported batched completions for
>> passthrough IO. Allow the request->end_io() handler to return back
>> whether or not it retains ownership of the request. By default all
>> handlers are converted to returning RQ_END_IO_NONE, which retains
>> the existing behavior. But with that in place, we can tweak the
>> nvme uring_cmd end_io handler to pass back ownership, and hence enable
>> completion batching for passthrough requests as well.
>>
>> This is good for a 10% improvement for passthrough performance. For
>> a non-drive limited test case, passthrough IO is now more efficient
>> than the regular bdev O_DIRECT path.
>>
>> Changes since v1:
>> - Remove spurious semicolon
>> - Cleanup struct nvme_uring_cmd_pdu handling
>>
>> --
>> Jens Axboe
>>
>>
> I see an improvement of ~12% (2.34 to 2.63 MIOPS) with polling enabled and
> an improvement of ~4% (1.84 to 1.92 MIOPS) with polling disabled using the
> t/io_uring utility (in fio) in my setup with this patch series!

Thanks for your testing! I'll add your reviewed-by to the series.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/5] nvme: split out metadata vs non metadata end_io uring_cmd completions
  2022-09-27  1:44 ` [PATCH 4/5] nvme: split out metadata vs non metadata end_io uring_cmd completions Jens Axboe
  2022-09-27  7:50   ` Christoph Hellwig
  2022-09-28 13:51   ` Anuj gupta
@ 2022-09-28 14:47   ` Sagi Grimberg
  2 siblings, 0 replies; 17+ messages in thread
From: Sagi Grimberg @ 2022-09-28 14:47 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: linux-scsi, linux-nvme, Stefan Roesch


> -static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
> +static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd)
>   {
>   	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
>   	struct request *req = pdu->req;
> -	struct bio *bio = req->bio;

Unrelated change I think. But other than that, looks good,

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 5/5] nvme: enable batched completions of passthrough IO
  2022-09-27  1:44 ` [PATCH 5/5] nvme: enable batched completions of passthrough IO Jens Axboe
  2022-09-28 13:55   ` Anuj gupta
@ 2022-09-28 14:47   ` Sagi Grimberg
  1 sibling, 0 replies; 17+ messages in thread
From: Sagi Grimberg @ 2022-09-28 14:47 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: linux-scsi, linux-nvme, Stefan Roesch

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough
  2022-09-27  1:44 [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Jens Axboe
                   ` (5 preceding siblings ...)
  2022-09-28 13:23 ` [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Anuj gupta
@ 2022-09-28 17:05 ` Keith Busch
  6 siblings, 0 replies; 17+ messages in thread
From: Keith Busch @ 2022-09-28 17:05 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-scsi, linux-nvme

Series looks good.

Reviewed-by: Keith Busch <kbusch@kernel.org>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 5/5] nvme: enable batched completions of passthrough IO
  2022-09-22 18:28 [PATCHSET " Jens Axboe
@ 2022-09-22 18:28 ` Jens Axboe
  0 siblings, 0 replies; 17+ messages in thread
From: Jens Axboe @ 2022-09-22 18:28 UTC (permalink / raw)
  To: linux-block; +Cc: linux-scsi, linux-nvme, Jens Axboe, Stefan Roesch

Now that the normal passthrough end_io path doesn't need the request
anymore, we can kill the explicit blk_mq_free_request() and just pass
back RQ_END_IO_FREE instead. This enables the batched completion from
freeing batches of requests at the time.

This brings passthrough IO performance at least on par with bdev based
O_DIRECT with io_uring. With this and batche allocations, peak performance
goes from 110M IOPS to 122M IOPS. For IRQ based, passthrough is now also
about 10% faster than previously, going from ~61M to ~67M IOPS.

Co-developed-by: Stefan Roesch <shr@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/ioctl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 1ccc9dd6d434..25f2f6df1602 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -430,8 +430,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 	else
 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);

-	blk_mq_free_request(req);
-	return RQ_END_IO_NONE;
+	return RQ_END_IO_FREE;
 }

 static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
-- 
2.35.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2022-09-28 17:05 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-27  1:44 [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Jens Axboe
2022-09-27  1:44 ` [PATCH 1/5] block: enable batched allocation for blk_mq_alloc_request() Jens Axboe
2022-09-28 13:38   ` Anuj gupta
2022-09-27  1:44 ` [PATCH 2/5] block: change request end_io handler to pass back a return value Jens Axboe
2022-09-27  1:44 ` [PATCH 3/5] block: allow end_io based requests in the completion batch handling Jens Axboe
2022-09-28 13:42   ` Anuj gupta
2022-09-27  1:44 ` [PATCH 4/5] nvme: split out metadata vs non metadata end_io uring_cmd completions Jens Axboe
2022-09-27  7:50   ` Christoph Hellwig
2022-09-28 13:51   ` Anuj gupta
2022-09-28 14:47   ` Sagi Grimberg
2022-09-27  1:44 ` [PATCH 5/5] nvme: enable batched completions of passthrough IO Jens Axboe
2022-09-28 13:55   ` Anuj gupta
2022-09-28 14:47   ` Sagi Grimberg
2022-09-28 13:23 ` [PATCHSET v2 0/5] Enable alloc caching and batched freeing for passthrough Anuj gupta
2022-09-28 14:22   ` Jens Axboe
2022-09-28 17:05 ` Keith Busch
  -- strict thread matches above, loose matches on Subject: below --
2022-09-22 18:28 [PATCHSET " Jens Axboe
2022-09-22 18:28 ` [PATCH 5/5] nvme: enable batched completions of passthrough IO Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).