* [PATCH] ublk_drv: don't call task_work_add for queueing io commands
@ 2022-10-23 9:38 Ming Lei
2022-10-24 9:48 ` Ziyang Zhang
0 siblings, 1 reply; 12+ messages in thread
From: Ming Lei @ 2022-10-23 9:38 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-block, ZiyangZhang, Ming Lei
task_work_add() is used for waking ubq daemon task with one batch
of io requests/commands queued. However, task_work_add() isn't
exported for module code, and it is still debatable if the symbol
should be exported.
Fortunately we still have io_uring_cmd_complete_in_task() which just
can't handle batched wakeup for us.
Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
via current command for running them via task work.
This way cleans up current code a lot, meantime allow us to wakeup
ubq daemon task after queueing batched requests/io commands.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/block/ublk_drv.c | 130 ++++++++++++++++-----------------------
1 file changed, 52 insertions(+), 78 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 5afce6ffaadf..7963fba66dd1 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -41,7 +41,7 @@
#include <linux/delay.h>
#include <linux/mm.h>
#include <asm/page.h>
-#include <linux/task_work.h>
+#include <linux/llist.h>
#include <uapi/linux/ublk_cmd.h>
#define UBLK_MINORS (1U << MINORBITS)
@@ -56,12 +56,9 @@
/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
-struct ublk_rq_data {
- struct callback_head work;
-};
-
struct ublk_uring_cmd_pdu {
struct request *req;
+ struct llist_node node;
};
/*
@@ -119,6 +116,8 @@ struct ublk_queue {
struct task_struct *ubq_daemon;
char *io_cmd_buf;
+ struct llist_head io_cmds;
+
unsigned long io_addr; /* mapped vm address */
unsigned int max_io_sz;
bool force_abort;
@@ -268,14 +267,6 @@ static int ublk_apply_params(struct ublk_device *ub)
return 0;
}
-static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
-{
- if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
- !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
- return true;
- return false;
-}
-
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
{
if (ubq->flags & UBLK_F_NEED_GET_DATA)
@@ -761,20 +752,16 @@ static inline void __ublk_rq_task_work(struct request *req)
ubq_complete_io_cmd(io, UBLK_IO_RES_OK);
}
-static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
+static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd)
{
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data;
+ struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
__ublk_rq_task_work(pdu->req);
-}
-static void ublk_rq_task_work_fn(struct callback_head *work)
-{
- struct ublk_rq_data *data = container_of(work,
- struct ublk_rq_data, work);
- struct request *req = blk_mq_rq_from_pdu(data);
-
- __ublk_rq_task_work(req);
+ llist_for_each_entry(pdu, io_cmds, node)
+ __ublk_rq_task_work(pdu->req);
}
static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -782,12 +769,16 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
{
struct ublk_queue *ubq = hctx->driver_data;
struct request *rq = bd->rq;
+ struct ublk_io *io = &ubq->ios[rq->tag];
+ struct io_uring_cmd *cmd = io->cmd;
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
blk_status_t res;
/* fill iod to slot in io cmd buffer */
res = ublk_setup_iod(ubq, rq);
if (unlikely(res != BLK_STS_OK))
return BLK_STS_IOERR;
+
/* With recovery feature enabled, force_abort is set in
* ublk_stop_dev() before calling del_gendisk(). We have to
* abort all requeued and new rqs here to let del_gendisk()
@@ -802,42 +793,38 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(bd->rq);
- if (unlikely(ubq_daemon_is_dying(ubq))) {
- fail:
+ /*
+ * If the check pass, we know that this is a re-issued request aborted
+ * previously in monitor_work because the ubq_daemon(cmd's task) is
+ * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
+ * because this ioucmd's io_uring context may be freed now if no inflight
+ * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
+ *
+ * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
+ * the tag). Then the request is re-started(allocating the tag) and we are here.
+ * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
+ * guarantees that here is a re-issued request aborted previously.
+ */
+ if (unlikely(ubq_daemon_is_dying(ubq) ||
+ (io->flags & UBLK_IO_FLAG_ABORTED))) {
__ublk_abort_rq(ubq, rq);
return BLK_STS_OK;
}
- if (ublk_can_use_task_work(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
- enum task_work_notify_mode notify_mode = bd->last ?
- TWA_SIGNAL_NO_IPI : TWA_NONE;
-
- if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode))
- goto fail;
- } else {
- struct ublk_io *io = &ubq->ios[rq->tag];
- struct io_uring_cmd *cmd = io->cmd;
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ pdu->req = rq;
- /*
- * If the check pass, we know that this is a re-issued request aborted
- * previously in monitor_work because the ubq_daemon(cmd's task) is
- * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
- * because this ioucmd's io_uring context may be freed now if no inflight
- * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
- *
- * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
- * the tag). Then the request is re-started(allocating the tag) and we are here.
- * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
- * guarantees that here is a re-issued request aborted previously.
- */
- if ((io->flags & UBLK_IO_FLAG_ABORTED))
- goto fail;
-
- pdu->req = rq;
- io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
- }
+ /*
+ * Typical multiple producers and single consumer, it is just fine
+ * to use llist_add() in producer side and llist_del_all() in
+ * consumer side.
+ *
+ * The last command can't be added into list, otherwise it could
+ * be handled twice
+ */
+ if (bd->last)
+ io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb);
+ else
+ llist_add(&pdu->node, &ubq->io_cmds);
return BLK_STS_OK;
}
@@ -846,8 +833,8 @@ static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
struct ublk_queue *ubq = hctx->driver_data;
- if (ublk_can_use_task_work(ubq))
- __set_notify_signal(ubq->ubq_daemon);
+ if (!test_and_set_tsk_thread_flag(ubq->ubq_daemon, TIF_NOTIFY_SIGNAL))
+ wake_up_process(ubq->ubq_daemon);
}
static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
@@ -860,20 +847,10 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
return 0;
}
-static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
- unsigned int hctx_idx, unsigned int numa_node)
-{
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- init_task_work(&data->work, ublk_rq_task_work_fn);
- return 0;
-}
-
static const struct blk_mq_ops ublk_mq_ops = {
.queue_rq = ublk_queue_rq,
.commit_rqs = ublk_commit_rqs,
.init_hctx = ublk_init_hctx,
- .init_request = ublk_init_rq,
};
static int ublk_ch_open(struct inode *inode, struct file *filp)
@@ -1163,23 +1140,21 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
mutex_unlock(&ub->mutex);
}
+static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ __ublk_rq_task_work(pdu->req);
+}
+
static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
int tag, struct io_uring_cmd *cmd)
{
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
- if (ublk_can_use_task_work(ubq)) {
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- /* should not fail since we call it just in ubq->ubq_daemon */
- task_work_add(ubq->ubq_daemon, &data->work, TWA_SIGNAL_NO_IPI);
- } else {
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-
- pdu->req = req;
- io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
- }
+ pdu->req = req;
+ io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
}
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
@@ -1450,7 +1425,6 @@ static int ublk_add_tag_set(struct ublk_device *ub)
ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
ub->tag_set.numa_node = NUMA_NO_NODE;
- ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ub->tag_set.driver_data = ub;
return blk_mq_alloc_tag_set(&ub->tag_set);
--
2.31.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
2022-10-23 9:38 [PATCH] ublk_drv: don't call task_work_add for queueing io commands Ming Lei
@ 2022-10-24 9:48 ` Ziyang Zhang
2022-10-24 13:20 ` Ming Lei
0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-24 9:48 UTC (permalink / raw)
To: Ming Lei; +Cc: linux-block, Jens Axboe
On 2022/10/23 17:38, Ming Lei wrote:
> task_work_add() is used for waking ubq daemon task with one batch
> of io requests/commands queued. However, task_work_add() isn't
> exported for module code, and it is still debatable if the symbol
> should be exported.
>
> Fortunately we still have io_uring_cmd_complete_in_task() which just
> can't handle batched wakeup for us.
>
> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
> via current command for running them via task work.
>
> This way cleans up current code a lot, meantime allow us to wakeup
> ubq daemon task after queueing batched requests/io commands.
>
Hi, Ming
This patch works and I have run some tests to compare current version(ucmd)
with your patch(ucmd-batch).
iodepth=128 numjobs=1 direct=1 bs=4k
--------------------------------------------
ublk loop target, the backend is a file.
IOPS(k)
type ucmd ucmd-batch
seq-read 54.7 54.2
rand-read 52.8 52.0
--------------------------------------------
ublk null target
IOPS(k)
type ucmd ucmd-batch
seq-read 257 257
rand-read 252 253
I find that io_req_task_work_add() puts task_work node into a llist
first, then it may call task_work_add() to run batched task_works. So do we really
need such llist in ublk_drv? I think io_uring has already considered task_work batch
optimization.
BTW, task_work_add() in ublk_drv achieves
higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
in ublk_drv.
Regards,
Zhang
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
2022-10-24 9:48 ` Ziyang Zhang
@ 2022-10-24 13:20 ` Ming Lei
2022-10-25 3:15 ` Ziyang Zhang
0 siblings, 1 reply; 12+ messages in thread
From: Ming Lei @ 2022-10-24 13:20 UTC (permalink / raw)
To: Ziyang Zhang; +Cc: linux-block, Jens Axboe
Hello Ziyang,
On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
> On 2022/10/23 17:38, Ming Lei wrote:
> > task_work_add() is used for waking ubq daemon task with one batch
> > of io requests/commands queued. However, task_work_add() isn't
> > exported for module code, and it is still debatable if the symbol
> > should be exported.
> >
> > Fortunately we still have io_uring_cmd_complete_in_task() which just
> > can't handle batched wakeup for us.
> >
> > Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
> > via current command for running them via task work.
> >
> > This way cleans up current code a lot, meantime allow us to wakeup
> > ubq daemon task after queueing batched requests/io commands.
> >
>
>
> Hi, Ming
>
> This patch works and I have run some tests to compare current version(ucmd)
> with your patch(ucmd-batch).
>
> iodepth=128 numjobs=1 direct=1 bs=4k
>
> --------------------------------------------
> ublk loop target, the backend is a file.
> IOPS(k)
>
> type ucmd ucmd-batch
> seq-read 54.7 54.2
> rand-read 52.8 52.0
>
> --------------------------------------------
> ublk null target
> IOPS(k)
>
> type ucmd ucmd-batch
> seq-read 257 257
> rand-read 252 253
>
>
> I find that io_req_task_work_add() puts task_work node into a llist
> first, then it may call task_work_add() to run batched task_works. So do we really
> need such llist in ublk_drv? I think io_uring has already considered task_work batch
> optimization.
>
> BTW, task_work_add() in ublk_drv achieves
> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
> in ublk_drv.
Yeah, that is same with my observation, and motivation of this patch is
to get same performance with task_work_add by building ublk_drv as
module. One win of task_work_add() is that we get exact batching info
meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
what the patch is doing, but needs help of the following ublksrv patch:
https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
IOPS boost is observed on loop/001 by putting image on SSD in my test
VM.
Thanks,
Ming
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
2022-10-24 13:20 ` Ming Lei
@ 2022-10-25 3:15 ` Ziyang Zhang
2022-10-25 7:19 ` Ming Lei
0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-25 3:15 UTC (permalink / raw)
To: Ming Lei; +Cc: linux-block, Jens Axboe
On 2022/10/24 21:20, Ming Lei wrote:
> Hello Ziyang,
>
> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
>> On 2022/10/23 17:38, Ming Lei wrote:
>>> task_work_add() is used for waking ubq daemon task with one batch
>>> of io requests/commands queued. However, task_work_add() isn't
>>> exported for module code, and it is still debatable if the symbol
>>> should be exported.
>>>
>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
>>> can't handle batched wakeup for us.
>>>
>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
>>> via current command for running them via task work.
>>>
>>> This way cleans up current code a lot, meantime allow us to wakeup
>>> ubq daemon task after queueing batched requests/io commands.
>>>
>>
>>
>> Hi, Ming
>>
>> This patch works and I have run some tests to compare current version(ucmd)
>> with your patch(ucmd-batch).
>>
>> iodepth=128 numjobs=1 direct=1 bs=4k
>>
>> --------------------------------------------
>> ublk loop target, the backend is a file.
>> IOPS(k)
>>
>> type ucmd ucmd-batch
>> seq-read 54.7 54.2
>> rand-read 52.8 52.0
>>
>> --------------------------------------------
>> ublk null target
>> IOPS(k)
>>
>> type ucmd ucmd-batch
>> seq-read 257 257
>> rand-read 252 253
>>
>>
>> I find that io_req_task_work_add() puts task_work node into a llist
>> first, then it may call task_work_add() to run batched task_works. So do we really
>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
>> optimization.
>>
>> BTW, task_work_add() in ublk_drv achieves
>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
>> in ublk_drv.
>
> Yeah, that is same with my observation, and motivation of this patch is
> to get same performance with task_work_add by building ublk_drv as
> module. One win of task_work_add() is that we get exact batching info
> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
> what the patch is doing, but needs help of the following ublksrv patch:
>
> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
>
> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
> IOPS boost is observed on loop/001 by putting image on SSD in my test
> VM.
Hi Ming,
I have added this ublksrv patch and run the above test again.
I have also run ublksrv test: loop/001. Please check them.
Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
64GB MEM, CentOS 8, kernel 6.0+
--------
fio test
iodepth=128 numjobs=1 direct=1 bs=4k
ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
for each blk-mq rq.
ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
for the last blk-mq rq.
--------------------------------------------
ublk loop target, the backend is a file.
IOPS(k)
type ucmd ucmd-batch
seq-read 54.1 53.7
rand-read 52.0 52.0
--------------------------------------------
ublk null target
IOPS(k)
type ucmd ucmd-batch
seq-read 272 265
rand-read 262 260
------------
ublksrv test
-------------
ucmd
running loop/001
fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 66737
randread: jobs 1, iops 64935
randrw: jobs 1, iops read 32694 write 32710
rw(512k): jobs 1, iops read 772 write 819
-------------
ucmd-batch
running loop/001
fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 66720
randread: jobs 1, iops 65015
randrw: jobs 1, iops read 32743 write 32759
rw(512k): jobs 1, iops read 771 write 817
It seems that manually putting rqs into llist and calling
io_uring_cmd_complete_in_task() while handling the last rq does
not improve IOPS much.
io_req_task_work_add() puts task_work node into a internal llist
first, then it may call task_work_add() to run batched task_works.
IMO, io_uring has already done such batch optimization and ublk_drv
does not need to add such llist.
Regards,
Zhang.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
2022-10-25 3:15 ` Ziyang Zhang
@ 2022-10-25 7:19 ` Ming Lei
2022-10-25 7:46 ` Ziyang Zhang
0 siblings, 1 reply; 12+ messages in thread
From: Ming Lei @ 2022-10-25 7:19 UTC (permalink / raw)
To: Ziyang Zhang; +Cc: linux-block, Jens Axboe
On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
> On 2022/10/24 21:20, Ming Lei wrote:
> > Hello Ziyang,
> >
> > On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
> >> On 2022/10/23 17:38, Ming Lei wrote:
> >>> task_work_add() is used for waking ubq daemon task with one batch
> >>> of io requests/commands queued. However, task_work_add() isn't
> >>> exported for module code, and it is still debatable if the symbol
> >>> should be exported.
> >>>
> >>> Fortunately we still have io_uring_cmd_complete_in_task() which just
> >>> can't handle batched wakeup for us.
> >>>
> >>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
> >>> via current command for running them via task work.
> >>>
> >>> This way cleans up current code a lot, meantime allow us to wakeup
> >>> ubq daemon task after queueing batched requests/io commands.
> >>>
> >>
> >>
> >> Hi, Ming
> >>
> >> This patch works and I have run some tests to compare current version(ucmd)
> >> with your patch(ucmd-batch).
> >>
> >> iodepth=128 numjobs=1 direct=1 bs=4k
> >>
> >> --------------------------------------------
> >> ublk loop target, the backend is a file.
> >> IOPS(k)
> >>
> >> type ucmd ucmd-batch
> >> seq-read 54.7 54.2
> >> rand-read 52.8 52.0
> >>
> >> --------------------------------------------
> >> ublk null target
> >> IOPS(k)
> >>
> >> type ucmd ucmd-batch
> >> seq-read 257 257
> >> rand-read 252 253
> >>
> >>
> >> I find that io_req_task_work_add() puts task_work node into a llist
> >> first, then it may call task_work_add() to run batched task_works. So do we really
> >> need such llist in ublk_drv? I think io_uring has already considered task_work batch
> >> optimization.
> >>
> >> BTW, task_work_add() in ublk_drv achieves
> >> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
> >> in ublk_drv.
> >
> > Yeah, that is same with my observation, and motivation of this patch is
> > to get same performance with task_work_add by building ublk_drv as
> > module. One win of task_work_add() is that we get exact batching info
> > meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
> > what the patch is doing, but needs help of the following ublksrv patch:
> >
> > https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
> >
> > which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
> > io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
> > IOPS boost is observed on loop/001 by putting image on SSD in my test
> > VM.
>
> Hi Ming,
>
> I have added this ublksrv patch and run the above test again.
> I have also run ublksrv test: loop/001. Please check them.
>
> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> 64GB MEM, CentOS 8, kernel 6.0+
>
> --------
> fio test
>
> iodepth=128 numjobs=1 direct=1 bs=4k
>
> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
> for each blk-mq rq.
>
> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
> for the last blk-mq rq.
>
> --------------------------------------------
> ublk loop target, the backend is a file.
>
> IOPS(k)
>
> type ucmd ucmd-batch
> seq-read 54.1 53.7
> rand-read 52.0 52.0
>
> --------------------------------------------
> ublk null target
> IOPS(k)
>
> type ucmd ucmd-batch
> seq-read 272 265
> rand-read 262 260
>
> ------------
> ublksrv test
>
> -------------
> ucmd
>
> running loop/001
> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 66737
> randread: jobs 1, iops 64935
> randrw: jobs 1, iops read 32694 write 32710
> rw(512k): jobs 1, iops read 772 write 819
>
> -------------
> ucmd-batch
>
> running loop/001
> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 66720
> randread: jobs 1, iops 65015
> randrw: jobs 1, iops read 32743 write 32759
> rw(512k): jobs 1, iops read 771 write 817
>
>
> It seems that manually putting rqs into llist and calling
> io_uring_cmd_complete_in_task() while handling the last rq does
> not improve IOPS much.
>
> io_req_task_work_add() puts task_work node into a internal llist
> first, then it may call task_work_add() to run batched task_works.
> IMO, io_uring has already done such batch optimization and ublk_drv
> does not need to add such llist.
The difference is just how batching is handled, looks blk-mq's batch info
doesn't matter any more. In my test, looks the perf improvement is mainly
made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.
Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
same perf with task_work_add()(ublk_drv is builtin) when building
ublk_drv as module?
Thanks,
Ming
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
2022-10-25 7:19 ` Ming Lei
@ 2022-10-25 7:46 ` Ziyang Zhang
2022-10-25 8:43 ` Ziyang Zhang
0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-25 7:46 UTC (permalink / raw)
To: Ming Lei; +Cc: linux-block, Jens Axboe
On 2022/10/25 15:19, Ming Lei wrote:
> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
>> On 2022/10/24 21:20, Ming Lei wrote:
>>> Hello Ziyang,
>>>
>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
>>>> On 2022/10/23 17:38, Ming Lei wrote:
>>>>> task_work_add() is used for waking ubq daemon task with one batch
>>>>> of io requests/commands queued. However, task_work_add() isn't
>>>>> exported for module code, and it is still debatable if the symbol
>>>>> should be exported.
>>>>>
>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
>>>>> can't handle batched wakeup for us.
>>>>>
>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
>>>>> via current command for running them via task work.
>>>>>
>>>>> This way cleans up current code a lot, meantime allow us to wakeup
>>>>> ubq daemon task after queueing batched requests/io commands.
>>>>>
>>>>
>>>>
>>>> Hi, Ming
>>>>
>>>> This patch works and I have run some tests to compare current version(ucmd)
>>>> with your patch(ucmd-batch).
>>>>
>>>> iodepth=128 numjobs=1 direct=1 bs=4k
>>>>
>>>> --------------------------------------------
>>>> ublk loop target, the backend is a file.
>>>> IOPS(k)
>>>>
>>>> type ucmd ucmd-batch
>>>> seq-read 54.7 54.2
>>>> rand-read 52.8 52.0
>>>>
>>>> --------------------------------------------
>>>> ublk null target
>>>> IOPS(k)
>>>>
>>>> type ucmd ucmd-batch
>>>> seq-read 257 257
>>>> rand-read 252 253
>>>>
>>>>
>>>> I find that io_req_task_work_add() puts task_work node into a llist
>>>> first, then it may call task_work_add() to run batched task_works. So do we really
>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
>>>> optimization.
>>>>
>>>> BTW, task_work_add() in ublk_drv achieves
>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
>>>> in ublk_drv.
>>>
>>> Yeah, that is same with my observation, and motivation of this patch is
>>> to get same performance with task_work_add by building ublk_drv as
>>> module. One win of task_work_add() is that we get exact batching info
>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
>>> what the patch is doing, but needs help of the following ublksrv patch:
>>>
>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
>>>
>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
>>> IOPS boost is observed on loop/001 by putting image on SSD in my test
>>> VM.
>>
>> Hi Ming,
>>
>> I have added this ublksrv patch and run the above test again.
>> I have also run ublksrv test: loop/001. Please check them.
>>
>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
>> 64GB MEM, CentOS 8, kernel 6.0+
>>
>> --------
>> fio test
>>
>> iodepth=128 numjobs=1 direct=1 bs=4k
>>
>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
>> for each blk-mq rq.
>>
>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
>> for the last blk-mq rq.
>>
>> --------------------------------------------
>> ublk loop target, the backend is a file.
>>
>> IOPS(k)
>>
>> type ucmd ucmd-batch
>> seq-read 54.1 53.7
>> rand-read 52.0 52.0
>>
>> --------------------------------------------
>> ublk null target
>> IOPS(k)
>>
>> type ucmd ucmd-batch
>> seq-read 272 265
>> rand-read 262 260
>>
>> ------------
>> ublksrv test
>>
>> -------------
>> ucmd
>>
>> running loop/001
>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 66737
>> randread: jobs 1, iops 64935
>> randrw: jobs 1, iops read 32694 write 32710
>> rw(512k): jobs 1, iops read 772 write 819
>>
>> -------------
>> ucmd-batch
>>
>> running loop/001
>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 66720
>> randread: jobs 1, iops 65015
>> randrw: jobs 1, iops read 32743 write 32759
>> rw(512k): jobs 1, iops read 771 write 817
>>
>>
>> It seems that manually putting rqs into llist and calling
>> io_uring_cmd_complete_in_task() while handling the last rq does
>> not improve IOPS much.
>>
>> io_req_task_work_add() puts task_work node into a internal llist
>> first, then it may call task_work_add() to run batched task_works.
>> IMO, io_uring has already done such batch optimization and ublk_drv
>> does not need to add such llist.
>
> The difference is just how batching is handled, looks blk-mq's batch info
> doesn't matter any more. In my test, looks the perf improvement is mainly
> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.
I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in
ublk_drv does not improve IOPS.
>
> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
> same perf with task_work_add()(ublk_drv is builtin) when building
> ublk_drv as module?
>
OK.
Regards,
Zhang
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
2022-10-25 7:46 ` Ziyang Zhang
@ 2022-10-25 8:43 ` Ziyang Zhang
2022-10-25 15:17 ` Ming Lei
0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-25 8:43 UTC (permalink / raw)
To: Ming Lei; +Cc: linux-block, Jens Axboe
On 2022/10/25 15:46, Ziyang Zhang wrote:
> On 2022/10/25 15:19, Ming Lei wrote:
>> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
>>> On 2022/10/24 21:20, Ming Lei wrote:
>>>> Hello Ziyang,
>>>>
>>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
>>>>> On 2022/10/23 17:38, Ming Lei wrote:
>>>>>> task_work_add() is used for waking ubq daemon task with one batch
>>>>>> of io requests/commands queued. However, task_work_add() isn't
>>>>>> exported for module code, and it is still debatable if the symbol
>>>>>> should be exported.
>>>>>>
>>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
>>>>>> can't handle batched wakeup for us.
>>>>>>
>>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
>>>>>> via current command for running them via task work.
>>>>>>
>>>>>> This way cleans up current code a lot, meantime allow us to wakeup
>>>>>> ubq daemon task after queueing batched requests/io commands.
>>>>>>
>>>>>
>>>>>
>>>>> Hi, Ming
>>>>>
>>>>> This patch works and I have run some tests to compare current version(ucmd)
>>>>> with your patch(ucmd-batch).
>>>>>
>>>>> iodepth=128 numjobs=1 direct=1 bs=4k
>>>>>
>>>>> --------------------------------------------
>>>>> ublk loop target, the backend is a file.
>>>>> IOPS(k)
>>>>>
>>>>> type ucmd ucmd-batch
>>>>> seq-read 54.7 54.2
>>>>> rand-read 52.8 52.0
>>>>>
>>>>> --------------------------------------------
>>>>> ublk null target
>>>>> IOPS(k)
>>>>>
>>>>> type ucmd ucmd-batch
>>>>> seq-read 257 257
>>>>> rand-read 252 253
>>>>>
>>>>>
>>>>> I find that io_req_task_work_add() puts task_work node into a llist
>>>>> first, then it may call task_work_add() to run batched task_works. So do we really
>>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
>>>>> optimization.
>>>>>
>>>>> BTW, task_work_add() in ublk_drv achieves
>>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
>>>>> in ublk_drv.
>>>>
>>>> Yeah, that is same with my observation, and motivation of this patch is
>>>> to get same performance with task_work_add by building ublk_drv as
>>>> module. One win of task_work_add() is that we get exact batching info
>>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
>>>> what the patch is doing, but needs help of the following ublksrv patch:
>>>>
>>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
>>>>
>>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
>>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
>>>> IOPS boost is observed on loop/001 by putting image on SSD in my test
>>>> VM.
>>>
>>> Hi Ming,
>>>
>>> I have added this ublksrv patch and run the above test again.
>>> I have also run ublksrv test: loop/001. Please check them.
>>>
>>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
>>> 64GB MEM, CentOS 8, kernel 6.0+
>>>
>>> --------
>>> fio test
>>>
>>> iodepth=128 numjobs=1 direct=1 bs=4k
>>>
>>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
>>> for each blk-mq rq.
>>>
>>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
>>> for the last blk-mq rq.
>>>
>>> --------------------------------------------
>>> ublk loop target, the backend is a file.
>>>
>>> IOPS(k)
>>>
>>> type ucmd ucmd-batch
>>> seq-read 54.1 53.7
>>> rand-read 52.0 52.0
>>>
>>> --------------------------------------------
>>> ublk null target
>>> IOPS(k)
>>>
>>> type ucmd ucmd-batch
>>> seq-read 272 265
>>> rand-read 262 260
>>>
>>> ------------
>>> ublksrv test
>>>
>>> -------------
>>> ucmd
>>>
>>> running loop/001
>>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>> randwrite: jobs 1, iops 66737
>>> randread: jobs 1, iops 64935
>>> randrw: jobs 1, iops read 32694 write 32710
>>> rw(512k): jobs 1, iops read 772 write 819
>>>
>>> -------------
>>> ucmd-batch
>>>
>>> running loop/001
>>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>> randwrite: jobs 1, iops 66720
>>> randread: jobs 1, iops 65015
>>> randrw: jobs 1, iops read 32743 write 32759
>>> rw(512k): jobs 1, iops read 771 write 817
>>>
>>>
>>> It seems that manually putting rqs into llist and calling
>>> io_uring_cmd_complete_in_task() while handling the last rq does
>>> not improve IOPS much.
>>>
>>> io_req_task_work_add() puts task_work node into a internal llist
>>> first, then it may call task_work_add() to run batched task_works.
>>> IMO, io_uring has already done such batch optimization and ublk_drv
>>> does not need to add such llist.
>>
>> The difference is just how batching is handled, looks blk-mq's batch info
>> doesn't matter any more. In my test, looks the perf improvement is mainly
>> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.
>
> I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in
> ublk_drv does not improve IOPS.
>
>>
>> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
>> same perf with task_work_add()(ublk_drv is builtin) when building
>> ublk_drv as module?
>>
>
> OK.
>
Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
64GB MEM, CentOS 8, kernel 6.0+
with IORING_SETUP_COOP_TASKRUN, without this kernel patch
ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
tw: task_work_add(), ublk is built-in.
--------
fio test
iodepth=128 numjobs=1 direct=1 bs=4k
--------------------------------------------
ublk loop target, the backend is a file.
IOPS(k)
type ucmd tw
seq-read 54.1 53.8
rand-read 52.0 52.0
--------------------------------------------
ublk null target
IOPS(k)
type ucmd tw
seq-read 272 286
rand-read 262 278
------------
ublksrv test
-------------
ucmd
running loop/001
fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 66737
randread: jobs 1, iops 64935
randrw: jobs 1, iops read 32694 write 32710
rw(512k): jobs 1, iops read 772 write 819
running null/001
fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 715863
randread: jobs 1, iops 758449
randrw: jobs 1, iops read 357407 write 357183
rw(512k): jobs 1, iops read 5895 write 5875
-------------
tw
running loop/001
fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 66856
randread: jobs 1, iops 65015
randrw: jobs 1, iops read 32751 write 32767
rw(512k): jobs 1, iops read 776 write 823
running null/001
fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 739450
randread: jobs 1, iops 787500
randrw: jobs 1, iops read 372956 write 372831
rw(512k): jobs 1, iops read 5798 write 5777
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
2022-10-25 8:43 ` Ziyang Zhang
@ 2022-10-25 15:17 ` Ming Lei
2022-10-26 10:32 ` Ziyang Zhang
0 siblings, 1 reply; 12+ messages in thread
From: Ming Lei @ 2022-10-25 15:17 UTC (permalink / raw)
To: Ziyang Zhang; +Cc: linux-block, Jens Axboe
On Tue, Oct 25, 2022 at 04:43:56PM +0800, Ziyang Zhang wrote:
> On 2022/10/25 15:46, Ziyang Zhang wrote:
> > On 2022/10/25 15:19, Ming Lei wrote:
> >> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
> >>> On 2022/10/24 21:20, Ming Lei wrote:
> >>>> Hello Ziyang,
> >>>>
> >>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
> >>>>> On 2022/10/23 17:38, Ming Lei wrote:
> >>>>>> task_work_add() is used for waking ubq daemon task with one batch
> >>>>>> of io requests/commands queued. However, task_work_add() isn't
> >>>>>> exported for module code, and it is still debatable if the symbol
> >>>>>> should be exported.
> >>>>>>
> >>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
> >>>>>> can't handle batched wakeup for us.
> >>>>>>
> >>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
> >>>>>> via current command for running them via task work.
> >>>>>>
> >>>>>> This way cleans up current code a lot, meantime allow us to wakeup
> >>>>>> ubq daemon task after queueing batched requests/io commands.
> >>>>>>
> >>>>>
> >>>>>
> >>>>> Hi, Ming
> >>>>>
> >>>>> This patch works and I have run some tests to compare current version(ucmd)
> >>>>> with your patch(ucmd-batch).
> >>>>>
> >>>>> iodepth=128 numjobs=1 direct=1 bs=4k
> >>>>>
> >>>>> --------------------------------------------
> >>>>> ublk loop target, the backend is a file.
> >>>>> IOPS(k)
> >>>>>
> >>>>> type ucmd ucmd-batch
> >>>>> seq-read 54.7 54.2
> >>>>> rand-read 52.8 52.0
> >>>>>
> >>>>> --------------------------------------------
> >>>>> ublk null target
> >>>>> IOPS(k)
> >>>>>
> >>>>> type ucmd ucmd-batch
> >>>>> seq-read 257 257
> >>>>> rand-read 252 253
> >>>>>
> >>>>>
> >>>>> I find that io_req_task_work_add() puts task_work node into a llist
> >>>>> first, then it may call task_work_add() to run batched task_works. So do we really
> >>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
> >>>>> optimization.
> >>>>>
> >>>>> BTW, task_work_add() in ublk_drv achieves
> >>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
> >>>>> in ublk_drv.
> >>>>
> >>>> Yeah, that is same with my observation, and motivation of this patch is
> >>>> to get same performance with task_work_add by building ublk_drv as
> >>>> module. One win of task_work_add() is that we get exact batching info
> >>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
> >>>> what the patch is doing, but needs help of the following ublksrv patch:
> >>>>
> >>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
> >>>>
> >>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
> >>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
> >>>> IOPS boost is observed on loop/001 by putting image on SSD in my test
> >>>> VM.
> >>>
> >>> Hi Ming,
> >>>
> >>> I have added this ublksrv patch and run the above test again.
> >>> I have also run ublksrv test: loop/001. Please check them.
> >>>
> >>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> >>> 64GB MEM, CentOS 8, kernel 6.0+
> >>>
> >>> --------
> >>> fio test
> >>>
> >>> iodepth=128 numjobs=1 direct=1 bs=4k
> >>>
> >>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
> >>> for each blk-mq rq.
> >>>
> >>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
> >>> for the last blk-mq rq.
> >>>
> >>> --------------------------------------------
> >>> ublk loop target, the backend is a file.
> >>>
> >>> IOPS(k)
> >>>
> >>> type ucmd ucmd-batch
> >>> seq-read 54.1 53.7
> >>> rand-read 52.0 52.0
> >>>
> >>> --------------------------------------------
> >>> ublk null target
> >>> IOPS(k)
> >>>
> >>> type ucmd ucmd-batch
> >>> seq-read 272 265
> >>> rand-read 262 260
> >>>
> >>> ------------
> >>> ublksrv test
> >>>
> >>> -------------
> >>> ucmd
> >>>
> >>> running loop/001
> >>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>> randwrite: jobs 1, iops 66737
> >>> randread: jobs 1, iops 64935
> >>> randrw: jobs 1, iops read 32694 write 32710
> >>> rw(512k): jobs 1, iops read 772 write 819
> >>>
> >>> -------------
> >>> ucmd-batch
> >>>
> >>> running loop/001
> >>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>> randwrite: jobs 1, iops 66720
> >>> randread: jobs 1, iops 65015
> >>> randrw: jobs 1, iops read 32743 write 32759
> >>> rw(512k): jobs 1, iops read 771 write 817
> >>>
> >>>
> >>> It seems that manually putting rqs into llist and calling
> >>> io_uring_cmd_complete_in_task() while handling the last rq does
> >>> not improve IOPS much.
> >>>
> >>> io_req_task_work_add() puts task_work node into a internal llist
> >>> first, then it may call task_work_add() to run batched task_works.
> >>> IMO, io_uring has already done such batch optimization and ublk_drv
> >>> does not need to add such llist.
> >>
> >> The difference is just how batching is handled, looks blk-mq's batch info
> >> doesn't matter any more. In my test, looks the perf improvement is mainly
> >> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.
> >
> > I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in
> > ublk_drv does not improve IOPS.
> >
> >>
> >> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
> >> same perf with task_work_add()(ublk_drv is builtin) when building
> >> ublk_drv as module?
> >>
> >
> > OK.
> >
>
> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> 64GB MEM, CentOS 8, kernel 6.0+
> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
>
> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
> tw: task_work_add(), ublk is built-in.
>
>
> --------
> fio test
>
> iodepth=128 numjobs=1 direct=1 bs=4k
>
> --------------------------------------------
> ublk loop target, the backend is a file.
>
> IOPS(k)
>
> type ucmd tw
> seq-read 54.1 53.8
> rand-read 52.0 52.0
>
> --------------------------------------------
> ublk null target
> IOPS(k)
>
> type ucmd tw
> seq-read 272 286
> rand-read 262 278
>
>
> ------------
> ublksrv test
>
> -------------
> ucmd
>
> running loop/001
> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 66737
> randread: jobs 1, iops 64935
> randrw: jobs 1, iops read 32694 write 32710
> rw(512k): jobs 1, iops read 772 write 819
>
> running null/001
> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 715863
> randread: jobs 1, iops 758449
> randrw: jobs 1, iops read 357407 write 357183
> rw(512k): jobs 1, iops read 5895 write 5875
>
> -------------
> tw
>
> running loop/001
> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 66856
> randread: jobs 1, iops 65015
> randrw: jobs 1, iops read 32751 write 32767
> rw(512k): jobs 1, iops read 776 write 823
>
> running null/001
> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 739450
> randread: jobs 1, iops 787500
> randrw: jobs 1, iops read 372956 write 372831
> rw(512k): jobs 1, iops read 5798 write 5777
Looks the gap isn't big between ucmd and tw when running null/001, in
which the fio io process should saturate the CPU. Probably we
should avoid to touch 'cmd'/'pdu'/'io' in ublk_queue_rq() since these
data should be cold at that time.
Can you apply the following delta patch against the current patch(
"ublk_drv: don't call task_work_add for queueing io commands") and
compare with task_work_add()?
From ecbbf6d10dbc63e279ce0b55c45da6721947f18d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 25 Oct 2022 11:01:25 -0400
Subject: [PATCH] ublk: follow up change
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/block/ublk_drv.c | 36 ++++++++++++++++++++++--------------
1 file changed, 22 insertions(+), 14 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 7963fba66dd1..18db337094c1 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -56,9 +56,12 @@
/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
+struct ublk_rq_data {
+ struct llist_node node;
+};
+
struct ublk_uring_cmd_pdu {
struct request *req;
- struct llist_node node;
};
/*
@@ -693,7 +696,8 @@ static inline void __ublk_rq_task_work(struct request *req)
*
* (2) current->flags & PF_EXITING.
*/
- if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
+ if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING
+ || (io->flags & UBLK_IO_FLAG_ABORTED))) {
__ublk_abort_rq(ubq, req);
return;
}
@@ -757,11 +761,12 @@ static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd)
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data;
struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
+ struct ublk_rq_data *data;
__ublk_rq_task_work(pdu->req);
- llist_for_each_entry(pdu, io_cmds, node)
- __ublk_rq_task_work(pdu->req);
+ llist_for_each_entry(data, io_cmds, node)
+ __ublk_rq_task_work(blk_mq_rq_from_pdu(data));
}
static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -769,9 +774,6 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
{
struct ublk_queue *ubq = hctx->driver_data;
struct request *rq = bd->rq;
- struct ublk_io *io = &ubq->ios[rq->tag];
- struct io_uring_cmd *cmd = io->cmd;
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
blk_status_t res;
/* fill iod to slot in io cmd buffer */
@@ -805,14 +807,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
* Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
* guarantees that here is a re-issued request aborted previously.
*/
- if (unlikely(ubq_daemon_is_dying(ubq) ||
- (io->flags & UBLK_IO_FLAG_ABORTED))) {
+ if (unlikely(ubq_daemon_is_dying(ubq))) {
__ublk_abort_rq(ubq, rq);
return BLK_STS_OK;
}
- pdu->req = rq;
-
/*
* Typical multiple producers and single consumer, it is just fine
* to use llist_add() in producer side and llist_del_all() in
@@ -821,10 +820,18 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
* The last command can't be added into list, otherwise it could
* be handled twice
*/
- if (bd->last)
+ if (bd->last) {
+ struct ublk_io *io = &ubq->ios[rq->tag];
+ struct io_uring_cmd *cmd = io->cmd;
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ pdu->req = rq;
io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb);
- else
- llist_add(&pdu->node, &ubq->io_cmds);
+ } else {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+
+ llist_add(&data->node, &ubq->io_cmds);
+ }
return BLK_STS_OK;
}
@@ -1426,6 +1433,7 @@ static int ublk_add_tag_set(struct ublk_device *ub)
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
ub->tag_set.numa_node = NUMA_NO_NODE;
ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
ub->tag_set.driver_data = ub;
return blk_mq_alloc_tag_set(&ub->tag_set);
}
--
2.31.1
Thanks,
Ming
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
2022-10-25 15:17 ` Ming Lei
@ 2022-10-26 10:32 ` Ziyang Zhang
2022-10-26 11:29 ` Ming Lei
0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-26 10:32 UTC (permalink / raw)
To: Ming Lei; +Cc: linux-block, Jens Axboe
On 2022/10/25 23:17, Ming Lei wrote:
> On Tue, Oct 25, 2022 at 04:43:56PM +0800, Ziyang Zhang wrote:
>> On 2022/10/25 15:46, Ziyang Zhang wrote:
>>> On 2022/10/25 15:19, Ming Lei wrote:
>>>> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
>>>>> On 2022/10/24 21:20, Ming Lei wrote:
>>>>>> Hello Ziyang,
>>>>>>
>>>>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
>>>>>>> On 2022/10/23 17:38, Ming Lei wrote:
>>>>>>>> task_work_add() is used for waking ubq daemon task with one batch
>>>>>>>> of io requests/commands queued. However, task_work_add() isn't
>>>>>>>> exported for module code, and it is still debatable if the symbol
>>>>>>>> should be exported.
>>>>>>>>
>>>>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
>>>>>>>> can't handle batched wakeup for us.
>>>>>>>>
>>>>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
>>>>>>>> via current command for running them via task work.
>>>>>>>>
>>>>>>>> This way cleans up current code a lot, meantime allow us to wakeup
>>>>>>>> ubq daemon task after queueing batched requests/io commands.
>>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Hi, Ming
>>>>>>>
>>>>>>> This patch works and I have run some tests to compare current version(ucmd)
>>>>>>> with your patch(ucmd-batch).
>>>>>>>
>>>>>>> iodepth=128 numjobs=1 direct=1 bs=4k
>>>>>>>
>>>>>>> --------------------------------------------
>>>>>>> ublk loop target, the backend is a file.
>>>>>>> IOPS(k)
>>>>>>>
>>>>>>> type ucmd ucmd-batch
>>>>>>> seq-read 54.7 54.2
>>>>>>> rand-read 52.8 52.0
>>>>>>>
>>>>>>> --------------------------------------------
>>>>>>> ublk null target
>>>>>>> IOPS(k)
>>>>>>>
>>>>>>> type ucmd ucmd-batch
>>>>>>> seq-read 257 257
>>>>>>> rand-read 252 253
>>>>>>>
>>>>>>>
>>>>>>> I find that io_req_task_work_add() puts task_work node into a llist
>>>>>>> first, then it may call task_work_add() to run batched task_works. So do we really
>>>>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
>>>>>>> optimization.
>>>>>>>
>>>>>>> BTW, task_work_add() in ublk_drv achieves
>>>>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
>>>>>>> in ublk_drv.
>>>>>>
>>>>>> Yeah, that is same with my observation, and motivation of this patch is
>>>>>> to get same performance with task_work_add by building ublk_drv as
>>>>>> module. One win of task_work_add() is that we get exact batching info
>>>>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
>>>>>> what the patch is doing, but needs help of the following ublksrv patch:
>>>>>>
>>>>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
>>>>>>
>>>>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
>>>>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
>>>>>> IOPS boost is observed on loop/001 by putting image on SSD in my test
>>>>>> VM.
>>>>>
>>>>> Hi Ming,
>>>>>
>>>>> I have added this ublksrv patch and run the above test again.
>>>>> I have also run ublksrv test: loop/001. Please check them.
>>>>>
>>>>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
>>>>> 64GB MEM, CentOS 8, kernel 6.0+
>>>>>
>>>>> --------
>>>>> fio test
>>>>>
>>>>> iodepth=128 numjobs=1 direct=1 bs=4k
>>>>>
>>>>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
>>>>> for each blk-mq rq.
>>>>>
>>>>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
>>>>> for the last blk-mq rq.
>>>>>
>>>>> --------------------------------------------
>>>>> ublk loop target, the backend is a file.
>>>>>
>>>>> IOPS(k)
>>>>>
>>>>> type ucmd ucmd-batch
>>>>> seq-read 54.1 53.7
>>>>> rand-read 52.0 52.0
>>>>>
>>>>> --------------------------------------------
>>>>> ublk null target
>>>>> IOPS(k)
>>>>>
>>>>> type ucmd ucmd-batch
>>>>> seq-read 272 265
>>>>> rand-read 262 260
>>>>>
>>>>> ------------
>>>>> ublksrv test
>>>>>
>>>>> -------------
>>>>> ucmd
>>>>>
>>>>> running loop/001
>>>>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>>>> randwrite: jobs 1, iops 66737
>>>>> randread: jobs 1, iops 64935
>>>>> randrw: jobs 1, iops read 32694 write 32710
>>>>> rw(512k): jobs 1, iops read 772 write 819
>>>>>
>>>>> -------------
>>>>> ucmd-batch
>>>>>
>>>>> running loop/001
>>>>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>>>>> randwrite: jobs 1, iops 66720
>>>>> randread: jobs 1, iops 65015
>>>>> randrw: jobs 1, iops read 32743 write 32759
>>>>> rw(512k): jobs 1, iops read 771 write 817
>>>>>
>>>>>
>>>>> It seems that manually putting rqs into llist and calling
>>>>> io_uring_cmd_complete_in_task() while handling the last rq does
>>>>> not improve IOPS much.
>>>>>
>>>>> io_req_task_work_add() puts task_work node into a internal llist
>>>>> first, then it may call task_work_add() to run batched task_works.
>>>>> IMO, io_uring has already done such batch optimization and ublk_drv
>>>>> does not need to add such llist.
>>>>
>>>> The difference is just how batching is handled, looks blk-mq's batch info
>>>> doesn't matter any more. In my test, looks the perf improvement is mainly
>>>> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.
>>>
>>> I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in
>>> ublk_drv does not improve IOPS.
>>>
>>>>
>>>> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
>>>> same perf with task_work_add()(ublk_drv is builtin) when building
>>>> ublk_drv as module?
>>>>
>>>
>>> OK.
>>>
>>
>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
>> 64GB MEM, CentOS 8, kernel 6.0+
>> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
>>
>> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
>> tw: task_work_add(), ublk is built-in.
>>
>>
>> --------
>> fio test
>>
>> iodepth=128 numjobs=1 direct=1 bs=4k
>>
>> --------------------------------------------
>> ublk loop target, the backend is a file.
>>
>> IOPS(k)
>>
>> type ucmd tw
>> seq-read 54.1 53.8
>> rand-read 52.0 52.0
>>
>> --------------------------------------------
>> ublk null target
>> IOPS(k)
>>
>> type ucmd tw
>> seq-read 272 286
>> rand-read 262 278
>>
>>
>> ------------
>> ublksrv test
>>
>> -------------
>> ucmd
>>
>> running loop/001
>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 66737
>> randread: jobs 1, iops 64935
>> randrw: jobs 1, iops read 32694 write 32710
>> rw(512k): jobs 1, iops read 772 write 819
>>
>> running null/001
>> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 715863
>> randread: jobs 1, iops 758449
>> randrw: jobs 1, iops read 357407 write 357183
>> rw(512k): jobs 1, iops read 5895 write 5875
>>
>> -------------
>> tw
>>
>> running loop/001
>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 66856
>> randread: jobs 1, iops 65015
>> randrw: jobs 1, iops read 32751 write 32767
>> rw(512k): jobs 1, iops read 776 write 823
>>
>> running null/001
>> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 739450
>> randread: jobs 1, iops 787500
>> randrw: jobs 1, iops read 372956 write 372831
>> rw(512k): jobs 1, iops read 5798 write 5777
>
> Looks the gap isn't big between ucmd and tw when running null/001, in
> which the fio io process should saturate the CPU. Probably we
> should avoid to touch 'cmd'/'pdu'/'io' in ublk_queue_rq() since these
> data should be cold at that time.
>
> Can you apply the following delta patch against the current patch(
> "ublk_drv: don't call task_work_add for queueing io commands") and
> compare with task_work_add()?
>
> From ecbbf6d10dbc63e279ce0b55c45da6721947f18d Mon Sep 17 00:00:00 2001
> From: Ming Lei <ming.lei@redhat.com>
> Date: Tue, 25 Oct 2022 11:01:25 -0400
> Subject: [PATCH] ublk: follow up change
>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
> drivers/block/ublk_drv.c | 36 ++++++++++++++++++++++--------------
> 1 file changed, 22 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 7963fba66dd1..18db337094c1 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -56,9 +56,12 @@
> /* All UBLK_PARAM_TYPE_* should be included here */
> #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
>
> +struct ublk_rq_data {
> + struct llist_node node;
> +};
> +
> struct ublk_uring_cmd_pdu {
> struct request *req;
> - struct llist_node node;
> };
>
> /*
> @@ -693,7 +696,8 @@ static inline void __ublk_rq_task_work(struct request *req)
> *
> * (2) current->flags & PF_EXITING.
> */
> - if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
> + if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING
> + || (io->flags & UBLK_IO_FLAG_ABORTED))) {
> __ublk_abort_rq(ubq, req);
> return;
We cannot check io->flags & UBLK_IO_FLAG_ABORTED in io_uring task_work.
We have to check io->flags & UBLK_IO_FLAG_ABORTED in ublk_queue_rq() because
io_uring ctx may be freed at that time and ioucmd task_work is invalid(freed).
Please See comment around it for more detail.
> }
> @@ -757,11 +761,12 @@ static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd)
> struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
> struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data;
> struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
> + struct ublk_rq_data *data;
>
> __ublk_rq_task_work(pdu->req);
>
> - llist_for_each_entry(pdu, io_cmds, node)
> - __ublk_rq_task_work(pdu->req);
> + llist_for_each_entry(data, io_cmds, node)
> + __ublk_rq_task_work(blk_mq_rq_from_pdu(data));
> }
>
> static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> @@ -769,9 +774,6 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> {
> struct ublk_queue *ubq = hctx->driver_data;
> struct request *rq = bd->rq;
> - struct ublk_io *io = &ubq->ios[rq->tag];
> - struct io_uring_cmd *cmd = io->cmd;
> - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
> blk_status_t res;
>
> /* fill iod to slot in io cmd buffer */
> @@ -805,14 +807,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
> * guarantees that here is a re-issued request aborted previously.
> */
> - if (unlikely(ubq_daemon_is_dying(ubq) ||
> - (io->flags & UBLK_IO_FLAG_ABORTED))) {
> + if (unlikely(ubq_daemon_is_dying(ubq))) {
> __ublk_abort_rq(ubq, rq);
> return BLK_STS_OK;
> }
>
> - pdu->req = rq;
> -
> /*
> * Typical multiple producers and single consumer, it is just fine
> * to use llist_add() in producer side and llist_del_all() in
> @@ -821,10 +820,18 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> * The last command can't be added into list, otherwise it could
> * be handled twice
> */
> - if (bd->last)
> + if (bd->last) {
> + struct ublk_io *io = &ubq->ios[rq->tag];
> + struct io_uring_cmd *cmd = io->cmd;
> + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
> +
> + pdu->req = rq;
> io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb);
> - else
> - llist_add(&pdu->node, &ubq->io_cmds);
> + } else {
> + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
> +
> + llist_add(&data->node, &ubq->io_cmds);
> + }
>
> return BLK_STS_OK;
> }
> @@ -1426,6 +1433,7 @@ static int ublk_add_tag_set(struct ublk_device *ub)
> ub->tag_set.queue_depth = ub->dev_info.queue_depth;
> ub->tag_set.numa_node = NUMA_NO_NODE;
> ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
> + ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
> ub->tag_set.driver_data = ub;
> return blk_mq_alloc_tag_set(&ub->tag_set);
> }
Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
64GB MEM, CentOS 8, kernel 6.0+
with IORING_SETUP_COOP_TASKRUN, without this kernel patch
ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq()
tw: task_work_add(), ublk is built-in.
--------
fio test
iodepth=128 numjobs=1 direct=1 bs=4k
--------------------------------------------
ublk loop target, the backend is a file.
IOPS(k)
type ucmd tw ucmd-not-touch-pdu
seq-read 54.1 53.8 53.6
rand-read 52.0 52.0 52.0
--------------------------------------------
ublk null target
IOPS(k)
type ucmd tw ucmd-not-touch-pdu
seq-read 272 286 275
rand-read 262 278 269
------------
ublksrv test
-------------
ucmd
running loop/001
fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 66737
randread: jobs 1, iops 64935
randrw: jobs 1, iops read 32694 write 32710
rw(512k): jobs 1, iops read 772 write 819
running null/001
fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 715863
randread: jobs 1, iops 758449
randrw: jobs 1, iops read 357407 write 357183
rw(512k): jobs 1, iops read 5895 write 5875
-------------
tw
running loop/001
fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 66856
randread: jobs 1, iops 65015
randrw: jobs 1, iops read 32751 write 32767
rw(512k): jobs 1, iops read 776 write 823
running null/001
fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 739450
randread: jobs 1, iops 787500
randrw: jobs 1, iops read 372956 write 372831
rw(512k): jobs 1, iops read 5798 write 5777
-------------
ucmd-not-touch-pdu
running loop/001
fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 66754
randread: jobs 1, iops 65032
randrw: jobs 1, iops read 32776 write 32792
rw(512k): jobs 1, iops read 772 write 818
running null/001
fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
randwrite: jobs 1, iops 725334
randread: jobs 1, iops 741105
randrw: jobs 1, iops read 360285 write 360047
rw(512k): jobs 1, iops read 5770 write 5748
Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS.
But it is worse than using task_work_add().
Regards,
Zhang
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
2022-10-26 10:32 ` Ziyang Zhang
@ 2022-10-26 11:29 ` Ming Lei
2022-10-27 3:00 ` Ziyang Zhang
0 siblings, 1 reply; 12+ messages in thread
From: Ming Lei @ 2022-10-26 11:29 UTC (permalink / raw)
To: Ziyang Zhang; +Cc: linux-block, Jens Axboe
On Wed, Oct 26, 2022 at 06:32:46PM +0800, Ziyang Zhang wrote:
> On 2022/10/25 23:17, Ming Lei wrote:
> > On Tue, Oct 25, 2022 at 04:43:56PM +0800, Ziyang Zhang wrote:
> >> On 2022/10/25 15:46, Ziyang Zhang wrote:
> >>> On 2022/10/25 15:19, Ming Lei wrote:
> >>>> On Tue, Oct 25, 2022 at 11:15:57AM +0800, Ziyang Zhang wrote:
> >>>>> On 2022/10/24 21:20, Ming Lei wrote:
> >>>>>> Hello Ziyang,
> >>>>>>
> >>>>>> On Mon, Oct 24, 2022 at 05:48:51PM +0800, Ziyang Zhang wrote:
> >>>>>>> On 2022/10/23 17:38, Ming Lei wrote:
> >>>>>>>> task_work_add() is used for waking ubq daemon task with one batch
> >>>>>>>> of io requests/commands queued. However, task_work_add() isn't
> >>>>>>>> exported for module code, and it is still debatable if the symbol
> >>>>>>>> should be exported.
> >>>>>>>>
> >>>>>>>> Fortunately we still have io_uring_cmd_complete_in_task() which just
> >>>>>>>> can't handle batched wakeup for us.
> >>>>>>>>
> >>>>>>>> Add one one llist into ublk_queue and call io_uring_cmd_complete_in_task()
> >>>>>>>> via current command for running them via task work.
> >>>>>>>>
> >>>>>>>> This way cleans up current code a lot, meantime allow us to wakeup
> >>>>>>>> ubq daemon task after queueing batched requests/io commands.
> >>>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> Hi, Ming
> >>>>>>>
> >>>>>>> This patch works and I have run some tests to compare current version(ucmd)
> >>>>>>> with your patch(ucmd-batch).
> >>>>>>>
> >>>>>>> iodepth=128 numjobs=1 direct=1 bs=4k
> >>>>>>>
> >>>>>>> --------------------------------------------
> >>>>>>> ublk loop target, the backend is a file.
> >>>>>>> IOPS(k)
> >>>>>>>
> >>>>>>> type ucmd ucmd-batch
> >>>>>>> seq-read 54.7 54.2
> >>>>>>> rand-read 52.8 52.0
> >>>>>>>
> >>>>>>> --------------------------------------------
> >>>>>>> ublk null target
> >>>>>>> IOPS(k)
> >>>>>>>
> >>>>>>> type ucmd ucmd-batch
> >>>>>>> seq-read 257 257
> >>>>>>> rand-read 252 253
> >>>>>>>
> >>>>>>>
> >>>>>>> I find that io_req_task_work_add() puts task_work node into a llist
> >>>>>>> first, then it may call task_work_add() to run batched task_works. So do we really
> >>>>>>> need such llist in ublk_drv? I think io_uring has already considered task_work batch
> >>>>>>> optimization.
> >>>>>>>
> >>>>>>> BTW, task_work_add() in ublk_drv achieves
> >>>>>>> higher IOPS(about 5-10% on my machine) than io_uring_cmd_complete_in_task()
> >>>>>>> in ublk_drv.
> >>>>>>
> >>>>>> Yeah, that is same with my observation, and motivation of this patch is
> >>>>>> to get same performance with task_work_add by building ublk_drv as
> >>>>>> module. One win of task_work_add() is that we get exact batching info
> >>>>>> meantime only send TWA_SIGNAL_NO_IPI for whole batch, that is basically
> >>>>>> what the patch is doing, but needs help of the following ublksrv patch:
> >>>>>>
> >>>>>> https://github.com/ming1/ubdsrv/commit/dce6d1d222023c1641292713b311ced01e6dc548
> >>>>>>
> >>>>>> which sets IORING_SETUP_COOP_TASKRUN for ublksrv's uring, then
> >>>>>> io_uring_cmd_complete_in_task will notify via TWA_SIGNAL_NO_IPI, and 5+%
> >>>>>> IOPS boost is observed on loop/001 by putting image on SSD in my test
> >>>>>> VM.
> >>>>>
> >>>>> Hi Ming,
> >>>>>
> >>>>> I have added this ublksrv patch and run the above test again.
> >>>>> I have also run ublksrv test: loop/001. Please check them.
> >>>>>
> >>>>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> >>>>> 64GB MEM, CentOS 8, kernel 6.0+
> >>>>>
> >>>>> --------
> >>>>> fio test
> >>>>>
> >>>>> iodepth=128 numjobs=1 direct=1 bs=4k
> >>>>>
> >>>>> ucmd: without your kernel patch. Run io_uring_cmd_complete_in_task()
> >>>>> for each blk-mq rq.
> >>>>>
> >>>>> ucmd-batch: with your kernel patch. Run io_uring_cmd_complete_in_task()
> >>>>> for the last blk-mq rq.
> >>>>>
> >>>>> --------------------------------------------
> >>>>> ublk loop target, the backend is a file.
> >>>>>
> >>>>> IOPS(k)
> >>>>>
> >>>>> type ucmd ucmd-batch
> >>>>> seq-read 54.1 53.7
> >>>>> rand-read 52.0 52.0
> >>>>>
> >>>>> --------------------------------------------
> >>>>> ublk null target
> >>>>> IOPS(k)
> >>>>>
> >>>>> type ucmd ucmd-batch
> >>>>> seq-read 272 265
> >>>>> rand-read 262 260
> >>>>>
> >>>>> ------------
> >>>>> ublksrv test
> >>>>>
> >>>>> -------------
> >>>>> ucmd
> >>>>>
> >>>>> running loop/001
> >>>>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>>>> randwrite: jobs 1, iops 66737
> >>>>> randread: jobs 1, iops 64935
> >>>>> randrw: jobs 1, iops read 32694 write 32710
> >>>>> rw(512k): jobs 1, iops read 772 write 819
> >>>>>
> >>>>> -------------
> >>>>> ucmd-batch
> >>>>>
> >>>>> running loop/001
> >>>>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_F56a3), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >>>>> randwrite: jobs 1, iops 66720
> >>>>> randread: jobs 1, iops 65015
> >>>>> randrw: jobs 1, iops read 32743 write 32759
> >>>>> rw(512k): jobs 1, iops read 771 write 817
> >>>>>
> >>>>>
> >>>>> It seems that manually putting rqs into llist and calling
> >>>>> io_uring_cmd_complete_in_task() while handling the last rq does
> >>>>> not improve IOPS much.
> >>>>>
> >>>>> io_req_task_work_add() puts task_work node into a internal llist
> >>>>> first, then it may call task_work_add() to run batched task_works.
> >>>>> IMO, io_uring has already done such batch optimization and ublk_drv
> >>>>> does not need to add such llist.
> >>>>
> >>>> The difference is just how batching is handled, looks blk-mq's batch info
> >>>> doesn't matter any more. In my test, looks the perf improvement is mainly
> >>>> made by enabling IORING_SETUP_COOP_TASKRUN in ublksrv.
> >>>
> >>> I guess only IORING_SETUP_COOP_TASKRUN helps improve IOPS. The llist in
> >>> ublk_drv does not improve IOPS.
> >>>
> >>>>
> >>>> Can you check if enabling IORING_SETUP_COOP_TASKRUN only can reach
> >>>> same perf with task_work_add()(ublk_drv is builtin) when building
> >>>> ublk_drv as module?
> >>>>
> >>>
> >>> OK.
> >>>
> >>
> >> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> >> 64GB MEM, CentOS 8, kernel 6.0+
> >> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
> >>
> >> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
> >> tw: task_work_add(), ublk is built-in.
> >>
> >>
> >> --------
> >> fio test
> >>
> >> iodepth=128 numjobs=1 direct=1 bs=4k
> >>
> >> --------------------------------------------
> >> ublk loop target, the backend is a file.
> >>
> >> IOPS(k)
> >>
> >> type ucmd tw
> >> seq-read 54.1 53.8
> >> rand-read 52.0 52.0
> >>
> >> --------------------------------------------
> >> ublk null target
> >> IOPS(k)
> >>
> >> type ucmd tw
> >> seq-read 272 286
> >> rand-read 262 278
> >>
> >>
> >> ------------
> >> ublksrv test
> >>
> >> -------------
> >> ucmd
> >>
> >> running loop/001
> >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >> randwrite: jobs 1, iops 66737
> >> randread: jobs 1, iops 64935
> >> randrw: jobs 1, iops read 32694 write 32710
> >> rw(512k): jobs 1, iops read 772 write 819
> >>
> >> running null/001
> >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >> randwrite: jobs 1, iops 715863
> >> randread: jobs 1, iops 758449
> >> randrw: jobs 1, iops read 357407 write 357183
> >> rw(512k): jobs 1, iops read 5895 write 5875
> >>
> >> -------------
> >> tw
> >>
> >> running loop/001
> >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >> randwrite: jobs 1, iops 66856
> >> randread: jobs 1, iops 65015
> >> randrw: jobs 1, iops read 32751 write 32767
> >> rw(512k): jobs 1, iops read 776 write 823
> >>
> >> running null/001
> >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >> randwrite: jobs 1, iops 739450
> >> randread: jobs 1, iops 787500
> >> randrw: jobs 1, iops read 372956 write 372831
> >> rw(512k): jobs 1, iops read 5798 write 5777
> >
> > Looks the gap isn't big between ucmd and tw when running null/001, in
> > which the fio io process should saturate the CPU. Probably we
> > should avoid to touch 'cmd'/'pdu'/'io' in ublk_queue_rq() since these
> > data should be cold at that time.
> >
> > Can you apply the following delta patch against the current patch(
> > "ublk_drv: don't call task_work_add for queueing io commands") and
> > compare with task_work_add()?
> >
> > From ecbbf6d10dbc63e279ce0b55c45da6721947f18d Mon Sep 17 00:00:00 2001
> > From: Ming Lei <ming.lei@redhat.com>
> > Date: Tue, 25 Oct 2022 11:01:25 -0400
> > Subject: [PATCH] ublk: follow up change
> >
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> > drivers/block/ublk_drv.c | 36 ++++++++++++++++++++++--------------
> > 1 file changed, 22 insertions(+), 14 deletions(-)
> >
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index 7963fba66dd1..18db337094c1 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -56,9 +56,12 @@
> > /* All UBLK_PARAM_TYPE_* should be included here */
> > #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
> >
> > +struct ublk_rq_data {
> > + struct llist_node node;
> > +};
> > +
> > struct ublk_uring_cmd_pdu {
> > struct request *req;
> > - struct llist_node node;
> > };
> >
> > /*
> > @@ -693,7 +696,8 @@ static inline void __ublk_rq_task_work(struct request *req)
> > *
> > * (2) current->flags & PF_EXITING.
> > */
> > - if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
> > + if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING
> > + || (io->flags & UBLK_IO_FLAG_ABORTED))) {
> > __ublk_abort_rq(ubq, req);
> > return;
>
> We cannot check io->flags & UBLK_IO_FLAG_ABORTED in io_uring task_work.
> We have to check io->flags & UBLK_IO_FLAG_ABORTED in ublk_queue_rq() because
> io_uring ctx may be freed at that time and ioucmd task_work is invalid(freed).
> Please See comment around it for more detail.
The comment is obsolete, because all inflight requests are drained
before io_uring is dead, please see __ublk_quiesce_dev().
>
> > }
> > @@ -757,11 +761,12 @@ static void ublk_rqs_task_work_cb(struct io_uring_cmd *cmd)
> > struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
> > struct ublk_queue *ubq = pdu->req->mq_hctx->driver_data;
> > struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
> > + struct ublk_rq_data *data;
> >
> > __ublk_rq_task_work(pdu->req);
> >
> > - llist_for_each_entry(pdu, io_cmds, node)
> > - __ublk_rq_task_work(pdu->req);
> > + llist_for_each_entry(data, io_cmds, node)
> > + __ublk_rq_task_work(blk_mq_rq_from_pdu(data));
> > }
> >
> > static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> > @@ -769,9 +774,6 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> > {
> > struct ublk_queue *ubq = hctx->driver_data;
> > struct request *rq = bd->rq;
> > - struct ublk_io *io = &ubq->ios[rq->tag];
> > - struct io_uring_cmd *cmd = io->cmd;
> > - struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
> > blk_status_t res;
> >
> > /* fill iod to slot in io cmd buffer */
> > @@ -805,14 +807,11 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> > * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
> > * guarantees that here is a re-issued request aborted previously.
> > */
> > - if (unlikely(ubq_daemon_is_dying(ubq) ||
> > - (io->flags & UBLK_IO_FLAG_ABORTED))) {
> > + if (unlikely(ubq_daemon_is_dying(ubq))) {
> > __ublk_abort_rq(ubq, rq);
> > return BLK_STS_OK;
> > }
> >
> > - pdu->req = rq;
> > -
> > /*
> > * Typical multiple producers and single consumer, it is just fine
> > * to use llist_add() in producer side and llist_del_all() in
> > @@ -821,10 +820,18 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
> > * The last command can't be added into list, otherwise it could
> > * be handled twice
> > */
> > - if (bd->last)
> > + if (bd->last) {
> > + struct ublk_io *io = &ubq->ios[rq->tag];
> > + struct io_uring_cmd *cmd = io->cmd;
> > + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
> > +
> > + pdu->req = rq;
> > io_uring_cmd_complete_in_task(cmd, ublk_rqs_task_work_cb);
> > - else
> > - llist_add(&pdu->node, &ubq->io_cmds);
> > + } else {
> > + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
> > +
> > + llist_add(&data->node, &ubq->io_cmds);
> > + }
> >
> > return BLK_STS_OK;
> > }
> > @@ -1426,6 +1433,7 @@ static int ublk_add_tag_set(struct ublk_device *ub)
> > ub->tag_set.queue_depth = ub->dev_info.queue_depth;
> > ub->tag_set.numa_node = NUMA_NO_NODE;
> > ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
> > + ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
> > ub->tag_set.driver_data = ub;
> > return blk_mq_alloc_tag_set(&ub->tag_set);
> > }
>
>
> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> 64GB MEM, CentOS 8, kernel 6.0+
> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
>
> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
>
> ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq()
>
> tw: task_work_add(), ublk is built-in.
>
>
> --------
> fio test
>
> iodepth=128 numjobs=1 direct=1 bs=4k
>
> --------------------------------------------
> ublk loop target, the backend is a file.
>
> IOPS(k)
>
> type ucmd tw ucmd-not-touch-pdu
> seq-read 54.1 53.8 53.6
> rand-read 52.0 52.0 52.0
>
> --------------------------------------------
> ublk null target
> IOPS(k)
>
> type ucmd tw ucmd-not-touch-pdu
> seq-read 272 286 275
> rand-read 262 278 269
>
>
> ------------
> ublksrv test
>
> -------------
> ucmd
>
> running loop/001
> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 66737
> randread: jobs 1, iops 64935
> randrw: jobs 1, iops read 32694 write 32710
> rw(512k): jobs 1, iops read 772 write 819
>
> running null/001
> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 715863
> randread: jobs 1, iops 758449
> randrw: jobs 1, iops read 357407 write 357183
> rw(512k): jobs 1, iops read 5895 write 5875
>
> -------------
> tw
>
> running loop/001
> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 66856
> randread: jobs 1, iops 65015
> randrw: jobs 1, iops read 32751 write 32767
> rw(512k): jobs 1, iops read 776 write 823
>
> running null/001
> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 739450
> randread: jobs 1, iops 787500
> randrw: jobs 1, iops read 372956 write 372831
> rw(512k): jobs 1, iops read 5798 write 5777
>
> -------------
> ucmd-not-touch-pdu
>
> running loop/001
> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 66754
> randread: jobs 1, iops 65032
> randrw: jobs 1, iops read 32776 write 32792
> rw(512k): jobs 1, iops read 772 write 818
>
> running null/001
> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> randwrite: jobs 1, iops 725334
> randread: jobs 1, iops 741105
> randrw: jobs 1, iops read 360285 write 360047
> rw(512k): jobs 1, iops read 5770 write 5748
>
> Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS.
> But it is worse than using task_work_add().
Thanks for the test! It is better to not share ucmd between
ublk blk-mq io context and ubq daemon context, and we can
improve it for using io_uring_cmd_complete_in_task(), and I
have one patch by using the batch handing approach in
io_uring_cmd_complete_in_task().
Another reason could be the extra __set_notify_signal() in
__io_req_task_work_add() via task_work_add(). When task_work_add()
is available, we just need to call __set_notify_signal() once
for the whole batch, but it can't be done in case of using
io_uring_cmd_complete_in_task().
Also the patch of 'use llist' is actually wrong since we have to
call io_uring_cmd_complete_in_task() once in ->commit_rqs(), but
that couldn't be easy because ucmd isn't available at that time.
I think we may have to live with task_work_add() until the perf
number is improved to same basically with io_uring_cmd_complete_in_task().
thanks,
Ming
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
2022-10-26 11:29 ` Ming Lei
@ 2022-10-27 3:00 ` Ziyang Zhang
2022-10-27 15:38 ` Ming Lei
0 siblings, 1 reply; 12+ messages in thread
From: Ziyang Zhang @ 2022-10-27 3:00 UTC (permalink / raw)
To: Ming Lei; +Cc: linux-block, Jens Axboe
On 2022/10/26 19:29, Ming Lei wrote:
[...]
>>
>> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
>> 64GB MEM, CentOS 8, kernel 6.0+
>> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
>>
>> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
>>
>> ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq()
>>
>> tw: task_work_add(), ublk is built-in.
>>
>>
>> --------
>> fio test
>>
>> iodepth=128 numjobs=1 direct=1 bs=4k
>>
>> --------------------------------------------
>> ublk loop target, the backend is a file.
>>
>> IOPS(k)
>>
>> type ucmd tw ucmd-not-touch-pdu
>> seq-read 54.1 53.8 53.6
>> rand-read 52.0 52.0 52.0
>>
>> --------------------------------------------
>> ublk null target
>> IOPS(k)
>>
>> type ucmd tw ucmd-not-touch-pdu
>> seq-read 272 286 275
>> rand-read 262 278 269
>>
>>
>> ------------
>> ublksrv test
>>
>> -------------
>> ucmd
>>
>> running loop/001
>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 66737
>> randread: jobs 1, iops 64935
>> randrw: jobs 1, iops read 32694 write 32710
>> rw(512k): jobs 1, iops read 772 write 819
>>
>> running null/001
>> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 715863
>> randread: jobs 1, iops 758449
>> randrw: jobs 1, iops read 357407 write 357183
>> rw(512k): jobs 1, iops read 5895 write 5875
>>
>> -------------
>> tw
>>
>> running loop/001
>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 66856
>> randread: jobs 1, iops 65015
>> randrw: jobs 1, iops read 32751 write 32767
>> rw(512k): jobs 1, iops read 776 write 823
>>
>> running null/001
>> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 739450
>> randread: jobs 1, iops 787500
>> randrw: jobs 1, iops read 372956 write 372831
>> rw(512k): jobs 1, iops read 5798 write 5777
>>
>> -------------
>> ucmd-not-touch-pdu
>>
>> running loop/001
>> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 66754
>> randread: jobs 1, iops 65032
>> randrw: jobs 1, iops read 32776 write 32792
>> rw(512k): jobs 1, iops read 772 write 818
>>
>> running null/001
>> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
>> randwrite: jobs 1, iops 725334
>> randread: jobs 1, iops 741105
>> randrw: jobs 1, iops read 360285 write 360047
>> rw(512k): jobs 1, iops read 5770 write 5748
>>
>> Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS.
>> But it is worse than using task_work_add().
>
> Thanks for the test! It is better to not share ucmd between
> ublk blk-mq io context and ubq daemon context, and we can
> improve it for using io_uring_cmd_complete_in_task(), and I
> have one patch by using the batch handing approach in
> io_uring_cmd_complete_in_task().
>
> Another reason could be the extra __set_notify_signal() in
> __io_req_task_work_add() via task_work_add(). When task_work_add()
> is available, we just need to call __set_notify_signal() once
> for the whole batch, but it can't be done in case of using
> io_uring_cmd_complete_in_task().
>
> Also the patch of 'use llist' is actually wrong since we have to
> call io_uring_cmd_complete_in_task() once in ->commit_rqs(), but
> that couldn't be easy because ucmd isn't available at that time.
Yes, you are correct.
>
> I think we may have to live with task_work_add() until the perf
> number is improved to same basically with io_uring_cmd_complete_in_task().
OK, we can keep task_work_add() && io_uring_cmd_complete_in_task().
BTW, from test:loop/001, I think with real backends, the performance
gap between them seems not too big.
Regards,
Zhang
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] ublk_drv: don't call task_work_add for queueing io commands
2022-10-27 3:00 ` Ziyang Zhang
@ 2022-10-27 15:38 ` Ming Lei
0 siblings, 0 replies; 12+ messages in thread
From: Ming Lei @ 2022-10-27 15:38 UTC (permalink / raw)
To: Ziyang Zhang; +Cc: linux-block, Jens Axboe
On Thu, Oct 27, 2022 at 11:00:44AM +0800, Ziyang Zhang wrote:
> On 2022/10/26 19:29, Ming Lei wrote:
>
> [...]
> >>
> >> Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz 16 cores
> >> 64GB MEM, CentOS 8, kernel 6.0+
> >> with IORING_SETUP_COOP_TASKRUN, without this kernel patch
> >>
> >> ucmd: io_uring_cmd_complete_in_task(), ublk_drv is a module
> >>
> >> ucmd-not-touch-pdu: use llist && do not touch 'cmd'/'pdu'/'io' in ublk_queue_rq()
> >>
> >> tw: task_work_add(), ublk is built-in.
> >>
> >>
> >> --------
> >> fio test
> >>
> >> iodepth=128 numjobs=1 direct=1 bs=4k
> >>
> >> --------------------------------------------
> >> ublk loop target, the backend is a file.
> >>
> >> IOPS(k)
> >>
> >> type ucmd tw ucmd-not-touch-pdu
> >> seq-read 54.1 53.8 53.6
> >> rand-read 52.0 52.0 52.0
> >>
> >> --------------------------------------------
> >> ublk null target
> >> IOPS(k)
> >>
> >> type ucmd tw ucmd-not-touch-pdu
> >> seq-read 272 286 275
> >> rand-read 262 278 269
> >>
> >>
> >> ------------
> >> ublksrv test
> >>
> >> -------------
> >> ucmd
> >>
> >> running loop/001
> >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_BZ85U), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >> randwrite: jobs 1, iops 66737
> >> randread: jobs 1, iops 64935
> >> randrw: jobs 1, iops read 32694 write 32710
> >> rw(512k): jobs 1, iops read 772 write 819
> >>
> >> running null/001
> >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >> randwrite: jobs 1, iops 715863
> >> randread: jobs 1, iops 758449
> >> randrw: jobs 1, iops read 357407 write 357183
> >> rw(512k): jobs 1, iops read 5895 write 5875
> >>
> >> -------------
> >> tw
> >>
> >> running loop/001
> >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_pvLTL), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >> randwrite: jobs 1, iops 66856
> >> randread: jobs 1, iops 65015
> >> randrw: jobs 1, iops read 32751 write 32767
> >> rw(512k): jobs 1, iops read 776 write 823
> >>
> >> running null/001
> >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >> randwrite: jobs 1, iops 739450
> >> randread: jobs 1, iops 787500
> >> randrw: jobs 1, iops read 372956 write 372831
> >> rw(512k): jobs 1, iops read 5798 write 5777
> >>
> >> -------------
> >> ucmd-not-touch-pdu
> >>
> >> running loop/001
> >> fio (ublk/loop( -f /root/work/ubdsrv/tests/tmp/ublk_loop_1G_oH0eG), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >> randwrite: jobs 1, iops 66754
> >> randread: jobs 1, iops 65032
> >> randrw: jobs 1, iops read 32776 write 32792
> >> rw(512k): jobs 1, iops read 772 write 818
> >>
> >> running null/001
> >> fio (ublk/null(), libaio, bs 4k, dio, hw queues:1, uring_comp: 0, get_data: 0)...
> >> randwrite: jobs 1, iops 725334
> >> randread: jobs 1, iops 741105
> >> randrw: jobs 1, iops read 360285 write 360047
> >> rw(512k): jobs 1, iops read 5770 write 5748
> >>
> >> Not touching cmd/pdu/io in ublk_queue_rq() improves IOPS.
> >> But it is worse than using task_work_add().
> >
> > Thanks for the test! It is better to not share ucmd between
> > ublk blk-mq io context and ubq daemon context, and we can
> > improve it for using io_uring_cmd_complete_in_task(), and I
> > have one patch by using the batch handing approach in
> > io_uring_cmd_complete_in_task().
> >
> > Another reason could be the extra __set_notify_signal() in
> > __io_req_task_work_add() via task_work_add(). When task_work_add()
> > is available, we just need to call __set_notify_signal() once
> > for the whole batch, but it can't be done in case of using
> > io_uring_cmd_complete_in_task().
> >
> > Also the patch of 'use llist' is actually wrong since we have to
> > call io_uring_cmd_complete_in_task() once in ->commit_rqs(), but
> > that couldn't be easy because ucmd isn't available at that time.
>
> Yes, you are correct.
>
> >
> > I think we may have to live with task_work_add() until the perf
> > number is improved to same basically with io_uring_cmd_complete_in_task().
>
> OK, we can keep task_work_add() && io_uring_cmd_complete_in_task().
> BTW, from test:loop/001, I think with real backends, the performance
> gap between them seems not too big.
Actually in VM created in my laptop, the gap isn't small on ublk-loop:
1) ublk-null, single job, change nr_hw_queue to 2 for using none scheduler
[root@ktest-09 ublksrv]# make test T=null/001:null/004
make -C tests run T=null/001:null/004 R=10 D=tests/tmp/
make[1]: Entering directory '/root/git/ublksrv/tests'
./run_test.sh null/001:null/004 10 tests/tmp/
running null/001
fio (ublk/null(), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 0, get_data: 0)...
randwrite(4k): jobs 1, iops 1028774, cpu_util(29% 69%)
randread(4k): jobs 1, iops 958598, cpu_util(27% 72%)
randrw(4k): jobs 1, iops read 453382 write 453239, cpu_util(29% 70%)
rw(512k): jobs 1, iops read 5264 write 5242, cpu_util(7% 6%)
running null/004
fio (ublk/null(), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 1, get_data: 0)...
randwrite(4k): jobs 1, iops 950755, cpu_util(30% 69%)
randread(4k): jobs 1, iops 855984, cpu_util(26% 73%)
randrw(4k): jobs 1, iops read 439369 write 439287, cpu_util(29% 70%)
rw(512k): jobs 1, iops read 5225 write 5202, cpu_util(7% 5%)
2) ublk-loop, single job, change nr_hw_queue to 2 for using none scheduler
[root@ktest-09 ublksrv]# make test T=loop/001:loop/004
make -C tests run T=loop/001:loop/004 R=10 D=tests/tmp/
make[1]: Entering directory '/root/git/ublksrv/tests'
./run_test.sh loop/001:loop/004 10 tests/tmp/
running loop/001
fio (ublk/loop( -f /root/git/ublksrv/tests/tmp/ublk_loop_2G_STbdZ), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 0, get_data: 0)...
randwrite(4k): jobs 1, iops 163177, cpu_util(7% 20%)
randread(4k): jobs 1, iops 141584, cpu_util(6% 14%)
randrw(4k): jobs 1, iops read 75034 write 75148, cpu_util(7% 17%)
rw(512k): jobs 1, iops read 1574 write 1663, cpu_util(3% 4%)
running loop/004
fio (ublk/loop( -f /root/git/ublksrv/tests/tmp/ublk_loop_2G_AdrHl), libaio, dio, hw queues:2, io jobs: 1, uring_comp: 1, get_data: 0)...
randwrite(4k): jobs 1, iops 142353, cpu_util(7% 19%)
randread(4k): jobs 1, iops 133883, cpu_util(6% 14%)
randrw(4k): jobs 1, iops read 72581 write 72691, cpu_util(7% 18%)
rw(512k): jobs 1, iops read 929 write 981, cpu_util(2% 2%)
Thanks,
Ming
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2022-10-27 15:38 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-23 9:38 [PATCH] ublk_drv: don't call task_work_add for queueing io commands Ming Lei
2022-10-24 9:48 ` Ziyang Zhang
2022-10-24 13:20 ` Ming Lei
2022-10-25 3:15 ` Ziyang Zhang
2022-10-25 7:19 ` Ming Lei
2022-10-25 7:46 ` Ziyang Zhang
2022-10-25 8:43 ` Ziyang Zhang
2022-10-25 15:17 ` Ming Lei
2022-10-26 10:32 ` Ziyang Zhang
2022-10-26 11:29 ` Ming Lei
2022-10-27 3:00 ` Ziyang Zhang
2022-10-27 15:38 ` Ming Lei
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.