* [PATCH] IB/srp: use multiple CPU cores more effectively
@ 2010-08-02 8:15 Bart Van Assche
[not found] ` <201008021015.40472.bvanassche-HInyCGIudOg@public.gmane.org>
0 siblings, 1 reply; 7+ messages in thread
From: Bart Van Assche @ 2010-08-02 8:15 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Roland Dreier, David Dillow,
Ralph Campbell
SRP I/O with small block sizes causes a high CPU load. Processing IB
completions on the context of a kernel thread instead of in interrupt context
allows to process up to 25% more I/O operations per second. This patch does
add a kernel parameter 'thread' that allows to specify whether to process IB
completions in interrupt context or in kernel thread context. Also, the IB
receive notification processing loop is rewritten as proposed earlier by Ralph
Campbell (see also https://patchwork.kernel.org/patch/89426/). As the
measurement results below show, rewriting the IB receive notification
processing loop did not have a measurable impact on performance. Processing
IB receive notifications in thread context however does have a measurable
impact: workloads with I/O depth one are processed at most 10% slower and
workloads with larger I/O depths are processed up to 25% faster.
block size number of IOPS IOPS IOPS
in bytes threads without with with
($bs) ($numjobs) this patch thread=n thread=y
512 1 25,400 25,400 23,100
512 128 122,000 122,000 153,000
4096 1 25,000 25,000 22,700
4096 128 122,000 121,000 157,000
65536 1 14,300 14,400 13,600
65536 4 36,700 36,700 36,600
524288 1 3,470 3,430 3,420
524288 4 5,020 5,020 4,990
performance test used to gather the above results:
fio --bs=${bs} --ioengine=sg --buffered=0 --size=128M --rw=read \
--thread --numjobs=${numjobs} --loops=100 --group_reporting \
--gtod_reduce=1 --name=${dev} --filename=${dev}
other ib_srp kernel module parameters: srp_sg_tablesize=128
SRP target settings: storage type NULLIO; SCSI queue depth 128.
IB HCA type: QDR.
Signed-off-by: Bart Van Assche <bvanassche-HInyCGIudOg@public.gmane.org>
Cc: Roland Dreier <rolandd-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org>
Cc: David Dillow <dave-i1Mk8JYDVaaSihdK6806/g@public.gmane.org>
Cc: Ralph Campbell <ralph.campbell-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org>
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index ed3f9eb..eebe870 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2009-2010 Bart Van Assche <bvanassche-HInyCGIudOg@public.gmane.org>.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -38,6 +39,7 @@
#include <linux/parser.h>
#include <linux/random.h>
#include <linux/jiffies.h>
+#include <linux/kthread.h>
#include <asm/atomic.h>
@@ -66,6 +68,12 @@ module_param(srp_sg_tablesize, int, 0444);
MODULE_PARM_DESC(srp_sg_tablesize,
"Max number of gather/scatter entries per I/O (default is 12, max 255)");
+static bool thread;
+module_param(thread, bool, 0444);
+MODULE_PARM_DESC(thread,
+ "Whether to process IB completions in interrupt context (false) or"
+ " kernel thread context (true)");
+
static int topspin_workarounds = 1;
module_param(topspin_workarounds, int, 0444);
@@ -81,6 +89,8 @@ MODULE_PARM_DESC(mellanox_workarounds,
static void srp_add_one(struct ib_device *device);
static void srp_remove_one(struct ib_device *device);
static void srp_recv_completion(struct ib_cq *cq, void *target_ptr);
+static void srp_notify_recv_thread(struct ib_cq *cq, void *target_ptr);
+static int srp_compl_thread(void *target_ptr);
static void srp_send_completion(struct ib_cq *cq, void *target_ptr);
static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
@@ -229,7 +239,9 @@ static int srp_create_target_ib(struct srp_target_port *target)
return -ENOMEM;
target->recv_cq = ib_create_cq(target->srp_host->srp_dev->dev,
- srp_recv_completion, NULL, target, SRP_RQ_SIZE, 0);
+ thread ? srp_notify_recv_thread
+ : srp_recv_completion,
+ NULL, target, SRP_RQ_SIZE, 0);
if (IS_ERR(target->recv_cq)) {
ret = PTR_ERR(target->recv_cq);
goto err;
@@ -242,7 +254,18 @@ static int srp_create_target_ib(struct srp_target_port *target)
goto err_recv_cq;
}
- ib_req_notify_cq(target->recv_cq, IB_CQ_NEXT_COMP);
+ if (thread) {
+ init_waitqueue_head(&target->wait_queue);
+ target->thread = kthread_run(srp_compl_thread, target,
+ "ib_srp_compl");
+ if (IS_ERR(target->thread)) {
+ ret = PTR_ERR(target->thread);
+ goto err_send_cq;
+ }
+ } else {
+ target->thread = NULL;
+ ib_req_notify_cq(target->recv_cq, IB_CQ_NEXT_COMP);
+ }
init_attr->event_handler = srp_qp_event;
init_attr->cap.max_send_wr = SRP_SQ_SIZE;
@@ -257,7 +280,7 @@ static int srp_create_target_ib(struct srp_target_port *target)
target->qp = ib_create_qp(target->srp_host->srp_dev->pd, init_attr);
if (IS_ERR(target->qp)) {
ret = PTR_ERR(target->qp);
- goto err_send_cq;
+ goto err_thread;
}
ret = srp_init_qp(target, target->qp);
@@ -270,6 +293,10 @@ static int srp_create_target_ib(struct srp_target_port *target)
err_qp:
ib_destroy_qp(target->qp);
+err_thread:
+ if (target->thread)
+ kthread_stop(target->thread);
+
err_send_cq:
ib_destroy_cq(target->send_cq);
@@ -286,6 +313,8 @@ static void srp_free_target_ib(struct srp_target_port *target)
int i;
ib_destroy_qp(target->qp);
+ if (target->thread)
+ kthread_stop(target->thread);
ib_destroy_cq(target->send_cq);
ib_destroy_cq(target->recv_cq);
@@ -917,23 +946,45 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
DMA_FROM_DEVICE);
}
+static void srp_notify_recv_thread(struct ib_cq *cq, void *target_ptr)
+{
+ struct srp_target_port *target = target_ptr;
+
+ wake_up_interruptible(&target->wait_queue);
+}
+
static void srp_recv_completion(struct ib_cq *cq, void *target_ptr)
{
struct srp_target_port *target = target_ptr;
struct ib_wc wc;
- ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- while (ib_poll_cq(cq, 1, &wc) > 0) {
- if (wc.status) {
- shost_printk(KERN_ERR, target->scsi_host,
- PFX "failed receive status %d\n",
- wc.status);
- target->qp_in_error = 1;
- break;
+ do {
+ while (ib_poll_cq(cq, 1, &wc) > 0) {
+ if (wc.status) {
+ shost_printk(KERN_ERR, target->scsi_host,
+ PFX "failed receive status %d\n",
+ wc.status);
+ target->qp_in_error = 1;
+ return;
+ }
+
+ srp_handle_recv(target, &wc);
}
+ } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP
+ | IB_CQ_REPORT_MISSED_EVENTS) > 0);
+}
+
+static int srp_compl_thread(void *target_ptr)
+{
+ struct srp_target_port *target = target_ptr;
- srp_handle_recv(target, &wc);
+ while (!kthread_should_stop()) {
+ wait_event_interruptible(target->wait_queue,
+ (srp_recv_completion(target->recv_cq, target),
+ kthread_should_stop()));
}
+
+ return 0;
}
static void srp_send_completion(struct ib_cq *cq, void *target_ptr)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 5a80eac..5ceb4a4 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -129,6 +129,8 @@ struct srp_target_port {
struct ib_sa_query *path_query;
int path_query_id;
+ wait_queue_head_t wait_queue;
+ struct task_struct *thread;
struct ib_cm_id *cm_id;
struct ib_cq *recv_cq;
struct ib_cq *send_cq;
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
[not found] ` <201008021015.40472.bvanassche-HInyCGIudOg@public.gmane.org>
@ 2010-08-02 13:08 ` Vladislav Bolkhovitin
[not found] ` <4C56C336.4040009-d+Crzxg7Rs0@public.gmane.org>
0 siblings, 1 reply; 7+ messages in thread
From: Vladislav Bolkhovitin @ 2010-08-02 13:08 UTC (permalink / raw)
To: Bart Van Assche
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Roland Dreier, David Dillow,
Ralph Campbell
Bart Van Assche, on 08/02/2010 12:15 PM wrote:
> SRP I/O with small block sizes causes a high CPU load. Processing IB
> completions on the context of a kernel thread instead of in interrupt context
> allows to process up to 25% more I/O operations per second. This patch does
> add a kernel parameter 'thread' that allows to specify whether to process IB
> completions in interrupt context or in kernel thread context. Also, the IB
> receive notification processing loop is rewritten as proposed earlier by Ralph
> Campbell (see also https://patchwork.kernel.org/patch/89426/). As the
> measurement results below show, rewriting the IB receive notification
> processing loop did not have a measurable impact on performance. Processing
> IB receive notifications in thread context however does have a measurable
> impact: workloads with I/O depth one are processed at most 10% slower and
> workloads with larger I/O depths are processed up to 25% faster.
>
> block size number of IOPS IOPS IOPS
> in bytes threads without with with
> ($bs) ($numjobs) this patch thread=n thread=y
> 512 1 25,400 25,400 23,100
> 512 128 122,000 122,000 153,000
> 4096 1 25,000 25,000 22,700
> 4096 128 122,000 121,000 157,000
> 65536 1 14,300 14,400 13,600
> 65536 4 36,700 36,700 36,600
> 524288 1 3,470 3,430 3,420
> 524288 4 5,020 5,020 4,990
>
> performance test used to gather the above results:
> fio --bs=${bs} --ioengine=sg --buffered=0 --size=128M --rw=read \
> --thread --numjobs=${numjobs} --loops=100 --group_reporting \
> --gtod_reduce=1 --name=${dev} --filename=${dev}
> other ib_srp kernel module parameters: srp_sg_tablesize=128
How about results of "dd Xflags=direct" in different modes to find out
the lowest latency the driver can process 512 and 4K packets? Sorry, I
don't trust fio, when it comes to precise latency measurements.
Vlad
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
[not found] ` <4C56C336.4040009-d+Crzxg7Rs0@public.gmane.org>
@ 2010-08-02 15:57 ` Bart Van Assche
[not found] ` <AANLkTinBTv5SZJ_H9C15CWZ5hYGFe38840zy78+N-wbO-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 7+ messages in thread
From: Bart Van Assche @ 2010-08-02 15:57 UTC (permalink / raw)
To: Vladislav Bolkhovitin
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Roland Dreier, David Dillow,
Ralph Campbell
On Mon, Aug 2, 2010 at 3:08 PM, Vladislav Bolkhovitin <vst-d+Crzxg7Rs0@public.gmane.org> wrote:
>
> Bart Van Assche, on 08/02/2010 12:15 PM wrote:
>>
>> SRP I/O with small block sizes causes a high CPU load. Processing IB
>> completions on the context of a kernel thread instead of in interrupt context
>> allows to process up to 25% more I/O operations per second. This patch does
>> add a kernel parameter 'thread' that allows to specify whether to process IB
>> completions in interrupt context or in kernel thread context. Also, the IB
>> receive notification processing loop is rewritten as proposed earlier by Ralph
>> Campbell (see also https://patchwork.kernel.org/patch/89426/). As the
>> measurement results below show, rewriting the IB receive notification
>> processing loop did not have a measurable impact on performance. Processing
>> IB receive notifications in thread context however does have a measurable
>> impact: workloads with I/O depth one are processed at most 10% slower and
>> workloads with larger I/O depths are processed up to 25% faster.
>>
>> block size number of IOPS IOPS IOPS
>> in bytes threads without with with
>> ($bs) ($numjobs) this patch thread=n thread=y
>> 512 1 25,400 25,400 23,100
>> 512 128 122,000 122,000 153,000
>> 4096 1 25,000 25,000 22,700
>> 4096 128 122,000 121,000 157,000
>> 65536 1 14,300 14,400 13,600
>> 65536 4 36,700 36,700 36,600
>> 524288 1 3,470 3,430 3,420
>> 524288 4 5,020 5,020 4,990
>>
>> performance test used to gather the above results:
>> fio --bs=${bs} --ioengine=sg --buffered=0 --size=128M --rw=read \
>> --thread --numjobs=${numjobs} --loops=100 --group_reporting \
>> --gtod_reduce=1 --name=${dev} --filename=${dev}
>> other ib_srp kernel module parameters: srp_sg_tablesize=128
>
> How about results of "dd Xflags=direct" in different modes to find out the lowest
> latency the driver can process 512 and 4K packets? Sorry, I don't trust fio, when
> it comes to precise latency measurements.
It would be interesting to compare such results, but unfortunately, dd
does not provide a way to perform I/O from multiple threads
simultaneously. I have tried to run multiple dd processes in parallel,
but that resulted in much lower IOPS results than a comparable
multithreaded fio test.
Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
[not found] ` <AANLkTinBTv5SZJ_H9C15CWZ5hYGFe38840zy78+N-wbO-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-08-02 18:16 ` Vladislav Bolkhovitin
[not found] ` <4C570B7F.2010306-d+Crzxg7Rs0@public.gmane.org>
0 siblings, 1 reply; 7+ messages in thread
From: Vladislav Bolkhovitin @ 2010-08-02 18:16 UTC (permalink / raw)
To: Bart Van Assche
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Roland Dreier, David Dillow,
Ralph Campbell
Bart Van Assche, on 08/02/2010 07:57 PM wrote:
>>> SRP I/O with small block sizes causes a high CPU load. Processing IB
>>> completions on the context of a kernel thread instead of in interrupt context
>>> allows to process up to 25% more I/O operations per second. This patch does
>>> add a kernel parameter 'thread' that allows to specify whether to process IB
>>> completions in interrupt context or in kernel thread context. Also, the IB
>>> receive notification processing loop is rewritten as proposed earlier by Ralph
>>> Campbell (see also https://patchwork.kernel.org/patch/89426/). As the
>>> measurement results below show, rewriting the IB receive notification
>>> processing loop did not have a measurable impact on performance. Processing
>>> IB receive notifications in thread context however does have a measurable
>>> impact: workloads with I/O depth one are processed at most 10% slower and
>>> workloads with larger I/O depths are processed up to 25% faster.
>>>
>>> block size number of IOPS IOPS IOPS
>>> in bytes threads without with with
>>> ($bs) ($numjobs) this patch thread=n thread=y
>>> 512 1 25,400 25,400 23,100
>>> 512 128 122,000 122,000 153,000
>>> 4096 1 25,000 25,000 22,700
>>> 4096 128 122,000 121,000 157,000
>>> 65536 1 14,300 14,400 13,600
>>> 65536 4 36,700 36,700 36,600
>>> 524288 1 3,470 3,430 3,420
>>> 524288 4 5,020 5,020 4,990
>>>
>>> performance test used to gather the above results:
>>> fio --bs=${bs} --ioengine=sg --buffered=0 --size=128M --rw=read \
>>> --thread --numjobs=${numjobs} --loops=100 --group_reporting \
>>> --gtod_reduce=1 --name=${dev} --filename=${dev}
>>> other ib_srp kernel module parameters: srp_sg_tablesize=128
>>
>> How about results of "dd Xflags=direct" in different modes to find out the lowest
>> latency the driver can process 512 and 4K packets? Sorry, I don't trust fio, when
>> it comes to precise latency measurements.
>
> It would be interesting to compare such results, but unfortunately, dd
> does not provide a way to perform I/O from multiple threads
> simultaneously. I have tried to run multiple dd processes in parallel,
> but that resulted in much lower IOPS results than a comparable
> multithreaded fio test.
I'm interested to see how much your changes affected processing latency,
i.e. to measure execution latency before and after changes. You can't do
that with several threads, because latency = 1/bandwidth only if you
always have only one command at time. So, all those sophisticated
measurements can't substitute a plane old:
dd if=/dev/sdX of=/dev/null bs=512 iflag=direct
and
dd if=/dev/zero of=/dev/sdX bs=512 oflag=direct
Vlad
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
[not found] ` <4C570B7F.2010306-d+Crzxg7Rs0@public.gmane.org>
@ 2010-08-02 18:36 ` David Dillow
[not found] ` <1280774209.2451.10.camel-FqX9LgGZnHWDB2HL1qBt2PIbXMQ5te18@public.gmane.org>
0 siblings, 1 reply; 7+ messages in thread
From: David Dillow @ 2010-08-02 18:36 UTC (permalink / raw)
To: Vladislav Bolkhovitin
Cc: Bart Van Assche, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
Roland Dreier, Ralph Campbell
On Mon, 2010-08-02 at 22:16 +0400, Vladislav Bolkhovitin wrote:
> Bart Van Assche, on 08/02/2010 07:57 PM wrote:
> >>>
> >>> block size number of IOPS IOPS IOPS
> >>> in bytes threads without with with
> >>> ($bs) ($numjobs) this patch thread=n thread=y
> >>> 512 1 25,400 25,400 23,100
> >>> 512 128 122,000 122,000 153,000
> >>> 4096 1 25,000 25,000 22,700
> >>> 4096 128 122,000 121,000 157,000
> >>> 65536 1 14,300 14,400 13,600
> >>> 65536 4 36,700 36,700 36,600
> >>> 524288 1 3,470 3,430 3,420
> >>> 524288 4 5,020 5,020 4,990
> I'm interested to see how much your changes affected processing latency,
> i.e. to measure execution latency before and after changes. You can't do
> that with several threads, because latency = 1/bandwidth only if you
> always have only one command at time. So, all those sophisticated
> measurements can't substitute a plane old:
If my assumption that --numjobs=1 puts fio into a single-threaded mode
is correct, it seems that using this patch hurts individual command
latency, at least in a gross sense. The table listed above shows a ~9%
hit for single-threaded 0.5 KB and 4 KB requests, ~4.8% for 64 KB
requests, and ~1.4% for 512 KB requests. It seems to win @ lots of
requests and small block sizes, but still seems to hurt performance at
larger request sizes, though it seems they were tested with smaller
thread counts.
I've not reviewed the patch yet, but that's how I read the table above.
I'm assuming latency is hurt by the need to schedule the kernel thread,
but the batching helps increase the IOPS for low request sizes.
Bart, you could also try xdd as a benchmark tool.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
[not found] ` <1280774209.2451.10.camel-FqX9LgGZnHWDB2HL1qBt2PIbXMQ5te18@public.gmane.org>
@ 2010-08-02 18:40 ` Bart Van Assche
[not found] ` <AANLkTikYEvQfbWGLMZGZ_c+ggy0hAkiS9RAsBmGVKDDA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
0 siblings, 1 reply; 7+ messages in thread
From: Bart Van Assche @ 2010-08-02 18:40 UTC (permalink / raw)
To: David Dillow
Cc: Vladislav Bolkhovitin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
Roland Dreier, Ralph Campbell
On Mon, Aug 2, 2010 at 8:36 PM, David Dillow <dave-i1Mk8JYDVaaSihdK6806/g@public.gmane.org> wrote:
>
> On Mon, 2010-08-02 at 22:16 +0400, Vladislav Bolkhovitin wrote:
> > Bart Van Assche, on 08/02/2010 07:57 PM wrote:
> > >>>
> > >>> block size number of IOPS IOPS IOPS
> > >>> in bytes threads without with with
> > >>> ($bs) ($numjobs) this patch thread=n thread=y
> > >>> 512 1 25,400 25,400 23,100
> > >>> 512 128 122,000 122,000 153,000
> > >>> 4096 1 25,000 25,000 22,700
> > >>> 4096 128 122,000 121,000 157,000
> > >>> 65536 1 14,300 14,400 13,600
> > >>> 65536 4 36,700 36,700 36,600
> > >>> 524288 1 3,470 3,430 3,420
> > >>> 524288 4 5,020 5,020 4,990
>
> > I'm interested to see how much your changes affected processing latency,
> > i.e. to measure execution latency before and after changes. You can't do
> > that with several threads, because latency = 1/bandwidth only if you
> > always have only one command at time. So, all those sophisticated
> > measurements can't substitute a plane old:
>
> If my assumption that --numjobs=1 puts fio into a single-threaded mode
> is correct, it seems that using this patch hurts individual command
> latency, at least in a gross sense. The table listed above shows a ~9%
> hit for single-threaded 0.5 KB and 4 KB requests, ~4.8% for 64 KB
> requests, and ~1.4% for 512 KB requests. It seems to win @ lots of
> requests and small block sizes, but still seems to hurt performance at
> larger request sizes, though it seems they were tested with smaller
> thread counts.
>
> I've not reviewed the patch yet, but that's how I read the table above.
> I'm assuming latency is hurt by the need to schedule the kernel thread,
> but the batching helps increase the IOPS for low request sizes.
Please note that the user has to enable mode thread=y explicitly. The
default mode is thread=n and in that mode neither latency nor
throughput is affected by this patch.
> Bart, you could also try xdd as a benchmark tool.
I'm familiar with xdd. However, I consider fio both as more powerful
and easier to user than xdd.
Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
[not found] ` <AANLkTikYEvQfbWGLMZGZ_c+ggy0hAkiS9RAsBmGVKDDA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-08-02 19:07 ` Vladislav Bolkhovitin
0 siblings, 0 replies; 7+ messages in thread
From: Vladislav Bolkhovitin @ 2010-08-02 19:07 UTC (permalink / raw)
To: Bart Van Assche
Cc: David Dillow, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Roland Dreier,
Ralph Campbell
Bart Van Assche, on 08/02/2010 10:40 PM wrote:
> On Mon, Aug 2, 2010 at 8:36 PM, David Dillow<dave-i1Mk8JYDVaaSihdK6806/g@public.gmane.org> wrote:
>>
>> On Mon, 2010-08-02 at 22:16 +0400, Vladislav Bolkhovitin wrote:
>>> Bart Van Assche, on 08/02/2010 07:57 PM wrote:
>>>>>>
>>>>>> block size number of IOPS IOPS IOPS
>>>>>> in bytes threads without with with
>>>>>> ($bs) ($numjobs) this patch thread=n thread=y
>>>>>> 512 1 25,400 25,400 23,100
>>>>>> 512 128 122,000 122,000 153,000
>>>>>> 4096 1 25,000 25,000 22,700
>>>>>> 4096 128 122,000 121,000 157,000
>>>>>> 65536 1 14,300 14,400 13,600
>>>>>> 65536 4 36,700 36,700 36,600
>>>>>> 524288 1 3,470 3,430 3,420
>>>>>> 524288 4 5,020 5,020 4,990
>>
>>> I'm interested to see how much your changes affected processing latency,
>>> i.e. to measure execution latency before and after changes. You can't do
>>> that with several threads, because latency = 1/bandwidth only if you
>>> always have only one command at time. So, all those sophisticated
>>> measurements can't substitute a plane old:
>>
>> If my assumption that --numjobs=1 puts fio into a single-threaded mode
>> is correct, it seems that using this patch hurts individual command
>> latency, at least in a gross sense. The table listed above shows a ~9%
>> hit for single-threaded 0.5 KB and 4 KB requests, ~4.8% for 64 KB
>> requests, and ~1.4% for 512 KB requests. It seems to win @ lots of
>> requests and small block sizes, but still seems to hurt performance at
>> larger request sizes, though it seems they were tested with smaller
>> thread counts.
>>
>> I've not reviewed the patch yet, but that's how I read the table above.
>> I'm assuming latency is hurt by the need to schedule the kernel thread,
>> but the batching helps increase the IOPS for low request sizes.
>
> Please note that the user has to enable mode thread=y explicitly. The
> default mode is thread=n and in that mode neither latency nor
> throughput is affected by this patch.
>
>> Bart, you could also try xdd as a benchmark tool.
>
> I'm familiar with xdd. However, I consider fio both as more powerful
> and easier to user than xdd.
Bart, you simply can't measure your link/processing latency with it in a
trustworthy manner. In my experience, it's too heavy wighted to measure
such small objects, i.e. its internal overhead is >= the measured value.
In the scientific terms it means that you have instrumental mistake in
tens-hundreds %%.
Vlad
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2010-08-02 19:07 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-02 8:15 [PATCH] IB/srp: use multiple CPU cores more effectively Bart Van Assche
[not found] ` <201008021015.40472.bvanassche-HInyCGIudOg@public.gmane.org>
2010-08-02 13:08 ` Vladislav Bolkhovitin
[not found] ` <4C56C336.4040009-d+Crzxg7Rs0@public.gmane.org>
2010-08-02 15:57 ` Bart Van Assche
[not found] ` <AANLkTinBTv5SZJ_H9C15CWZ5hYGFe38840zy78+N-wbO-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-08-02 18:16 ` Vladislav Bolkhovitin
[not found] ` <4C570B7F.2010306-d+Crzxg7Rs0@public.gmane.org>
2010-08-02 18:36 ` David Dillow
[not found] ` <1280774209.2451.10.camel-FqX9LgGZnHWDB2HL1qBt2PIbXMQ5te18@public.gmane.org>
2010-08-02 18:40 ` Bart Van Assche
[not found] ` <AANLkTikYEvQfbWGLMZGZ_c+ggy0hAkiS9RAsBmGVKDDA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-08-02 19:07 ` Vladislav Bolkhovitin
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.