All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] IB/srp: use multiple CPU cores more effectively
@ 2010-08-02  8:15 Bart Van Assche
       [not found] ` <201008021015.40472.bvanassche-HInyCGIudOg@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Bart Van Assche @ 2010-08-02  8:15 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Roland Dreier, David Dillow,
	Ralph Campbell

SRP I/O with small block sizes causes a high CPU load. Processing IB
completions on the context of a kernel thread instead of in interrupt context
allows to process up to 25% more I/O operations per second. This patch does
add a kernel parameter 'thread' that allows to specify whether to process IB
completions in interrupt context or in kernel thread context. Also, the IB
receive notification processing loop is rewritten as proposed earlier by Ralph
Campbell (see also https://patchwork.kernel.org/patch/89426/). As the
measurement results below show, rewriting the IB receive notification
processing loop did not have a measurable impact on performance. Processing
IB receive notifications in thread context however does have a measurable
impact: workloads with I/O depth one are processed at most 10% slower and
workloads with larger I/O depths are processed up to 25% faster.

block size  number of    IOPS        IOPS      IOPS
 in bytes    threads     without     with      with
  ($bs)     ($numjobs)  this patch  thread=n  thread=y
   512           1        25,400      25,400    23,100
   512         128       122,000     122,000   153,000
  4096           1        25,000      25,000    22,700
  4096         128       122,000     121,000   157,000
 65536           1        14,300      14,400    13,600
 65536           4        36,700      36,700    36,600
524288           1         3,470       3,430     3,420
524288           4         5,020       5,020     4,990

performance test used to gather the above results:
  fio --bs=${bs} --ioengine=sg --buffered=0 --size=128M --rw=read \
      --thread --numjobs=${numjobs} --loops=100 --group_reporting \
      --gtod_reduce=1 --name=${dev} --filename=${dev}
other ib_srp kernel module parameters: srp_sg_tablesize=128
SRP target settings: storage type NULLIO; SCSI queue depth 128.
IB HCA type: QDR.

Signed-off-by: Bart Van Assche <bvanassche-HInyCGIudOg@public.gmane.org>
Cc: Roland Dreier <rolandd-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org>
Cc: David Dillow <dave-i1Mk8JYDVaaSihdK6806/g@public.gmane.org>
Cc: Ralph Campbell <ralph.campbell-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org>

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index ed3f9eb..eebe870 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2009-2010 Bart Van Assche <bvanassche-HInyCGIudOg@public.gmane.org>.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -38,6 +39,7 @@
 #include <linux/parser.h>
 #include <linux/random.h>
 #include <linux/jiffies.h>
+#include <linux/kthread.h>
 
 #include <asm/atomic.h>
 
@@ -66,6 +68,12 @@ module_param(srp_sg_tablesize, int, 0444);
 MODULE_PARM_DESC(srp_sg_tablesize,
 		 "Max number of gather/scatter entries per I/O (default is 12, max 255)");
 
+static bool thread;
+module_param(thread, bool, 0444);
+MODULE_PARM_DESC(thread,
+		 "Whether to process IB completions in interrupt context (false) or"
+		 " kernel thread context (true)");
+
 static int topspin_workarounds = 1;
 
 module_param(topspin_workarounds, int, 0444);
@@ -81,6 +89,8 @@ MODULE_PARM_DESC(mellanox_workarounds,
 static void srp_add_one(struct ib_device *device);
 static void srp_remove_one(struct ib_device *device);
 static void srp_recv_completion(struct ib_cq *cq, void *target_ptr);
+static void srp_notify_recv_thread(struct ib_cq *cq, void *target_ptr);
+static int srp_compl_thread(void *target_ptr);
 static void srp_send_completion(struct ib_cq *cq, void *target_ptr);
 static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
 
@@ -229,7 +239,9 @@ static int srp_create_target_ib(struct srp_target_port *target)
 		return -ENOMEM;
 
 	target->recv_cq = ib_create_cq(target->srp_host->srp_dev->dev,
-				       srp_recv_completion, NULL, target, SRP_RQ_SIZE, 0);
+				       thread ? srp_notify_recv_thread
+				       : srp_recv_completion,
+				       NULL, target, SRP_RQ_SIZE, 0);
 	if (IS_ERR(target->recv_cq)) {
 		ret = PTR_ERR(target->recv_cq);
 		goto err;
@@ -242,7 +254,18 @@ static int srp_create_target_ib(struct srp_target_port *target)
 		goto err_recv_cq;
 	}
 
-	ib_req_notify_cq(target->recv_cq, IB_CQ_NEXT_COMP);
+	if (thread) {
+		init_waitqueue_head(&target->wait_queue);
+		target->thread = kthread_run(srp_compl_thread, target,
+					     "ib_srp_compl");
+		if (IS_ERR(target->thread)) {
+			ret = PTR_ERR(target->thread);
+			goto err_send_cq;
+		}
+	} else {
+		target->thread = NULL;
+		ib_req_notify_cq(target->recv_cq, IB_CQ_NEXT_COMP);
+	}
 
 	init_attr->event_handler       = srp_qp_event;
 	init_attr->cap.max_send_wr     = SRP_SQ_SIZE;
@@ -257,7 +280,7 @@ static int srp_create_target_ib(struct srp_target_port *target)
 	target->qp = ib_create_qp(target->srp_host->srp_dev->pd, init_attr);
 	if (IS_ERR(target->qp)) {
 		ret = PTR_ERR(target->qp);
-		goto err_send_cq;
+		goto err_thread;
 	}
 
 	ret = srp_init_qp(target, target->qp);
@@ -270,6 +293,10 @@ static int srp_create_target_ib(struct srp_target_port *target)
 err_qp:
 	ib_destroy_qp(target->qp);
 
+err_thread:
+	if (target->thread)
+		kthread_stop(target->thread);
+
 err_send_cq:
 	ib_destroy_cq(target->send_cq);
 
@@ -286,6 +313,8 @@ static void srp_free_target_ib(struct srp_target_port *target)
 	int i;
 
 	ib_destroy_qp(target->qp);
+	if (target->thread)
+		kthread_stop(target->thread);
 	ib_destroy_cq(target->send_cq);
 	ib_destroy_cq(target->recv_cq);
 
@@ -917,23 +946,45 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
 				      DMA_FROM_DEVICE);
 }
 
+static void srp_notify_recv_thread(struct ib_cq *cq, void *target_ptr)
+{
+	struct srp_target_port *target = target_ptr;
+
+	wake_up_interruptible(&target->wait_queue);
+}
+
 static void srp_recv_completion(struct ib_cq *cq, void *target_ptr)
 {
 	struct srp_target_port *target = target_ptr;
 	struct ib_wc wc;
 
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-	while (ib_poll_cq(cq, 1, &wc) > 0) {
-		if (wc.status) {
-			shost_printk(KERN_ERR, target->scsi_host,
-				     PFX "failed receive status %d\n",
-				     wc.status);
-			target->qp_in_error = 1;
-			break;
+	do {
+		while (ib_poll_cq(cq, 1, &wc) > 0) {
+			if (wc.status) {
+				shost_printk(KERN_ERR, target->scsi_host,
+					     PFX "failed receive status %d\n",
+					     wc.status);
+				target->qp_in_error = 1;
+				return;
+			}
+
+			srp_handle_recv(target, &wc);
 		}
+	} while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP
+				      | IB_CQ_REPORT_MISSED_EVENTS) > 0);
+}
+
+static int srp_compl_thread(void *target_ptr)
+{
+	struct srp_target_port *target = target_ptr;
 
-		srp_handle_recv(target, &wc);
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(target->wait_queue,
+			(srp_recv_completion(target->recv_cq, target),
+			 kthread_should_stop()));
 	}
+
+	return 0;
 }
 
 static void srp_send_completion(struct ib_cq *cq, void *target_ptr)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 5a80eac..5ceb4a4 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -129,6 +129,8 @@ struct srp_target_port {
 	struct ib_sa_query     *path_query;
 	int			path_query_id;
 
+	wait_queue_head_t       wait_queue;
+	struct task_struct     *thread;
 	struct ib_cm_id	       *cm_id;
 	struct ib_cq	       *recv_cq;
 	struct ib_cq	       *send_cq;
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
       [not found] ` <201008021015.40472.bvanassche-HInyCGIudOg@public.gmane.org>
@ 2010-08-02 13:08   ` Vladislav Bolkhovitin
       [not found]     ` <4C56C336.4040009-d+Crzxg7Rs0@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Vladislav Bolkhovitin @ 2010-08-02 13:08 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Roland Dreier, David Dillow,
	Ralph Campbell

Bart Van Assche, on 08/02/2010 12:15 PM wrote:
> SRP I/O with small block sizes causes a high CPU load. Processing IB
> completions on the context of a kernel thread instead of in interrupt context
> allows to process up to 25% more I/O operations per second. This patch does
> add a kernel parameter 'thread' that allows to specify whether to process IB
> completions in interrupt context or in kernel thread context. Also, the IB
> receive notification processing loop is rewritten as proposed earlier by Ralph
> Campbell (see also https://patchwork.kernel.org/patch/89426/). As the
> measurement results below show, rewriting the IB receive notification
> processing loop did not have a measurable impact on performance. Processing
> IB receive notifications in thread context however does have a measurable
> impact: workloads with I/O depth one are processed at most 10% slower and
> workloads with larger I/O depths are processed up to 25% faster.
>
> block size  number of    IOPS        IOPS      IOPS
>   in bytes    threads     without     with      with
>    ($bs)     ($numjobs)  this patch  thread=n  thread=y
>     512           1        25,400      25,400    23,100
>     512         128       122,000     122,000   153,000
>    4096           1        25,000      25,000    22,700
>    4096         128       122,000     121,000   157,000
>   65536           1        14,300      14,400    13,600
>   65536           4        36,700      36,700    36,600
> 524288           1         3,470       3,430     3,420
> 524288           4         5,020       5,020     4,990
>
> performance test used to gather the above results:
>    fio --bs=${bs} --ioengine=sg --buffered=0 --size=128M --rw=read \
>        --thread --numjobs=${numjobs} --loops=100 --group_reporting \
>        --gtod_reduce=1 --name=${dev} --filename=${dev}
> other ib_srp kernel module parameters: srp_sg_tablesize=128

How about results of "dd Xflags=direct" in different modes to find out 
the lowest latency the driver can process 512 and 4K packets? Sorry, I 
don't trust fio, when it comes to precise latency measurements.

Vlad
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
       [not found]     ` <4C56C336.4040009-d+Crzxg7Rs0@public.gmane.org>
@ 2010-08-02 15:57       ` Bart Van Assche
       [not found]         ` <AANLkTinBTv5SZJ_H9C15CWZ5hYGFe38840zy78+N-wbO-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Bart Van Assche @ 2010-08-02 15:57 UTC (permalink / raw)
  To: Vladislav Bolkhovitin
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Roland Dreier, David Dillow,
	Ralph Campbell

On Mon, Aug 2, 2010 at 3:08 PM, Vladislav Bolkhovitin <vst-d+Crzxg7Rs0@public.gmane.org> wrote:
>
> Bart Van Assche, on 08/02/2010 12:15 PM wrote:
>>
>> SRP I/O with small block sizes causes a high CPU load. Processing IB
>> completions on the context of a kernel thread instead of in interrupt context
>> allows to process up to 25% more I/O operations per second. This patch does
>> add a kernel parameter 'thread' that allows to specify whether to process IB
>> completions in interrupt context or in kernel thread context. Also, the IB
>> receive notification processing loop is rewritten as proposed earlier by Ralph
>> Campbell (see also https://patchwork.kernel.org/patch/89426/). As the
>> measurement results below show, rewriting the IB receive notification
>> processing loop did not have a measurable impact on performance. Processing
>> IB receive notifications in thread context however does have a measurable
>> impact: workloads with I/O depth one are processed at most 10% slower and
>> workloads with larger I/O depths are processed up to 25% faster.
>>
>> block size  number of    IOPS        IOPS      IOPS
>>  in bytes    threads     without     with      with
>>   ($bs)     ($numjobs)  this patch  thread=n  thread=y
>>    512           1        25,400      25,400    23,100
>>    512         128       122,000     122,000   153,000
>>   4096           1        25,000      25,000    22,700
>>   4096         128       122,000     121,000   157,000
>>  65536           1        14,300      14,400    13,600
>>  65536           4        36,700      36,700    36,600
>> 524288           1         3,470       3,430     3,420
>> 524288           4         5,020       5,020     4,990
>>
>> performance test used to gather the above results:
>>   fio --bs=${bs} --ioengine=sg --buffered=0 --size=128M --rw=read \
>>       --thread --numjobs=${numjobs} --loops=100 --group_reporting \
>>       --gtod_reduce=1 --name=${dev} --filename=${dev}
>> other ib_srp kernel module parameters: srp_sg_tablesize=128
>
> How about results of "dd Xflags=direct" in different modes to find out the lowest
> latency the driver can process 512 and 4K packets? Sorry, I don't trust fio, when
> it comes to precise latency measurements.

It would be interesting to compare such results, but unfortunately, dd
does not provide a way to perform I/O from multiple threads
simultaneously. I have tried to run multiple dd processes in parallel,
but that resulted in much lower IOPS results than a comparable
multithreaded fio test.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
       [not found]         ` <AANLkTinBTv5SZJ_H9C15CWZ5hYGFe38840zy78+N-wbO-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-08-02 18:16           ` Vladislav Bolkhovitin
       [not found]             ` <4C570B7F.2010306-d+Crzxg7Rs0@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Vladislav Bolkhovitin @ 2010-08-02 18:16 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Roland Dreier, David Dillow,
	Ralph Campbell

Bart Van Assche, on 08/02/2010 07:57 PM wrote:
>>> SRP I/O with small block sizes causes a high CPU load. Processing IB
>>> completions on the context of a kernel thread instead of in interrupt context
>>> allows to process up to 25% more I/O operations per second. This patch does
>>> add a kernel parameter 'thread' that allows to specify whether to process IB
>>> completions in interrupt context or in kernel thread context. Also, the IB
>>> receive notification processing loop is rewritten as proposed earlier by Ralph
>>> Campbell (see also https://patchwork.kernel.org/patch/89426/). As the
>>> measurement results below show, rewriting the IB receive notification
>>> processing loop did not have a measurable impact on performance. Processing
>>> IB receive notifications in thread context however does have a measurable
>>> impact: workloads with I/O depth one are processed at most 10% slower and
>>> workloads with larger I/O depths are processed up to 25% faster.
>>>
>>> block size  number of    IOPS        IOPS      IOPS
>>>   in bytes    threads     without     with      with
>>>    ($bs)     ($numjobs)  this patch  thread=n  thread=y
>>>     512           1        25,400      25,400    23,100
>>>     512         128       122,000     122,000   153,000
>>>    4096           1        25,000      25,000    22,700
>>>    4096         128       122,000     121,000   157,000
>>>   65536           1        14,300      14,400    13,600
>>>   65536           4        36,700      36,700    36,600
>>> 524288           1         3,470       3,430     3,420
>>> 524288           4         5,020       5,020     4,990
>>>
>>> performance test used to gather the above results:
>>>    fio --bs=${bs} --ioengine=sg --buffered=0 --size=128M --rw=read \
>>>        --thread --numjobs=${numjobs} --loops=100 --group_reporting \
>>>        --gtod_reduce=1 --name=${dev} --filename=${dev}
>>> other ib_srp kernel module parameters: srp_sg_tablesize=128
>>
>> How about results of "dd Xflags=direct" in different modes to find out the lowest
>> latency the driver can process 512 and 4K packets? Sorry, I don't trust fio, when
>> it comes to precise latency measurements.
>
> It would be interesting to compare such results, but unfortunately, dd
> does not provide a way to perform I/O from multiple threads
> simultaneously. I have tried to run multiple dd processes in parallel,
> but that resulted in much lower IOPS results than a comparable
> multithreaded fio test.

I'm interested to see how much your changes affected processing latency, 
i.e. to measure execution latency before and after changes. You can't do 
that with several threads, because latency = 1/bandwidth only if you 
always have only one command at time. So, all those sophisticated 
measurements can't substitute a plane old:

dd if=/dev/sdX of=/dev/null bs=512 iflag=direct
and
dd if=/dev/zero of=/dev/sdX bs=512 oflag=direct

Vlad
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
       [not found]             ` <4C570B7F.2010306-d+Crzxg7Rs0@public.gmane.org>
@ 2010-08-02 18:36               ` David Dillow
       [not found]                 ` <1280774209.2451.10.camel-FqX9LgGZnHWDB2HL1qBt2PIbXMQ5te18@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: David Dillow @ 2010-08-02 18:36 UTC (permalink / raw)
  To: Vladislav Bolkhovitin
  Cc: Bart Van Assche, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Roland Dreier, Ralph Campbell

On Mon, 2010-08-02 at 22:16 +0400, Vladislav Bolkhovitin wrote:
> Bart Van Assche, on 08/02/2010 07:57 PM wrote:
> >>>
> >>> block size  number of    IOPS        IOPS      IOPS
> >>>   in bytes    threads     without     with      with
> >>>    ($bs)     ($numjobs)  this patch  thread=n  thread=y
> >>>     512           1        25,400      25,400    23,100
> >>>     512         128       122,000     122,000   153,000
> >>>    4096           1        25,000      25,000    22,700
> >>>    4096         128       122,000     121,000   157,000
> >>>   65536           1        14,300      14,400    13,600
> >>>   65536           4        36,700      36,700    36,600
> >>> 524288           1         3,470       3,430     3,420
> >>> 524288           4         5,020       5,020     4,990

> I'm interested to see how much your changes affected processing latency, 
> i.e. to measure execution latency before and after changes. You can't do 
> that with several threads, because latency = 1/bandwidth only if you 
> always have only one command at time. So, all those sophisticated 
> measurements can't substitute a plane old:

If my assumption that --numjobs=1 puts fio into a single-threaded mode
is correct, it seems that using this patch hurts individual command
latency, at least in a gross sense. The table listed above shows a ~9%
hit for single-threaded 0.5 KB and 4 KB requests, ~4.8% for 64 KB
requests, and ~1.4% for 512 KB requests. It seems to win @ lots of
requests and small block sizes, but still seems to hurt performance at
larger request sizes, though it seems they were tested with smaller
thread counts.

I've not reviewed the patch yet, but that's how I read the table above.
I'm assuming latency is hurt by the need to schedule the kernel thread,
but the batching helps increase the IOPS for low request sizes.

Bart, you could also try xdd as a benchmark tool.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
       [not found]                 ` <1280774209.2451.10.camel-FqX9LgGZnHWDB2HL1qBt2PIbXMQ5te18@public.gmane.org>
@ 2010-08-02 18:40                   ` Bart Van Assche
       [not found]                     ` <AANLkTikYEvQfbWGLMZGZ_c+ggy0hAkiS9RAsBmGVKDDA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Bart Van Assche @ 2010-08-02 18:40 UTC (permalink / raw)
  To: David Dillow
  Cc: Vladislav Bolkhovitin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Roland Dreier, Ralph Campbell

On Mon, Aug 2, 2010 at 8:36 PM, David Dillow <dave-i1Mk8JYDVaaSihdK6806/g@public.gmane.org> wrote:
>
> On Mon, 2010-08-02 at 22:16 +0400, Vladislav Bolkhovitin wrote:
> > Bart Van Assche, on 08/02/2010 07:57 PM wrote:
> > >>>
> > >>> block size  number of    IOPS        IOPS      IOPS
> > >>>   in bytes    threads     without     with      with
> > >>>    ($bs)     ($numjobs)  this patch  thread=n  thread=y
> > >>>     512           1        25,400      25,400    23,100
> > >>>     512         128       122,000     122,000   153,000
> > >>>    4096           1        25,000      25,000    22,700
> > >>>    4096         128       122,000     121,000   157,000
> > >>>   65536           1        14,300      14,400    13,600
> > >>>   65536           4        36,700      36,700    36,600
> > >>> 524288           1         3,470       3,430     3,420
> > >>> 524288           4         5,020       5,020     4,990
>
> > I'm interested to see how much your changes affected processing latency,
> > i.e. to measure execution latency before and after changes. You can't do
> > that with several threads, because latency = 1/bandwidth only if you
> > always have only one command at time. So, all those sophisticated
> > measurements can't substitute a plane old:
>
> If my assumption that --numjobs=1 puts fio into a single-threaded mode
> is correct, it seems that using this patch hurts individual command
> latency, at least in a gross sense. The table listed above shows a ~9%
> hit for single-threaded 0.5 KB and 4 KB requests, ~4.8% for 64 KB
> requests, and ~1.4% for 512 KB requests. It seems to win @ lots of
> requests and small block sizes, but still seems to hurt performance at
> larger request sizes, though it seems they were tested with smaller
> thread counts.
>
> I've not reviewed the patch yet, but that's how I read the table above.
> I'm assuming latency is hurt by the need to schedule the kernel thread,
> but the batching helps increase the IOPS for low request sizes.

Please note that the user has to enable mode thread=y explicitly. The
default mode is thread=n and in that mode neither latency nor
throughput is affected by this patch.

> Bart, you could also try xdd as a benchmark tool.

I'm familiar with xdd. However, I consider fio both as more powerful
and easier to user than xdd.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] IB/srp: use multiple CPU cores more effectively
       [not found]                     ` <AANLkTikYEvQfbWGLMZGZ_c+ggy0hAkiS9RAsBmGVKDDA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-08-02 19:07                       ` Vladislav Bolkhovitin
  0 siblings, 0 replies; 7+ messages in thread
From: Vladislav Bolkhovitin @ 2010-08-02 19:07 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: David Dillow, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Roland Dreier,
	Ralph Campbell

Bart Van Assche, on 08/02/2010 10:40 PM wrote:
> On Mon, Aug 2, 2010 at 8:36 PM, David Dillow<dave-i1Mk8JYDVaaSihdK6806/g@public.gmane.org>  wrote:
>>
>> On Mon, 2010-08-02 at 22:16 +0400, Vladislav Bolkhovitin wrote:
>>> Bart Van Assche, on 08/02/2010 07:57 PM wrote:
>>>>>>
>>>>>> block size  number of    IOPS        IOPS      IOPS
>>>>>>    in bytes    threads     without     with      with
>>>>>>     ($bs)     ($numjobs)  this patch  thread=n  thread=y
>>>>>>      512           1        25,400      25,400    23,100
>>>>>>      512         128       122,000     122,000   153,000
>>>>>>     4096           1        25,000      25,000    22,700
>>>>>>     4096         128       122,000     121,000   157,000
>>>>>>    65536           1        14,300      14,400    13,600
>>>>>>    65536           4        36,700      36,700    36,600
>>>>>> 524288           1         3,470       3,430     3,420
>>>>>> 524288           4         5,020       5,020     4,990
>>
>>> I'm interested to see how much your changes affected processing latency,
>>> i.e. to measure execution latency before and after changes. You can't do
>>> that with several threads, because latency = 1/bandwidth only if you
>>> always have only one command at time. So, all those sophisticated
>>> measurements can't substitute a plane old:
>>
>> If my assumption that --numjobs=1 puts fio into a single-threaded mode
>> is correct, it seems that using this patch hurts individual command
>> latency, at least in a gross sense. The table listed above shows a ~9%
>> hit for single-threaded 0.5 KB and 4 KB requests, ~4.8% for 64 KB
>> requests, and ~1.4% for 512 KB requests. It seems to win @ lots of
>> requests and small block sizes, but still seems to hurt performance at
>> larger request sizes, though it seems they were tested with smaller
>> thread counts.
>>
>> I've not reviewed the patch yet, but that's how I read the table above.
>> I'm assuming latency is hurt by the need to schedule the kernel thread,
>> but the batching helps increase the IOPS for low request sizes.
>
> Please note that the user has to enable mode thread=y explicitly. The
> default mode is thread=n and in that mode neither latency nor
> throughput is affected by this patch.
>
>> Bart, you could also try xdd as a benchmark tool.
>
> I'm familiar with xdd. However, I consider fio both as more powerful
> and easier to user than xdd.

Bart, you simply can't measure your link/processing latency with it in a 
trustworthy manner. In my experience, it's too heavy wighted to measure 
such small objects, i.e. its internal overhead is >= the measured value. 
In the scientific terms it means that you have instrumental mistake in 
tens-hundreds %%.

Vlad
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2010-08-02 19:07 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-02  8:15 [PATCH] IB/srp: use multiple CPU cores more effectively Bart Van Assche
     [not found] ` <201008021015.40472.bvanassche-HInyCGIudOg@public.gmane.org>
2010-08-02 13:08   ` Vladislav Bolkhovitin
     [not found]     ` <4C56C336.4040009-d+Crzxg7Rs0@public.gmane.org>
2010-08-02 15:57       ` Bart Van Assche
     [not found]         ` <AANLkTinBTv5SZJ_H9C15CWZ5hYGFe38840zy78+N-wbO-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-08-02 18:16           ` Vladislav Bolkhovitin
     [not found]             ` <4C570B7F.2010306-d+Crzxg7Rs0@public.gmane.org>
2010-08-02 18:36               ` David Dillow
     [not found]                 ` <1280774209.2451.10.camel-FqX9LgGZnHWDB2HL1qBt2PIbXMQ5te18@public.gmane.org>
2010-08-02 18:40                   ` Bart Van Assche
     [not found]                     ` <AANLkTikYEvQfbWGLMZGZ_c+ggy0hAkiS9RAsBmGVKDDA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-08-02 19:07                       ` Vladislav Bolkhovitin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.