All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 7/7] dm-crypt: sort writes
@ 2015-02-13 13:27 Mikulas Patocka
  2015-02-13 21:01 ` Mike Snitzer
  2015-02-20  9:38 ` Performance testing of related dm-crypt patches Ondrej Kozina
  0 siblings, 2 replies; 4+ messages in thread
From: Mikulas Patocka @ 2015-02-13 13:27 UTC (permalink / raw)
  To: Mike Snitzer, Milan Broz, Ondrej Kozina, Alasdair G. Kergon; +Cc: dm-devel

Write requests are sorted in a red-black tree structure and are submitted
in the sorted order.

In theory the sorting should be performed by the underlying disk scheduler,
however, in practice the disk scheduler accepts and sorts only 128 requests.
In order to sort more requests, we need to implement our own sorting.

In testing, it was shown that this patch slightly increases performance in
some situations

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-crypt.c |   50 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 15 deletions(-)

Index: linux-3.19/drivers/md/dm-crypt.c
===================================================================
--- linux-3.19.orig/drivers/md/dm-crypt.c	2015-02-12 18:40:56.000000000 +0100
+++ linux-3.19/drivers/md/dm-crypt.c	2015-02-12 18:41:32.000000000 +0100
@@ -22,6 +22,7 @@
 #include <linux/backing-dev.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
+#include <linux/rbtree.h>
 #include <asm/page.h>
 #include <asm/unaligned.h>
 #include <crypto/hash.h>
@@ -60,7 +61,7 @@ struct dm_crypt_io {
 	int error;
 	sector_t sector;
 
-	struct list_head list;
+	struct rb_node rb_node;
 } CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
@@ -133,7 +134,7 @@ struct crypt_config {
 
 	struct task_struct *write_thread;
 	wait_queue_head_t write_thread_wait;
-	struct list_head write_thread_list;
+	struct rb_root write_tree;
 
 	char *cipher;
 	char *cipher_string;
@@ -1172,7 +1173,7 @@ static int dmcrypt_write(void *data)
 {
 	struct crypt_config *cc = data;
 	while (1) {
-		struct list_head local_list;
+		struct rb_root write_tree;
 		struct blk_plug plug;
 
 		DECLARE_WAITQUEUE(wait, current);
@@ -1180,7 +1181,7 @@ static int dmcrypt_write(void *data)
 		spin_lock_irq(&cc->write_thread_wait.lock);
 continue_locked:
 
-		if (!list_empty(&cc->write_thread_list))
+		if (!RB_EMPTY_ROOT(&cc->write_tree))
 			goto pop_from_list;
 
 		__set_current_state(TASK_INTERRUPTIBLE);
@@ -1202,20 +1203,23 @@ continue_locked:
 		goto continue_locked;
 
 pop_from_list:
-		local_list = cc->write_thread_list;
-		local_list.next->prev = &local_list;
-		local_list.prev->next = &local_list;
-		INIT_LIST_HEAD(&cc->write_thread_list);
-
+		write_tree = cc->write_tree;
+		cc->write_tree = RB_ROOT;
 		spin_unlock_irq(&cc->write_thread_wait.lock);
 
+		BUG_ON(rb_parent(write_tree.rb_node));
+
+		/*
+		 * Note: we cannot walk the tree here with rb_next because
+		 * the structures may be freed when kcryptd_io_write is called.
+		 */
 		blk_start_plug(&plug);
 		do {
-			struct dm_crypt_io *io = container_of(local_list.next,
-						struct dm_crypt_io, list);
-			list_del(&io->list);
+			struct dm_crypt_io *io = rb_entry(rb_first(&write_tree),
+						struct dm_crypt_io, rb_node);
+			rb_erase(&io->rb_node, &write_tree);
 			kcryptd_io_write(io);
-		} while (!list_empty(&local_list));
+		} while (!RB_EMPTY_ROOT(&write_tree));
 		blk_finish_plug(&plug);
 	}
 	return 0;
@@ -1226,6 +1230,8 @@ static void kcryptd_crypt_write_io_submi
 	struct bio *clone = io->ctx.bio_out;
 	struct crypt_config *cc = io->cc;
 	unsigned long flags;
+	sector_t sector;
+	struct rb_node **p, *parent;
 
 	if (unlikely(io->error < 0)) {
 		crypt_free_buffer_pages(cc, clone);
@@ -1245,7 +1251,21 @@ static void kcryptd_crypt_write_io_submi
 	}
 
 	spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
-	list_add_tail(&io->list, &cc->write_thread_list);
+	p = &cc->write_tree.rb_node;
+	parent = NULL;
+	sector = io->sector;
+	while (*p) {
+		parent = *p;
+#define io_node rb_entry(parent, struct dm_crypt_io, rb_node)
+		if (sector < io_node->sector)
+			p = &io_node->rb_node.rb_left;
+		else
+			p = &io_node->rb_node.rb_right;
+#undef io_node
+	}
+	rb_link_node(&io->rb_node, parent, p);
+	rb_insert_color(&io->rb_node, &cc->write_tree);
+
 	wake_up_locked(&cc->write_thread_wait);
 	spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
 }
@@ -1827,7 +1847,7 @@ static int crypt_ctr(struct dm_target *t
 	}
 
 	init_waitqueue_head(&cc->write_thread_wait);
-	INIT_LIST_HEAD(&cc->write_thread_list);
+	cc->write_tree = RB_ROOT;
 
 	cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
 	if (IS_ERR(cc->write_thread)) {

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 7/7] dm-crypt: sort writes
  2015-02-13 13:27 [PATCH 7/7] dm-crypt: sort writes Mikulas Patocka
@ 2015-02-13 21:01 ` Mike Snitzer
  2015-02-20  9:38 ` Performance testing of related dm-crypt patches Ondrej Kozina
  1 sibling, 0 replies; 4+ messages in thread
From: Mike Snitzer @ 2015-02-13 21:01 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Ondrej Kozina, dm-devel, Alasdair G. Kergon, Milan Broz

On Fri, Feb 13 2015 at  8:27P -0500,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> Write requests are sorted in a red-black tree structure and are submitted
> in the sorted order.
> 
> In theory the sorting should be performed by the underlying disk scheduler,
> however, in practice the disk scheduler accepts and sorts only 128 requests.
> In order to sort more requests, we need to implement our own sorting.
> 
> In testing, it was shown that this patch slightly increases performance in
> some situations
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

FYI, I've folded this patch in to cleanup rb_tree node access:

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 954ba1f..e6a1460 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1175,9 +1175,13 @@ static void kcryptd_io_write(struct dm_crypt_io *io)
 	generic_make_request(clone);
 }
 
+#define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node)
+
 static int dmcrypt_write(void *data)
 {
 	struct crypt_config *cc = data;
+	struct dm_crypt_io *io;
+
 	while (1) {
 		struct rb_root write_tree;
 		struct blk_plug plug;
@@ -1221,8 +1225,7 @@ pop_from_list:
 		 */
 		blk_start_plug(&plug);
 		do {
-			struct dm_crypt_io *io = rb_entry(rb_first(&write_tree),
-						struct dm_crypt_io, rb_node);
+			io = crypt_io_from_node(rb_first(&write_tree));
 			rb_erase(&io->rb_node, &write_tree);
 			kcryptd_io_write(io);
 		} while (!RB_EMPTY_ROOT(&write_tree));
@@ -1237,7 +1240,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 	struct crypt_config *cc = io->cc;
 	unsigned long flags;
 	sector_t sector;
-	struct rb_node **p, *parent;
+	struct rb_node **rbp, *parent;
 
 	if (unlikely(io->error < 0)) {
 		crypt_free_buffer_pages(cc, clone);
@@ -1252,19 +1255,17 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
 	spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
-	p = &cc->write_tree.rb_node;
+	rbp = &cc->write_tree.rb_node;
 	parent = NULL;
 	sector = io->sector;
-	while (*p) {
-		parent = *p;
-#define io_node rb_entry(parent, struct dm_crypt_io, rb_node)
-		if (sector < io_node->sector)
-			p = &io_node->rb_node.rb_left;
+	while (*rbp) {
+		parent = *rbp;
+		if (sector < crypt_io_from_node(parent)->sector)
+			rbp = &(*rbp)->rb_left;
 		else
-			p = &io_node->rb_node.rb_right;
-#undef io_node
+			rbp = &(*rbp)->rb_right;
 	}
-	rb_link_node(&io->rb_node, parent, p);
+	rb_link_node(&io->rb_node, parent, rbp);
 	rb_insert_color(&io->rb_node, &cc->write_tree);
 
 	wake_up_locked(&cc->write_thread_wait);

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Performance testing of related dm-crypt patches
  2015-02-13 13:27 [PATCH 7/7] dm-crypt: sort writes Mikulas Patocka
  2015-02-13 21:01 ` Mike Snitzer
@ 2015-02-20  9:38 ` Ondrej Kozina
  2015-02-20 14:43   ` Mike Snitzer
  1 sibling, 1 reply; 4+ messages in thread
From: Ondrej Kozina @ 2015-02-20  9:38 UTC (permalink / raw)
  To: device-mapper development
  Cc: Mike Snitzer, Mikulas Patocka, Alasdair G. Kergon, Milan Broz

Hi,

the mail will be quite a big one so for better navigation I'm adding 
contents:

[1] Short resume of performance results
[2] Descriptions of test systems
[3] Detailed tests description
[4] Description of dm-crypt modules involved in testing
[5] dm-zero based test results
[6] spin drive based results
[7] spin drive based results (heavy load)
--------------------------------------------------------------------

[1] Short resume of performance results
---------------------------------------

Results for dm-crypt target mapped over dm-zero one (testing pure 
performance of dm-crypt only) show that unbounding the workqueue
is vastly beneficial for very fast devices. Offloading the requests to 
separate thread (before sorting the requests) has some cost (~10% 
compared to after the unbound workqueue patch applied) but it's not 
anything that would kill the performance seriously. Also results show 
that (CPU) price for sorting the requests before submitting to lower 
layer is negligible. Note that with dm-zero backend no I/O scheduler 
steps in.

With spin drives it's not so straightforward, but in summary there're 
still nice performance gains visible. Especially with larger block sizes 
(and deeper queues) the sorting patch improves the performance 
significantly and sometimes matches the performance of raw block device!

Unfortunately there are examples of workloads where even unbounding the 
queue or subsequent offloading of requests to separate thread can hurt 
performance so this is why we decided to introduce 2 switches in 
dm-crypt target constructor. More detailed explanation in [6] and [7].

[2] Descriptions of test systems
--------------------------------

numa_1 : single socket Intel system with 6 cores CPU and hyper-threading 
enabled (12 logical cores), 12GiB ram

numa_2 : two socket Intel system with 2x8 cores with HT enabled (32 
logical cores), 128 GiB ram

numa_4 : 4 socket AMD system with 4x4 cores no HT (16 logical cores), 
8GiB ram

numa_8 : 8 socket Intel system with 8x10 cores and HT eanabled (160 
logical cores), 1 TiB ram

- All systems had additional storage attached so that spin drives were 
not shared with the system (with rootfs, swap, whatever)

- CPU throttling was disabled: especially all sleep states (except 
c-state 0) and turbo modes (if available)

- read/write caching disabled on spin drives

- test OS was RHEL7 with upstream kernel and custom dm-crypt patches 
(more on that in section [4])

[3] Detailed tests description
------------------------------

tested cipher passed to dm-crypt target: aes-xts-plain64

Tests were performing async sequental writes using fio and libaio 
library. Each test scenario ran repeatedly (5 to 10 iterations per each 
scenario) to rule out measurements error as much as possible or to 
detect some results for particular job were highly volatile (there were 
some)

Tests were based on two backends for dm-crypt mapping: spin drive or 
dm-zero target for measuring pure dm-crypt performance.

I used three basic scenarios:
"disk" single fio process writing sequentially dm-crypt mapped over 
spind drive (starting with device's origin)

"zero": single fio process writing sequentially dm-crypt mapped over dm-zero

"disk_heavy_load": sequential writes issued from multiple fio processes 
each process set bound to different CPU sockets writing to spin drive 
(under dm-crypt mapping). The device is divided uniformly between all 
sockets (and thus also all fio processes).

example of disk_heavy_load test with 3 fio processes per socket:
CPU0 (meant whole socket, not single core)
f0 f1 f2 (set of three individual fio processes bound to CPU0)
r0 (device region (linear segment) written by f0)

     CPU0--------CPU1-------CPU2
            |          |          |
  f0 f1 f2  | f3 f4 f5 | f6 f7 f8 |
   |  |  |  |  |  |  | |  |  |  | |
  r0 r1 r2  | r3 r4 r5 | r6 r7 r9 |

Result tables are composed from multiple lines that looks like following:

D iodepth=256, 32k, mode: write: 698461.10 14795.64 2.12 %
-    -----     ---                -----      -----   ----
|      |        |                   |          |      |
|      |        |                   |          |      v
|      |        |                   |          |  standard deviation
|      |        |                   |          v
|      |        |                   |     average deviation (KiB/s)
|      |        |                   v
|      |        |       sum of bandwidth all fio's (KiB/s)
|      |        v
|      v     block size
| max I/O queue depth
|
v
dm-crypt module name (see following section)

[4] Description of dm-crypt modules involved in testing
-------------------------------------------------------

Each line in results tables is prefixed with single letter meaning 
different dm-crypt module was involved in testing.

'_' stands for raw block device (used only within one "disk" test)

'A' stands for upstream kernel

'D' stands for following patches:
- dm crypt: remove unused io_pool and _crypt_io_pool
- dm crypt: avoid deadlock in mempools
- dm crypt: don't allocate pages for a partial request

'E' stands for following patch:
    dm crypt: use unbound workqueue for request processing (the option 
'same_cpu_crypt' turned off)

'F' stands for following patches:
- dm crypt: offload writes to thread
- dm crypt: add 'submit_from_crypt_cpus' option (but turned off)

'G' stands for following patch:
- dm crypt: sort writes ('submit_from_crypt_cpus' turned off)

[5] dm-zero based test results
------------------------------

"zero" test on single socket system: 
http://okozina.fedorapeople.org/dm-crypt-for-3.20/zero/numa_1/stats

"zero" test on 8 socket system: 
http://okozina.fedorapeople.org/dm-crypt-for-3.20/zero/numa_8/stats

full test results including fio job files and logs:
http://okozina.fedorapeople.org/dm-crypt-for-3.20/zero/numa_1/test_zero_aio.tar.xz
http://okozina.fedorapeople.org/dm-crypt-for-3.20/zero/numa_8/test_zero_aio.tar.xz

[6] spin drive based results
----------------------------

"disk" test single socket system with cfq scheduler: 
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk/numa_1/stats
full test results including fio job files and logs:
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk/numa_1/test_disk_aio.tar.xz

Usually, there's noticeable performance improvement starting with patch 
E in iodepth=8 and reasonably set bsize (4KiB and larger), but as you 
can seen there're few examples where offloading (and sorting) hurts the 
performance (iodepth=32, various block sizes).

With iodepth=256 there're some examples where unbounding the workqueue 
without offloading to single thread can hurt the performance 
(bsize=16KiB and 32KiB)

But in most cases we can say dm-crypt performance is pretty close to raw 
block device now.

[7] spin drive based results (heavy load)
-----------------------------------------

These tests were most complex. Tested both cfq and deadline schedulers, 
setting different nr_request parameter for device's scheduler queue.

Tests were spawning 1, 5 or 8 fio processes per CPU socket (8, 40 or 64 
processes in case of numa_8) in a system and performed i/o on same count 
of non-overlapping disk regions.

subdir /numj_1/ means: single process per cpu socket, /numj_5/: 5 
processes...

Unfortunately, there're workloads where unbounding the workqueue shows 
performance drop and subsequent offloading to single thread makes it 
even worse. (see 8 socket system, cfq, numj_1: 
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_8/cfq/nr_req_128/numj_1/stats).

Similar observations in 2 socket system, cfq, numj_1 
:http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_2/cfq/nr_req_128/numj_1/stats.

On both 2 socket and 8 socket system this observation fades away with 
adding more fio processes per socket.

Only 4 socket system (not so up to date AMD CPUs w/o HT) didn't show 
such pattern.

Generally with higher load, deeper ioqueues and larger block sizes, the 
sorting which takes place in offload thread proves to do it's job good.

*cfq* scheduler, nr_request=128:

2 sockets system:
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_2/cfq/nr_req_128/numj_1/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_2/cfq/nr_req_128/numj_5/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_2/cfq/nr_req_128/numj_8/stats

4 sockets system:
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_4/cfq/nr_req_128/numj_1/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_4/cfq/nr_req_128/numj_5/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_4/cfq/nr_req_128/numj_8/stats

8 sockets system:
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_8/cfq/nr_req_128/numj_1/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_8/cfq/nr_req_128/numj_5/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_8/cfq/nr_req_128/numj_8/stats

*deadline* scheduler, nr_request=128:

2 sockets system:
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_2/deadline/nr_req_128/numj_1/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_2/deadline/nr_req_128/numj_5/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_2/deadline/nr_req_128/numj_8/stats

4 sockets system:
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_4/deadline/nr_req_128/numj_1/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_4/deadline/nr_req_128/numj_5/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_4/deadline/nr_req_128/numj_8/stats

8 sockets system:
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_8/deadline/nr_req_128/numj_1/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_8/deadline/nr_req_128/numj_5/stats
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_8/deadline/nr_req_128/numj_8/stats

full test results including fio job files and logs (beware of archive 
unpacked has about 500MiBs):
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_2/test_disk_heavy_load.tar.xz
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_4/test_disk_heavy_load.tar.xz
http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_8/test_disk_heavy_load.tar.xz

Ondrej

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Performance testing of related dm-crypt patches
  2015-02-20  9:38 ` Performance testing of related dm-crypt patches Ondrej Kozina
@ 2015-02-20 14:43   ` Mike Snitzer
  0 siblings, 0 replies; 4+ messages in thread
From: Mike Snitzer @ 2015-02-20 14:43 UTC (permalink / raw)
  To: Ondrej Kozina
  Cc: device-mapper development, Mikulas Patocka, Alasdair G. Kergon,
	Milan Broz

On Fri, Feb 20 2015 at  4:38am -0500,
Ondrej Kozina <okozina@redhat.com> wrote:

> Hi,
> 
> the mail will be quite a big one so for better navigation I'm adding
> contents:
> 
> [1] Short resume of performance results
> [2] Descriptions of test systems
> [3] Detailed tests description
> [4] Description of dm-crypt modules involved in testing
> [5] dm-zero based test results
> [6] spin drive based results
> [7] spin drive based results (heavy load)
> --------------------------------------------------------------------
> 
> [1] Short resume of performance results
> ---------------------------------------
> 
> Results for dm-crypt target mapped over dm-zero one (testing pure
> performance of dm-crypt only) show that unbounding the workqueue
> is vastly beneficial for very fast devices. Offloading the requests
> to separate thread (before sorting the requests) has some cost (~10%
> compared to after the unbound workqueue patch applied) but it's not
> anything that would kill the performance seriously. Also results
> show that (CPU) price for sorting the requests before submitting to
> lower layer is negligible. Note that with dm-zero backend no I/O
> scheduler steps in.
> 
> With spin drives it's not so straightforward, but in summary
> there're still nice performance gains visible. Especially with
> larger block sizes (and deeper queues) the sorting patch improves
> the performance significantly and sometimes matches the performance
> of raw block device!
> 
> Unfortunately there are examples of workloads where even unbounding
> the queue or subsequent offloading of requests to separate thread
> can hurt performance so this is why we decided to introduce 2
> switches in dm-crypt target constructor. More detailed explanation
> in [6] and [7].

Overall the good definitely outweighs any bad though.  Thanks a lot for
all your work on this testing.
 
> [6] spin drive based results
> ----------------------------
> 
> "disk" test single socket system with cfq scheduler:
> http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk/numa_1/stats
> full test results including fio job files and logs:
> http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk/numa_1/test_disk_aio.tar.xz
> 
> Usually, there's noticeable performance improvement starting with
> patch E in iodepth=8 and reasonably set bsize (4KiB and larger), but
> as you can seen there're few examples where offloading (and sorting)
> hurts the performance (iodepth=32, various block sizes).
> 
> With iodepth=256 there're some examples where unbounding the
> workqueue without offloading to single thread can hurt the
> performance (bsize=16KiB and 32KiB)
> 
> But in most cases we can say dm-crypt performance is pretty close to
> raw block device now.

Yes, though there definitely seems something pathologically wrong with
the cases you pointed out.  But in 99% of all cases the end result of
the new changes is better than existing dm-crypt (G vs A).

> [7] spin drive based results (heavy load)
> -----------------------------------------
> 
> These tests were most complex. Tested both cfq and deadline
> schedulers, setting different nr_request parameter for device's
> scheduler queue.
> 
> Tests were spawning 1, 5 or 8 fio processes per CPU socket (8, 40 or
> 64 processes in case of numa_8) in a system and performed i/o on
> same count of non-overlapping disk regions.
> 
> subdir /numj_1/ means: single process per cpu socket, /numj_5/: 5
> processes...
> 
> Unfortunately, there're workloads where unbounding the workqueue
> shows performance drop and subsequent offloading to single thread
> makes it even worse. (see 8 socket system, cfq, numj_1: http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_8/cfq/nr_req_128/numj_1/stats).
> 
> Similar observations in 2 socket system, cfq, numj_1 :http://okozina.fedorapeople.org/dm-crypt-for-3.20/disk_heavy_load/numa_2/cfq/nr_req_128/numj_1/stats.
> 
> On both 2 socket and 8 socket system this observation fades away
> with adding more fio processes per socket.
> 
> Only 4 socket system (not so up to date AMD CPUs w/o HT) didn't show
> such pattern.
> 
> Generally with higher load, deeper ioqueues and larger block sizes,
> the sorting which takes place in offload thread proves to do it's
> job good.

It is interesting to note that deadline pretty consistently outperforms
CFQ.  Not too surprising considering all the extra logic that CFQ has.
But it is nice to see that with CFQ the new changes, targeting helping
CFQ, seem to be helping (to overcome CFQ's IO context constraints).

In the end I'm inclined to "ship it!".  I'll prep the pull for Linus
now.

Again, thanks for all your testing!

Mike

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-02-20 14:43 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-13 13:27 [PATCH 7/7] dm-crypt: sort writes Mikulas Patocka
2015-02-13 21:01 ` Mike Snitzer
2015-02-20  9:38 ` Performance testing of related dm-crypt patches Ondrej Kozina
2015-02-20 14:43   ` Mike Snitzer

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.