All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/9] dm crypt: improve cpu scalability
@ 2014-03-28 20:11 Mike Snitzer
  2014-03-28 20:11 ` [PATCH 1/9] dm crypt: fix cpu hotplug crash by removing per-cpu structure Mike Snitzer
                   ` (8 more replies)
  0 siblings, 9 replies; 27+ messages in thread
From: Mike Snitzer @ 2014-03-28 20:11 UTC (permalink / raw)
  To: dm-devel

With these changes we hope to improve dm-crypt performance for a
wide-range of dm-crypt use-cases and configurations.  Extensive
testing is planned to identify the performance difference(s) as
compared to stock v3.14-rc8.  We'll up this thread as results become
available.

These changes are staged in linux-dm.git 'for-next' -- that is
primarily for linux-next test coverage purposes.  We'll make a call on
whether any of these patches are included in 3.15 when we have the
benefit of performance comparison results.

Mikulas Patocka (9):
  dm crypt: fix cpu hotplug crash by removing per-cpu structure
  block: use kmalloc alignment for bio slab
  dm crypt: use per-bio data
  dm crypt: use unbound workqueue for request processing
  dm crypt: don't allocate pages for a partial request
  dm crypt: avoid deadlock in mempools
  dm crypt: remove io_pool
  dm crypt: offload writes to thread
  dm crypt: sort writes

 drivers/md/dm-crypt.c | 413 +++++++++++++++++++++++++-------------------------
 fs/bio.c              |   3 +-
 2 files changed, 209 insertions(+), 207 deletions(-)

-- 
1.8.1.4

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/9] dm crypt: fix cpu hotplug crash by removing per-cpu structure
  2014-03-28 20:11 [PATCH 0/9] dm crypt: improve cpu scalability Mike Snitzer
@ 2014-03-28 20:11 ` Mike Snitzer
  2014-03-28 20:11 ` [PATCH 2/9] block: use kmalloc alignment for bio slab Mike Snitzer
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Mike Snitzer @ 2014-03-28 20:11 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

The DM crypt target used per-cpu structures to hold pointers to a
ablkcipher_request structure.  The code assumed that the work item keeps
executing on a single CPU, so it didn't use synchronization when
accessing this structure.

If a CPU is disabled by writing 0 to /sys/devices/system/cpu/cpu*/online,
the work item could be moved to another CPU.  This causes dm-crypt
crashes, like the following, because the code starts using an incorrect
ablkcipher_request:

 smpboot: CPU 7 is now offline
 BUG: unable to handle kernel NULL pointer dereference at 0000000000000130
 IP: [<ffffffffa1862b3d>] crypt_convert+0x12d/0x3c0 [dm_crypt]
 ...
 Call Trace:
  [<ffffffffa1864415>] ? kcryptd_crypt+0x305/0x470 [dm_crypt]
  [<ffffffff81062060>] ? finish_task_switch+0x40/0xc0
  [<ffffffff81052a28>] ? process_one_work+0x168/0x470
  [<ffffffff8105366b>] ? worker_thread+0x10b/0x390
  [<ffffffff81053560>] ? manage_workers.isra.26+0x290/0x290
  [<ffffffff81058d9f>] ? kthread+0xaf/0xc0
  [<ffffffff81058cf0>] ? kthread_create_on_node+0x120/0x120
  [<ffffffff813464ac>] ? ret_from_fork+0x7c/0xb0
  [<ffffffff81058cf0>] ? kthread_create_on_node+0x120/0x120

Fix this bug by removing the per-cpu definition.  The structure
ablkcipher_request is accessed via a pointer from convert_context.
Consequently, if the work item is rescheduled to a different CPU, the
thread still uses the same ablkcipher_request.

This change may undermine performance improvements intended by commit
c0297721 ("dm crypt: scale to multiple cpus") on select hardware.  In
practice no performance difference was observed on recent hardware.  But
regardless, correctness is more important than performance.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-crypt.c | 61 ++++++++++-----------------------------------------
 1 file changed, 12 insertions(+), 49 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 784695d..53b2132 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -19,7 +19,6 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
-#include <linux/percpu.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
@@ -43,6 +42,7 @@ struct convert_context {
 	struct bvec_iter iter_out;
 	sector_t cc_sector;
 	atomic_t cc_pending;
+	struct ablkcipher_request *req;
 };
 
 /*
@@ -111,15 +111,7 @@ struct iv_tcw_private {
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
 
 /*
- * Duplicated per-CPU state for cipher.
- */
-struct crypt_cpu {
-	struct ablkcipher_request *req;
-};
-
-/*
- * The fields in here must be read only after initialization,
- * changing state should be in crypt_cpu.
+ * The fields in here must be read only after initialization.
  */
 struct crypt_config {
 	struct dm_dev *dev;
@@ -150,12 +142,6 @@ struct crypt_config {
 	sector_t iv_offset;
 	unsigned int iv_size;
 
-	/*
-	 * Duplicated per cpu state. Access through
-	 * per_cpu_ptr() only.
-	 */
-	struct crypt_cpu __percpu *cpu;
-
 	/* ESSIV: struct crypto_cipher *essiv_tfm */
 	void *iv_private;
 	struct crypto_ablkcipher **tfms;
@@ -192,11 +178,6 @@ static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
 
-static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
-{
-	return this_cpu_ptr(cc->cpu);
-}
-
 /*
  * Use this to access cipher attributes that are the same for each CPU.
  */
@@ -903,16 +884,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
-	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
 
-	if (!this_cc->req)
-		this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+	if (!ctx->req)
+		ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
-	ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
-	ablkcipher_request_set_callback(this_cc->req,
+	ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+	ablkcipher_request_set_callback(ctx->req,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-	    kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
+	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
 }
 
 /*
@@ -921,7 +901,6 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
-	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	int r;
 
 	atomic_set(&ctx->cc_pending, 1);
@@ -932,7 +911,7 @@ static int crypt_convert(struct crypt_config *cc,
 
 		atomic_inc(&ctx->cc_pending);
 
-		r = crypt_convert_block(cc, ctx, this_cc->req);
+		r = crypt_convert_block(cc, ctx, ctx->req);
 
 		switch (r) {
 		/* async */
@@ -941,7 +920,7 @@ static int crypt_convert(struct crypt_config *cc,
 			reinit_completion(&ctx->restart);
 			/* fall through*/
 		case -EINPROGRESS:
-			this_cc->req = NULL;
+			ctx->req = NULL;
 			ctx->cc_sector++;
 			continue;
 
@@ -1040,6 +1019,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
 	io->sector = sector;
 	io->error = 0;
 	io->base_io = NULL;
+	io->ctx.req = NULL;
 	atomic_set(&io->io_pending, 0);
 
 	return io;
@@ -1065,6 +1045,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 	if (!atomic_dec_and_test(&io->io_pending))
 		return;
 
+	if (io->ctx.req)
+		mempool_free(io->ctx.req, cc->req_pool);
 	mempool_free(io, cc->io_pool);
 
 	if (likely(!base_io))
@@ -1492,8 +1474,6 @@ static int crypt_wipe_key(struct crypt_config *cc)
 static void crypt_dtr(struct dm_target *ti)
 {
 	struct crypt_config *cc = ti->private;
-	struct crypt_cpu *cpu_cc;
-	int cpu;
 
 	ti->private = NULL;
 
@@ -1505,13 +1485,6 @@ static void crypt_dtr(struct dm_target *ti)
 	if (cc->crypt_queue)
 		destroy_workqueue(cc->crypt_queue);
 
-	if (cc->cpu)
-		for_each_possible_cpu(cpu) {
-			cpu_cc = per_cpu_ptr(cc->cpu, cpu);
-			if (cpu_cc->req)
-				mempool_free(cpu_cc->req, cc->req_pool);
-		}
-
 	crypt_free_tfms(cc);
 
 	if (cc->bs)
@@ -1530,9 +1503,6 @@ static void crypt_dtr(struct dm_target *ti)
 	if (cc->dev)
 		dm_put_device(ti, cc->dev);
 
-	if (cc->cpu)
-		free_percpu(cc->cpu);
-
 	kzfree(cc->cipher);
 	kzfree(cc->cipher_string);
 
@@ -1588,13 +1558,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	if (tmp)
 		DMWARN("Ignoring unexpected additional cipher options");
 
-	cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
-				 __alignof__(struct crypt_cpu));
-	if (!cc->cpu) {
-		ti->error = "Cannot allocate per cpu state";
-		goto bad_mem;
-	}
-
 	/*
 	 * For compatibility with the original dm-crypt mapping format, if
 	 * only the cipher name is supplied, use cbc-plain.
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 2/9] block: use kmalloc alignment for bio slab
  2014-03-28 20:11 [PATCH 0/9] dm crypt: improve cpu scalability Mike Snitzer
  2014-03-28 20:11 ` [PATCH 1/9] dm crypt: fix cpu hotplug crash by removing per-cpu structure Mike Snitzer
@ 2014-03-28 20:11 ` Mike Snitzer
  2014-03-28 20:11 ` [PATCH 3/9] dm crypt: use per-bio data Mike Snitzer
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Mike Snitzer @ 2014-03-28 20:11 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

Various subsystems can ask the bio subsystem to create a bio slab cache
with some free space before the bio.  This free space can be used for any
purpose.  Device mapper uses this per-bio-data feature to place some
target-specific and device-mapper specific data before the bio, so that
the target-specific data doesn't have to be allocated separately.

This per-bio-data mechanism is used in place of kmalloc, so we need the
allocated slab to have the same memory alignment as memory allocated
with kmalloc.

Change bio_find_or_create_slab() so that it uses ARCH_KMALLOC_MINALIGN
alignment when creating the slab cache.  This is needed so that dm-crypt
can use per-bio-data for encryption - the crypto subsystem assumes this
data will have the same alignment as kmalloc'ed memory.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 fs/bio.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bio.c b/fs/bio.c
index 8754e7b..371b646 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -112,7 +112,8 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 	bslab = &bio_slabs[entry];
 
 	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
-	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+	slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
+				 SLAB_HWCACHE_ALIGN, NULL);
 	if (!slab)
 		goto out_unlock;
 
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 3/9] dm crypt: use per-bio data
  2014-03-28 20:11 [PATCH 0/9] dm crypt: improve cpu scalability Mike Snitzer
  2014-03-28 20:11 ` [PATCH 1/9] dm crypt: fix cpu hotplug crash by removing per-cpu structure Mike Snitzer
  2014-03-28 20:11 ` [PATCH 2/9] block: use kmalloc alignment for bio slab Mike Snitzer
@ 2014-03-28 20:11 ` Mike Snitzer
  2014-03-28 20:11 ` [PATCH 4/9] dm crypt: use unbound workqueue for request processing Mike Snitzer
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Mike Snitzer @ 2014-03-28 20:11 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

Change dm-crypt so that it uses auxiliary data allocated with the bio.

Dm-crypt requires two allocations per request - struct dm_crypt_io and
struct ablkcipher_request (with other data appended to it).  It
previously only used mempool allocations.

Some requests may require more dm_crypt_ios and ablkcipher_requests,
however most requests need just one of each of these two structures to
complete.

This patch changes it so that the first dm_crypt_io and ablkcipher_request
are allocated with the bio (using target per_bio_data_size option).  If
the request needs additional values, they are allocated from the mempool.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 53b2132..2221d12 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -59,7 +59,7 @@ struct dm_crypt_io {
 	int error;
 	sector_t sector;
 	struct dm_crypt_io *base_io;
-};
+} CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
 	struct convert_context *ctx;
@@ -162,6 +162,8 @@ struct crypt_config {
 	 */
 	unsigned int dmreq_start;
 
+	unsigned int per_bio_data_size;
+
 	unsigned long flags;
 	unsigned int key_size;
 	unsigned int key_parts;      /* independent parts in key buffer */
@@ -895,6 +897,14 @@ static void crypt_alloc_req(struct crypt_config *cc,
 	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
 }
 
+static void crypt_free_req(struct crypt_config *cc,
+			   struct ablkcipher_request *req, struct bio *base_bio)
+{
+	struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size);
+	if ((struct ablkcipher_request *)(io + 1) != req)
+		mempool_free(req, cc->req_pool);
+}
+
 /*
  * Encrypt / decrypt data from one bio to another one (can be the same one)
  */
@@ -1008,12 +1018,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
 	}
 }
 
-static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
-					  struct bio *bio, sector_t sector)
+static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
+			  struct bio *bio, sector_t sector)
 {
-	struct dm_crypt_io *io;
-
-	io = mempool_alloc(cc->io_pool, GFP_NOIO);
 	io->cc = cc;
 	io->base_bio = bio;
 	io->sector = sector;
@@ -1021,8 +1028,6 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
 	io->base_io = NULL;
 	io->ctx.req = NULL;
 	atomic_set(&io->io_pending, 0);
-
-	return io;
 }
 
 static void crypt_inc_pending(struct dm_crypt_io *io)
@@ -1046,8 +1051,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 		return;
 
 	if (io->ctx.req)
-		mempool_free(io->ctx.req, cc->req_pool);
-	mempool_free(io, cc->io_pool);
+		crypt_free_req(cc, io->ctx.req, base_bio);
+	if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
+		mempool_free(io, cc->io_pool);
 
 	if (likely(!base_io))
 		bio_endio(base_bio, error);
@@ -1255,8 +1261,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 		 * between fragments, so switch to a new dm_crypt_io structure.
 		 */
 		if (unlikely(!crypt_finished && remaining)) {
-			new_io = crypt_io_alloc(io->cc, io->base_bio,
-						sector);
+			new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
+			crypt_io_init(new_io, io->cc, io->base_bio, sector);
 			crypt_inc_pending(new_io);
 			crypt_convert_init(cc, &new_io->ctx, NULL,
 					   io->base_bio, sector);
@@ -1325,7 +1331,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	if (error < 0)
 		io->error = -EIO;
 
-	mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
+	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
 
 	if (!atomic_dec_and_test(&ctx->cc_pending))
 		return;
@@ -1728,6 +1734,10 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
+	cc->per_bio_data_size = ti->per_bio_data_size =
+				sizeof(struct dm_crypt_io) + cc->dmreq_start +
+				sizeof(struct dm_crypt_request) + cc->iv_size;
+
 	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
 	if (!cc->page_pool) {
 		ti->error = "Cannot allocate page mempool";
@@ -1824,7 +1834,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_REMAPPED;
 	}
 
-	io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+	io = dm_per_bio_data(bio, cc->per_bio_data_size);
+	crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
+	io->ctx.req = (struct ablkcipher_request *)(io + 1);
 
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 4/9] dm crypt: use unbound workqueue for request processing
  2014-03-28 20:11 [PATCH 0/9] dm crypt: improve cpu scalability Mike Snitzer
                   ` (2 preceding siblings ...)
  2014-03-28 20:11 ` [PATCH 3/9] dm crypt: use per-bio data Mike Snitzer
@ 2014-03-28 20:11 ` Mike Snitzer
  2014-03-28 20:11 ` [PATCH 5/9] dm crypt: don't allocate pages for a partial request Mike Snitzer
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Mike Snitzer @ 2014-03-28 20:11 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

Use unbound workqueue so that work is automatically ballanced between
available CPUs.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 2221d12..a57b02a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1800,7 +1800,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	cc->crypt_queue = alloc_workqueue("kcryptd",
-					  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+					  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
 	if (!cc->crypt_queue) {
 		ti->error = "Couldn't create kcryptd queue";
 		goto bad;
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 5/9] dm crypt: don't allocate pages for a partial request
  2014-03-28 20:11 [PATCH 0/9] dm crypt: improve cpu scalability Mike Snitzer
                   ` (3 preceding siblings ...)
  2014-03-28 20:11 ` [PATCH 4/9] dm crypt: use unbound workqueue for request processing Mike Snitzer
@ 2014-03-28 20:11 ` Mike Snitzer
  2014-03-28 20:11 ` [PATCH 6/9] dm crypt: avoid deadlock in mempools Mike Snitzer
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Mike Snitzer @ 2014-03-28 20:11 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

Change crypt_alloc_buffer so that it only ever allocates pages for a
full request.

This change enables further simplification and removing of one refcounts
in the next patches.

Note: the next commit ("dm-crypt: avoid deadlock in mempools") is needed
to fix a theoretical deadlock.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 133 ++++++++++----------------------------------------
 1 file changed, 27 insertions(+), 106 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index a57b02a..fe92764 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -58,7 +58,6 @@ struct dm_crypt_io {
 	atomic_t io_pending;
 	int error;
 	sector_t sector;
-	struct dm_crypt_io *base_io;
 } CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
@@ -172,7 +171,6 @@ struct crypt_config {
 };
 
 #define MIN_IOS        16
-#define MIN_POOL_PAGES 32
 
 static struct kmem_cache *_crypt_io_pool;
 
@@ -951,14 +949,13 @@ static int crypt_convert(struct crypt_config *cc,
 	return 0;
 }
 
+static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
+
 /*
  * Generate a new unfragmented bio with the given size
  * This should never violate the device limitations
- * May return a smaller bio when running out of pages, indicated by
- * *out_of_pages set to 1.
  */
-static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
-				      unsigned *out_of_pages)
+static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *clone;
@@ -972,37 +969,23 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
 		return NULL;
 
 	clone_init(io, clone);
-	*out_of_pages = 0;
 
 	for (i = 0; i < nr_iovecs; i++) {
 		page = mempool_alloc(cc->page_pool, gfp_mask);
-		if (!page) {
-			*out_of_pages = 1;
-			break;
-		}
-
-		/*
-		 * If additional pages cannot be allocated without waiting,
-		 * return a partially-allocated bio.  The caller will then try
-		 * to allocate more bios while submitting this partial bio.
-		 */
-		gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
 
 		len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
 
 		if (!bio_add_page(clone, page, len, 0)) {
+			DMERR("bio_add_page failed for page %d: the underlying device has stricter limits than dm-crypt target", i);
 			mempool_free(page, cc->page_pool);
-			break;
+			crypt_free_buffer_pages(cc, clone);
+			bio_put(clone);
+			return NULL;
 		}
 
 		size -= len;
 	}
 
-	if (!clone->bi_iter.bi_size) {
-		bio_put(clone);
-		return NULL;
-	}
-
 	return clone;
 }
 
@@ -1025,7 +1008,6 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
 	io->base_bio = bio;
 	io->sector = sector;
 	io->error = 0;
-	io->base_io = NULL;
 	io->ctx.req = NULL;
 	atomic_set(&io->io_pending, 0);
 }
@@ -1038,13 +1020,11 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
 /*
  * One of the bios was finished. Check for completion of
  * the whole request and correctly clean up the buffer.
- * If base_io is set, wait for the last fragment to complete.
  */
 static void crypt_dec_pending(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *base_bio = io->base_bio;
-	struct dm_crypt_io *base_io = io->base_io;
 	int error = io->error;
 
 	if (!atomic_dec_and_test(&io->io_pending))
@@ -1055,13 +1035,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 	if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
 		mempool_free(io, cc->io_pool);
 
-	if (likely(!base_io))
-		bio_endio(base_bio, error);
-	else {
-		if (error && !base_io->error)
-			base_io->error = error;
-		crypt_dec_pending(base_io);
-	}
+	bio_endio(base_bio, error);
 }
 
 /*
@@ -1197,10 +1171,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *clone;
-	struct dm_crypt_io *new_io;
 	int crypt_finished;
-	unsigned out_of_pages = 0;
-	unsigned remaining = io->base_bio->bi_iter.bi_size;
 	sector_t sector = io->sector;
 	int r;
 
@@ -1210,80 +1181,30 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	crypt_inc_pending(io);
 	crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector);
 
-	/*
-	 * The allocated buffers can be smaller than the whole bio,
-	 * so repeat the whole process until all the data can be handled.
-	 */
-	while (remaining) {
-		clone = crypt_alloc_buffer(io, remaining, &out_of_pages);
-		if (unlikely(!clone)) {
-			io->error = -ENOMEM;
-			break;
-		}
-
-		io->ctx.bio_out = clone;
-		io->ctx.iter_out = clone->bi_iter;
-
-		remaining -= clone->bi_iter.bi_size;
-		sector += bio_sectors(clone);
-
-		crypt_inc_pending(io);
-
-		r = crypt_convert(cc, &io->ctx);
-		if (r < 0)
-			io->error = -EIO;
-
-		crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
-
-		/* Encryption was already finished, submit io now */
-		if (crypt_finished) {
-			kcryptd_crypt_write_io_submit(io, 0);
-
-			/*
-			 * If there was an error, do not try next fragments.
-			 * For async, error is processed in async handler.
-			 */
-			if (unlikely(r < 0))
-				break;
+	clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
+	if (unlikely(!clone)) {
+		io->error = -EIO;
+		goto dec;
+	}
 
-			io->sector = sector;
-		}
+	io->ctx.bio_out = clone;
+	io->ctx.iter_out = clone->bi_iter;
 
-		/*
-		 * Out of memory -> run queues
-		 * But don't wait if split was due to the io size restriction
-		 */
-		if (unlikely(out_of_pages))
-			congestion_wait(BLK_RW_ASYNC, HZ/100);
+	sector += bio_sectors(clone);
 
-		/*
-		 * With async crypto it is unsafe to share the crypto context
-		 * between fragments, so switch to a new dm_crypt_io structure.
-		 */
-		if (unlikely(!crypt_finished && remaining)) {
-			new_io = mempool_alloc(cc->io_pool, GFP_NOIO);
-			crypt_io_init(new_io, io->cc, io->base_bio, sector);
-			crypt_inc_pending(new_io);
-			crypt_convert_init(cc, &new_io->ctx, NULL,
-					   io->base_bio, sector);
-			new_io->ctx.iter_in = io->ctx.iter_in;
-
-			/*
-			 * Fragments after the first use the base_io
-			 * pending count.
-			 */
-			if (!io->base_io)
-				new_io->base_io = io;
-			else {
-				new_io->base_io = io->base_io;
-				crypt_inc_pending(io->base_io);
-				crypt_dec_pending(io);
-			}
+	crypt_inc_pending(io);
+	r = crypt_convert(cc, &io->ctx);
+	if (r)
+		io->error = -EIO;
+	crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
 
-			io = new_io;
-		}
+	/* Encryption was already finished, submit io now */
+	if (crypt_finished) {
+		kcryptd_crypt_write_io_submit(io, 0);
+		io->sector = sector;
 	}
 
+dec:
 	crypt_dec_pending(io);
 }
 
@@ -1738,7 +1659,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 				sizeof(struct dm_crypt_io) + cc->dmreq_start +
 				sizeof(struct dm_crypt_request) + cc->iv_size;
 
-	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
+	cc->page_pool = mempool_create_page_pool(BIO_MAX_PAGES, 0);
 	if (!cc->page_pool) {
 		ti->error = "Cannot allocate page mempool";
 		goto bad;
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 6/9] dm crypt: avoid deadlock in mempools
  2014-03-28 20:11 [PATCH 0/9] dm crypt: improve cpu scalability Mike Snitzer
                   ` (4 preceding siblings ...)
  2014-03-28 20:11 ` [PATCH 5/9] dm crypt: don't allocate pages for a partial request Mike Snitzer
@ 2014-03-28 20:11 ` Mike Snitzer
  2014-03-28 20:11 ` [PATCH 7/9] dm crypt: remove io_pool Mike Snitzer
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 27+ messages in thread
From: Mike Snitzer @ 2014-03-28 20:11 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

Fix a theoretical deadlock introduced in the previous commit ("dm crypt:
don't allocate pages for a partial request").

The function crypt_alloc_buffer may be called concurrently.  If we allocate
from the mempool concurrently, there is a possibility of deadlock.  For
example, if we have mempool of 256 pages, two processes, each wanting
256, pages allocate from the mempool concurrently, it may deadlock in a
situation where both processes have allocated 128 pages and the mempool
is exhausted.

In order to avoid such a scenario, we allocate the pages under a mutex.

In order to not degrade performance with excessive locking, we try
non-blocking allocations without a mutex first and if it fails, we
fallback to a blocking allocation with a mutex.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index fe92764..d7d7023 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -124,6 +124,7 @@ struct crypt_config {
 	mempool_t *req_pool;
 	mempool_t *page_pool;
 	struct bio_set *bs;
+	struct mutex bio_alloc_lock;
 
 	struct workqueue_struct *io_queue;
 	struct workqueue_struct *crypt_queue;
@@ -954,24 +955,46 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
 /*
  * Generate a new unfragmented bio with the given size
  * This should never violate the device limitations
+ *
+ * This function may be called concurrently. If we allocate from the mempool
+ * concurrently, there is a possibility of deadlock. For example, if we have
+ * mempool of 256 pages, two processes, each wanting 256, pages allocate from
+ * the mempool concurrently, it may deadlock in a situation where both processes
+ * have allocated 128 pages and the mempool is exhausted.
+ *
+ * In order to avoid this scenarios, we allocate the pages under a mutex.
+ *
+ * In order to not degrade performance with excessive locking, we try
+ * non-blocking allocations without a mutex first and if it fails, we fallback
+ * to a blocking allocation with a mutex.
  */
 static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *clone;
 	unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
+	gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM;
 	unsigned i, len;
 	struct page *page;
 
+retry:
+	if (unlikely(gfp_mask & __GFP_WAIT))
+		mutex_lock(&cc->bio_alloc_lock);
+
 	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
 	if (!clone)
-		return NULL;
+		goto return_clone;
 
 	clone_init(io, clone);
 
 	for (i = 0; i < nr_iovecs; i++) {
 		page = mempool_alloc(cc->page_pool, gfp_mask);
+		if (!page) {
+			crypt_free_buffer_pages(cc, clone);
+			bio_put(clone);
+			gfp_mask |= __GFP_WAIT;
+			goto retry;
+		}
 
 		len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
 
@@ -980,12 +1003,17 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size)
 			mempool_free(page, cc->page_pool);
 			crypt_free_buffer_pages(cc, clone);
 			bio_put(clone);
-			return NULL;
+			clone = NULL;
+			goto return_clone;
 		}
 
 		size -= len;
 	}
 
+return_clone:
+	if (unlikely(gfp_mask & __GFP_WAIT))
+		mutex_unlock(&cc->bio_alloc_lock);
+
 	return clone;
 }
 
@@ -1671,6 +1699,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
+	mutex_init(&cc->bio_alloc_lock);
+
 	ret = -EINVAL;
 	if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
 		ti->error = "Invalid iv_offset sector";
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 7/9] dm crypt: remove io_pool
  2014-03-28 20:11 [PATCH 0/9] dm crypt: improve cpu scalability Mike Snitzer
                   ` (5 preceding siblings ...)
  2014-03-28 20:11 ` [PATCH 6/9] dm crypt: avoid deadlock in mempools Mike Snitzer
@ 2014-03-28 20:11 ` Mike Snitzer
  2014-03-28 20:11 ` [PATCH 8/9] dm crypt: offload writes to thread Mike Snitzer
  2014-03-28 20:11 ` [PATCH 9/9] dm crypt: sort writes Mike Snitzer
  8 siblings, 0 replies; 27+ messages in thread
From: Mike Snitzer @ 2014-03-28 20:11 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

Remove io_pool and _crypt_io_pool because they are unused.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d7d7023..8f91531 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -120,7 +120,6 @@ struct crypt_config {
 	 * pool for per bio private data, crypto requests and
 	 * encryption requeusts/buffer pages
 	 */
-	mempool_t *io_pool;
 	mempool_t *req_pool;
 	mempool_t *page_pool;
 	struct bio_set *bs;
@@ -173,8 +172,6 @@ struct crypt_config {
 
 #define MIN_IOS        16
 
-static struct kmem_cache *_crypt_io_pool;
-
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
@@ -1060,8 +1057,6 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 
 	if (io->ctx.req)
 		crypt_free_req(cc, io->ctx.req, base_bio);
-	if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size))
-		mempool_free(io, cc->io_pool);
 
 	bio_endio(base_bio, error);
 }
@@ -1449,8 +1444,6 @@ static void crypt_dtr(struct dm_target *ti)
 		mempool_destroy(cc->page_pool);
 	if (cc->req_pool)
 		mempool_destroy(cc->req_pool);
-	if (cc->io_pool)
-		mempool_destroy(cc->io_pool);
 
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
@@ -1663,19 +1656,13 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	if (ret < 0)
 		goto bad;
 
-	ret = -ENOMEM;
-	cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
-	if (!cc->io_pool) {
-		ti->error = "Cannot allocate crypt io mempool";
-		goto bad;
-	}
-
 	cc->dmreq_start = sizeof(struct ablkcipher_request);
 	cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
 	cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
 	cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
 			   ~(crypto_tfm_ctx_alignment() - 1);
 
+	ret = -ENOMEM;
 	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
 			sizeof(struct dm_crypt_request) + cc->iv_size);
 	if (!cc->req_pool) {
@@ -1937,14 +1924,9 @@ static int __init dm_crypt_init(void)
 {
 	int r;
 
-	_crypt_io_pool = KMEM_CACHE(dm_crypt_io, 0);
-	if (!_crypt_io_pool)
-		return -ENOMEM;
-
 	r = dm_register_target(&crypt_target);
 	if (r < 0) {
 		DMERR("register failed %d", r);
-		kmem_cache_destroy(_crypt_io_pool);
 	}
 
 	return r;
@@ -1953,7 +1935,6 @@ static int __init dm_crypt_init(void)
 static void __exit dm_crypt_exit(void)
 {
 	dm_unregister_target(&crypt_target);
-	kmem_cache_destroy(_crypt_io_pool);
 }
 
 module_init(dm_crypt_init);
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 8/9] dm crypt: offload writes to thread
  2014-03-28 20:11 [PATCH 0/9] dm crypt: improve cpu scalability Mike Snitzer
                   ` (6 preceding siblings ...)
  2014-03-28 20:11 ` [PATCH 7/9] dm crypt: remove io_pool Mike Snitzer
@ 2014-03-28 20:11 ` Mike Snitzer
  2014-04-01 16:27   ` Ondrej Kozina
  2014-03-28 20:11 ` [PATCH 9/9] dm crypt: sort writes Mike Snitzer
  8 siblings, 1 reply; 27+ messages in thread
From: Mike Snitzer @ 2014-03-28 20:11 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

Submitting write bios directly in the encryption thread caused serious
performance degradation.  On a multiprocessor machine, encryption requests
finish in a different order than they were submitted.  Consequently, write
requests would be submitted in a different order and it could cause severe
performance degradation.

Move the submission of write requests to a separate thread so that the
requests can be sorted before submitting.  Sorting is implemented in the
next commit ("dm crypt: sort writes").

Note: it is required that a previous commit ("dm crypt: don't allocate
pages for a partial request") be applied before applying this patch.
Otherwise, this commit could introduce a crash.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 120 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 97 insertions(+), 23 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8f91531..8bbe3d5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <linux/backing-dev.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
@@ -58,6 +59,8 @@ struct dm_crypt_io {
 	atomic_t io_pending;
 	int error;
 	sector_t sector;
+
+	struct list_head list;
 } CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
@@ -128,6 +131,10 @@ struct crypt_config {
 	struct workqueue_struct *io_queue;
 	struct workqueue_struct *crypt_queue;
 
+	struct task_struct *write_thread;
+	wait_queue_head_t write_thread_wait;
+	struct list_head write_thread_list;
+
 	char *cipher;
 	char *cipher_string;
 
@@ -1140,37 +1147,89 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 	return 0;
 }
 
+static void kcryptd_io_read_work(struct work_struct *work)
+{
+	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+
+	crypt_inc_pending(io);
+	if (kcryptd_io_read(io, GFP_NOIO))
+		io->error = -ENOMEM;
+	crypt_dec_pending(io);
+}
+
+static void kcryptd_queue_read(struct dm_crypt_io *io)
+{
+	struct crypt_config *cc = io->cc;
+
+	INIT_WORK(&io->work, kcryptd_io_read_work);
+	queue_work(cc->io_queue, &io->work);
+}
+
 static void kcryptd_io_write(struct dm_crypt_io *io)
 {
 	struct bio *clone = io->ctx.bio_out;
+
 	generic_make_request(clone);
 }
 
-static void kcryptd_io(struct work_struct *work)
+static int dmcrypt_write(void *data)
 {
-	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
+	struct crypt_config *cc = data;
+	while (1) {
+		struct list_head local_list;
+		struct blk_plug plug;
 
-	if (bio_data_dir(io->base_bio) == READ) {
-		crypt_inc_pending(io);
-		if (kcryptd_io_read(io, GFP_NOIO))
-			io->error = -ENOMEM;
-		crypt_dec_pending(io);
-	} else
-		kcryptd_io_write(io);
-}
+		DECLARE_WAITQUEUE(wait, current);
 
-static void kcryptd_queue_io(struct dm_crypt_io *io)
-{
-	struct crypt_config *cc = io->cc;
+		spin_lock_irq(&cc->write_thread_wait.lock);
+continue_locked:
 
-	INIT_WORK(&io->work, kcryptd_io);
-	queue_work(cc->io_queue, &io->work);
+		if (!list_empty(&cc->write_thread_list))
+			goto pop_from_list;
+
+		__set_current_state(TASK_INTERRUPTIBLE);
+		__add_wait_queue(&cc->write_thread_wait, &wait);
+
+		spin_unlock_irq(&cc->write_thread_wait.lock);
+
+		if (unlikely(kthread_should_stop())) {
+			set_task_state(current, TASK_RUNNING);
+			remove_wait_queue(&cc->write_thread_wait, &wait);
+			break;
+		}
+
+		schedule();
+
+		set_task_state(current, TASK_RUNNING);
+		spin_lock_irq(&cc->write_thread_wait.lock);
+		__remove_wait_queue(&cc->write_thread_wait, &wait);
+		goto continue_locked;
+
+pop_from_list:
+		local_list = cc->write_thread_list;
+		local_list.next->prev = &local_list;
+		local_list.prev->next = &local_list;
+		INIT_LIST_HEAD(&cc->write_thread_list);
+
+		spin_unlock_irq(&cc->write_thread_wait.lock);
+
+		blk_start_plug(&plug);
+		do {
+			struct dm_crypt_io *io = container_of(local_list.next,
+						struct dm_crypt_io, list);
+			list_del(&io->list);
+			kcryptd_io_write(io);
+		} while (!list_empty(&local_list));
+		blk_finish_plug(&plug);
+	}
+	return 0;
 }
 
-static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
+static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io)
 {
 	struct bio *clone = io->ctx.bio_out;
 	struct crypt_config *cc = io->cc;
+	unsigned long flags;
 
 	if (unlikely(io->error < 0)) {
 		crypt_free_buffer_pages(cc, clone);
@@ -1184,10 +1243,10 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
-	if (async)
-		kcryptd_queue_io(io);
-	else
-		generic_make_request(clone);
+	spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
+	list_add_tail(&io->list, &cc->write_thread_list);
+	wake_up_locked(&cc->write_thread_wait);
+	spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
 }
 
 static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
@@ -1223,7 +1282,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 
 	/* Encryption was already finished, submit io now */
 	if (crypt_finished) {
-		kcryptd_crypt_write_io_submit(io, 0);
+		kcryptd_crypt_write_io_submit(io);
 		io->sector = sector;
 	}
 
@@ -1283,7 +1342,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	if (bio_data_dir(io->base_bio) == READ)
 		kcryptd_crypt_read_done(io);
 	else
-		kcryptd_crypt_write_io_submit(io, 1);
+		kcryptd_crypt_write_io_submit(io);
 }
 
 static void kcryptd_crypt(struct work_struct *work)
@@ -1430,6 +1489,9 @@ static void crypt_dtr(struct dm_target *ti)
 	if (!cc)
 		return;
 
+	if (cc->write_thread)
+		kthread_stop(cc->write_thread);
+
 	if (cc->io_queue)
 		destroy_workqueue(cc->io_queue);
 	if (cc->crypt_queue)
@@ -1744,6 +1806,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
+	init_waitqueue_head(&cc->write_thread_wait);
+	INIT_LIST_HEAD(&cc->write_thread_list);
+
+	cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
+	if (IS_ERR(cc->write_thread)) {
+		ret = PTR_ERR(cc->write_thread);
+		cc->write_thread = NULL;
+		ti->error = "Couldn't spawn write thread";
+		goto bad;
+	}
+	wake_up_process(cc->write_thread);
+
 	ti->num_flush_bios = 1;
 	ti->discard_zeroes_data_unsupported = true;
 
@@ -1778,7 +1852,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 
 	if (bio_data_dir(io->base_bio) == READ) {
 		if (kcryptd_io_read(io, GFP_NOWAIT))
-			kcryptd_queue_io(io);
+			kcryptd_queue_read(io);
 	} else
 		kcryptd_queue_crypt(io);
 
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 9/9] dm crypt: sort writes
  2014-03-28 20:11 [PATCH 0/9] dm crypt: improve cpu scalability Mike Snitzer
                   ` (7 preceding siblings ...)
  2014-03-28 20:11 ` [PATCH 8/9] dm crypt: offload writes to thread Mike Snitzer
@ 2014-03-28 20:11 ` Mike Snitzer
  2014-03-29  8:11   ` Milan Broz
  8 siblings, 1 reply; 27+ messages in thread
From: Mike Snitzer @ 2014-03-28 20:11 UTC (permalink / raw)
  To: dm-devel

From: Mikulas Patocka <mpatocka@redhat.com>

Write requests are sorted in a red-black tree structure and are submitted
in the sorted order.

In theory the sorting should be performed by the underlying disk scheduler,
however, in practice the disk scheduler accepts and sorts only 128 requests.
In order to sort more requests, we need to implement our own sorting.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 50 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8bbe3d5..3f55b25 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -22,6 +22,7 @@
 #include <linux/backing-dev.h>
 #include <linux/atomic.h>
 #include <linux/scatterlist.h>
+#include <linux/rbtree.h>
 #include <asm/page.h>
 #include <asm/unaligned.h>
 #include <crypto/hash.h>
@@ -60,7 +61,7 @@ struct dm_crypt_io {
 	int error;
 	sector_t sector;
 
-	struct list_head list;
+	struct rb_node rb_node;
 } CRYPTO_MINALIGN_ATTR;
 
 struct dm_crypt_request {
@@ -133,7 +134,7 @@ struct crypt_config {
 
 	struct task_struct *write_thread;
 	wait_queue_head_t write_thread_wait;
-	struct list_head write_thread_list;
+	struct rb_root write_tree;
 
 	char *cipher;
 	char *cipher_string;
@@ -1176,7 +1177,7 @@ static int dmcrypt_write(void *data)
 {
 	struct crypt_config *cc = data;
 	while (1) {
-		struct list_head local_list;
+		struct rb_root write_tree;
 		struct blk_plug plug;
 
 		DECLARE_WAITQUEUE(wait, current);
@@ -1184,7 +1185,7 @@ static int dmcrypt_write(void *data)
 		spin_lock_irq(&cc->write_thread_wait.lock);
 continue_locked:
 
-		if (!list_empty(&cc->write_thread_list))
+		if (!RB_EMPTY_ROOT(&cc->write_tree))
 			goto pop_from_list;
 
 		__set_current_state(TASK_INTERRUPTIBLE);
@@ -1206,20 +1207,23 @@ continue_locked:
 		goto continue_locked;
 
 pop_from_list:
-		local_list = cc->write_thread_list;
-		local_list.next->prev = &local_list;
-		local_list.prev->next = &local_list;
-		INIT_LIST_HEAD(&cc->write_thread_list);
-
+		write_tree = cc->write_tree;
+		cc->write_tree = RB_ROOT;
 		spin_unlock_irq(&cc->write_thread_wait.lock);
 
+		BUG_ON(rb_parent(write_tree.rb_node));
+
+		/*
+		 * Note: we cannot walk the tree here with rb_next because
+		 * the structures may be freed when kcryptd_io_write is called.
+		 */
 		blk_start_plug(&plug);
 		do {
-			struct dm_crypt_io *io = container_of(local_list.next,
-						struct dm_crypt_io, list);
-			list_del(&io->list);
+			struct dm_crypt_io *io = rb_entry(rb_first(&write_tree),
+						struct dm_crypt_io, rb_node);
+			rb_erase(&io->rb_node, &write_tree);
 			kcryptd_io_write(io);
-		} while (!list_empty(&local_list));
+		} while (!RB_EMPTY_ROOT(&write_tree));
 		blk_finish_plug(&plug);
 	}
 	return 0;
@@ -1230,6 +1234,8 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io)
 	struct bio *clone = io->ctx.bio_out;
 	struct crypt_config *cc = io->cc;
 	unsigned long flags;
+	sector_t sector;
+	struct rb_node **p, *parent;
 
 	if (unlikely(io->error < 0)) {
 		crypt_free_buffer_pages(cc, clone);
@@ -1244,7 +1250,21 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io)
 	clone->bi_iter.bi_sector = cc->start + io->sector;
 
 	spin_lock_irqsave(&cc->write_thread_wait.lock, flags);
-	list_add_tail(&io->list, &cc->write_thread_list);
+	p = &cc->write_tree.rb_node;
+	parent = NULL;
+	sector = io->sector;
+	while (*p) {
+		parent = *p;
+#define io_node rb_entry(parent, struct dm_crypt_io, rb_node)
+		if (sector < io_node->sector)
+			p = &io_node->rb_node.rb_left;
+		else
+			p = &io_node->rb_node.rb_right;
+#undef io_node
+	}
+	rb_link_node(&io->rb_node, parent, p);
+	rb_insert_color(&io->rb_node, &cc->write_tree);
+
 	wake_up_locked(&cc->write_thread_wait);
 	spin_unlock_irqrestore(&cc->write_thread_wait.lock, flags);
 }
@@ -1807,7 +1827,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	init_waitqueue_head(&cc->write_thread_wait);
-	INIT_LIST_HEAD(&cc->write_thread_list);
+	cc->write_tree = RB_ROOT;
 
 	cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
 	if (IS_ERR(cc->write_thread)) {
-- 
1.8.1.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 9/9] dm crypt: sort writes
  2014-03-28 20:11 ` [PATCH 9/9] dm crypt: sort writes Mike Snitzer
@ 2014-03-29  8:11   ` Milan Broz
  2014-03-31 12:39     ` Mike Snitzer
  0 siblings, 1 reply; 27+ messages in thread
From: Milan Broz @ 2014-03-29  8:11 UTC (permalink / raw)
  To: device-mapper development

On 03/28/2014 09:11 PM, Mike Snitzer wrote:
> From: Mikulas Patocka <mpatocka@redhat.com>
> 
> Write requests are sorted in a red-black tree structure and are submitted
> in the sorted order.
> 
> In theory the sorting should be performed by the underlying disk scheduler,
> however, in practice the disk scheduler accepts and sorts only 128 requests.
> In order to sort more requests, we need to implement our own sorting.

Hi,

I think it would be nice to mention why simply increasing queue nr_request
doesn't help. It is definitely not limited to 128, it is only default value.

(I just wonder how this helps for SSDs where I think it depends what's fw
doing with io requests anyway).

It would be nice to have sysfs switch to disable sorting in dmcrypt but that
can be added later.

Anyway, thanks for rebasing these patches!

...
> +#define io_node rb_entry(parent, struct dm_crypt_io, rb_node)
> +		if (sector < io_node->sector)
> +			p = &io_node->rb_node.rb_left;
> +		else
> +			p = &io_node->rb_node.rb_right;
> +#undef io_node

Btw, could this be switched to inline function instead of define,
or it is only me who thinks #define here is ugly? :)

Thanks,
Milan

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 9/9] dm crypt: sort writes
  2014-03-29  8:11   ` Milan Broz
@ 2014-03-31 12:39     ` Mike Snitzer
  2014-03-31 23:37       ` Akira Hayakawa
  0 siblings, 1 reply; 27+ messages in thread
From: Mike Snitzer @ 2014-03-31 12:39 UTC (permalink / raw)
  To: Milan Broz; +Cc: device-mapper development

On Sat, Mar 29 2014 at  4:11am -0400,
Milan Broz <gmazyland@gmail.com> wrote:

> On 03/28/2014 09:11 PM, Mike Snitzer wrote:
> > From: Mikulas Patocka <mpatocka@redhat.com>
> > 
> > Write requests are sorted in a red-black tree structure and are submitted
> > in the sorted order.
> > 
> > In theory the sorting should be performed by the underlying disk scheduler,
> > however, in practice the disk scheduler accepts and sorts only 128 requests.
> > In order to sort more requests, we need to implement our own sorting.
> 
> Hi,
> 
> I think it would be nice to mention why simply increasing queue nr_request
> doesn't help. It is definitely not limited to 128, it is only default value.
> 
> (I just wonder how this helps for SSDs where I think it depends what's fw
> doing with io requests anyway).

Obviously is less of a concern on SSD but I think we'll find it still is
beneficial.

> It would be nice to have sysfs switch to disable sorting in dmcrypt but that
> can be added later.

Not sure sysfs is the right place to put this.  Would prefer to use a DM
message to toggle it.  And possibly default to off if the backing
storage is non-rotational.

> Anyway, thanks for rebasing these patches!
> 
> ...
> > +#define io_node rb_entry(parent, struct dm_crypt_io, rb_node)
> > +		if (sector < io_node->sector)
> > +			p = &io_node->rb_node.rb_left;
> > +		else
> > +			p = &io_node->rb_node.rb_right;
> > +#undef io_node
> 
> Btw, could this be switched to inline function instead of define,
> or it is only me who thinks #define here is ugly? :)

I wasn't a big fan of the #define .. #undef but I didn't see the need to
change it if Mikulas felt more comfortable with it this way.

I recently wrote some rbtree code for dm-thinp, it also uses a #define
but outside of the function body (above the function), see:
https://git.kernel.org/cgit/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=for-next&id=a43260f01a3b6f5ef33d0abc86e9b0a92096cd84

We could change this code in a similar way.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 9/9] dm crypt: sort writes
  2014-03-31 12:39     ` Mike Snitzer
@ 2014-03-31 23:37       ` Akira Hayakawa
  2014-04-01  1:01         ` Mike Snitzer
  0 siblings, 1 reply; 27+ messages in thread
From: Akira Hayakawa @ 2014-03-31 23:37 UTC (permalink / raw)
  To: dm-devel

Hi,

This is just a comment.

I like this sorting and I would like to apply this sorting code also to dm-writeboost.
https://github.com/akiradeveloper/dm-writeboost

Hope that improves the writeback.
dm-writeboost writes back multiple (e.g. 4KB * 1000) data in cache device at once
hoping that the scheduler sorts them implicitly.
Explictly sort them possibly makes writeback more efficient.

Thanks for the code.

--
Akira

On Mon, 31 Mar 2014 08:39:40 -0400
Mike Snitzer <snitzer@redhat.com> wrote:

> On Sat, Mar 29 2014 at  4:11am -0400,
> Milan Broz <gmazyland@gmail.com> wrote:
> 
> > On 03/28/2014 09:11 PM, Mike Snitzer wrote:
> > > From: Mikulas Patocka <mpatocka@redhat.com>
> > > 
> > > Write requests are sorted in a red-black tree structure and are submitted
> > > in the sorted order.
> > > 
> > > In theory the sorting should be performed by the underlying disk scheduler,
> > > however, in practice the disk scheduler accepts and sorts only 128 requests.
> > > In order to sort more requests, we need to implement our own sorting.
> > 
> > Hi,
> > 
> > I think it would be nice to mention why simply increasing queue nr_request
> > doesn't help. It is definitely not limited to 128, it is only default value.
> > 
> > (I just wonder how this helps for SSDs where I think it depends what's fw
> > doing with io requests anyway).
> 
> Obviously is less of a concern on SSD but I think we'll find it still is
> beneficial.
> 
> > It would be nice to have sysfs switch to disable sorting in dmcrypt but that
> > can be added later.
> 
> Not sure sysfs is the right place to put this.  Would prefer to use a DM
> message to toggle it.  And possibly default to off if the backing
> storage is non-rotational.
> 
> > Anyway, thanks for rebasing these patches!
> > 
> > ...
> > > +#define io_node rb_entry(parent, struct dm_crypt_io, rb_node)
> > > +		if (sector < io_node->sector)
> > > +			p = &io_node->rb_node.rb_left;
> > > +		else
> > > +			p = &io_node->rb_node.rb_right;
> > > +#undef io_node
> > 
> > Btw, could this be switched to inline function instead of define,
> > or it is only me who thinks #define here is ugly? :)
> 
> I wasn't a big fan of the #define .. #undef but I didn't see the need to
> change it if Mikulas felt more comfortable with it this way.
> 
> I recently wrote some rbtree code for dm-thinp, it also uses a #define
> but outside of the function body (above the function), see:
> https://git.kernel.org/cgit/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=for-next&id=a43260f01a3b6f5ef33d0abc86e9b0a92096cd84
> 
> We could change this code in a similar way.
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 9/9] dm crypt: sort writes
  2014-03-31 23:37       ` Akira Hayakawa
@ 2014-04-01  1:01         ` Mike Snitzer
  2014-04-01 17:35           ` Milan Broz
  2014-04-01 23:21           ` Akira Hayakawa
  0 siblings, 2 replies; 27+ messages in thread
From: Mike Snitzer @ 2014-04-01  1:01 UTC (permalink / raw)
  To: Akira Hayakawa; +Cc: dm-devel

On Mon, Mar 31 2014 at  7:37pm -0400,
Akira Hayakawa <hayakawa@valinux.co.jp> wrote:

> Hi,
> 
> This is just a comment.
> 
> I like this sorting and I would like to apply this sorting code also to dm-writeboost.
> https://github.com/akiradeveloper/dm-writeboost
> 
> Hope that improves the writeback.
> dm-writeboost writes back multiple (e.g. 4KB * 1000) data in cache device at once
> hoping that the scheduler sorts them implicitly.
> Explictly sort them possibly makes writeback more efficient.
> 
> Thanks for the code.

OK.  FYI, I cleaned the code up a little today based on Milan's dislike
for the #define within the function, see:
https://git.kernel.org/cgit/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=for-next&id=039e943277604c3f4f6b4ee803b8793627d53d36

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/9] dm crypt: offload writes to thread
  2014-03-28 20:11 ` [PATCH 8/9] dm crypt: offload writes to thread Mike Snitzer
@ 2014-04-01 16:27   ` Ondrej Kozina
  2014-04-01 16:32     ` Mike Snitzer
  2014-04-01 18:01     ` Mikulas Patocka
  0 siblings, 2 replies; 27+ messages in thread
From: Ondrej Kozina @ 2014-04-01 16:27 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: device-mapper development, Mikulas Patocka

On 03/28/2014 09:11 PM, Mike Snitzer wrote:
> From: Mikulas Patocka <mpatocka@redhat.com>
>
> Submitting write bios directly in the encryption thread caused serious
> performance degradation.  On a multiprocessor machine, encryption requests
> finish in a different order than they were submitted.  Consequently, write
> requests would be submitted in a different order and it could cause severe
> performance degradation.
 > (...)

Hi,

originally I planed to post result of performance testing but 
unfortunately I crashed the test machine several times with <in_subject> 
patch applied and onward:

The test set up looks following:

the base is kernel-3.14-rc8 with applied "[PATCH 2/9] block: use kmalloc 
alignment for bio slab"

On top of base I compiled various dm-crypt modules:

<no_patch>		= raw 3.14-rc8 dm-crypt module
<no_percpu>		= "[PATCH 1/9]"
<per_bio_data>		= <no_percpu> + "[PATCH 3/9]"
<unbound>		= <per_bio_data> + "[PATCH 4/9]"
<dont_allocate_wfix>	= <unbound> + "[PATCH 5/9]" + "[PATCH 6/9]"
<remove_io_pool>	= <dont_allocate_wfix> + "[PATCH 7/9]"
<offload>		= <remove_io_pool> + "[PATCH 8/9]"
<sort>			= <offload> + "[PATCH 9/9]"

All the modules above went through tests that are available here 
(originally write by Milan Broz, I have made only few modifications):

http://okozina.fedorapeople.org/dmcrypt-tests.tar.xz

All tests passed on single device tests with "test_disk" script

Testing md raid5 backing device ("test_md" script) fails in test 
2/dmcrypt-all-verify.sh. The failure occurs after it pushes data through 
dm-crypt device using "null" cipher:

dmsetup create crypt --table "0 </dev/md0 bsize> crypt 
cipher_null-ecb-null - 0 /dev/md0 0"

steps to reproduce is to load module with <offload> or <sort> dm-crypt 
modules:

create md raid5 over 3 devices:

1) mdadm -C -l 5 -n 3 -c 64 --assume-clean /dev/md0 /dev/sd[xyz]
2) run 2/dmcrypt-all-verify.sh

if CONFIG_DEBUG_VM is enabled, it ended with:

XFS (dm-3): Mounting Filesystem
[ 1278.005956] XFS (dm-3): Ending clean mount
[ 1346.330472] device-mapper: crypt: bio_add_page failed for page 10: 
the underlying device has stricter limits t
[ 1475.049400] page:ffffea0004272600 count:0 mapcount:0 mapping: 
   (null) index:0x2
[ 1475.057475] page flags: 0x2fffff80000000()
[ 1475.061619] ------------[ cut here ]------------
[ 1475.062313] kernel BUG at include/linux/mm.h:307!
[ 1475.062313] invalid opcode: 0000 [#1] SMP
[ 1475.062313] Modules linked in: crypto_null dm_crypt(F) raid456 
async_raid6_recov async_memcpy async_pq raid6_]
[ 1475.062313] CPU: 2 PID: 24851 Comm: dmsetup Tainted: GF 
3.14.0-rc8 #3
[ 1475.062313] Hardware name: Dell Computer Corporation PowerEdge 
2800/0C8306, BIOS A07 04/25/2008
[ 1475.062313] task: ffff8801197cea80 ti: ffff8801189a6000 task.ti: 
ffff8801189a6000
[ 1475.062313] RIP: 0010:[<ffffffff81167748>]  [<ffffffff81167748>] 
__free_pages+0x68/0x70
[ 1475.062313] RSP: 0018:ffff8801189a7c08  EFLAGS: 00010246
[ 1475.062313] RAX: 0000000000000000 RBX: ffffea0004272600 RCX: 
0000000000000000
[ 1475.062313] RDX: 0000000000000000 RSI: ffff88011fc8e6c8 RDI: 
000000000109c980
[ 1475.062313] RBP: ffff8801189a7c18 R08: 0000000000000086 R09: 
0000000000000425
[ 1475.062313] R10: 0000000000000424 R11: 0000000000000003 R12: 
ffff880118afcc70
[ 1475.062313] R13: ffff880118afcc68 R14: ffff8800d712b400 R15: 
0000000000000000
[ 1475.062313] FS:  00007f73b2f9f880(0000) GS:ffff88011fc80000(0000) 
knlGS:0000000000000000
[ 1475.062313] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 1475.062313] CR2: 00007f238669b008 CR3: 0000000119288000 CR4: 
00000000000007e0
[ 1475.062313] Stack:
[ 1475.062313]  ffff880118afcc60 ffff880118afcc70 ffff8801189a7c28 
ffffffff811617ae
[ 1475.062313]  ffff8801189a7c50 ffffffff81161815 ffff8800d7129a00 
ffffc90010753040
[ 1475.062313]  ffff8800d712b400 ffff8801189a7c70 ffffffffa054101d 
0000000000000000
[ 1475.062313] Call Trace:
[ 1475.062313]  [<ffffffff811617ae>] mempool_free_pages+0xe/0x10
[ 1475.062313]  [<ffffffff81161815>] mempool_destroy+0x35/0x60
[ 1475.062313]  [<ffffffffa054101d>] crypt_dtr+0x7d/0xe0 [dm_crypt]
[ 1475.062313]  [<ffffffffa0004c38>] dm_table_destroy+0x68/0xf0 [dm_mod]
[ 1475.062313]  [<ffffffffa00020d4>] __dm_destroy+0xf4/0x260 [dm_mod]
[ 1475.062313]  [<ffffffffa0002fa3>] dm_destroy+0x13/0x20 [dm_mod]
[ 1475.062313]  [<ffffffffa0008a3e>] dev_remove+0x11e/0x180 [dm_mod]
[ 1475.062313]  [<ffffffffa0008920>] ? dev_suspend+0x250/0x250 [dm_mod]
[ 1475.062313]  [<ffffffffa0009115>] ctl_ioctl+0x255/0x500 [dm_mod]
[ 1475.062313]  [<ffffffffa00093d3>] dm_ctl_ioctl+0x13/0x20 [dm_mod]
[ 1475.062313]  [<ffffffff811e9ac0>] do_vfs_ioctl+0x2e0/0x4a0
[ 1475.062313]  [<ffffffff8127fe96>] ? file_has_perm+0xa6/0xb0
[ 1475.062313]  [<ffffffff811e9d01>] SyS_ioctl+0x81/0xa0
[ 1475.062313]  [<ffffffff81639369>] system_call_fastpath+0x16/0x1b
[ 1475.062313] Code: de 44 89 e6 48 89 df e8 f7 f2 ff ff 5b 41 5c 5d c3 
66 90 48 89 df 31 f6 e8 c6 fd ff ff 5b 4
[ 1475.062313] RIP  [<ffffffff81167748>] __free_pages+0x68/0x70
[ 1475.062313]  RSP <ffff8801189a7c08>
[ 1475.377375] ---[ end trace 9960ec06a734b827 ]---
[ 1475.382019] Kernel panic - not syncing: Fatal exception
[ 1475.383013] Kernel Offset: 0x0 from 0xffffffff81000000 (relocation 
range: 0xffffffff80000000-0xffffffff9fffff)
[ 1475.383013] drm_kms_helper: panic occurred, switching back to text 
console
[ 1475.404333] ------------[ cut here ]------------
[ 1475.405330] WARNING: CPU: 2 PID: 24851 at arch/x86/kernel/smp.c:124 
native_smp_send_reschedule+0x5d/0x60()
[ 1475.405330] Modules linked in: crypto_null dm_crypt(F) raid456 
async_raid6_recov async_memcpy async_pq raid6_]
[ 1475.405330] CPU: 2 PID: 24851 Comm: dmsetup Tainted: GF     D 
3.14.0-rc8 #3
[ 1475.405330] Hardware name: Dell Computer Corporation PowerEdge 
2800/0C8306, BIOS A07 04/25/2008
[ 1475.405330]  0000000000000000 0000000021407093 ffff88011fc83d90 
ffffffff816281f1
[ 1475.405330]  0000000000000000 ffff88011fc83dc8 ffffffff8106fc7d 
0000000000000000
[ 1475.405330]  ffff88011fc146c0 0000000000000002 0000000000000002 
000000000000e5c8
[ 1475.405330] Call Trace:
[ 1475.405330]  <IRQ>  [<ffffffff816281f1>] dump_stack+0x45/0x56
[ 1475.405330]  [<ffffffff8106fc7d>] warn_slowpath_common+0x7d/0xa0
[ 1475.405330]  [<ffffffff8106fdaa>] warn_slowpath_null+0x1a/0x20
[ 1475.405330]  [<ffffffff81047add>] native_smp_send_reschedule+0x5d/0x60
[ 1475.405330]  [<ffffffff810b4634>] trigger_load_balance+0x144/0x1b0
[ 1475.405330]  [<ffffffff810a519f>] scheduler_tick+0x9f/0xe0
[ 1475.405330]  [<ffffffff8107ef00>] update_process_times+0x60/0x70
[ 1475.405330]  [<ffffffff810e2375>] tick_sched_handle.isra.17+0x25/0x60
[ 1475.405330]  [<ffffffff810e23f1>] tick_sched_timer+0x41/0x60
[ 1475.405330]  [<ffffffff810984f7>] __run_hrtimer+0x77/0x1d0
[ 1475.405330]  [<ffffffff810e23b0>] ? tick_sched_handle.isra.17+0x60/0x60
[ 1475.405330]  [<ffffffff81098d37>] hrtimer_interrupt+0xf7/0x240
[ 1475.405330]  [<ffffffff8104ab07>] local_apic_timer_interrupt+0x37/0x60
[ 1475.405330]  [<ffffffff8163b66f>] smp_apic_timer_interrupt+0x3f/0x60
[ 1475.405330]  [<ffffffff81639fdd>] apic_timer_interrupt+0x6d/0x80
[ 1475.405330]  <EOI>  [<ffffffff81621b45>] ? panic+0x1a6/0x1e7
[ 1475.405330]  [<ffffffff8163120b>] oops_end+0x12b/0x150
[ 1475.405330]  [<ffffffff810192fb>] die+0x4b/0x70
[ 1475.405330]  [<ffffffff816308e0>] do_trap+0x60/0x170
[ 1475.405330]  [<ffffffff810161c4>] do_invalid_op+0xb4/0x130
[ 1475.405330]  [<ffffffff81167748>] ? __free_pages+0x68/0x70
[ 1475.405330]  [<ffffffff810cabdd>] ? vprintk_emit+0x1bd/0x510
[ 1475.405330]  [<ffffffff8163aa9e>] invalid_op+0x1e/0x30
[ 1475.405330]  [<ffffffff81167748>] ? __free_pages+0x68/0x70
[ 1475.405330]  [<ffffffff81167748>] ? __free_pages+0x68/0x70
[ 1475.405330]  [<ffffffff811617ae>] mempool_free_pages+0xe/0x10
[ 1475.405330]  [<ffffffff81161815>] mempool_destroy+0x35/0x60
[ 1475.405330]  [<ffffffffa054101d>] crypt_dtr+0x7d/0xe0 [dm_crypt]
[ 1475.405330]  [<ffffffffa0004c38>] dm_table_destroy+0x68/0xf0 [dm_mod]
[ 1475.405330]  [<ffffffffa00020d4>] __dm_destroy+0xf4/0x260 [dm_mod]
[ 1475.405330]  [<ffffffffa0002fa3>] dm_destroy+0x13/0x20 [dm_mod]
[ 1475.405330]  [<ffffffffa0008a3e>] dev_remove+0x11e/0x180 [dm_mod]
[ 1475.405330]  [<ffffffffa0008920>] ? dev_suspend+0x250/0x250 [dm_mod]
[ 1475.405330]  [<ffffffffa0009115>] ctl_ioctl+0x255/0x500 [dm_mod]
[ 1475.405330]  [<ffffffffa00093d3>] dm_ctl_ioctl+0x13/0x20 [dm_mod]
[ 1475.405330]  [<ffffffff811e9ac0>] do_vfs_ioctl+0x2e0/0x4a0
[ 1475.405330]  [<ffffffff8127fe96>] ? file_has_perm+0xa6/0xb0
[ 1475.405330]  [<ffffffff811e9d01>] SyS_ioctl+0x81/0xa0
[ 1475.405330]  [<ffffffff81639369>] system_call_fastpath+0x16/0x1b
[ 1475.405330] ---[ end trace 9960ec06a734b828 ]---

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/9] dm crypt: offload writes to thread
  2014-04-01 16:27   ` Ondrej Kozina
@ 2014-04-01 16:32     ` Mike Snitzer
  2014-04-01 18:15       ` Mikulas Patocka
  2014-04-01 18:01     ` Mikulas Patocka
  1 sibling, 1 reply; 27+ messages in thread
From: Mike Snitzer @ 2014-04-01 16:32 UTC (permalink / raw)
  To: Ondrej Kozina; +Cc: device-mapper development, Mikulas Patocka

On Tue, Apr 01 2014 at 12:27pm -0400,
Ondrej Kozina <okozina@redhat.com> wrote:

> On 03/28/2014 09:11 PM, Mike Snitzer wrote:
> >From: Mikulas Patocka <mpatocka@redhat.com>
> >
> >Submitting write bios directly in the encryption thread caused serious
> >performance degradation.  On a multiprocessor machine, encryption requests
> >finish in a different order than they were submitted.  Consequently, write
> >requests would be submitted in a different order and it could cause severe
> >performance degradation.
> > (...)
> 
> Hi,
> 
> originally I planed to post result of performance testing but
> unfortunately I crashed the test machine several times with
> <in_subject> patch applied and onward:
> 
> The test set up looks following:
> 
> the base is kernel-3.14-rc8 with applied "[PATCH 2/9] block: use
> kmalloc alignment for bio slab"
> 
> On top of base I compiled various dm-crypt modules:
> 
> <no_patch>		= raw 3.14-rc8 dm-crypt module
> <no_percpu>		= "[PATCH 1/9]"
> <per_bio_data>		= <no_percpu> + "[PATCH 3/9]"
> <unbound>		= <per_bio_data> + "[PATCH 4/9]"
> <dont_allocate_wfix>	= <unbound> + "[PATCH 5/9]" + "[PATCH 6/9]"
> <remove_io_pool>	= <dont_allocate_wfix> + "[PATCH 7/9]"
> <offload>		= <remove_io_pool> + "[PATCH 8/9]"
> <sort>			= <offload> + "[PATCH 9/9]"

Wonder if it worthwhile to rebase to final v3.14... could this
bio_add_page/mm crash be a function of something since fixed late in the
rc cycle?

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 9/9] dm crypt: sort writes
  2014-04-01  1:01         ` Mike Snitzer
@ 2014-04-01 17:35           ` Milan Broz
  2014-04-01 20:15             ` Alasdair G Kergon
  2014-04-01 23:21           ` Akira Hayakawa
  1 sibling, 1 reply; 27+ messages in thread
From: Milan Broz @ 2014-04-01 17:35 UTC (permalink / raw)
  To: device-mapper development

On 04/01/2014 03:01 AM, Mike Snitzer wrote:
> OK.  FYI, I cleaned the code up a little today based on Milan's dislike
> for the #define within the function, see:
> https://git.kernel.org/cgit/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=for-next&id=039e943277604c3f4f6b4ee803b8793627d53d36

Thanks.

Btw if you don't like sysfs option for disabling this, it can be optional table parameter,
there is already discard support implemented this way.

(I could add this myself later... just need to find at least few hours
of free time:)

Milan

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/9] dm crypt: offload writes to thread
  2014-04-01 16:27   ` Ondrej Kozina
  2014-04-01 16:32     ` Mike Snitzer
@ 2014-04-01 18:01     ` Mikulas Patocka
  2014-04-01 19:08       ` Milan Broz
  1 sibling, 1 reply; 27+ messages in thread
From: Mikulas Patocka @ 2014-04-01 18:01 UTC (permalink / raw)
  To: Ondrej Kozina; +Cc: Mike Snitzer, device-mapper development

[-- Attachment #1: Type: TEXT/PLAIN, Size: 7078 bytes --]



On Tue, 1 Apr 2014, Ondrej Kozina wrote:

> On 03/28/2014 09:11 PM, Mike Snitzer wrote:
> > From: Mikulas Patocka <mpatocka@redhat.com>
> > 
> > Submitting write bios directly in the encryption thread caused serious
> > performance degradation.  On a multiprocessor machine, encryption requests
> > finish in a different order than they were submitted.  Consequently, write
> > requests would be submitted in a different order and it could cause severe
> > performance degradation.
> > (...)
> 
> Hi,
> 
> originally I planed to post result of performance testing but unfortunately I
> crashed the test machine several times with <in_subject> patch applied and
> onward:
> 
> The test set up looks following:
> 
> the base is kernel-3.14-rc8 with applied "[PATCH 2/9] block: use kmalloc
> alignment for bio slab"
> 
> On top of base I compiled various dm-crypt modules:
> 
> <no_patch>		= raw 3.14-rc8 dm-crypt module
> <no_percpu>		= "[PATCH 1/9]"
> <per_bio_data>		= <no_percpu> + "[PATCH 3/9]"
> <unbound>		= <per_bio_data> + "[PATCH 4/9]"
> <dont_allocate_wfix>	= <unbound> + "[PATCH 5/9]" + "[PATCH 6/9]"
> <remove_io_pool>	= <dont_allocate_wfix> + "[PATCH 7/9]"
> <offload>		= <remove_io_pool> + "[PATCH 8/9]"
> <sort>			= <offload> + "[PATCH 9/9]"
> 
> All the modules above went through tests that are available here (originally
> write by Milan Broz, I have made only few modifications):
> 
> http://okozina.fedorapeople.org/dmcrypt-tests.tar.xz
> 
> All tests passed on single device tests with "test_disk" script
> 
> Testing md raid5 backing device ("test_md" script) fails in test
> 2/dmcrypt-all-verify.sh. The failure occurs after it pushes data through
> dm-crypt device using "null" cipher:
> 
> dmsetup create crypt --table "0 </dev/md0 bsize> crypt cipher_null-ecb-null -
> 0 /dev/md0 0"
> 
> steps to reproduce is to load module with <offload> or <sort> dm-crypt
> modules:
> 
> create md raid5 over 3 devices:
> 
> 1) mdadm -C -l 5 -n 3 -c 64 --assume-clean /dev/md0 /dev/sd[xyz]
> 2) run 2/dmcrypt-all-verify.sh

I tried to run "./dmcrypt-all-verify.sh --dev /dev/md0 --testdir 
/mnt/test" and the result was this.

That script is somehow buggy because it attempts to copy files that I have 
in /usr/src to the test filesystem. It fills up the test filesystem and 
fails. I don't know why is it trying to copy files not related to the test.

cp: nelze vytvoøit obyèejný soubor ,,/mnt/test/tst/usr/src/patch-3.5.xz": 
Na zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/rpm" nelze vytvoøit: Na zaøízení není 
volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/patch-3.5.2.xz": Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/cryptsetup-1.6.3.tar.bz2": Na zaøízení není volné 
místo
cp: nelze vytvoøit obyèejný soubor ,,/mnt/test/tst/usr/src/patch-3.12.xz": 
Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/dm-raid5-release.patch": Na zaøízení není volné 
místo
cp: adresáø ,,/mnt/test/tst/usr/src/linux-3.14-rc6" nelze vytvoøit: Na 
zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/linux-2.6.39" nelze vytvoøit: Na 
zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/openssh-6.1p1.tar.gz": Na zaøízení není volné 
místo
cp: adresáø ,,/mnt/test/tst/usr/src/LVM2.2.02.98-test" nelze vytvoøit: Na 
zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/linux-3.10" nelze vytvoøit: Na 
zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/patch-3.14-rc4.xz": Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/patch-3.14-rc8.xz": Na zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/gcc-4.7.3-build64" nelze vytvoøit: Na 
zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/binutils-2.23.2.tar.bz2": Na zaøízení není volné 
místo
cp: adresáø ,,/mnt/test/tst/usr/src/linux-3.2.7" nelze vytvoøit: Na 
zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/fio-2.0.15.tar.bzip.bz2": Na zaøízení není volné 
místo
cp: adresáø ,,/mnt/test/tst/usr/src/fio-2.0.15" nelze vytvoøit: Na 
zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/gcc-4.7.3.tar.bz2": Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor ,,/mnt/test/tst/usr/src/patch-3.9.xz": 
Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor ,,/mnt/test/tst/usr/src/lvm.tar.gz": Na 
zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/gcc-4.6.3-build-arm" nelze vytvoøit: 
Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/patch-3.9.2.xz": Na zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/linux-3.14" nelze vytvoøit: Na 
zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/linux-3.9.3-no-quota" nelze vytvoøit: 
Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/patch-3.9.3-4.xz": Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/patch-3.9.5-6.xz": Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/patch-3.9.7-8.xz": Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/patch-3.10.4.xz": Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/patch-3.10.8.xz": Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/patch-3.10.8-9.xz": Na zaøízení není volné místo
cp: nelze vytvoøit obyèejný soubor 
,,/mnt/test/tst/usr/src/patch-3.11.3.xz": Na zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/linux-3.3" nelze vytvoøit: Na zaøízení 
není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/linux-3.5.4" nelze vytvoøit: Na 
zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/linux-3.7.2" nelze vytvoøit: Na 
zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/linux-3.9.4" nelze vytvoøit: Na 
zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/src/linux-3.9.8" nelze vytvoøit: Na 
zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/local" nelze vytvoøit: Na zaøízení není 
volné místo
cp: adresáø ,,/mnt/test/tst/usr/hppa64-linux-gnu" nelze vytvoøit: Na 
zaøízení není volné místo
cp: adresáø ,,/mnt/test/tst/usr/X11R6" nelze vytvoøit: Na zaøízení není 
volné místo
cp: adresáø ,,/mnt/test/tst/usr/hppa64-linux" nelze vytvoøit: Na zaøízení 
není volné místo
cp: adresáø ,,/mnt/test/tst/usr/man" nelze vytvoøit: Na zaøízení není 
volné místo
cp: nelze vytvoøit obyèejný soubor ,,/mnt/test/tst/usr/patch-3.9-rc7.xz": 
Na zaøízení není volné místo
## 0 STACKTRACE() called from ./dmcrypt-all-verify.sh:116
## 1 c_usr_verify() called from ./dmcrypt-all-verify.sh:167
## 2 run_test() called from ./dmcrypt-all-verify.sh:256

Mikulas

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/9] dm crypt: offload writes to thread
  2014-04-01 16:32     ` Mike Snitzer
@ 2014-04-01 18:15       ` Mikulas Patocka
  2014-04-01 18:23         ` Mike Snitzer
  2014-04-02  6:55         ` Ondrej Kozina
  0 siblings, 2 replies; 27+ messages in thread
From: Mikulas Patocka @ 2014-04-01 18:15 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: Ondrej Kozina, device-mapper development



On Tue, 1 Apr 2014, Mike Snitzer wrote:

> On Tue, Apr 01 2014 at 12:27pm -0400,
> Ondrej Kozina <okozina@redhat.com> wrote:
> 
> > On 03/28/2014 09:11 PM, Mike Snitzer wrote:
> > >From: Mikulas Patocka <mpatocka@redhat.com>
> > >
> > >Submitting write bios directly in the encryption thread caused serious
> > >performance degradation.  On a multiprocessor machine, encryption requests
> > >finish in a different order than they were submitted.  Consequently, write
> > >requests would be submitted in a different order and it could cause severe
> > >performance degradation.
> > > (...)
> > 
> > Hi,
> > 
> > originally I planed to post result of performance testing but
> > unfortunately I crashed the test machine several times with
> > <in_subject> patch applied and onward:
> > 
> > The test set up looks following:
> > 
> > the base is kernel-3.14-rc8 with applied "[PATCH 2/9] block: use
> > kmalloc alignment for bio slab"
> > 
> > On top of base I compiled various dm-crypt modules:
> > 
> > <no_patch>		= raw 3.14-rc8 dm-crypt module
> > <no_percpu>		= "[PATCH 1/9]"
> > <per_bio_data>		= <no_percpu> + "[PATCH 3/9]"
> > <unbound>		= <per_bio_data> + "[PATCH 4/9]"
> > <dont_allocate_wfix>	= <unbound> + "[PATCH 5/9]" + "[PATCH 6/9]"
> > <remove_io_pool>	= <dont_allocate_wfix> + "[PATCH 7/9]"
> > <offload>		= <remove_io_pool> + "[PATCH 8/9]"
> > <sort>			= <offload> + "[PATCH 9/9]"
> 
> Wonder if it worthwhile to rebase to final v3.14... could this
> bio_add_page/mm crash be a function of something since fixed late in the
> rc cycle?

... or try it without XFS. XFS corrupts memory on I/O error 
(https://bugzilla.redhat.com/show_bug.cgi?id=924301), so it may be that.

Can the bug be reproduced if you modify the test to use ext2, ext3 or ext4 
instead of xfs?

Mikulas

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/9] dm crypt: offload writes to thread
  2014-04-01 18:15       ` Mikulas Patocka
@ 2014-04-01 18:23         ` Mike Snitzer
  2014-04-02  6:55         ` Ondrej Kozina
  1 sibling, 0 replies; 27+ messages in thread
From: Mike Snitzer @ 2014-04-01 18:23 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Ondrej Kozina, device-mapper development

On Tue, Apr 01 2014 at  2:15pm -0400,
Mikulas Patocka <mpatocka@redhat.com> wrote:

> 
> 
> On Tue, 1 Apr 2014, Mike Snitzer wrote:
> 
> > On Tue, Apr 01 2014 at 12:27pm -0400,
> > Ondrej Kozina <okozina@redhat.com> wrote:
> > 
> > > On 03/28/2014 09:11 PM, Mike Snitzer wrote:
> > > >From: Mikulas Patocka <mpatocka@redhat.com>
> > > >
> > > >Submitting write bios directly in the encryption thread caused serious
> > > >performance degradation.  On a multiprocessor machine, encryption requests
> > > >finish in a different order than they were submitted.  Consequently, write
> > > >requests would be submitted in a different order and it could cause severe
> > > >performance degradation.
> > > > (...)
> > > 
> > > Hi,
> > > 
> > > originally I planed to post result of performance testing but
> > > unfortunately I crashed the test machine several times with
> > > <in_subject> patch applied and onward:
> > > 
> > > The test set up looks following:
> > > 
> > > the base is kernel-3.14-rc8 with applied "[PATCH 2/9] block: use
> > > kmalloc alignment for bio slab"
> > > 
> > > On top of base I compiled various dm-crypt modules:
> > > 
> > > <no_patch>		= raw 3.14-rc8 dm-crypt module
> > > <no_percpu>		= "[PATCH 1/9]"
> > > <per_bio_data>		= <no_percpu> + "[PATCH 3/9]"
> > > <unbound>		= <per_bio_data> + "[PATCH 4/9]"
> > > <dont_allocate_wfix>	= <unbound> + "[PATCH 5/9]" + "[PATCH 6/9]"
> > > <remove_io_pool>	= <dont_allocate_wfix> + "[PATCH 7/9]"
> > > <offload>		= <remove_io_pool> + "[PATCH 8/9]"
> > > <sort>			= <offload> + "[PATCH 9/9]"
> > 
> > Wonder if it worthwhile to rebase to final v3.14... could this
> > bio_add_page/mm crash be a function of something since fixed late in the
> > rc cycle?
> 
> ... or try it without XFS. XFS corrupts memory on I/O error 
> (https://bugzilla.redhat.com/show_bug.cgi?id=924301), so it may be that.
> 
> Can the bug be reproduced if you modify the test to use ext2, ext3 or ext4 
> instead of xfs?

Or apply the patch that fixes the XFS use after free:
http://marc.info/?l=linux-xfs&m=139545654311177&q=raw

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/9] dm crypt: offload writes to thread
  2014-04-01 18:01     ` Mikulas Patocka
@ 2014-04-01 19:08       ` Milan Broz
  0 siblings, 0 replies; 27+ messages in thread
From: Milan Broz @ 2014-04-01 19:08 UTC (permalink / raw)
  To: device-mapper development, Ondrej Kozina; +Cc: Mike Snitzer

On 04/01/2014 08:01 PM, Mikulas Patocka wrote:
>> 1) mdadm -C -l 5 -n 3 -c 64 --assume-clean /dev/md0 /dev/sd[xyz]
>> 2) run 2/dmcrypt-all-verify.sh
> 
> I tried to run "./dmcrypt-all-verify.sh --dev /dev/md0 --testdir 
> /mnt/test" and the result was this.
> 
> That script is somehow buggy because it attempts to copy files that I have 
> in /usr/src to the test filesystem. It fills up the test filesystem and 
> fails. I don't know why is it trying to copy files not related to the test.

Hi Mikulas,

if it is the test I wrote years ago, it was in principle a hack,
I need some filesystem with a lot files of different sizes.

So I simply packed /usr and checked hash of all files before
and after unpacking to test dmcrypt device.

IMHO there should be some prepared archive which is just unpacked,
not that it uses live /usr.
(Obviously it need large enough device :-)

But anyway, despite it is quite stupid test, it already proved
to be very useful (I remember it discovered bugs in various patches
several times).

A IIRC it used several fs types and several encryption types
(including null encryption which removes crypto layer influence,
testing pure dmcrypt bio handling).

Milan

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 9/9] dm crypt: sort writes
  2014-04-01 17:35           ` Milan Broz
@ 2014-04-01 20:15             ` Alasdair G Kergon
  0 siblings, 0 replies; 27+ messages in thread
From: Alasdair G Kergon @ 2014-04-01 20:15 UTC (permalink / raw)
  To: Milan Broz; +Cc: device-mapper development

On Tue, Apr 01, 2014 at 07:35:28PM +0200, Milan Broz wrote:
> Btw if you don't like sysfs option for disabling this, it can be optional table parameter,
> there is already discard support implemented this way.
> (I could add this myself later... just need to find at least few hours
> of free time:)
 
But apart from using that option for benchmarking purposes, is there any reason
anyone would need to turn this off?

Do we know of any circumstances where the sorting causes a non-neglible
performance hit?

Alasdair

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 9/9] dm crypt: sort writes
  2014-04-01  1:01         ` Mike Snitzer
  2014-04-01 17:35           ` Milan Broz
@ 2014-04-01 23:21           ` Akira Hayakawa
  2014-04-02  3:19             ` Akira Hayakawa
  1 sibling, 1 reply; 27+ messages in thread
From: Akira Hayakawa @ 2014-04-01 23:21 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: masami.hiramatsu.pt, dm-devel

> OK.  FYI, I cleaned the code up a little today based on Milan's dislike
> for the #define within the function, see:
> https://git.kernel.org/cgit/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=for-next&id=039e943277604c3f4f6b4ee803b8793627d53d36

Thanks Mike, I did it last night (feature/sorted-writeback branch)
https://github.com/akiradeveloper/dm-writeboost/tree/feature/sorted-writeback/src

The codes in dm-crypt and dm-thin really helped.

Confirmed actually sorted.

Outline:
- In dm-writeboost.h
-- struct migrate_io is defined
- In dm-writeboost-daemon.c
-- prepare_migrate_ios() adds all migrate I/Os into the tree (add_migrate_io() is called for each)
-- submit_migrate_ios() tasks them out one by one in sorted order of pair (address, id) (compare_migrate_io() is the comparator)

--
Akira

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 9/9] dm crypt: sort writes
  2014-04-01 23:21           ` Akira Hayakawa
@ 2014-04-02  3:19             ` Akira Hayakawa
  2014-04-02  3:38               ` Mike Snitzer
  0 siblings, 1 reply; 27+ messages in thread
From: Akira Hayakawa @ 2014-04-02  3:19 UTC (permalink / raw)
  To: dm-devel; +Cc: masami.hiramatsu.pt

But, I myself found that what I really need in my case is not the
RB tree but a simple list sorting (list_sort()).

Will do sorting anyway.

Enabling/disabling sorting in my case is really simple if I use list_sort()
and will do. The default should be ON because I also don't see any reason
to turn it off except benchmarking reason.

--
Akira

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 9/9] dm crypt: sort writes
  2014-04-02  3:19             ` Akira Hayakawa
@ 2014-04-02  3:38               ` Mike Snitzer
  2014-04-02  4:18                 ` Akira Hayakawa
  0 siblings, 1 reply; 27+ messages in thread
From: Mike Snitzer @ 2014-04-02  3:38 UTC (permalink / raw)
  To: Akira Hayakawa; +Cc: masami.hiramatsu.pt, dm-devel

On Tue, Apr 01 2014 at 11:19pm -0400,
Akira Hayakawa <hayakawa@valinux.co.jp> wrote:

> But, I myself found that what I really need in my case is not the
> RB tree but a simple list sorting (list_sort()).
> 
> Will do sorting anyway.
> 
> Enabling/disabling sorting in my case is really simple if I use list_sort()
> and will do. The default should be ON because I also don't see any reason
> to turn it off except benchmarking reason.

list_sort() uses merge sort, which has O(nlog(n)) complexity;
list_sort() also suffers from "list passed to list_sort() too long for
efficiency".  But in practice I'm not sure how long a list needs to be
to hit that case.

Whereas an rb-tree has O(log(n)) complexity and is efficient for
traversal, it also doesn't have the length limits.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 9/9] dm crypt: sort writes
  2014-04-02  3:38               ` Mike Snitzer
@ 2014-04-02  4:18                 ` Akira Hayakawa
  0 siblings, 0 replies; 27+ messages in thread
From: Akira Hayakawa @ 2014-04-02  4:18 UTC (permalink / raw)
  To: dm-devel

> list_sort() uses merge sort, which has O(nlog(n)) complexity;
> list_sort() also suffers from "list passed to list_sort() too long for
> efficiency".  But in practice I'm not sure how long a list needs to be
> to hit that case.
> 
> Whereas an rb-tree has O(log(n)) complexity and is efficient for
> traversal, it also doesn't have the length limits.

I see. OK, I will remain on rb-tree.

Btw, if I want to disable sorting
1. Always attach new node to the left
2. Eliminate rb_insert_color()
is enough?

-- 
Akira

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/9] dm crypt: offload writes to thread
  2014-04-01 18:15       ` Mikulas Patocka
  2014-04-01 18:23         ` Mike Snitzer
@ 2014-04-02  6:55         ` Ondrej Kozina
  1 sibling, 0 replies; 27+ messages in thread
From: Ondrej Kozina @ 2014-04-02  6:55 UTC (permalink / raw)
  To: device-mapper development, Mike Snitzer; +Cc: Mikulas Patocka

On 04/01/2014 08:15 PM, Mikulas Patocka wrote:
>
> ... or try it without XFS. XFS corrupts memory on I/O error
> (https://bugzilla.redhat.com/show_bug.cgi?id=924301), so it may be that.
>
> Can the bug be reproduced if you modify the test to use ext2, ext3 or ext4
> instead of xfs?

It can, I crashed the machine also with ext4, testing the <sort> 
dm-crypt module:

[51218.996332] EXT4-fs (dm-3): mounted filesystem with ordered data 
mode. Opts: barrier=1
[51256.820147] device-mapper: crypt: bio_add_page failed for page 14: 
the underlying device has stricter limits than dm-crypt target
[51256.831928] EXT4-fs warning (device dm-3): ext4_end_bio:317: I/O 
error writing to inode 1443581 (offset 8388608 size 1380352 starting 
block 112320)
[51256.845215] Buffer I/O error on device dm-3, logical block 112320
[51256.851450] Buffer I/O error on device dm-3, logical block 112321
[51256.857640] Buffer I/O error on device dm-3, logical block 112322
[51256.863845] Buffer I/O error on device dm-3, logical block 112323
[51256.870039] Buffer I/O error on device dm-3, logical block 112324
[51256.876197] Buffer I/O error on device dm-3, logical block 112325
[51256.882382] Buffer I/O error on device dm-3, logical block 112326
[51256.888528] Buffer I/O error on device dm-3, logical block 112327
[51256.894732] Buffer I/O error on device dm-3, logical block 112328
[51256.900909] Buffer I/O error on device dm-3, logical block 112329
[51307.584415] device-mapper: crypt: bio_add_page failed for page 8: the 
underlying device has stricter limits than dm-crypt target
[51307.596042] EXT4-fs warning (device dm-3): ext4_end_bio:317: I/O 
error writing to inode 1839894 (offset 0 size 8388608 starting block 194304)
[51307.599959] device-mapper: crypt: bio_add_page failed for page 3: the 
underlying device has stricter limits than dm-crypt target
[51307.599985] EXT4-fs warning (device dm-3): ext4_end_bio:317: I/O 
error writing to inode 1839894 (offset 8388608 size 6574080 starting 
block 196832)
[51307.599988] buffer_io_error: 6 callbacks suppressed
[51307.599991] Buffer I/O error on device dm-3, logical block 196832
[51307.600023] Buffer I/O error on device dm-3, logical block 196833
[51307.600026] Buffer I/O error on device dm-3, logical block 196834
[51307.600028] Buffer I/O error on device dm-3, logical block 196835
[51307.600031] Buffer I/O error on device dm-3, logical block 196836
[51307.600034] Buffer I/O error on device dm-3, logical block 196837
[51307.600036] Buffer I/O error on device dm-3, logical block 196838
[51307.600039] Buffer I/O error on device dm-3, logical block 196839
[51307.600041] Buffer I/O error on device dm-3, logical block 196840
[51307.600044] Buffer I/O error on device dm-3, logical block 196841
[51307.600088] device-mapper: crypt: bio_add_page failed for page 2: the 
underlying device has stricter limits than dm-crypt target
[51307.600096] EXT4-fs warning (device dm-3): ext4_end_bio:317: I/O 
error writing to inode 1839894 (offset 8388608 size 6574080 starting 
block 196848)
[51307.600144] device-mapper: crypt: bio_add_page failed for page 1: the 
underlying device has stricter limits than dm-crypt target
[51307.600149] EXT4-fs warning (device dm-3): ext4_end_bio:317: I/O 
error writing to inode 1839894 (offset 8388608 size 6574080 starting 
block 196864)
[51307.748969] device-mapper: crypt: bio_add_page failed for page 1: the 
underlying device has stricter limits than dm-crypt target
[51307.760717] page:ffffea000447a040 count:0 mapcount:0 mapping: 
   (null) index:0x2
[51307.768791] page flags: 0x2fffff80000000()
[51307.773039] ------------[ cut here ]------------
[51307.774020] kernel BUG at include/linux/mm.h:307!
[51307.774020] invalid opcode: 0000 [#1] SMP
[51307.774020] Modules linked in: dm_crypt(F) ext4 mbcache jbd2 
crypto_null cfg80211 sg rfkill iTCO_wdt iTCO_vendor_support ppdev 
raid456 async_raid6_recov async_memcpy async_pq dcdbas raid6_pq 
async_xor xor async_tx pcspkr lpc_ich serio_raw mfd_core nfsd e1000 
e752x_edac edac_core ipmi_si ipmi_msghandler shpchp parport_pc video 
parport auth_rpcgss nfs_acl lockd sunrpc xfs libcrc32c sd_mod crc_t10dif 
crct10dif_common radeon sr_mod cdrom ata_generic pata_acpi i2c_algo_bit 
drm_kms_helper ttm drm ata_piix libata mptspi scsi_transport_spi 
mptscsih mptbase i2c_core floppy dm_mirror dm_region_hash dm_log dm_mod 
[last unloaded: dm_crypt]
[51307.812279] CPU: 1 PID: 296 Comm: kworker/u16:2 Tainted: GF 
   3.14.0-rc8 #3
[51307.812279] Hardware name: Dell Computer Corporation PowerEdge 
2800/0C8306, BIOS A07 04/25/2008
[51307.812279] Workqueue: kcryptd kcryptd_crypt [dm_crypt]
[51307.812279] task: ffff880034e76a80 ti: ffff880035058000 task.ti: 
ffff880035058000
[51307.812279] RIP: 0010:[<ffffffff81167748>]  [<ffffffff81167748>] 
__free_pages+0x68/0x70
[51307.812279] RSP: 0018:ffff880035059d08  EFLAGS: 00010246
[51307.812279] RAX: 0000000000000000 RBX: ffffea000447a040 RCX: 
0000000000000000
[51307.812279] RDX: 0000000000000000 RSI: ffff88011fc4e6c8 RDI: 
000000000111e810
[51307.812279] RBP: ffff880035059d18 R08: 0000000000000086 R09: 
0000000000000415
[51307.812279] R10: 0000000000000414 R11: 0000000000000003 R12: 
ffffea000447a040
[51307.812279] R13: ffff880090911870 R14: ffff8800c2dfdb18 R15: 
0000000000000000
[51307.812279] FS:  0000000000000000(0000) GS:ffff88011fc40000(0000) 
knlGS:0000000000000000
[51307.812279] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[51307.812279] CR2: 00007fed2e62e000 CR3: 000000000b472000 CR4: 
00000000000007e0
[51307.812279] Stack:
[51307.812279]  ffff8800da671720 ffffea000447a040 ffff880035059d28 
ffffffff811617ae
[51307.812279]  ffff880035059d50 ffffffff81161a29 ffff880090911920 
0000000000000002
[51307.812279]  ffff880090911870 ffff880035059d80 ffffffffa04dec48 
ffff880090911800
[51307.812279] Call Trace:
[51307.812279]  [<ffffffff811617ae>] mempool_free_pages+0xe/0x10
[51307.812279]  [<ffffffff81161a29>] mempool_free+0x49/0x90
[51307.812279]  [<ffffffffa04dec48>] 
crypt_free_buffer_pages.isra.12+0x48/0x70 [dm_crypt]
[51307.812279]  [<ffffffffa04e0874>] kcryptd_crypt+0x224/0x3d0 [dm_crypt]
[51307.812279]  [<ffffffff8108d1eb>] process_one_work+0x17b/0x460
[51307.812279]  [<ffffffff8108dfbb>] worker_thread+0x11b/0x400
[51307.812279]  [<ffffffff8108dea0>] ? rescuer_thread+0x400/0x400
[51307.812279]  [<ffffffff81095001>] kthread+0xe1/0x100
[51307.812279]  [<ffffffff81094f20>] ? kthread_create_on_node+0x1a0/0x1a0
[51307.812279]  [<ffffffff816392bc>] ret_from_fork+0x7c/0xb0
[51307.812279]  [<ffffffff81094f20>] ? kthread_create_on_node+0x1a0/0x1a0
[51307.812279] Code: de 44 89 e6 48 89 df e8 f7 f2 ff ff 5b 41 5c 5d c3 
66 90 48 89 df 31 f6 e8 c6 fd ff ff 5b 41 5c 5d c3 31 d2 31 f6 e8 48 e5 
ff ff <0f> 0b 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 53 48 89 fb
[51307.812279] RIP  [<ffffffff81167748>] __free_pages+0x68/0x70
[51307.812279]  RSP <ffff880035059d08>
[51308.080891] ---[ end trace 12e540e796d165e2 ]---

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2014-04-02  6:55 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-03-28 20:11 [PATCH 0/9] dm crypt: improve cpu scalability Mike Snitzer
2014-03-28 20:11 ` [PATCH 1/9] dm crypt: fix cpu hotplug crash by removing per-cpu structure Mike Snitzer
2014-03-28 20:11 ` [PATCH 2/9] block: use kmalloc alignment for bio slab Mike Snitzer
2014-03-28 20:11 ` [PATCH 3/9] dm crypt: use per-bio data Mike Snitzer
2014-03-28 20:11 ` [PATCH 4/9] dm crypt: use unbound workqueue for request processing Mike Snitzer
2014-03-28 20:11 ` [PATCH 5/9] dm crypt: don't allocate pages for a partial request Mike Snitzer
2014-03-28 20:11 ` [PATCH 6/9] dm crypt: avoid deadlock in mempools Mike Snitzer
2014-03-28 20:11 ` [PATCH 7/9] dm crypt: remove io_pool Mike Snitzer
2014-03-28 20:11 ` [PATCH 8/9] dm crypt: offload writes to thread Mike Snitzer
2014-04-01 16:27   ` Ondrej Kozina
2014-04-01 16:32     ` Mike Snitzer
2014-04-01 18:15       ` Mikulas Patocka
2014-04-01 18:23         ` Mike Snitzer
2014-04-02  6:55         ` Ondrej Kozina
2014-04-01 18:01     ` Mikulas Patocka
2014-04-01 19:08       ` Milan Broz
2014-03-28 20:11 ` [PATCH 9/9] dm crypt: sort writes Mike Snitzer
2014-03-29  8:11   ` Milan Broz
2014-03-31 12:39     ` Mike Snitzer
2014-03-31 23:37       ` Akira Hayakawa
2014-04-01  1:01         ` Mike Snitzer
2014-04-01 17:35           ` Milan Broz
2014-04-01 20:15             ` Alasdair G Kergon
2014-04-01 23:21           ` Akira Hayakawa
2014-04-02  3:19             ` Akira Hayakawa
2014-04-02  3:38               ` Mike Snitzer
2014-04-02  4:18                 ` Akira Hayakawa

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.