All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-05-31 16:04 ` Andi Kleen
  0 siblings, 0 replies; 40+ messages in thread
From: Andi Kleen @ 2010-05-31 16:04 UTC (permalink / raw)
  To: herbert, dm-devel, linux-kernel, agk, ak

DM-CRYPT: Scale to multiple CPUs

Currently dm-crypt does all encryption work per dmcrypt mapping in a
single workqueue. This does not scale well when multiple CPUs
are submitting IO at a high rate. The single CPU running the single
thread cannot keep up with the encryption and encrypted IO performance
tanks.

This patch changes the crypto workqueue to be per CPU. This means
that as long as the IO submitter (or the interrupt target CPUs
for reads) runs on different CPUs the encryption work will be also
parallel.

To avoid a bottleneck on the IO worker I also changed those to be
per CPU threads.

There is still some shared data, so I suspect some bouncing
cache lines. But I haven't done a detailed study on that yet.

All the threads are global, not per CPU. That is to avoid a thread
explosion on systems with a large number of CPUs and a larger
number of dm-crypt mappings. The code takes care to avoid problems
with nested mappings.
    
Signed-off-by: Andi Kleen <ak@linux.intel.com>

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3bdbb61..fb95896 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,6 +18,7 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
+#include <linux/percpu.h>
 #include <asm/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
@@ -77,11 +78,15 @@ struct crypt_iv_operations {
 };
 
 struct iv_essiv_private {
-	struct crypto_cipher *tfm;
 	struct crypto_hash *hash_tfm;
 	u8 *salt;
 };
 
+/* Duplicated per CPU state for cipher */
+struct iv_essiv_private_cpu {
+	struct crypto_cipher *tfm;
+};
+
 struct iv_benbi_private {
 	int shift;
 };
@@ -91,6 +96,18 @@ struct iv_benbi_private {
  * and encrypts / decrypts at the same time.
  */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+
+/* Duplicated per CPU state for cipher */
+struct crypt_cpu {
+	struct ablkcipher_request *req;
+	struct crypto_ablkcipher *tfm;
+	struct iv_essiv_private_cpu ie;
+};
+
+/*
+ * The fields in here must be read only after initialization,
+ * changing state should be in crypt_cpu.
+ */
 struct crypt_config {
 	struct dm_dev *dev;
 	sector_t start;
@@ -103,10 +120,6 @@ struct crypt_config {
 	mempool_t *req_pool;
 	mempool_t *page_pool;
 	struct bio_set *bs;
-
-	struct workqueue_struct *io_queue;
-	struct workqueue_struct *crypt_queue;
-
 	/*
 	 * crypto related data
 	 */
@@ -120,6 +133,12 @@ struct crypt_config {
 	unsigned int iv_size;
 
 	/*
+	 * Duplicated per cpu state. Access through
+	 * per_cpu_ptr() only.
+	 */
+	struct crypt_cpu *cpu;
+
+	/*
 	 * Layout of each crypto request:
 	 *
 	 *   struct ablkcipher_request
@@ -133,25 +152,44 @@ struct crypt_config {
 	 * correctly aligned.
 	 */
 	unsigned int dmreq_start;
-	struct ablkcipher_request *req;
 
 	char cipher[CRYPTO_MAX_ALG_NAME];
 	char chainmode[CRYPTO_MAX_ALG_NAME];
-	struct crypto_ablkcipher *tfm;
 	unsigned long flags;
 	unsigned int key_size;
 	u8 key[0];
 };
 
+/* RED-PEN scale with number of cpus? */
 #define MIN_IOS        16
 #define MIN_POOL_PAGES 32
 #define MIN_BIO_PAGES  8
 
+/* Protect creation of a new crypt queue */
+static DEFINE_MUTEX(queue_setup_lock);
+static struct workqueue_struct *crypt_workqueue;
+static struct workqueue_struct *io_workqueue;
+static DEFINE_PER_CPU(struct task_struct *, io_wq_cpu);
+
 static struct kmem_cache *_crypt_io_pool;
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 
+static struct crypt_cpu *crypt_me(struct crypt_config *cc)
+{
+	return per_cpu_ptr(cc->cpu, smp_processor_id());
+}
+
+/* Use this for cipher attributes that are the same for all cpus */
+static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+{
+	struct crypto_ablkcipher *tfm;
+	/* cpu doesn't matter, output is always the same */
+	tfm = per_cpu_ptr(cc->cpu, raw_smp_processor_id())->tfm;
+	return tfm;
+}
+
 /*
  * Different IV generation algorithms:
  *
@@ -198,7 +236,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 	struct hash_desc desc;
 	struct scatterlist sg;
-	int err;
+	int err, n, cpu;
 
 	sg_init_one(&sg, cc->key, cc->key_size);
 	desc.tfm = essiv->hash_tfm;
@@ -208,8 +246,18 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 	if (err)
 		return err;
 
-	return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+
+		n = crypto_cipher_setkey(cs->ie.tfm, essiv->salt,
 				    crypto_hash_digestsize(essiv->hash_tfm));
+		if (n) {
+			err = n;
+			break;
+		}
+	}
+
+	return err;
 }
 
 /* Wipe salt and reset key derived from volume key */
@@ -217,24 +265,70 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 {
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+	int cpu, err, n;
 
 	memset(essiv->salt, 0, salt_size);
 
-	return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+	err = 0;
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		n = crypto_cipher_setkey(cs->ie.tfm, essiv->salt, salt_size);
+		if (n)
+			err = n;
+	}
+	return err;
+}
+
+/* Set up per cpu cipher state */
+static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
+					     struct dm_target *ti,
+					     u8 *salt, unsigned saltsize)
+{
+	struct crypto_cipher *essiv_tfm;
+	int err;
+
+	/* Setup the essiv_tfm with the given salt */
+	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(essiv_tfm)) {
+		ti->error = "Error allocating crypto tfm for ESSIV";
+		return essiv_tfm;
+	}
+
+	if (crypto_cipher_blocksize(essiv_tfm) !=
+	    crypto_ablkcipher_ivsize(any_tfm(cc))) {
+		ti->error = "Block size of ESSIV cipher does "
+			    "not match IV size of block cipher";
+		crypto_free_cipher(essiv_tfm);
+		return ERR_PTR(-EINVAL);
+	}
+	err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
+	if (err) {
+		ti->error = "Failed to set key for ESSIV cipher";
+		crypto_free_cipher(essiv_tfm);
+		return ERR_PTR(err);
+	}
+
+	return essiv_tfm;
 }
 
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
+	int cpu;
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 
-	crypto_free_cipher(essiv->tfm);
-	essiv->tfm = NULL;
-
 	crypto_free_hash(essiv->hash_tfm);
 	essiv->hash_tfm = NULL;
 
 	kzfree(essiv->salt);
 	essiv->salt = NULL;
+
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		if (cs->ie.tfm) {
+			crypto_free_cipher(cs->ie.tfm);
+			cs->ie.tfm = NULL;
+		}
+	}
 }
 
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -244,6 +338,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	struct crypto_hash *hash_tfm = NULL;
 	u8 *salt = NULL;
 	int err;
+	int cpu;
 
 	if (!opts) {
 		ti->error = "Digest algorithm missing for ESSIV mode";
@@ -265,30 +360,22 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		goto bad;
 	}
 
-	/* Allocate essiv_tfm */
-	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(essiv_tfm)) {
-		ti->error = "Error allocating crypto tfm for ESSIV";
-		err = PTR_ERR(essiv_tfm);
-		goto bad;
-	}
-	if (crypto_cipher_blocksize(essiv_tfm) !=
-	    crypto_ablkcipher_ivsize(cc->tfm)) {
-		ti->error = "Block size of ESSIV cipher does "
-			    "not match IV size of block cipher";
-		err = -EINVAL;
-		goto bad;
-	}
-
 	cc->iv_gen_private.essiv.salt = salt;
-	cc->iv_gen_private.essiv.tfm = essiv_tfm;
 	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
 
+	for_each_possible_cpu (cpu) {
+		essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+					crypto_hash_digestsize(hash_tfm));
+		if (IS_ERR(essiv_tfm)) {
+			kfree(salt);
+			crypt_iv_essiv_dtr(cc);
+			return PTR_ERR(essiv_tfm);
+		}
+		per_cpu_ptr(cc->cpu, cpu)->ie.tfm = essiv_tfm;
+	}
 	return 0;
 
 bad:
-	if (essiv_tfm && !IS_ERR(essiv_tfm))
-		crypto_free_cipher(essiv_tfm);
 	if (hash_tfm && !IS_ERR(hash_tfm))
 		crypto_free_hash(hash_tfm);
 	kfree(salt);
@@ -299,14 +386,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
 	memset(iv, 0, cc->iv_size);
 	*(u64 *)iv = cpu_to_le64(sector);
-	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
+	crypto_cipher_encrypt_one(crypt_me(cc)->ie.tfm, iv, iv);
 	return 0;
 }
 
 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
 			      const char *opts)
 {
-	unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
+	unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
 	int log = ilog2(bs);
 
 	/* we need to calculate how far we must shift the sector count
@@ -415,7 +502,7 @@ static int crypt_convert_block(struct crypt_config *cc,
 
 	dmreq = dmreq_of_req(cc, req);
 	iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
-			 crypto_ablkcipher_alignmask(cc->tfm) + 1);
+			 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
 
 	dmreq->ctx = ctx;
 	sg_init_table(&dmreq->sg_in, 1);
@@ -460,13 +547,14 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
-	if (!cc->req)
-		cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
-	ablkcipher_request_set_tfm(cc->req, cc->tfm);
-	ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+	struct crypt_cpu *cs = crypt_me(cc);
+	if (!cs->req)
+		cs->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+	ablkcipher_request_set_tfm(cs->req, cs->tfm);
+	ablkcipher_request_set_callback(cs->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
 					CRYPTO_TFM_REQ_MAY_SLEEP,
 					kcryptd_async_done,
-					dmreq_of_req(cc, cc->req));
+					dmreq_of_req(cc, cs->req));
 }
 
 /*
@@ -475,6 +563,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
+	struct crypt_cpu *cs = crypt_me(cc);
 	int r;
 
 	atomic_set(&ctx->pending, 1);
@@ -486,7 +575,7 @@ static int crypt_convert(struct crypt_config *cc,
 
 		atomic_inc(&ctx->pending);
 
-		r = crypt_convert_block(cc, ctx, cc->req);
+		r = crypt_convert_block(cc, ctx, cs->req);
 
 		switch (r) {
 		/* async */
@@ -495,7 +584,7 @@ static int crypt_convert(struct crypt_config *cc,
 			INIT_COMPLETION(ctx->restart);
 			/* fall through*/
 		case -EINPROGRESS:
-			cc->req = NULL;
+			cs->req = NULL;
 			ctx->sector++;
 			continue;
 
@@ -654,6 +743,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
  * They must be separated as otherwise the final stages could be
  * starved by new requests which can block in the first stages due
  * to memory allocation.
+ *
+ * The work is done per CPU global for all dmcrypt instances.
+ * They should not depend on each other and do not block.
  */
 static void crypt_endio(struct bio *clone, int error)
 {
@@ -735,6 +827,7 @@ static void kcryptd_io(struct work_struct *work)
 {
 	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
 
+	__get_cpu_var(io_wq_cpu) = current;
 	if (bio_data_dir(io->base_bio) == READ)
 		kcryptd_io_read(io);
 	else
@@ -743,10 +836,23 @@ static void kcryptd_io(struct work_struct *work)
 
 static void kcryptd_queue_io(struct dm_crypt_io *io)
 {
-	struct crypt_config *cc = io->target->private;
+	int cpu;
+
+	/*
+	 * Since we only have a single worker per CPU in extreme
+	 * cases there might be nesting (dm-crypt on another dm-crypt)
+	 * To avoid deadlock run the work directly then.
+	 */
+	cpu = get_cpu();
+	if (per_cpu(io_wq_cpu, cpu) == current && !in_interrupt()) {
+		put_cpu();
+		kcryptd_io(&io->work);
+		return;
+	}
 
 	INIT_WORK(&io->work, kcryptd_io);
-	queue_work(cc->io_queue, &io->work);
+	queue_work(io_workqueue, &io->work);
+	put_cpu();
 }
 
 static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
@@ -927,10 +1033,8 @@ static void kcryptd_crypt(struct work_struct *work)
 
 static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 {
-	struct crypt_config *cc = io->target->private;
-
 	INIT_WORK(&io->work, kcryptd_crypt);
-	queue_work(cc->crypt_queue, &io->work);
+	queue_work(crypt_workqueue, &io->work);
 }
 
 /*
@@ -974,6 +1078,21 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 	}
 }
 
+static int crypt_setkey_allcpus(struct crypt_config *cc)
+{
+	int cpu, n, err;
+
+	err = 0;
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		n = crypto_ablkcipher_setkey(cs->tfm, cc->key, cc->key_size);
+		if (n)
+			err = n;
+	}
+	return err;
+}
+
+
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
 	unsigned key_size = strlen(key) >> 1;
@@ -989,14 +1108,48 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
 
 	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
-	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+	return crypt_setkey_allcpus(cc);
 }
 
 static int crypt_wipe_key(struct crypt_config *cc)
 {
 	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 	memset(&cc->key, 0, cc->key_size * sizeof(u8));
-	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+	return crypt_setkey_allcpus(cc);
+}
+
+/* Use a global per-CPU encryption workqueue for all mounts */
+static int crypt_create_workqueues(void)
+{
+	int ret = 0;
+
+	/* Module unload cleans up on error */
+	mutex_lock(&queue_setup_lock);
+	if (!crypt_workqueue) {
+		crypt_workqueue = create_workqueue("dmcrypt");
+		if (!crypt_workqueue)
+			ret = -ENOMEM;
+	}
+	if (!io_workqueue) {
+		io_workqueue = create_workqueue("dmcrypt-io");
+		if (!io_workqueue)
+			ret = -ENOMEM;
+	}
+	mutex_unlock(&queue_setup_lock);
+	return ret;
+}
+
+static void crypt_dtr_cpus(struct crypt_config *cc)
+{
+	int cpu;
+
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		if (cs->tfm) {
+			crypto_free_ablkcipher(cs->tfm);
+			cs->tfm = NULL;
+		}
+	}
 }
 
 /*
@@ -1014,6 +1167,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	char *ivopts;
 	unsigned int key_size;
 	unsigned long long tmpll;
+	int cpu;
 
 	if (argc != 5) {
 		ti->error = "Not enough arguments";
@@ -1038,7 +1192,13 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -ENOMEM;
 	}
 
-	/* Compatibility mode for old dm-crypt cipher strings */
+	cc->cpu = alloc_percpu(struct crypt_cpu);
+	if (!cc->cpu) {
+		ti->error = "Cannot allocate per cpu state";
+		goto bad_percpu;
+	}
+
+	/* Compatiblity mode for old dm-crypt cipher strings */
 	if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
 		chainmode = "cbc";
 		ivmode = "plain";
@@ -1055,15 +1215,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_cipher;
 	}
 
-	tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0);
-	if (IS_ERR(tfm)) {
-		ti->error = "Error allocating crypto tfm";
-		goto bad_cipher;
+	for_each_possible_cpu (cpu) {
+		tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0);
+		if (IS_ERR(tfm)) {
+			ti->error = "Error allocating crypto tfm";
+			goto bad_ivmode;
+		}
+		per_cpu_ptr(cc->cpu, cpu)->tfm = tfm;
 	}
+	tfm = any_tfm(cc);
 
 	strcpy(cc->cipher, cipher);
 	strcpy(cc->chainmode, chainmode);
-	cc->tfm = tfm;
 
 	if (crypt_set_key(cc, argv[1]) < 0) {
 		ti->error = "Error decoding and setting key";
@@ -1134,7 +1297,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ti->error = "Cannot allocate crypt request mempool";
 		goto bad_req_pool;
 	}
-	cc->req = NULL;
 
 	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
 	if (!cc->page_pool) {
@@ -1148,6 +1310,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_bs;
 	}
 
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		if (crypto_ablkcipher_setkey(cs->tfm, cc->key, key_size) < 0) {
+			ti->error = "Error setting key";
+			goto bad_device;
+		}
+	}
+
 	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
 		ti->error = "Invalid iv_offset sector";
 		goto bad_device;
@@ -1177,25 +1347,16 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	} else
 		cc->iv_mode = NULL;
 
-	cc->io_queue = create_singlethread_workqueue("kcryptd_io");
-	if (!cc->io_queue) {
-		ti->error = "Couldn't create kcryptd io queue";
-		goto bad_io_queue;
-	}
-
-	cc->crypt_queue = create_singlethread_workqueue("kcryptd");
-	if (!cc->crypt_queue) {
-		ti->error = "Couldn't create kcryptd queue";
-		goto bad_crypt_queue;
+	if (crypt_create_workqueues() < 0) {
+		ti->error = "Couldn't create kcrypt work queues";
+		goto bad_queues;
 	}
 
 	ti->num_flush_requests = 1;
 	ti->private = cc;
 	return 0;
 
-bad_crypt_queue:
-	destroy_workqueue(cc->io_queue);
-bad_io_queue:
+bad_queues:
 	kfree(cc->iv_mode);
 bad_ivmode_string:
 	dm_put_device(ti, cc->dev);
@@ -1211,8 +1372,10 @@ bad_slab_pool:
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
 bad_ivmode:
-	crypto_free_ablkcipher(tfm);
+	crypt_dtr_cpus(cc);
 bad_cipher:
+	free_percpu(cc->cpu);
+bad_percpu:
 	/* Must zero key material before freeing */
 	kzfree(cc);
 	return -EINVAL;
@@ -1220,13 +1383,14 @@ bad_cipher:
 
 static void crypt_dtr(struct dm_target *ti)
 {
+	int cpu;
 	struct crypt_config *cc = (struct crypt_config *) ti->private;
 
-	destroy_workqueue(cc->io_queue);
-	destroy_workqueue(cc->crypt_queue);
-
-	if (cc->req)
-		mempool_free(cc->req, cc->req_pool);
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		if (cs->req)
+			mempool_free(cs->req, cc->req_pool);
+	}
 
 	bioset_free(cc->bs);
 	mempool_destroy(cc->page_pool);
@@ -1236,9 +1400,11 @@ static void crypt_dtr(struct dm_target *ti)
 	kfree(cc->iv_mode);
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
-	crypto_free_ablkcipher(cc->tfm);
+	crypt_dtr_cpus(cc);
 	dm_put_device(ti, cc->dev);
 
+	free_percpu(cc->cpu);
+
 	/* Must zero key material before freeing */
 	kzfree(cc);
 }
@@ -1428,6 +1594,10 @@ static void __exit dm_crypt_exit(void)
 {
 	dm_unregister_target(&crypt_target);
 	kmem_cache_destroy(_crypt_io_pool);
+	if (crypt_workqueue)
+		destroy_workqueue(crypt_workqueue);
+	if (io_workqueue)
+		destroy_workqueue(io_workqueue);
 }
 
 module_init(dm_crypt_init);

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-05-31 16:04 ` Andi Kleen
  0 siblings, 0 replies; 40+ messages in thread
From: Andi Kleen @ 2010-05-31 16:04 UTC (permalink / raw)
  To: herbert, dm-devel, linux-kernel, agk, ak

DM-CRYPT: Scale to multiple CPUs

Currently dm-crypt does all encryption work per dmcrypt mapping in a
single workqueue. This does not scale well when multiple CPUs
are submitting IO at a high rate. The single CPU running the single
thread cannot keep up with the encryption and encrypted IO performance
tanks.

This patch changes the crypto workqueue to be per CPU. This means
that as long as the IO submitter (or the interrupt target CPUs
for reads) runs on different CPUs the encryption work will be also
parallel.

To avoid a bottleneck on the IO worker I also changed those to be
per CPU threads.

There is still some shared data, so I suspect some bouncing
cache lines. But I haven't done a detailed study on that yet.

All the threads are global, not per CPU. That is to avoid a thread
explosion on systems with a large number of CPUs and a larger
number of dm-crypt mappings. The code takes care to avoid problems
with nested mappings.
    
Signed-off-by: Andi Kleen <ak@linux.intel.com>

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 3bdbb61..fb95896 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,6 +18,7 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
+#include <linux/percpu.h>
 #include <asm/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
@@ -77,11 +78,15 @@ struct crypt_iv_operations {
 };
 
 struct iv_essiv_private {
-	struct crypto_cipher *tfm;
 	struct crypto_hash *hash_tfm;
 	u8 *salt;
 };
 
+/* Duplicated per CPU state for cipher */
+struct iv_essiv_private_cpu {
+	struct crypto_cipher *tfm;
+};
+
 struct iv_benbi_private {
 	int shift;
 };
@@ -91,6 +96,18 @@ struct iv_benbi_private {
  * and encrypts / decrypts at the same time.
  */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+
+/* Duplicated per CPU state for cipher */
+struct crypt_cpu {
+	struct ablkcipher_request *req;
+	struct crypto_ablkcipher *tfm;
+	struct iv_essiv_private_cpu ie;
+};
+
+/*
+ * The fields in here must be read only after initialization,
+ * changing state should be in crypt_cpu.
+ */
 struct crypt_config {
 	struct dm_dev *dev;
 	sector_t start;
@@ -103,10 +120,6 @@ struct crypt_config {
 	mempool_t *req_pool;
 	mempool_t *page_pool;
 	struct bio_set *bs;
-
-	struct workqueue_struct *io_queue;
-	struct workqueue_struct *crypt_queue;
-
 	/*
 	 * crypto related data
 	 */
@@ -120,6 +133,12 @@ struct crypt_config {
 	unsigned int iv_size;
 
 	/*
+	 * Duplicated per cpu state. Access through
+	 * per_cpu_ptr() only.
+	 */
+	struct crypt_cpu *cpu;
+
+	/*
 	 * Layout of each crypto request:
 	 *
 	 *   struct ablkcipher_request
@@ -133,25 +152,44 @@ struct crypt_config {
 	 * correctly aligned.
 	 */
 	unsigned int dmreq_start;
-	struct ablkcipher_request *req;
 
 	char cipher[CRYPTO_MAX_ALG_NAME];
 	char chainmode[CRYPTO_MAX_ALG_NAME];
-	struct crypto_ablkcipher *tfm;
 	unsigned long flags;
 	unsigned int key_size;
 	u8 key[0];
 };
 
+/* RED-PEN scale with number of cpus? */
 #define MIN_IOS        16
 #define MIN_POOL_PAGES 32
 #define MIN_BIO_PAGES  8
 
+/* Protect creation of a new crypt queue */
+static DEFINE_MUTEX(queue_setup_lock);
+static struct workqueue_struct *crypt_workqueue;
+static struct workqueue_struct *io_workqueue;
+static DEFINE_PER_CPU(struct task_struct *, io_wq_cpu);
+
 static struct kmem_cache *_crypt_io_pool;
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 
+static struct crypt_cpu *crypt_me(struct crypt_config *cc)
+{
+	return per_cpu_ptr(cc->cpu, smp_processor_id());
+}
+
+/* Use this for cipher attributes that are the same for all cpus */
+static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+{
+	struct crypto_ablkcipher *tfm;
+	/* cpu doesn't matter, output is always the same */
+	tfm = per_cpu_ptr(cc->cpu, raw_smp_processor_id())->tfm;
+	return tfm;
+}
+
 /*
  * Different IV generation algorithms:
  *
@@ -198,7 +236,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 	struct hash_desc desc;
 	struct scatterlist sg;
-	int err;
+	int err, n, cpu;
 
 	sg_init_one(&sg, cc->key, cc->key_size);
 	desc.tfm = essiv->hash_tfm;
@@ -208,8 +246,18 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 	if (err)
 		return err;
 
-	return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+
+		n = crypto_cipher_setkey(cs->ie.tfm, essiv->salt,
 				    crypto_hash_digestsize(essiv->hash_tfm));
+		if (n) {
+			err = n;
+			break;
+		}
+	}
+
+	return err;
 }
 
 /* Wipe salt and reset key derived from volume key */
@@ -217,24 +265,70 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 {
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+	int cpu, err, n;
 
 	memset(essiv->salt, 0, salt_size);
 
-	return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+	err = 0;
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		n = crypto_cipher_setkey(cs->ie.tfm, essiv->salt, salt_size);
+		if (n)
+			err = n;
+	}
+	return err;
+}
+
+/* Set up per cpu cipher state */
+static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
+					     struct dm_target *ti,
+					     u8 *salt, unsigned saltsize)
+{
+	struct crypto_cipher *essiv_tfm;
+	int err;
+
+	/* Setup the essiv_tfm with the given salt */
+	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(essiv_tfm)) {
+		ti->error = "Error allocating crypto tfm for ESSIV";
+		return essiv_tfm;
+	}
+
+	if (crypto_cipher_blocksize(essiv_tfm) !=
+	    crypto_ablkcipher_ivsize(any_tfm(cc))) {
+		ti->error = "Block size of ESSIV cipher does "
+			    "not match IV size of block cipher";
+		crypto_free_cipher(essiv_tfm);
+		return ERR_PTR(-EINVAL);
+	}
+	err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
+	if (err) {
+		ti->error = "Failed to set key for ESSIV cipher";
+		crypto_free_cipher(essiv_tfm);
+		return ERR_PTR(err);
+	}
+
+	return essiv_tfm;
 }
 
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
+	int cpu;
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 
-	crypto_free_cipher(essiv->tfm);
-	essiv->tfm = NULL;
-
 	crypto_free_hash(essiv->hash_tfm);
 	essiv->hash_tfm = NULL;
 
 	kzfree(essiv->salt);
 	essiv->salt = NULL;
+
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		if (cs->ie.tfm) {
+			crypto_free_cipher(cs->ie.tfm);
+			cs->ie.tfm = NULL;
+		}
+	}
 }
 
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -244,6 +338,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	struct crypto_hash *hash_tfm = NULL;
 	u8 *salt = NULL;
 	int err;
+	int cpu;
 
 	if (!opts) {
 		ti->error = "Digest algorithm missing for ESSIV mode";
@@ -265,30 +360,22 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		goto bad;
 	}
 
-	/* Allocate essiv_tfm */
-	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(essiv_tfm)) {
-		ti->error = "Error allocating crypto tfm for ESSIV";
-		err = PTR_ERR(essiv_tfm);
-		goto bad;
-	}
-	if (crypto_cipher_blocksize(essiv_tfm) !=
-	    crypto_ablkcipher_ivsize(cc->tfm)) {
-		ti->error = "Block size of ESSIV cipher does "
-			    "not match IV size of block cipher";
-		err = -EINVAL;
-		goto bad;
-	}
-
 	cc->iv_gen_private.essiv.salt = salt;
-	cc->iv_gen_private.essiv.tfm = essiv_tfm;
 	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
 
+	for_each_possible_cpu (cpu) {
+		essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+					crypto_hash_digestsize(hash_tfm));
+		if (IS_ERR(essiv_tfm)) {
+			kfree(salt);
+			crypt_iv_essiv_dtr(cc);
+			return PTR_ERR(essiv_tfm);
+		}
+		per_cpu_ptr(cc->cpu, cpu)->ie.tfm = essiv_tfm;
+	}
 	return 0;
 
 bad:
-	if (essiv_tfm && !IS_ERR(essiv_tfm))
-		crypto_free_cipher(essiv_tfm);
 	if (hash_tfm && !IS_ERR(hash_tfm))
 		crypto_free_hash(hash_tfm);
 	kfree(salt);
@@ -299,14 +386,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
 	memset(iv, 0, cc->iv_size);
 	*(u64 *)iv = cpu_to_le64(sector);
-	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
+	crypto_cipher_encrypt_one(crypt_me(cc)->ie.tfm, iv, iv);
 	return 0;
 }
 
 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
 			      const char *opts)
 {
-	unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
+	unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
 	int log = ilog2(bs);
 
 	/* we need to calculate how far we must shift the sector count
@@ -415,7 +502,7 @@ static int crypt_convert_block(struct crypt_config *cc,
 
 	dmreq = dmreq_of_req(cc, req);
 	iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
-			 crypto_ablkcipher_alignmask(cc->tfm) + 1);
+			 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
 
 	dmreq->ctx = ctx;
 	sg_init_table(&dmreq->sg_in, 1);
@@ -460,13 +547,14 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
-	if (!cc->req)
-		cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
-	ablkcipher_request_set_tfm(cc->req, cc->tfm);
-	ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+	struct crypt_cpu *cs = crypt_me(cc);
+	if (!cs->req)
+		cs->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+	ablkcipher_request_set_tfm(cs->req, cs->tfm);
+	ablkcipher_request_set_callback(cs->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
 					CRYPTO_TFM_REQ_MAY_SLEEP,
 					kcryptd_async_done,
-					dmreq_of_req(cc, cc->req));
+					dmreq_of_req(cc, cs->req));
 }
 
 /*
@@ -475,6 +563,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
+	struct crypt_cpu *cs = crypt_me(cc);
 	int r;
 
 	atomic_set(&ctx->pending, 1);
@@ -486,7 +575,7 @@ static int crypt_convert(struct crypt_config *cc,
 
 		atomic_inc(&ctx->pending);
 
-		r = crypt_convert_block(cc, ctx, cc->req);
+		r = crypt_convert_block(cc, ctx, cs->req);
 
 		switch (r) {
 		/* async */
@@ -495,7 +584,7 @@ static int crypt_convert(struct crypt_config *cc,
 			INIT_COMPLETION(ctx->restart);
 			/* fall through*/
 		case -EINPROGRESS:
-			cc->req = NULL;
+			cs->req = NULL;
 			ctx->sector++;
 			continue;
 
@@ -654,6 +743,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
  * They must be separated as otherwise the final stages could be
  * starved by new requests which can block in the first stages due
  * to memory allocation.
+ *
+ * The work is done per CPU global for all dmcrypt instances.
+ * They should not depend on each other and do not block.
  */
 static void crypt_endio(struct bio *clone, int error)
 {
@@ -735,6 +827,7 @@ static void kcryptd_io(struct work_struct *work)
 {
 	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
 
+	__get_cpu_var(io_wq_cpu) = current;
 	if (bio_data_dir(io->base_bio) == READ)
 		kcryptd_io_read(io);
 	else
@@ -743,10 +836,23 @@ static void kcryptd_io(struct work_struct *work)
 
 static void kcryptd_queue_io(struct dm_crypt_io *io)
 {
-	struct crypt_config *cc = io->target->private;
+	int cpu;
+
+	/*
+	 * Since we only have a single worker per CPU in extreme
+	 * cases there might be nesting (dm-crypt on another dm-crypt)
+	 * To avoid deadlock run the work directly then.
+	 */
+	cpu = get_cpu();
+	if (per_cpu(io_wq_cpu, cpu) == current && !in_interrupt()) {
+		put_cpu();
+		kcryptd_io(&io->work);
+		return;
+	}
 
 	INIT_WORK(&io->work, kcryptd_io);
-	queue_work(cc->io_queue, &io->work);
+	queue_work(io_workqueue, &io->work);
+	put_cpu();
 }
 
 static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
@@ -927,10 +1033,8 @@ static void kcryptd_crypt(struct work_struct *work)
 
 static void kcryptd_queue_crypt(struct dm_crypt_io *io)
 {
-	struct crypt_config *cc = io->target->private;
-
 	INIT_WORK(&io->work, kcryptd_crypt);
-	queue_work(cc->crypt_queue, &io->work);
+	queue_work(crypt_workqueue, &io->work);
 }
 
 /*
@@ -974,6 +1078,21 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 	}
 }
 
+static int crypt_setkey_allcpus(struct crypt_config *cc)
+{
+	int cpu, n, err;
+
+	err = 0;
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		n = crypto_ablkcipher_setkey(cs->tfm, cc->key, cc->key_size);
+		if (n)
+			err = n;
+	}
+	return err;
+}
+
+
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
 	unsigned key_size = strlen(key) >> 1;
@@ -989,14 +1108,48 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
 
 	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
-	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+	return crypt_setkey_allcpus(cc);
 }
 
 static int crypt_wipe_key(struct crypt_config *cc)
 {
 	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 	memset(&cc->key, 0, cc->key_size * sizeof(u8));
-	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+	return crypt_setkey_allcpus(cc);
+}
+
+/* Use a global per-CPU encryption workqueue for all mounts */
+static int crypt_create_workqueues(void)
+{
+	int ret = 0;
+
+	/* Module unload cleans up on error */
+	mutex_lock(&queue_setup_lock);
+	if (!crypt_workqueue) {
+		crypt_workqueue = create_workqueue("dmcrypt");
+		if (!crypt_workqueue)
+			ret = -ENOMEM;
+	}
+	if (!io_workqueue) {
+		io_workqueue = create_workqueue("dmcrypt-io");
+		if (!io_workqueue)
+			ret = -ENOMEM;
+	}
+	mutex_unlock(&queue_setup_lock);
+	return ret;
+}
+
+static void crypt_dtr_cpus(struct crypt_config *cc)
+{
+	int cpu;
+
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		if (cs->tfm) {
+			crypto_free_ablkcipher(cs->tfm);
+			cs->tfm = NULL;
+		}
+	}
 }
 
 /*
@@ -1014,6 +1167,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	char *ivopts;
 	unsigned int key_size;
 	unsigned long long tmpll;
+	int cpu;
 
 	if (argc != 5) {
 		ti->error = "Not enough arguments";
@@ -1038,7 +1192,13 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -ENOMEM;
 	}
 
-	/* Compatibility mode for old dm-crypt cipher strings */
+	cc->cpu = alloc_percpu(struct crypt_cpu);
+	if (!cc->cpu) {
+		ti->error = "Cannot allocate per cpu state";
+		goto bad_percpu;
+	}
+
+	/* Compatiblity mode for old dm-crypt cipher strings */
 	if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
 		chainmode = "cbc";
 		ivmode = "plain";
@@ -1055,15 +1215,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_cipher;
 	}
 
-	tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0);
-	if (IS_ERR(tfm)) {
-		ti->error = "Error allocating crypto tfm";
-		goto bad_cipher;
+	for_each_possible_cpu (cpu) {
+		tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0);
+		if (IS_ERR(tfm)) {
+			ti->error = "Error allocating crypto tfm";
+			goto bad_ivmode;
+		}
+		per_cpu_ptr(cc->cpu, cpu)->tfm = tfm;
 	}
+	tfm = any_tfm(cc);
 
 	strcpy(cc->cipher, cipher);
 	strcpy(cc->chainmode, chainmode);
-	cc->tfm = tfm;
 
 	if (crypt_set_key(cc, argv[1]) < 0) {
 		ti->error = "Error decoding and setting key";
@@ -1134,7 +1297,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ti->error = "Cannot allocate crypt request mempool";
 		goto bad_req_pool;
 	}
-	cc->req = NULL;
 
 	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
 	if (!cc->page_pool) {
@@ -1148,6 +1310,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_bs;
 	}
 
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		if (crypto_ablkcipher_setkey(cs->tfm, cc->key, key_size) < 0) {
+			ti->error = "Error setting key";
+			goto bad_device;
+		}
+	}
+
 	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
 		ti->error = "Invalid iv_offset sector";
 		goto bad_device;
@@ -1177,25 +1347,16 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	} else
 		cc->iv_mode = NULL;
 
-	cc->io_queue = create_singlethread_workqueue("kcryptd_io");
-	if (!cc->io_queue) {
-		ti->error = "Couldn't create kcryptd io queue";
-		goto bad_io_queue;
-	}
-
-	cc->crypt_queue = create_singlethread_workqueue("kcryptd");
-	if (!cc->crypt_queue) {
-		ti->error = "Couldn't create kcryptd queue";
-		goto bad_crypt_queue;
+	if (crypt_create_workqueues() < 0) {
+		ti->error = "Couldn't create kcrypt work queues";
+		goto bad_queues;
 	}
 
 	ti->num_flush_requests = 1;
 	ti->private = cc;
 	return 0;
 
-bad_crypt_queue:
-	destroy_workqueue(cc->io_queue);
-bad_io_queue:
+bad_queues:
 	kfree(cc->iv_mode);
 bad_ivmode_string:
 	dm_put_device(ti, cc->dev);
@@ -1211,8 +1372,10 @@ bad_slab_pool:
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
 bad_ivmode:
-	crypto_free_ablkcipher(tfm);
+	crypt_dtr_cpus(cc);
 bad_cipher:
+	free_percpu(cc->cpu);
+bad_percpu:
 	/* Must zero key material before freeing */
 	kzfree(cc);
 	return -EINVAL;
@@ -1220,13 +1383,14 @@ bad_cipher:
 
 static void crypt_dtr(struct dm_target *ti)
 {
+	int cpu;
 	struct crypt_config *cc = (struct crypt_config *) ti->private;
 
-	destroy_workqueue(cc->io_queue);
-	destroy_workqueue(cc->crypt_queue);
-
-	if (cc->req)
-		mempool_free(cc->req, cc->req_pool);
+	for_each_possible_cpu (cpu) {
+		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
+		if (cs->req)
+			mempool_free(cs->req, cc->req_pool);
+	}
 
 	bioset_free(cc->bs);
 	mempool_destroy(cc->page_pool);
@@ -1236,9 +1400,11 @@ static void crypt_dtr(struct dm_target *ti)
 	kfree(cc->iv_mode);
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
-	crypto_free_ablkcipher(cc->tfm);
+	crypt_dtr_cpus(cc);
 	dm_put_device(ti, cc->dev);
 
+	free_percpu(cc->cpu);
+
 	/* Must zero key material before freeing */
 	kzfree(cc);
 }
@@ -1428,6 +1594,10 @@ static void __exit dm_crypt_exit(void)
 {
 	dm_unregister_target(&crypt_target);
 	kmem_cache_destroy(_crypt_io_pool);
+	if (crypt_workqueue)
+		destroy_workqueue(crypt_workqueue);
+	if (io_workqueue)
+		destroy_workqueue(io_workqueue);
 }
 
 module_init(dm_crypt_init);

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 16:04 ` Andi Kleen
@ 2010-05-31 16:34   ` Eric Dumazet
  -1 siblings, 0 replies; 40+ messages in thread
From: Eric Dumazet @ 2010-05-31 16:34 UTC (permalink / raw)
  To: Andi Kleen; +Cc: herbert, dm-devel, linux-kernel, agk, ak

Le lundi 31 mai 2010 à 18:04 +0200, Andi Kleen a écrit :
> DM-CRYPT: Scale to multiple CPUs
> 
> Currently dm-crypt does all encryption work per dmcrypt mapping in a
> single workqueue. This does not scale well when multiple CPUs
> are submitting IO at a high rate. The single CPU running the single
> thread cannot keep up with the encryption and encrypted IO performance
> tanks.
> 
> This patch changes the crypto workqueue to be per CPU. This means
> that as long as the IO submitter (or the interrupt target CPUs
> for reads) runs on different CPUs the encryption work will be also
> parallel.
> 
> To avoid a bottleneck on the IO worker I also changed those to be
> per CPU threads.
> 
> There is still some shared data, so I suspect some bouncing
> cache lines. But I haven't done a detailed study on that yet.
> 
> All the threads are global, not per CPU. That is to avoid a thread
> explosion on systems with a large number of CPUs and a larger
> number of dm-crypt mappings. The code takes care to avoid problems
> with nested mappings.
>     
> Signed-off-by: Andi Kleen <ak@linux.intel.com>


>  	/*
> +	 * Duplicated per cpu state. Access through
> +	 * per_cpu_ptr() only.
> +	 */
> +	struct crypt_cpu *cpu;
> +
> +	/*
>  	 * Layout of each crypto request:
>  	 *

Since commit e0fdb0e050eae331, we have a __percpu annotation so that
sparse can be augmented. This would also make the comment unnecessary...


We also have this_cpu_ accessors

> +       return per_cpu_ptr(cc->cpu, smp_processor_id());

	this_cpu_ptr(cc->cpu)

and __thic_cpu_ptr() for the 'raw' version


> +       __get_cpu_var(io_wq_cpu) = current;

	this_cpu_write(io_wq_cpu, current)



^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-05-31 16:34   ` Eric Dumazet
  0 siblings, 0 replies; 40+ messages in thread
From: Eric Dumazet @ 2010-05-31 16:34 UTC (permalink / raw)
  To: Andi Kleen; +Cc: herbert, dm-devel, linux-kernel, agk, ak

Le lundi 31 mai 2010 à 18:04 +0200, Andi Kleen a écrit :
> DM-CRYPT: Scale to multiple CPUs
> 
> Currently dm-crypt does all encryption work per dmcrypt mapping in a
> single workqueue. This does not scale well when multiple CPUs
> are submitting IO at a high rate. The single CPU running the single
> thread cannot keep up with the encryption and encrypted IO performance
> tanks.
> 
> This patch changes the crypto workqueue to be per CPU. This means
> that as long as the IO submitter (or the interrupt target CPUs
> for reads) runs on different CPUs the encryption work will be also
> parallel.
> 
> To avoid a bottleneck on the IO worker I also changed those to be
> per CPU threads.
> 
> There is still some shared data, so I suspect some bouncing
> cache lines. But I haven't done a detailed study on that yet.
> 
> All the threads are global, not per CPU. That is to avoid a thread
> explosion on systems with a large number of CPUs and a larger
> number of dm-crypt mappings. The code takes care to avoid problems
> with nested mappings.
>     
> Signed-off-by: Andi Kleen <ak@linux.intel.com>


>  	/*
> +	 * Duplicated per cpu state. Access through
> +	 * per_cpu_ptr() only.
> +	 */
> +	struct crypt_cpu *cpu;
> +
> +	/*
>  	 * Layout of each crypto request:
>  	 *

Since commit e0fdb0e050eae331, we have a __percpu annotation so that
sparse can be augmented. This would also make the comment unnecessary...


We also have this_cpu_ accessors

> +       return per_cpu_ptr(cc->cpu, smp_processor_id());

	this_cpu_ptr(cc->cpu)

and __thic_cpu_ptr() for the 'raw' version


> +       __get_cpu_var(io_wq_cpu) = current;

	this_cpu_write(io_wq_cpu, current)

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 16:04 ` Andi Kleen
  (?)
  (?)
@ 2010-05-31 17:22 ` Milan Broz
  2010-05-31 17:42   ` Andi Kleen
  -1 siblings, 1 reply; 40+ messages in thread
From: Milan Broz @ 2010-05-31 17:22 UTC (permalink / raw)
  To: device-mapper development; +Cc: Andi Kleen, herbert, linux-kernel, agk, ak

On 05/31/2010 06:04 PM, Andi Kleen wrote:
> DM-CRYPT: Scale to multiple CPUs
> 
> Currently dm-crypt does all encryption work per dmcrypt mapping in a
> single workqueue. This does not scale well when multiple CPUs
> are submitting IO at a high rate. The single CPU running the single
> thread cannot keep up with the encryption and encrypted IO performance
> tanks.

This is true only if encryption run on the CPU synchronously.

(Usually it is high speed SSD or dm-crypt above striped RAID where
underlying device throughput is higher than CPU encryption speed.)

I did a lot of experiments with similar design and abandoned it.
(If we go this way, there should be some parameter limiting
used # cpu threads for encryption, I had this configurable
through dm messages online + initial kernel module parameter.)

But I see two main problems:

1) How this scale together with asynchronous
crypto which run in parallel in crypto API layer (and have limited
resources)? (AES-NI for example)

2) Per volume threads and mempools were added to solve low memory
problems (exhausted mempools), isn't now possible deadlock here again?

(Like one CPU, many dm-crypt volumes - thread waiting for allocating
page from exhausted mempool, blocking another request (another volume)
in queue later which will free some pages after crypt processing.
This cannot happen with per volume threads. Or am I missing something here?)


Anyway, I still think that proper solution to this problem is run
parallel requests in cryptoAPI using async crypt interface,
IOW paralelize this on cryptoAPI layer which know best which resources
it can use for crypto work.

(Herbert - is something like per cpu crypto threads planned
for use in this case?)

Milan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 17:22 ` [dm-devel] " Milan Broz
@ 2010-05-31 17:42   ` Andi Kleen
  2010-05-31 18:10     ` Milan Broz
  0 siblings, 1 reply; 40+ messages in thread
From: Andi Kleen @ 2010-05-31 17:42 UTC (permalink / raw)
  To: Milan Broz
  Cc: device-mapper development, Andi Kleen, herbert, linux-kernel, agk, ak

On Mon, May 31, 2010 at 07:22:21PM +0200, Milan Broz wrote:
> On 05/31/2010 06:04 PM, Andi Kleen wrote:
> > DM-CRYPT: Scale to multiple CPUs
> > 
> > Currently dm-crypt does all encryption work per dmcrypt mapping in a
> > single workqueue. This does not scale well when multiple CPUs
> > are submitting IO at a high rate. The single CPU running the single
> > thread cannot keep up with the encryption and encrypted IO performance
> > tanks.
> 
> This is true only if encryption run on the CPU synchronously.

That's the common case isn't it?

On asynchronous crypto it won't change anything compared
to the current state.

> I did a lot of experiments with similar design and abandoned it.
> (If we go this way, there should be some parameter limiting
> used # cpu threads for encryption, I had this configurable
> through dm messages online + initial kernel module parameter.)

One thread per CPU is exactly the right number.

If you want less threads used submit IO from less CPUs (e.g.
with a cpuset). More never makes sense.

The only alternative possibility might be one per core
on a SMT system, but if that's done it should be implemented
in the workqueue interface. Right now one per SMT thread
is fine though.

> 1) How this scale together with asynchronous
> crypto which run in parallel in crypto API layer (and have limited
> resources)? (AES-NI for example)

AES-NI is not asynchronous and doesn't have limited resources.

> 
> 2) Per volume threads and mempools were added to solve low memory
> problems (exhausted mempools), isn't now possible deadlock here again?

Increasing the number of parallel submitters does not increase deadlocks
with mempool as long as they don't nest.  They would just
block each other, but eventually make progress as one finishes. 

This only matters when you're low on memory anyways, 
in the common case with enough memory there is full parallelism.

Nesting is possible, but the same as before.

> 
> (Like one CPU, many dm-crypt volumes - thread waiting for allocating
> page from exhausted mempool, blocking another request (another volume)

As long as they are not dependent that is not a deadlock
(and they are not) 

> Anyway, I still think that proper solution to this problem is run
> parallel requests in cryptoAPI using async crypt interface,
> IOW paralelize this on cryptoAPI layer which know best which resources
> it can use for crypto work.

I discussed this with Herbert before and he suggested that it's better
done in the submitter for the common case. There is a parallel crypt
manager now (pcrypt), but it has more overhead than simply doing it directly.
It can be still used when it is instantiated, but it's likely
only a win with very slow CPUs. For the case of reasonably fast
CPUs all that matters is that it scales.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 16:34   ` Eric Dumazet
@ 2010-05-31 17:46     ` Andi Kleen
  -1 siblings, 0 replies; 40+ messages in thread
From: Andi Kleen @ 2010-05-31 17:46 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Andi Kleen, herbert, dm-devel, linux-kernel, agk, ak

On Mon, May 31, 2010 at 06:34:55PM +0200, Eric Dumazet wrote:

Yes seems like I'm behind on per cpu innovations.

> Since commit e0fdb0e050eae331, we have a __percpu annotation so that
> sparse can be augmented. This would also make the comment unnecessary...

I think the comment is still necessary.

> 
> > +       __get_cpu_var(io_wq_cpu) = current;
> 
> 	this_cpu_write(io_wq_cpu, current)

And this one looks more ugly than before and does not seem
to generate better code, so I decided to skip it.

Thanks.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-05-31 17:46     ` Andi Kleen
  0 siblings, 0 replies; 40+ messages in thread
From: Andi Kleen @ 2010-05-31 17:46 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Andi Kleen, herbert, dm-devel, linux-kernel, agk, ak

On Mon, May 31, 2010 at 06:34:55PM +0200, Eric Dumazet wrote:

Yes seems like I'm behind on per cpu innovations.

> Since commit e0fdb0e050eae331, we have a __percpu annotation so that
> sparse can be augmented. This would also make the comment unnecessary...

I think the comment is still necessary.

> 
> > +       __get_cpu_var(io_wq_cpu) = current;
> 
> 	this_cpu_write(io_wq_cpu, current)

And this one looks more ugly than before and does not seem
to generate better code, so I decided to skip it.

Thanks.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 17:46     ` Andi Kleen
@ 2010-05-31 17:52       ` Eric Dumazet
  -1 siblings, 0 replies; 40+ messages in thread
From: Eric Dumazet @ 2010-05-31 17:52 UTC (permalink / raw)
  To: Andi Kleen; +Cc: herbert, dm-devel, linux-kernel, agk, ak

Le lundi 31 mai 2010 à 19:46 +0200, Andi Kleen a écrit :
> On Mon, May 31, 2010 at 06:34:55PM +0200, Eric Dumazet wrote:
> 
> Yes seems like I'm behind on per cpu innovations.
> 
> > Since commit e0fdb0e050eae331, we have a __percpu annotation so that
> > sparse can be augmented. This would also make the comment unnecessary...
> 
> I think the comment is still necessary.
> 
> > 
> > > +       __get_cpu_var(io_wq_cpu) = current;
> > 
> > 	this_cpu_write(io_wq_cpu, current)
> 
> And this one looks more ugly than before and does not seem
> to generate better code, so I decided to skip it.

This is because you whould use __this_cpu_write() in this context of
course ;)

generated code : one instruction




^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-05-31 17:52       ` Eric Dumazet
  0 siblings, 0 replies; 40+ messages in thread
From: Eric Dumazet @ 2010-05-31 17:52 UTC (permalink / raw)
  To: Andi Kleen; +Cc: herbert, dm-devel, linux-kernel, agk, ak

Le lundi 31 mai 2010 à 19:46 +0200, Andi Kleen a écrit :
> On Mon, May 31, 2010 at 06:34:55PM +0200, Eric Dumazet wrote:
> 
> Yes seems like I'm behind on per cpu innovations.
> 
> > Since commit e0fdb0e050eae331, we have a __percpu annotation so that
> > sparse can be augmented. This would also make the comment unnecessary...
> 
> I think the comment is still necessary.
> 
> > 
> > > +       __get_cpu_var(io_wq_cpu) = current;
> > 
> > 	this_cpu_write(io_wq_cpu, current)
> 
> And this one looks more ugly than before and does not seem
> to generate better code, so I decided to skip it.

This is because you whould use __this_cpu_write() in this context of
course ;)

generated code : one instruction

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 17:42   ` Andi Kleen
@ 2010-05-31 18:10     ` Milan Broz
  2010-05-31 18:27       ` Andi Kleen
  0 siblings, 1 reply; 40+ messages in thread
From: Milan Broz @ 2010-05-31 18:10 UTC (permalink / raw)
  To: Andi Kleen; +Cc: device-mapper development, herbert, linux-kernel, agk, ak

On 05/31/2010 07:42 PM, Andi Kleen wrote:
> On Mon, May 31, 2010 at 07:22:21PM +0200, Milan Broz wrote:
>> On 05/31/2010 06:04 PM, Andi Kleen wrote:
>>> DM-CRYPT: Scale to multiple CPUs
>>>
>>> Currently dm-crypt does all encryption work per dmcrypt mapping in a
>>> single workqueue. This does not scale well when multiple CPUs
>>> are submitting IO at a high rate. The single CPU running the single
>>> thread cannot keep up with the encryption and encrypted IO performance
>>> tanks.
>>
>> This is true only if encryption run on the CPU synchronously.
> 
> That's the common case isn't it?

Maybe now, but I think not in the near future.
 
> On asynchronous crypto it won't change anything compared
> to the current state.
> 
>> I did a lot of experiments with similar design and abandoned it.
>> (If we go this way, there should be some parameter limiting
>> used # cpu threads for encryption, I had this configurable
>> through dm messages online + initial kernel module parameter.)
> 
> One thread per CPU is exactly the right number.

Sure, I mean to be able set it from 1..max cpu to now use
all CPUs for crypto, more threads makes no sense.
But this is not real problem here.

>> 1) How this scale together with asynchronous
>> crypto which run in parallel in crypto API layer (and have limited
>> resources)? (AES-NI for example)
> 
> AES-NI is not asynchronous and doesn't have limited resources.

AES-NI used asynchronous crypto interface, was using asynchronous
crypto API cryptd daemon IIRC. So this changed?

>> 2) Per volume threads and mempools were added to solve low memory
>> problems (exhausted mempools), isn't now possible deadlock here again?
> 
> Increasing the number of parallel submitters does not increase deadlocks
> with mempool as long as they don't nest.  They would just
> block each other, but eventually make progress as one finishes. 

I mean if they nest of course, sorry for confusion.

> This only matters when you're low on memory anyways, 
> in the common case with enough memory there is full parallelism.

If it breaks not-so-common case, it is wrong approach:-)
It must not deadlock. We had already similar design before and it
was broken, that's why I am asking.

>> (Like one CPU, many dm-crypt volumes - thread waiting for allocating
>> page from exhausted mempool, blocking another request (another volume)
> 
> As long as they are not dependent that is not a deadlock
> (and they are not) 

Please try truecrupt, select AES-Twofish-Serpent mode and see how it is
nested... This is common case. (not that I like it :-)

>> Anyway, I still think that proper solution to this problem is run
>> parallel requests in cryptoAPI using async crypt interface,
>> IOW paralelize this on cryptoAPI layer which know best which resources
>> it can use for crypto work.
> 
> I discussed this with Herbert before and he suggested that it's better
> done in the submitter for the common case. There is a parallel crypt
> manager now (pcrypt), but it has more overhead than simply doing it directly.

Yes, I meant pcrypt extension.
Asynchronous crypto has big overhead, but that can be optimized eventually, no?
Could you please cc me or dm-devel on these discussions?

So we now run parallel crypt over parallel crypt in some case...
This is not good. And dm-crypt have no way to check if crypto
request will be processed synchronously or asynchronously.
(If it is possible, we can run parallel thread using your patch
for sunchronous requests, and in other case just submit request
to crypto API and let it process asynchronously.)

> It can be still used when it is instantiated, but it's likely
> only a win with very slow CPUs. For the case of reasonably fast
> CPUs all that matters is that it scales.

I know this is problem and parallel processing is needed here,
but it must not cause problems or slow  down that AES-NI case,
most of new cpu will support these extensions...

Milan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 18:10     ` Milan Broz
@ 2010-05-31 18:27       ` Andi Kleen
  2010-05-31 18:55         ` Milan Broz
  2010-06-01  2:39         ` Mikulas Patocka
  0 siblings, 2 replies; 40+ messages in thread
From: Andi Kleen @ 2010-05-31 18:27 UTC (permalink / raw)
  To: Milan Broz
  Cc: Andi Kleen, device-mapper development, herbert, linux-kernel, agk, ak

On Mon, May 31, 2010 at 08:10:22PM +0200, Milan Broz wrote:
> On 05/31/2010 07:42 PM, Andi Kleen wrote:
> > On Mon, May 31, 2010 at 07:22:21PM +0200, Milan Broz wrote:
> >> On 05/31/2010 06:04 PM, Andi Kleen wrote:
> >>> DM-CRYPT: Scale to multiple CPUs
> >>>
> >>> Currently dm-crypt does all encryption work per dmcrypt mapping in a
> >>> single workqueue. This does not scale well when multiple CPUs
> >>> are submitting IO at a high rate. The single CPU running the single
> >>> thread cannot keep up with the encryption and encrypted IO performance
> >>> tanks.
> >>
> >> This is true only if encryption run on the CPU synchronously.
> > 
> > That's the common case isn't it?
> 
> Maybe now, but I think not in the near future.

Why? 

> >> 1) How this scale together with asynchronous
> >> crypto which run in parallel in crypto API layer (and have limited
> >> resources)? (AES-NI for example)
> > 
> > AES-NI is not asynchronous and doesn't have limited resources.
> 
> AES-NI used asynchronous crypto interface, was using asynchronous
> crypto API cryptd daemon IIRC. So this changed?

AFAIK all ciphers use the asynchronous interface,  but that 
doesn't mean they are actually asynchronous. AES-NI certainly
does not require running in a special thread. The only
thing it doesn't support is running from interrupt context.

> 
> >> 2) Per volume threads and mempools were added to solve low memory
> >> problems (exhausted mempools), isn't now possible deadlock here again?
> > 
> > Increasing the number of parallel submitters does not increase deadlocks
> > with mempool as long as they don't nest.  They would just
> > block each other, but eventually make progress as one finishes. 
> 
> I mean if they nest of course, sorry for confusion.

No change to that, it's the same as it always worked.

Anyways, Right now you would need to nest more than 16 times I think
to exhaust the mempool (or 8, but the pages are less critical I think)

For me that seems like a "don't do that if hurts" situation.

> 
> > This only matters when you're low on memory anyways, 
> > in the common case with enough memory there is full parallelism.
> 
> If it breaks not-so-common case, it is wrong approach:-)
> It must not deadlock. We had already similar design before and it
> was broken, that's why I am asking.

There's no new deadlock possibility to my knowledge.


> 
> >> (Like one CPU, many dm-crypt volumes - thread waiting for allocating
> >> page from exhausted mempool, blocking another request (another volume)
> > 
> > As long as they are not dependent that is not a deadlock
> > (and they are not) 
> 
> Please try truecrupt, select AES-Twofish-Serpent mode and see how it is
> nested... This is common case. (not that I like it :-)

Nesting a few times is fine, you should just not nest
more times than your mempool is big.

Again also this is a red herring here because my patch
does not change the situation.

I guess if there was a way to determine the global nesting
one could also increase the mempool allocation to match it.
I don't think it has anything to do with my patch though.

And if you will ever have any user who wants to nest more than 8/16
times is also quite doubtful.

> 
> >> Anyway, I still think that proper solution to this problem is run
> >> parallel requests in cryptoAPI using async crypt interface,
> >> IOW paralelize this on cryptoAPI layer which know best which resources
> >> it can use for crypto work.
> > 
> > I discussed this with Herbert before and he suggested that it's better
> > done in the submitter for the common case. There is a parallel crypt
> > manager now (pcrypt), but it has more overhead than simply doing it directly.
> 
> Yes, I meant pcrypt extension.
> Asynchronous crypto has big overhead, but that can be optimized eventually, no?
Maybe it can be optimized, but for a fast CPU it will never
be as fast as just doing it directly.

Communication between CPUs is slow.

> 
> So we now run parallel crypt over parallel crypt in some case...
> This is not good. And dm-crypt have no way to check if crypto
> request will be processed synchronously or asynchronously.

It doesn't need to know.

Also even if there's asynchronous crypto it's more efficient
to submit to it from multiple CPUs than to funnel everything
through a single CPU bottleneck.


> > It can be still used when it is instantiated, but it's likely
> > only a win with very slow CPUs. For the case of reasonably fast
> > CPUs all that matters is that it scales.
> 
> I know this is problem and parallel processing is needed here,
> but it must not cause problems or slow  down that AES-NI case,
> most of new cpu will support these extensions...

My patch is the best way to make use of AES-NI.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 18:27       ` Andi Kleen
@ 2010-05-31 18:55         ` Milan Broz
  2010-05-31 19:04           ` Andi Kleen
  2010-06-01  2:39         ` Mikulas Patocka
  1 sibling, 1 reply; 40+ messages in thread
From: Milan Broz @ 2010-05-31 18:55 UTC (permalink / raw)
  To: Andi Kleen; +Cc: device-mapper development, herbert, linux-kernel, agk, ak

On 05/31/2010 08:27 PM, Andi Kleen wrote:

>>> AES-NI is not asynchronous and doesn't have limited resources.
>>
>> AES-NI used asynchronous crypto interface, was using asynchronous
>> crypto API cryptd daemon IIRC. So this changed?
> 
> AFAIK all ciphers use the asynchronous interface,  but that 
> doesn't mean they are actually asynchronous. AES-NI certainly
> does not require running in a special thread. The only
> thing it doesn't support is running from interrupt context.

I mean how it is implemented now in crypto API, and I was almost
sure that aes-ni acceleration code uses cryptd (iow real asynchronous processing)
and also that not all CPU cores can run these instruction in parallel.

So I am mistaken here?

Milan

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 18:55         ` Milan Broz
@ 2010-05-31 19:04           ` Andi Kleen
  2010-05-31 22:07               ` Herbert Xu
  0 siblings, 1 reply; 40+ messages in thread
From: Andi Kleen @ 2010-05-31 19:04 UTC (permalink / raw)
  To: Milan Broz
  Cc: Andi Kleen, device-mapper development, herbert, linux-kernel, agk, ak

> I mean how it is implemented now in crypto API, and I was almost
> sure that aes-ni acceleration code uses cryptd (iow real asynchronous processing)
> and also that not all CPU cores can run these instruction in parallel.

I think you can configure it to use cryptd (or pcrypt), but it's not default
and usually higher overhead.

Each CPU core has its own support for AES-NI (it's part
of the SSE units), there's no shared crypto hardware between cores.

There are other systems with separate AES units, but they are
considerably more obscure.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 19:04           ` Andi Kleen
@ 2010-05-31 22:07               ` Herbert Xu
  0 siblings, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-05-31 22:07 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Milan Broz, device-mapper development, linux-kernel, agk, ak

On Mon, May 31, 2010 at 09:04:00PM +0200, Andi Kleen wrote:
> > I mean how it is implemented now in crypto API, and I was almost
> > sure that aes-ni acceleration code uses cryptd (iow real asynchronous processing)
> > and also that not all CPU cores can run these instruction in parallel.
> 
> I think you can configure it to use cryptd (or pcrypt), but it's not default
> and usually higher overhead.

Right.  The only reason aes-ni uses the async interface is to
work around the fact that you have to save FPU state when using
it.  For dm-crypt you can consider it to be synchronous.

Note that cryptd is nothing like pcrypt.  cryptd always keeps
the work on the local CPU, even if goes into another thread.

OTOH pcrypt was designed to parallelise the work across CPUs.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-05-31 22:07               ` Herbert Xu
  0 siblings, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-05-31 22:07 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Milan Broz, device-mapper development, linux-kernel, agk, ak

On Mon, May 31, 2010 at 09:04:00PM +0200, Andi Kleen wrote:
> > I mean how it is implemented now in crypto API, and I was almost
> > sure that aes-ni acceleration code uses cryptd (iow real asynchronous processing)
> > and also that not all CPU cores can run these instruction in parallel.
> 
> I think you can configure it to use cryptd (or pcrypt), but it's not default
> and usually higher overhead.

Right.  The only reason aes-ni uses the async interface is to
work around the fact that you have to save FPU state when using
it.  For dm-crypt you can consider it to be synchronous.

Note that cryptd is nothing like pcrypt.  cryptd always keeps
the work on the local CPU, even if goes into another thread.

OTOH pcrypt was designed to parallelise the work across CPUs.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 22:07               ` Herbert Xu
@ 2010-06-01  1:46                 ` huang ying
  -1 siblings, 0 replies; 40+ messages in thread
From: huang ying @ 2010-06-01  1:46 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Andi Kleen, Milan Broz, device-mapper development, linux-kernel, agk, ak

On Tue, Jun 1, 2010 at 6:07 AM, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Mon, May 31, 2010 at 09:04:00PM +0200, Andi Kleen wrote:
>> > I mean how it is implemented now in crypto API, and I was almost
>> > sure that aes-ni acceleration code uses cryptd (iow real asynchronous processing)
>> > and also that not all CPU cores can run these instruction in parallel.
>>
>> I think you can configure it to use cryptd (or pcrypt), but it's not default
>> and usually higher overhead.
>
> Right.  The only reason aes-ni uses the async interface is to
> work around the fact that you have to save FPU state when using
> it.  For dm-crypt you can consider it to be synchronous.

The reason for aes-ni to use async interface is that the FPU/SSE
instructions enclosed by kernel_fpu_begin and kernel_fpu_end is not
reentranceable. That is, it is not safe to use FPU/SSE instructions in
interrupt context, if the instructions are used in interrupted
context. So irq_fpu_usable is used in the async interface of aes-ni,
and go asynchronous implementation only if !irq_fpu_usable. In most
instances, the async interface of aes-ni will run in synchronous mode.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-06-01  1:46                 ` huang ying
  0 siblings, 0 replies; 40+ messages in thread
From: huang ying @ 2010-06-01  1:46 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Andi Kleen, Milan Broz, device-mapper development, linux-kernel, agk, ak

On Tue, Jun 1, 2010 at 6:07 AM, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Mon, May 31, 2010 at 09:04:00PM +0200, Andi Kleen wrote:
>> > I mean how it is implemented now in crypto API, and I was almost
>> > sure that aes-ni acceleration code uses cryptd (iow real asynchronous processing)
>> > and also that not all CPU cores can run these instruction in parallel.
>>
>> I think you can configure it to use cryptd (or pcrypt), but it's not default
>> and usually higher overhead.
>
> Right.  The only reason aes-ni uses the async interface is to
> work around the fact that you have to save FPU state when using
> it.  For dm-crypt you can consider it to be synchronous.

The reason for aes-ni to use async interface is that the FPU/SSE
instructions enclosed by kernel_fpu_begin and kernel_fpu_end is not
reentranceable. That is, it is not safe to use FPU/SSE instructions in
interrupt context, if the instructions are used in interrupted
context. So irq_fpu_usable is used in the async interface of aes-ni,
and go asynchronous implementation only if !irq_fpu_usable. In most
instances, the async interface of aes-ni will run in synchronous mode.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 18:27       ` Andi Kleen
  2010-05-31 18:55         ` Milan Broz
@ 2010-06-01  2:39         ` Mikulas Patocka
  1 sibling, 0 replies; 40+ messages in thread
From: Mikulas Patocka @ 2010-06-01  2:39 UTC (permalink / raw)
  To: device-mapper development
  Cc: Milan Broz, ak, linux-kernel, herbert, Andi Kleen, agk

> > >> 2) Per volume threads and mempools were added to solve low memory
> > >> problems (exhausted mempools), isn't now possible deadlock here again?
> > > 
> > > Increasing the number of parallel submitters does not increase deadlocks
> > > with mempool as long as they don't nest.  They would just
> > > block each other, but eventually make progress as one finishes. 
> > 
> > I mean if they nest of course, sorry for confusion.
> 
> No change to that, it's the same as it always worked.
> 
> Anyways, Right now you would need to nest more than 16 times I think
> to exhaust the mempool (or 8, but the pages are less critical I think)
> 
> For me that seems like a "don't do that if hurts" situation.

If the mempools are per-instance (they are), it can nest any number of 
times, because each nested instance will get new mempools. It is not a 
problem.

Mikulas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-05-31 16:04 ` Andi Kleen
                   ` (2 preceding siblings ...)
  (?)
@ 2010-06-01  2:44 ` Mikulas Patocka
  2010-06-01  4:39     ` Herbert Xu
  2010-06-01  6:40   ` Andi Kleen
  -1 siblings, 2 replies; 40+ messages in thread
From: Mikulas Patocka @ 2010-06-01  2:44 UTC (permalink / raw)
  To: device-mapper development; +Cc: herbert, linux-kernel, agk, ak

Questions:

If you are optimizing it,

1) why don't you optimize it in such a way that if one CPU submits 
requests, the crypto work is spread among all the CPUs? Currently it 
spreads the work only if different CPUs submit it.

2) why not optimize software async crypto daemon (crypt/cryptd.c) instead 
of dm-crypt, so that all kernel subsystems can actually take advantage of 
those multi-CPU optimizations, not just dm-crypt?

Mikulas


On Mon, 31 May 2010, Andi Kleen wrote:

> DM-CRYPT: Scale to multiple CPUs
> 
> Currently dm-crypt does all encryption work per dmcrypt mapping in a
> single workqueue. This does not scale well when multiple CPUs
> are submitting IO at a high rate. The single CPU running the single
> thread cannot keep up with the encryption and encrypted IO performance
> tanks.
> 
> This patch changes the crypto workqueue to be per CPU. This means
> that as long as the IO submitter (or the interrupt target CPUs
> for reads) runs on different CPUs the encryption work will be also
> parallel.
> 
> To avoid a bottleneck on the IO worker I also changed those to be
> per CPU threads.
> 
> There is still some shared data, so I suspect some bouncing
> cache lines. But I haven't done a detailed study on that yet.
> 
> All the threads are global, not per CPU. That is to avoid a thread
> explosion on systems with a large number of CPUs and a larger
> number of dm-crypt mappings. The code takes care to avoid problems
> with nested mappings.
>     
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> 
> diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
> index 3bdbb61..fb95896 100644
> --- a/drivers/md/dm-crypt.c
> +++ b/drivers/md/dm-crypt.c
> @@ -18,6 +18,7 @@
>  #include <linux/crypto.h>
>  #include <linux/workqueue.h>
>  #include <linux/backing-dev.h>
> +#include <linux/percpu.h>
>  #include <asm/atomic.h>
>  #include <linux/scatterlist.h>
>  #include <asm/page.h>
> @@ -77,11 +78,15 @@ struct crypt_iv_operations {
>  };
>  
>  struct iv_essiv_private {
> -	struct crypto_cipher *tfm;
>  	struct crypto_hash *hash_tfm;
>  	u8 *salt;
>  };
>  
> +/* Duplicated per CPU state for cipher */
> +struct iv_essiv_private_cpu {
> +	struct crypto_cipher *tfm;
> +};
> +
>  struct iv_benbi_private {
>  	int shift;
>  };
> @@ -91,6 +96,18 @@ struct iv_benbi_private {
>   * and encrypts / decrypts at the same time.
>   */
>  enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
> +
> +/* Duplicated per CPU state for cipher */
> +struct crypt_cpu {
> +	struct ablkcipher_request *req;
> +	struct crypto_ablkcipher *tfm;
> +	struct iv_essiv_private_cpu ie;
> +};
> +
> +/*
> + * The fields in here must be read only after initialization,
> + * changing state should be in crypt_cpu.
> + */
>  struct crypt_config {
>  	struct dm_dev *dev;
>  	sector_t start;
> @@ -103,10 +120,6 @@ struct crypt_config {
>  	mempool_t *req_pool;
>  	mempool_t *page_pool;
>  	struct bio_set *bs;
> -
> -	struct workqueue_struct *io_queue;
> -	struct workqueue_struct *crypt_queue;
> -
>  	/*
>  	 * crypto related data
>  	 */
> @@ -120,6 +133,12 @@ struct crypt_config {
>  	unsigned int iv_size;
>  
>  	/*
> +	 * Duplicated per cpu state. Access through
> +	 * per_cpu_ptr() only.
> +	 */
> +	struct crypt_cpu *cpu;
> +
> +	/*
>  	 * Layout of each crypto request:
>  	 *
>  	 *   struct ablkcipher_request
> @@ -133,25 +152,44 @@ struct crypt_config {
>  	 * correctly aligned.
>  	 */
>  	unsigned int dmreq_start;
> -	struct ablkcipher_request *req;
>  
>  	char cipher[CRYPTO_MAX_ALG_NAME];
>  	char chainmode[CRYPTO_MAX_ALG_NAME];
> -	struct crypto_ablkcipher *tfm;
>  	unsigned long flags;
>  	unsigned int key_size;
>  	u8 key[0];
>  };
>  
> +/* RED-PEN scale with number of cpus? */
>  #define MIN_IOS        16
>  #define MIN_POOL_PAGES 32
>  #define MIN_BIO_PAGES  8
>  
> +/* Protect creation of a new crypt queue */
> +static DEFINE_MUTEX(queue_setup_lock);
> +static struct workqueue_struct *crypt_workqueue;
> +static struct workqueue_struct *io_workqueue;
> +static DEFINE_PER_CPU(struct task_struct *, io_wq_cpu);
> +
>  static struct kmem_cache *_crypt_io_pool;
>  
>  static void clone_init(struct dm_crypt_io *, struct bio *);
>  static void kcryptd_queue_crypt(struct dm_crypt_io *io);
>  
> +static struct crypt_cpu *crypt_me(struct crypt_config *cc)
> +{
> +	return per_cpu_ptr(cc->cpu, smp_processor_id());
> +}
> +
> +/* Use this for cipher attributes that are the same for all cpus */
> +static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
> +{
> +	struct crypto_ablkcipher *tfm;
> +	/* cpu doesn't matter, output is always the same */
> +	tfm = per_cpu_ptr(cc->cpu, raw_smp_processor_id())->tfm;
> +	return tfm;
> +}
> +
>  /*
>   * Different IV generation algorithms:
>   *
> @@ -198,7 +236,7 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
>  	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
>  	struct hash_desc desc;
>  	struct scatterlist sg;
> -	int err;
> +	int err, n, cpu;
>  
>  	sg_init_one(&sg, cc->key, cc->key_size);
>  	desc.tfm = essiv->hash_tfm;
> @@ -208,8 +246,18 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
>  	if (err)
>  		return err;
>  
> -	return crypto_cipher_setkey(essiv->tfm, essiv->salt,
> +	for_each_possible_cpu (cpu) {
> +		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
> +
> +		n = crypto_cipher_setkey(cs->ie.tfm, essiv->salt,
>  				    crypto_hash_digestsize(essiv->hash_tfm));
> +		if (n) {
> +			err = n;
> +			break;
> +		}
> +	}
> +
> +	return err;
>  }
>  
>  /* Wipe salt and reset key derived from volume key */
> @@ -217,24 +265,70 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
>  {
>  	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
>  	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
> +	int cpu, err, n;
>  
>  	memset(essiv->salt, 0, salt_size);
>  
> -	return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
> +	err = 0;
> +	for_each_possible_cpu (cpu) {
> +		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
> +		n = crypto_cipher_setkey(cs->ie.tfm, essiv->salt, salt_size);
> +		if (n)
> +			err = n;
> +	}
> +	return err;
> +}
> +
> +/* Set up per cpu cipher state */
> +static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
> +					     struct dm_target *ti,
> +					     u8 *salt, unsigned saltsize)
> +{
> +	struct crypto_cipher *essiv_tfm;
> +	int err;
> +
> +	/* Setup the essiv_tfm with the given salt */
> +	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
> +	if (IS_ERR(essiv_tfm)) {
> +		ti->error = "Error allocating crypto tfm for ESSIV";
> +		return essiv_tfm;
> +	}
> +
> +	if (crypto_cipher_blocksize(essiv_tfm) !=
> +	    crypto_ablkcipher_ivsize(any_tfm(cc))) {
> +		ti->error = "Block size of ESSIV cipher does "
> +			    "not match IV size of block cipher";
> +		crypto_free_cipher(essiv_tfm);
> +		return ERR_PTR(-EINVAL);
> +	}
> +	err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
> +	if (err) {
> +		ti->error = "Failed to set key for ESSIV cipher";
> +		crypto_free_cipher(essiv_tfm);
> +		return ERR_PTR(err);
> +	}
> +
> +	return essiv_tfm;
>  }
>  
>  static void crypt_iv_essiv_dtr(struct crypt_config *cc)
>  {
> +	int cpu;
>  	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
>  
> -	crypto_free_cipher(essiv->tfm);
> -	essiv->tfm = NULL;
> -
>  	crypto_free_hash(essiv->hash_tfm);
>  	essiv->hash_tfm = NULL;
>  
>  	kzfree(essiv->salt);
>  	essiv->salt = NULL;
> +
> +	for_each_possible_cpu (cpu) {
> +		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
> +		if (cs->ie.tfm) {
> +			crypto_free_cipher(cs->ie.tfm);
> +			cs->ie.tfm = NULL;
> +		}
> +	}
>  }
>  
>  static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
> @@ -244,6 +338,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
>  	struct crypto_hash *hash_tfm = NULL;
>  	u8 *salt = NULL;
>  	int err;
> +	int cpu;
>  
>  	if (!opts) {
>  		ti->error = "Digest algorithm missing for ESSIV mode";
> @@ -265,30 +360,22 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
>  		goto bad;
>  	}
>  
> -	/* Allocate essiv_tfm */
> -	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
> -	if (IS_ERR(essiv_tfm)) {
> -		ti->error = "Error allocating crypto tfm for ESSIV";
> -		err = PTR_ERR(essiv_tfm);
> -		goto bad;
> -	}
> -	if (crypto_cipher_blocksize(essiv_tfm) !=
> -	    crypto_ablkcipher_ivsize(cc->tfm)) {
> -		ti->error = "Block size of ESSIV cipher does "
> -			    "not match IV size of block cipher";
> -		err = -EINVAL;
> -		goto bad;
> -	}
> -
>  	cc->iv_gen_private.essiv.salt = salt;
> -	cc->iv_gen_private.essiv.tfm = essiv_tfm;
>  	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
>  
> +	for_each_possible_cpu (cpu) {
> +		essiv_tfm = setup_essiv_cpu(cc, ti, salt,
> +					crypto_hash_digestsize(hash_tfm));
> +		if (IS_ERR(essiv_tfm)) {
> +			kfree(salt);
> +			crypt_iv_essiv_dtr(cc);
> +			return PTR_ERR(essiv_tfm);
> +		}
> +		per_cpu_ptr(cc->cpu, cpu)->ie.tfm = essiv_tfm;
> +	}
>  	return 0;
>  
>  bad:
> -	if (essiv_tfm && !IS_ERR(essiv_tfm))
> -		crypto_free_cipher(essiv_tfm);
>  	if (hash_tfm && !IS_ERR(hash_tfm))
>  		crypto_free_hash(hash_tfm);
>  	kfree(salt);
> @@ -299,14 +386,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
>  {
>  	memset(iv, 0, cc->iv_size);
>  	*(u64 *)iv = cpu_to_le64(sector);
> -	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
> +	crypto_cipher_encrypt_one(crypt_me(cc)->ie.tfm, iv, iv);
>  	return 0;
>  }
>  
>  static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
>  			      const char *opts)
>  {
> -	unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
> +	unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
>  	int log = ilog2(bs);
>  
>  	/* we need to calculate how far we must shift the sector count
> @@ -415,7 +502,7 @@ static int crypt_convert_block(struct crypt_config *cc,
>  
>  	dmreq = dmreq_of_req(cc, req);
>  	iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
> -			 crypto_ablkcipher_alignmask(cc->tfm) + 1);
> +			 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
>  
>  	dmreq->ctx = ctx;
>  	sg_init_table(&dmreq->sg_in, 1);
> @@ -460,13 +547,14 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
>  static void crypt_alloc_req(struct crypt_config *cc,
>  			    struct convert_context *ctx)
>  {
> -	if (!cc->req)
> -		cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
> -	ablkcipher_request_set_tfm(cc->req, cc->tfm);
> -	ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
> +	struct crypt_cpu *cs = crypt_me(cc);
> +	if (!cs->req)
> +		cs->req = mempool_alloc(cc->req_pool, GFP_NOIO);
> +	ablkcipher_request_set_tfm(cs->req, cs->tfm);
> +	ablkcipher_request_set_callback(cs->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
>  					CRYPTO_TFM_REQ_MAY_SLEEP,
>  					kcryptd_async_done,
> -					dmreq_of_req(cc, cc->req));
> +					dmreq_of_req(cc, cs->req));
>  }
>  
>  /*
> @@ -475,6 +563,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
>  static int crypt_convert(struct crypt_config *cc,
>  			 struct convert_context *ctx)
>  {
> +	struct crypt_cpu *cs = crypt_me(cc);
>  	int r;
>  
>  	atomic_set(&ctx->pending, 1);
> @@ -486,7 +575,7 @@ static int crypt_convert(struct crypt_config *cc,
>  
>  		atomic_inc(&ctx->pending);
>  
> -		r = crypt_convert_block(cc, ctx, cc->req);
> +		r = crypt_convert_block(cc, ctx, cs->req);
>  
>  		switch (r) {
>  		/* async */
> @@ -495,7 +584,7 @@ static int crypt_convert(struct crypt_config *cc,
>  			INIT_COMPLETION(ctx->restart);
>  			/* fall through*/
>  		case -EINPROGRESS:
> -			cc->req = NULL;
> +			cs->req = NULL;
>  			ctx->sector++;
>  			continue;
>  
> @@ -654,6 +743,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
>   * They must be separated as otherwise the final stages could be
>   * starved by new requests which can block in the first stages due
>   * to memory allocation.
> + *
> + * The work is done per CPU global for all dmcrypt instances.
> + * They should not depend on each other and do not block.
>   */
>  static void crypt_endio(struct bio *clone, int error)
>  {
> @@ -735,6 +827,7 @@ static void kcryptd_io(struct work_struct *work)
>  {
>  	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
>  
> +	__get_cpu_var(io_wq_cpu) = current;
>  	if (bio_data_dir(io->base_bio) == READ)
>  		kcryptd_io_read(io);
>  	else
> @@ -743,10 +836,23 @@ static void kcryptd_io(struct work_struct *work)
>  
>  static void kcryptd_queue_io(struct dm_crypt_io *io)
>  {
> -	struct crypt_config *cc = io->target->private;
> +	int cpu;
> +
> +	/*
> +	 * Since we only have a single worker per CPU in extreme
> +	 * cases there might be nesting (dm-crypt on another dm-crypt)
> +	 * To avoid deadlock run the work directly then.
> +	 */
> +	cpu = get_cpu();
> +	if (per_cpu(io_wq_cpu, cpu) == current && !in_interrupt()) {
> +		put_cpu();
> +		kcryptd_io(&io->work);
> +		return;
> +	}
>  
>  	INIT_WORK(&io->work, kcryptd_io);
> -	queue_work(cc->io_queue, &io->work);
> +	queue_work(io_workqueue, &io->work);
> +	put_cpu();
>  }
>  
>  static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
> @@ -927,10 +1033,8 @@ static void kcryptd_crypt(struct work_struct *work)
>  
>  static void kcryptd_queue_crypt(struct dm_crypt_io *io)
>  {
> -	struct crypt_config *cc = io->target->private;
> -
>  	INIT_WORK(&io->work, kcryptd_crypt);
> -	queue_work(cc->crypt_queue, &io->work);
> +	queue_work(crypt_workqueue, &io->work);
>  }
>  
>  /*
> @@ -974,6 +1078,21 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
>  	}
>  }
>  
> +static int crypt_setkey_allcpus(struct crypt_config *cc)
> +{
> +	int cpu, n, err;
> +
> +	err = 0;
> +	for_each_possible_cpu (cpu) {
> +		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
> +		n = crypto_ablkcipher_setkey(cs->tfm, cc->key, cc->key_size);
> +		if (n)
> +			err = n;
> +	}
> +	return err;
> +}
> +
> +
>  static int crypt_set_key(struct crypt_config *cc, char *key)
>  {
>  	unsigned key_size = strlen(key) >> 1;
> @@ -989,14 +1108,48 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
>  
>  	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
>  
> -	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
> +	return crypt_setkey_allcpus(cc);
>  }
>  
>  static int crypt_wipe_key(struct crypt_config *cc)
>  {
>  	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
>  	memset(&cc->key, 0, cc->key_size * sizeof(u8));
> -	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
> +	return crypt_setkey_allcpus(cc);
> +}
> +
> +/* Use a global per-CPU encryption workqueue for all mounts */
> +static int crypt_create_workqueues(void)
> +{
> +	int ret = 0;
> +
> +	/* Module unload cleans up on error */
> +	mutex_lock(&queue_setup_lock);
> +	if (!crypt_workqueue) {
> +		crypt_workqueue = create_workqueue("dmcrypt");
> +		if (!crypt_workqueue)
> +			ret = -ENOMEM;
> +	}
> +	if (!io_workqueue) {
> +		io_workqueue = create_workqueue("dmcrypt-io");
> +		if (!io_workqueue)
> +			ret = -ENOMEM;
> +	}
> +	mutex_unlock(&queue_setup_lock);
> +	return ret;
> +}
> +
> +static void crypt_dtr_cpus(struct crypt_config *cc)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu (cpu) {
> +		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
> +		if (cs->tfm) {
> +			crypto_free_ablkcipher(cs->tfm);
> +			cs->tfm = NULL;
> +		}
> +	}
>  }
>  
>  /*
> @@ -1014,6 +1167,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  	char *ivopts;
>  	unsigned int key_size;
>  	unsigned long long tmpll;
> +	int cpu;
>  
>  	if (argc != 5) {
>  		ti->error = "Not enough arguments";
> @@ -1038,7 +1192,13 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  		return -ENOMEM;
>  	}
>  
> -	/* Compatibility mode for old dm-crypt cipher strings */
> +	cc->cpu = alloc_percpu(struct crypt_cpu);
> +	if (!cc->cpu) {
> +		ti->error = "Cannot allocate per cpu state";
> +		goto bad_percpu;
> +	}
> +
> +	/* Compatiblity mode for old dm-crypt cipher strings */
>  	if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
>  		chainmode = "cbc";
>  		ivmode = "plain";
> @@ -1055,15 +1215,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  		goto bad_cipher;
>  	}
>  
> -	tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0);
> -	if (IS_ERR(tfm)) {
> -		ti->error = "Error allocating crypto tfm";
> -		goto bad_cipher;
> +	for_each_possible_cpu (cpu) {
> +		tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0);
> +		if (IS_ERR(tfm)) {
> +			ti->error = "Error allocating crypto tfm";
> +			goto bad_ivmode;
> +		}
> +		per_cpu_ptr(cc->cpu, cpu)->tfm = tfm;
>  	}
> +	tfm = any_tfm(cc);
>  
>  	strcpy(cc->cipher, cipher);
>  	strcpy(cc->chainmode, chainmode);
> -	cc->tfm = tfm;
>  
>  	if (crypt_set_key(cc, argv[1]) < 0) {
>  		ti->error = "Error decoding and setting key";
> @@ -1134,7 +1297,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  		ti->error = "Cannot allocate crypt request mempool";
>  		goto bad_req_pool;
>  	}
> -	cc->req = NULL;
>  
>  	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
>  	if (!cc->page_pool) {
> @@ -1148,6 +1310,14 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  		goto bad_bs;
>  	}
>  
> +	for_each_possible_cpu (cpu) {
> +		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
> +		if (crypto_ablkcipher_setkey(cs->tfm, cc->key, key_size) < 0) {
> +			ti->error = "Error setting key";
> +			goto bad_device;
> +		}
> +	}
> +
>  	if (sscanf(argv[2], "%llu", &tmpll) != 1) {
>  		ti->error = "Invalid iv_offset sector";
>  		goto bad_device;
> @@ -1177,25 +1347,16 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>  	} else
>  		cc->iv_mode = NULL;
>  
> -	cc->io_queue = create_singlethread_workqueue("kcryptd_io");
> -	if (!cc->io_queue) {
> -		ti->error = "Couldn't create kcryptd io queue";
> -		goto bad_io_queue;
> -	}
> -
> -	cc->crypt_queue = create_singlethread_workqueue("kcryptd");
> -	if (!cc->crypt_queue) {
> -		ti->error = "Couldn't create kcryptd queue";
> -		goto bad_crypt_queue;
> +	if (crypt_create_workqueues() < 0) {
> +		ti->error = "Couldn't create kcrypt work queues";
> +		goto bad_queues;
>  	}
>  
>  	ti->num_flush_requests = 1;
>  	ti->private = cc;
>  	return 0;
>  
> -bad_crypt_queue:
> -	destroy_workqueue(cc->io_queue);
> -bad_io_queue:
> +bad_queues:
>  	kfree(cc->iv_mode);
>  bad_ivmode_string:
>  	dm_put_device(ti, cc->dev);
> @@ -1211,8 +1372,10 @@ bad_slab_pool:
>  	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
>  		cc->iv_gen_ops->dtr(cc);
>  bad_ivmode:
> -	crypto_free_ablkcipher(tfm);
> +	crypt_dtr_cpus(cc);
>  bad_cipher:
> +	free_percpu(cc->cpu);
> +bad_percpu:
>  	/* Must zero key material before freeing */
>  	kzfree(cc);
>  	return -EINVAL;
> @@ -1220,13 +1383,14 @@ bad_cipher:
>  
>  static void crypt_dtr(struct dm_target *ti)
>  {
> +	int cpu;
>  	struct crypt_config *cc = (struct crypt_config *) ti->private;
>  
> -	destroy_workqueue(cc->io_queue);
> -	destroy_workqueue(cc->crypt_queue);
> -
> -	if (cc->req)
> -		mempool_free(cc->req, cc->req_pool);
> +	for_each_possible_cpu (cpu) {
> +		struct crypt_cpu *cs = per_cpu_ptr(cc->cpu, cpu);
> +		if (cs->req)
> +			mempool_free(cs->req, cc->req_pool);
> +	}
>  
>  	bioset_free(cc->bs);
>  	mempool_destroy(cc->page_pool);
> @@ -1236,9 +1400,11 @@ static void crypt_dtr(struct dm_target *ti)
>  	kfree(cc->iv_mode);
>  	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
>  		cc->iv_gen_ops->dtr(cc);
> -	crypto_free_ablkcipher(cc->tfm);
> +	crypt_dtr_cpus(cc);
>  	dm_put_device(ti, cc->dev);
>  
> +	free_percpu(cc->cpu);
> +
>  	/* Must zero key material before freeing */
>  	kzfree(cc);
>  }
> @@ -1428,6 +1594,10 @@ static void __exit dm_crypt_exit(void)
>  {
>  	dm_unregister_target(&crypt_target);
>  	kmem_cache_destroy(_crypt_io_pool);
> +	if (crypt_workqueue)
> +		destroy_workqueue(crypt_workqueue);
> +	if (io_workqueue)
> +		destroy_workqueue(io_workqueue);
>  }
>  
>  module_init(dm_crypt_init);
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-01  2:44 ` Mikulas Patocka
@ 2010-06-01  4:39     ` Herbert Xu
  2010-06-01  6:40   ` Andi Kleen
  1 sibling, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-01  4:39 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, linux-kernel, agk, ak

On Mon, May 31, 2010 at 10:44:30PM -0400, Mikulas Patocka wrote:
> Questions:
> 
> If you are optimizing it,
> 
> 1) why don't you optimize it in such a way that if one CPU submits 
> requests, the crypto work is spread among all the CPUs? Currently it 
> spreads the work only if different CPUs submit it.

Because the crypto layer already provides that functionality,
through pcrypt.  By instantiating pcrypt for a given algorithm,
you can parallelise that algorithm across CPUs.

This would be inappropriate for upper layer code as they do not
know whether the underlying algorithm should be parallelised,
e.g., a PCI offload board certainly should not be parallelised.

> 2) why not optimize software async crypto daemon (crypt/cryptd.c) instead 
> of dm-crypt, so that all kernel subsystems can actually take advantage of 
> those multi-CPU optimizations, not just dm-crypt?

Because you cannot do what Andi is doing here in the crypto layer.
What dm-crypt does today (which hasn't always been the case BTW)
hides information away (the original submitting CPU) that we cannot
recreate.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-06-01  4:39     ` Herbert Xu
  0 siblings, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-01  4:39 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, linux-kernel, agk, ak

On Mon, May 31, 2010 at 10:44:30PM -0400, Mikulas Patocka wrote:
> Questions:
> 
> If you are optimizing it,
> 
> 1) why don't you optimize it in such a way that if one CPU submits 
> requests, the crypto work is spread among all the CPUs? Currently it 
> spreads the work only if different CPUs submit it.

Because the crypto layer already provides that functionality,
through pcrypt.  By instantiating pcrypt for a given algorithm,
you can parallelise that algorithm across CPUs.

This would be inappropriate for upper layer code as they do not
know whether the underlying algorithm should be parallelised,
e.g., a PCI offload board certainly should not be parallelised.

> 2) why not optimize software async crypto daemon (crypt/cryptd.c) instead 
> of dm-crypt, so that all kernel subsystems can actually take advantage of 
> those multi-CPU optimizations, not just dm-crypt?

Because you cannot do what Andi is doing here in the crypto layer.
What dm-crypt does today (which hasn't always been the case BTW)
hides information away (the original submitting CPU) that we cannot
recreate.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-01  2:44 ` Mikulas Patocka
  2010-06-01  4:39     ` Herbert Xu
@ 2010-06-01  6:40   ` Andi Kleen
  2010-06-02  5:15     ` Mikulas Patocka
  1 sibling, 1 reply; 40+ messages in thread
From: Andi Kleen @ 2010-06-01  6:40 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, herbert, linux-kernel, agk

, Mikulas Patocka wrote:
> Questions:
>
> If you are optimizing it,
>
> 1) why don't you optimize it in such a way that if one CPU submits
> requests, the crypto work is spread among all the CPUs? Currently it
> spreads the work only if different CPUs submit it.

This case is only useful with very slow CPUs and is handled by pcrypt
in theory

(but I haven't tested it)

>
> 2) why not optimize software async crypto daemon (crypt/cryptd.c) instead
> of dm-crypt, so that all kernel subsystems can actually take advantage of
> those multi-CPU optimizations, not just dm-crypt?

Normally most subsystems are multi-CPU already, unless they limit
themselves artitifically like dm-crypt.

For dm-crypt would be wasteful to funnel everything through two single CPU threads just
to spread it out again.  That is why I also used per CPU IO threads too.

-Andi



^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-01  4:39     ` Herbert Xu
  (?)
@ 2010-06-02  5:10     ` Mikulas Patocka
  2010-06-02  5:14         ` Herbert Xu
  -1 siblings, 1 reply; 40+ messages in thread
From: Mikulas Patocka @ 2010-06-02  5:10 UTC (permalink / raw)
  To: Herbert Xu; +Cc: device-mapper development, linux-kernel, agk, ak



On Tue, 1 Jun 2010, Herbert Xu wrote:

> On Mon, May 31, 2010 at 10:44:30PM -0400, Mikulas Patocka wrote:
> > Questions:
> > 
> > If you are optimizing it,
> > 
> > 1) why don't you optimize it in such a way that if one CPU submits 
> > requests, the crypto work is spread among all the CPUs? Currently it 
> > spreads the work only if different CPUs submit it.
> 
> Because the crypto layer already provides that functionality,
> through pcrypt.  By instantiating pcrypt for a given algorithm,
> you can parallelise that algorithm across CPUs.

And how can I use pcrypt for dm-crypt? After a quick look at pcrypt 
sources, it seems to be dependent on aead and not useable for general 
encryption algorithms at all.

I tried cryptd --- in theory it should work by requesting the algorithm 
like cryptd(cbc(aes)) --- but if I replace "%s(%s)" with "cryptd(%s(%s))" 
in dm-crypt sources it locks up and doesn't work.

> This would be inappropriate for upper layer code as they do not
> know whether the underlying algorithm should be parallelised,
> e.g., a PCI offload board certainly should not be parallelised.

The upper layer should ideally request "cbc(aes)" and the crypto routine 
should select the most efficient implementation --- sync on single-core 
system, async with cryptd on multi-core system and async with hardware 
implementation if you have HIFN crypto card.

> > 2) why not optimize software async crypto daemon (crypt/cryptd.c) instead 
> > of dm-crypt, so that all kernel subsystems can actually take advantage of 
> > those multi-CPU optimizations, not just dm-crypt?
> 
> Because you cannot do what Andi is doing here in the crypto layer.
> What dm-crypt does today (which hasn't always been the case BTW)
> hides information away (the original submitting CPU) that we cannot
> recreate.

It is pointless to track the submitting CPU.

Majority of time is consumed by raw encyption/decryption. And you must 
optimize that --- i.e. on SMP system make sure that cryptd distributes the 
work across all available cores.

When you get this right --- i.e. when reading encrypted disk, you get 
either read speed equivalent to non-encrypted disk or all the cores are 
saturated, then you can start thinking about other optimizations.

Mikulas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-02  5:10     ` Mikulas Patocka
@ 2010-06-02  5:14         ` Herbert Xu
  0 siblings, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-02  5:14 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, linux-kernel, agk, ak

On Wed, Jun 02, 2010 at 01:10:00AM -0400, Mikulas Patocka wrote:
>
> And how can I use pcrypt for dm-crypt? After a quick look at pcrypt 
> sources, it seems to be dependent on aead and not useable for general 
> encryption algorithms at all.

You instantiate a pcrypt variant of whatever algorithm that you're
using.  For example, if you're using XTS then you should instantiate
pcrypt(xts(aes)).  Currently you must use tcrypt to instantiate.

> I tried cryptd --- in theory it should work by requesting the algorithm 
> like cryptd(cbc(aes)) --- but if I replace "%s(%s)" with "cryptd(%s(%s))" 
> in dm-crypt sources it locks up and doesn't work.

cryptd is something else altogether.  However, it certainly should
not lock up.  What kernel version is this?

> > This would be inappropriate for upper layer code as they do not
> > know whether the underlying algorithm should be parallelised,
> > e.g., a PCI offload board certainly should not be parallelised.
> 
> The upper layer should ideally request "cbc(aes)" and the crypto routine 
> should select the most efficient implementation --- sync on single-core 
> system, async with cryptd on multi-core system and async with hardware 
> implementation if you have HIFN crypto card.

That's exactly what will happen when the admin instantiates pcrypt.
dm-crypt simply needs to specify cbc(aes) and it will get pcrypt
automatically.

The point is that on a modern processor like Nehalem you don't need
pcrypt.
 
> It is pointless to track the submitting CPU.

No you are wrong.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-06-02  5:14         ` Herbert Xu
  0 siblings, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-02  5:14 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, linux-kernel, agk, ak

On Wed, Jun 02, 2010 at 01:10:00AM -0400, Mikulas Patocka wrote:
>
> And how can I use pcrypt for dm-crypt? After a quick look at pcrypt 
> sources, it seems to be dependent on aead and not useable for general 
> encryption algorithms at all.

You instantiate a pcrypt variant of whatever algorithm that you're
using.  For example, if you're using XTS then you should instantiate
pcrypt(xts(aes)).  Currently you must use tcrypt to instantiate.

> I tried cryptd --- in theory it should work by requesting the algorithm 
> like cryptd(cbc(aes)) --- but if I replace "%s(%s)" with "cryptd(%s(%s))" 
> in dm-crypt sources it locks up and doesn't work.

cryptd is something else altogether.  However, it certainly should
not lock up.  What kernel version is this?

> > This would be inappropriate for upper layer code as they do not
> > know whether the underlying algorithm should be parallelised,
> > e.g., a PCI offload board certainly should not be parallelised.
> 
> The upper layer should ideally request "cbc(aes)" and the crypto routine 
> should select the most efficient implementation --- sync on single-core 
> system, async with cryptd on multi-core system and async with hardware 
> implementation if you have HIFN crypto card.

That's exactly what will happen when the admin instantiates pcrypt.
dm-crypt simply needs to specify cbc(aes) and it will get pcrypt
automatically.

The point is that on a modern processor like Nehalem you don't need
pcrypt.
 
> It is pointless to track the submitting CPU.

No you are wrong.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-01  6:40   ` Andi Kleen
@ 2010-06-02  5:15     ` Mikulas Patocka
  2010-06-02  5:20         ` Herbert Xu
  2010-06-02  7:53       ` Andi Kleen
  0 siblings, 2 replies; 40+ messages in thread
From: Mikulas Patocka @ 2010-06-02  5:15 UTC (permalink / raw)
  To: Andi Kleen; +Cc: device-mapper development, herbert, linux-kernel, agk



On Tue, 1 Jun 2010, Andi Kleen wrote:

> , Mikulas Patocka wrote:
> > Questions:
> > 
> > If you are optimizing it,
> > 
> > 1) why don't you optimize it in such a way that if one CPU submits
> > requests, the crypto work is spread among all the CPUs? Currently it
> > spreads the work only if different CPUs submit it.
> 
> This case is only useful with very slow CPUs and is handled by pcrypt
> in theory
> 
> (but I haven't tested it)

Almost every CPU is "very slow" so that it lags behind disk when 
encrypting. CPUs with hardware AES may be the exception.

> > 2) why not optimize software async crypto daemon (crypt/cryptd.c) instead
> > of dm-crypt, so that all kernel subsystems can actually take advantage of
> > those multi-CPU optimizations, not just dm-crypt?
> 
> Normally most subsystems are multi-CPU already, unless they limit
> themselves artitifically like dm-crypt.
> 
> For dm-crypt would be wasteful to funnel everything through two single CPU
> threads just
> to spread it out again.  That is why I also used per CPU IO threads too.

If one CPU submits I/O for 10MB of data, your patch makes no 
paralelization at all. Because all those 10MB will be encrypted by the 
same CPU that submitted it.

> -Andi

Mikulas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-02  5:15     ` Mikulas Patocka
@ 2010-06-02  5:20         ` Herbert Xu
  2010-06-02  7:53       ` Andi Kleen
  1 sibling, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-02  5:20 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Andi Kleen, device-mapper development, linux-kernel, agk

On Wed, Jun 02, 2010 at 01:15:43AM -0400, Mikulas Patocka wrote:
>
> Almost every CPU is "very slow" so that it lags behind disk when 
> encrypting. CPUs with hardware AES may be the exception.

I would not call a platform like Nehalem the exception.

> If one CPU submits I/O for 10MB of data, your patch makes no 
> paralelization at all. Because all those 10MB will be encrypted by the 
> same CPU that submitted it.

He doesn't need to.  This is already solved by pcrypt.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-06-02  5:20         ` Herbert Xu
  0 siblings, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-02  5:20 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Andi Kleen, device-mapper development, linux-kernel, agk

On Wed, Jun 02, 2010 at 01:15:43AM -0400, Mikulas Patocka wrote:
>
> Almost every CPU is "very slow" so that it lags behind disk when 
> encrypting. CPUs with hardware AES may be the exception.

I would not call a platform like Nehalem the exception.

> If one CPU submits I/O for 10MB of data, your patch makes no 
> paralelization at all. Because all those 10MB will be encrypted by the 
> same CPU that submitted it.

He doesn't need to.  This is already solved by pcrypt.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-02  5:14         ` Herbert Xu
  (?)
@ 2010-06-02  5:25         ` Mikulas Patocka
  2010-06-02  6:07             ` Herbert Xu
  2010-06-02  7:51           ` Andi Kleen
  -1 siblings, 2 replies; 40+ messages in thread
From: Mikulas Patocka @ 2010-06-02  5:25 UTC (permalink / raw)
  To: Herbert Xu; +Cc: device-mapper development, linux-kernel, agk, ak



On Wed, 2 Jun 2010, Herbert Xu wrote:

> On Wed, Jun 02, 2010 at 01:10:00AM -0400, Mikulas Patocka wrote:
> >
> > And how can I use pcrypt for dm-crypt? After a quick look at pcrypt 
> > sources, it seems to be dependent on aead and not useable for general 
> > encryption algorithms at all.
> 
> You instantiate a pcrypt variant of whatever algorithm that you're
> using.  For example, if you're using XTS then you should instantiate
> pcrypt(xts(aes)).  Currently you must use tcrypt to instantiate.

I tried "pcrypt(%s(%s))" in dm-crypt and I get "table: 253:0: crypt: Error 
allocating crypto tfm"

Are you sure that you know what you're talking about? pcrypt_alloc 
contains this:
switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) {
case CRYPTO_ALG_TYPE_AEAD:
        return pcrypt_alloc_aead(tb);
}

return ERR_PTR(-EINVAL);
--- so for anything other byt AEAD it returns -EINVAL.

> > I tried cryptd --- in theory it should work by requesting the algorithm 
> > like cryptd(cbc(aes)) --- but if I replace "%s(%s)" with "cryptd(%s(%s))" 
> > in dm-crypt sources it locks up and doesn't work.
> 
> cryptd is something else altogether.  However, it certainly should
> not lock up.  What kernel version is this?

2.6.34 and cryptsetup 1.1.2. It is a soft lockup interruptible with 
Ctrl-C.

> > > This would be inappropriate for upper layer code as they do not
> > > know whether the underlying algorithm should be parallelised,
> > > e.g., a PCI offload board certainly should not be parallelised.
> > 
> > The upper layer should ideally request "cbc(aes)" and the crypto routine 
> > should select the most efficient implementation --- sync on single-core 
> > system, async with cryptd on multi-core system and async with hardware 
> > implementation if you have HIFN crypto card.
> 
> That's exactly what will happen when the admin instantiates pcrypt.
> dm-crypt simply needs to specify cbc(aes) and it will get pcrypt
> automatically.
> 
> The point is that on a modern processor like Nehalem you don't need
> pcrypt.
>  
> > It is pointless to track the submitting CPU.
> 
> No you are wrong.

For what? For avoiding cache bounces? But the encrypting is 
order-of-magnitude slower than memory speed.

Mikulas

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-02  5:25         ` Mikulas Patocka
@ 2010-06-02  6:07             ` Herbert Xu
  2010-06-02  7:51           ` Andi Kleen
  1 sibling, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-02  6:07 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, linux-kernel, agk, ak

On Wed, Jun 02, 2010 at 01:25:11AM -0400, Mikulas Patocka wrote:
>
> Are you sure that you know what you're talking about? pcrypt_alloc 
> contains this:
> switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) {
> case CRYPTO_ALG_TYPE_AEAD:
>         return pcrypt_alloc_aead(tb);
> }
> 
> return ERR_PTR(-EINVAL);
> --- so for anything other byt AEAD it returns -EINVAL.

So someone needs to write the ablkcipher plugins for pcrypt.

This is the whole point of pcrypt, to implement parallelisation
exactly once, in the crypto layer.

> For what? For avoiding cache bounces? But the encrypting is 
> order-of-magnitude slower than memory speed.

Have you benchmarked the Nehalem AES performance lately?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-06-02  6:07             ` Herbert Xu
  0 siblings, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-02  6:07 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, linux-kernel, agk, ak

On Wed, Jun 02, 2010 at 01:25:11AM -0400, Mikulas Patocka wrote:
>
> Are you sure that you know what you're talking about? pcrypt_alloc 
> contains this:
> switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) {
> case CRYPTO_ALG_TYPE_AEAD:
>         return pcrypt_alloc_aead(tb);
> }
> 
> return ERR_PTR(-EINVAL);
> --- so for anything other byt AEAD it returns -EINVAL.

So someone needs to write the ablkcipher plugins for pcrypt.

This is the whole point of pcrypt, to implement parallelisation
exactly once, in the crypto layer.

> For what? For avoiding cache bounces? But the encrypting is 
> order-of-magnitude slower than memory speed.

Have you benchmarked the Nehalem AES performance lately?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-02  6:07             ` Herbert Xu
@ 2010-06-02  6:30               ` Steffen Klassert
  -1 siblings, 0 replies; 40+ messages in thread
From: Steffen Klassert @ 2010-06-02  6:30 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Mikulas Patocka, device-mapper development, linux-kernel, agk, ak

On Wed, Jun 02, 2010 at 04:07:02PM +1000, Herbert Xu wrote:
> On Wed, Jun 02, 2010 at 01:25:11AM -0400, Mikulas Patocka wrote:
> >
> > Are you sure that you know what you're talking about? pcrypt_alloc 
> > contains this:
> > switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) {
> > case CRYPTO_ALG_TYPE_AEAD:
> >         return pcrypt_alloc_aead(tb);
> > }
> > 
> > return ERR_PTR(-EINVAL);
> > --- so for anything other byt AEAD it returns -EINVAL.
> 
> So someone needs to write the ablkcipher plugins for pcrypt.
> 

I did that already for one of the older (remote softirq based)
version of pcrypt. I just never pushed it out because I was focused
on IPsec and the softirq based version I was not that usefull for
dm-crypt. Now, that we use workqueues internally, dm-crypt could
benefit from pcrypt too. Let me see whether I can respin the old
ablkcipher plugin.

Steffen

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-06-02  6:30               ` Steffen Klassert
  0 siblings, 0 replies; 40+ messages in thread
From: Steffen Klassert @ 2010-06-02  6:30 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Mikulas Patocka, device-mapper development, linux-kernel, agk, ak

On Wed, Jun 02, 2010 at 04:07:02PM +1000, Herbert Xu wrote:
> On Wed, Jun 02, 2010 at 01:25:11AM -0400, Mikulas Patocka wrote:
> >
> > Are you sure that you know what you're talking about? pcrypt_alloc 
> > contains this:
> > switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) {
> > case CRYPTO_ALG_TYPE_AEAD:
> >         return pcrypt_alloc_aead(tb);
> > }
> > 
> > return ERR_PTR(-EINVAL);
> > --- so for anything other byt AEAD it returns -EINVAL.
> 
> So someone needs to write the ablkcipher plugins for pcrypt.
> 

I did that already for one of the older (remote softirq based)
version of pcrypt. I just never pushed it out because I was focused
on IPsec and the softirq based version I was not that usefull for
dm-crypt. Now, that we use workqueues internally, dm-crypt could
benefit from pcrypt too. Let me see whether I can respin the old
ablkcipher plugin.

Steffen

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-02  6:30               ` Steffen Klassert
@ 2010-06-02  7:07                 ` Herbert Xu
  -1 siblings, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-02  7:07 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Mikulas Patocka, device-mapper development, linux-kernel, agk, ak

On Wed, Jun 02, 2010 at 08:30:33AM +0200, Steffen Klassert wrote:
> 
> I did that already for one of the older (remote softirq based)
> version of pcrypt. I just never pushed it out because I was focused
> on IPsec and the softirq based version I was not that usefull for
> dm-crypt. Now, that we use workqueues internally, dm-crypt could
> benefit from pcrypt too. Let me see whether I can respin the old
> ablkcipher plugin.

Thanks Steffen!
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-06-02  7:07                 ` Herbert Xu
  0 siblings, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-02  7:07 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Mikulas Patocka, device-mapper development, linux-kernel, agk, ak

On Wed, Jun 02, 2010 at 08:30:33AM +0200, Steffen Klassert wrote:
> 
> I did that already for one of the older (remote softirq based)
> version of pcrypt. I just never pushed it out because I was focused
> on IPsec and the softirq based version I was not that usefull for
> dm-crypt. Now, that we use workqueues internally, dm-crypt could
> benefit from pcrypt too. Let me see whether I can respin the old
> ablkcipher plugin.

Thanks Steffen!
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-02  5:25         ` Mikulas Patocka
  2010-06-02  6:07             ` Herbert Xu
@ 2010-06-02  7:51           ` Andi Kleen
  1 sibling, 0 replies; 40+ messages in thread
From: Andi Kleen @ 2010-06-02  7:51 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Herbert Xu, device-mapper development, linux-kernel, agk


>>
>>> It is pointless to track the submitting CPU.
>>
>> No you are wrong.
>
> For what? For avoiding cache bounces? But the encrypting is
> order-of-magnitude slower than memory speed.

On a system with reasonably fast CPUs it's fastest to just stay
on your current CPU, don't try to talk to other CPUs, avoid
communication, just get the work done ASAP. But make
sure you can do this on multiple CPUs at the same time.

With AES-NI this becomes even more pronounced because it effectively
makes the CPU faster for encryption.

-andi

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-02  5:15     ` Mikulas Patocka
  2010-06-02  5:20         ` Herbert Xu
@ 2010-06-02  7:53       ` Andi Kleen
  2010-06-02 10:20           ` Herbert Xu
  1 sibling, 1 reply; 40+ messages in thread
From: Andi Kleen @ 2010-06-02  7:53 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, herbert, linux-kernel, agk


> Almost every CPU is "very slow" so that it lags behind disk when
> encrypting. CPUs with hardware AES may be the exception.

Behind disk??

You're not serious?

-Andi


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
  2010-06-02  7:53       ` Andi Kleen
@ 2010-06-02 10:20           ` Herbert Xu
  0 siblings, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-02 10:20 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Mikulas Patocka, device-mapper development, linux-kernel, agk

On Wed, Jun 02, 2010 at 09:53:15AM +0200, Andi Kleen wrote:
>
> Behind disk??
>
> You're not serious?

Well my $20 wireless router can't perform AES and keep up with
the disk, but then it doesn't have two CPUs either :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [dm-devel] [PATCH] DM-CRYPT: Scale to multiple CPUs
@ 2010-06-02 10:20           ` Herbert Xu
  0 siblings, 0 replies; 40+ messages in thread
From: Herbert Xu @ 2010-06-02 10:20 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Mikulas Patocka, device-mapper development, linux-kernel, agk

On Wed, Jun 02, 2010 at 09:53:15AM +0200, Andi Kleen wrote:
>
> Behind disk??
>
> You're not serious?

Well my $20 wireless router can't perform AES and keep up with
the disk, but then it doesn't have two CPUs either :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2010-06-02 10:21 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-05-31 16:04 [PATCH] DM-CRYPT: Scale to multiple CPUs Andi Kleen
2010-05-31 16:04 ` Andi Kleen
2010-05-31 16:34 ` Eric Dumazet
2010-05-31 16:34   ` Eric Dumazet
2010-05-31 17:46   ` Andi Kleen
2010-05-31 17:46     ` Andi Kleen
2010-05-31 17:52     ` Eric Dumazet
2010-05-31 17:52       ` Eric Dumazet
2010-05-31 17:22 ` [dm-devel] " Milan Broz
2010-05-31 17:42   ` Andi Kleen
2010-05-31 18:10     ` Milan Broz
2010-05-31 18:27       ` Andi Kleen
2010-05-31 18:55         ` Milan Broz
2010-05-31 19:04           ` Andi Kleen
2010-05-31 22:07             ` Herbert Xu
2010-05-31 22:07               ` Herbert Xu
2010-06-01  1:46               ` huang ying
2010-06-01  1:46                 ` huang ying
2010-06-01  2:39         ` Mikulas Patocka
2010-06-01  2:44 ` Mikulas Patocka
2010-06-01  4:39   ` Herbert Xu
2010-06-01  4:39     ` Herbert Xu
2010-06-02  5:10     ` Mikulas Patocka
2010-06-02  5:14       ` Herbert Xu
2010-06-02  5:14         ` Herbert Xu
2010-06-02  5:25         ` Mikulas Patocka
2010-06-02  6:07           ` Herbert Xu
2010-06-02  6:07             ` Herbert Xu
2010-06-02  6:30             ` Steffen Klassert
2010-06-02  6:30               ` Steffen Klassert
2010-06-02  7:07               ` Herbert Xu
2010-06-02  7:07                 ` Herbert Xu
2010-06-02  7:51           ` Andi Kleen
2010-06-01  6:40   ` Andi Kleen
2010-06-02  5:15     ` Mikulas Patocka
2010-06-02  5:20       ` Herbert Xu
2010-06-02  5:20         ` Herbert Xu
2010-06-02  7:53       ` Andi Kleen
2010-06-02 10:20         ` Herbert Xu
2010-06-02 10:20           ` Herbert Xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.