All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrey Smirnov <andrew.smirnov@gmail.com>
To: linux-crypto@vger.kernel.org
Cc: "Andrey Smirnov" <andrew.smirnov@gmail.com>,
	"Chris Healy" <cphealy@gmail.com>,
	"Lucas Stach" <l.stach@pengutronix.de>,
	"Horia Geantă" <horia.geanta@nxp.com>,
	"Herbert Xu" <herbert@gondor.apana.org.au>,
	"Iuliana Prodan" <iuliana.prodan@nxp.com>,
	linux-kernel@vger.kernel.org, linux-imx@nxp.com
Subject: [PATCH v7 5/9] crypto: caam - simplify RNG implementation
Date: Mon, 27 Jan 2020 08:56:42 -0800	[thread overview]
Message-ID: <20200127165646.19806-6-andrew.smirnov@gmail.com> (raw)
In-Reply-To: <20200127165646.19806-1-andrew.smirnov@gmail.com>

Rework CAAM RNG implementation as follows:

- Make use of the fact that HWRNG supports partial reads and will
handle such cases gracefully by removing recursion in caam_read()

- Convert blocking caam_read() codepath to do a single blocking job
read directly into requested buffer, bypassing any intermediary
buffers

- Convert async caam_read() codepath into a simple single
reader/single writer FIFO use-case, thus simplifying concurrency
handling and delegating buffer read/write position management to KFIFO
subsystem.

- Leverage the same low level RNG data extraction code for both async
and blocking caam_read() scenarios, get rid of the shared job
descriptor and make non-shared one as a simple as possible (just
HEADER + ALGORITHM OPERATION + FIFO STORE)

- Split private context from DMA related memory, so that the former
could be allocated without GFP_DMA.

NOTE: On its face value this commit decreased throughput numbers
reported by

  dd if=/dev/hwrng of=/dev/null bs=1 count=100K [iflag=nonblock]

by about 15%, however commits that enable prediction resistance and
limit JR total size impact the performance so much and move the
bottleneck such as to make this regression irrelevant.

NOTE: On the bright side, this commit reduces RNG in kernel DMA buffer
memory usage from 2 x RN_BUF_SIZE (~256K) to 32K.

Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Cc: Chris Healy <cphealy@gmail.com>
Cc: Lucas Stach <l.stach@pengutronix.de>
Cc: Horia Geantă <horia.geanta@nxp.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Iuliana Prodan <iuliana.prodan@nxp.com>
Cc: linux-crypto@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-imx@nxp.com
---
 drivers/crypto/caam/caamrng.c | 321 ++++++++++++----------------------
 1 file changed, 107 insertions(+), 214 deletions(-)

diff --git a/drivers/crypto/caam/caamrng.c b/drivers/crypto/caam/caamrng.c
index 47b15c25b66f..cb498186b9b9 100644
--- a/drivers/crypto/caam/caamrng.c
+++ b/drivers/crypto/caam/caamrng.c
@@ -7,35 +7,12 @@
  *
  * Based on caamalg.c crypto API driver.
  *
- * relationship between job descriptors to shared descriptors:
- *
- * ---------------                     --------------
- * | JobDesc #0  |-------------------->| ShareDesc  |
- * | *(buffer 0) |      |------------->| (generate) |
- * ---------------      |              | (move)     |
- *                      |              | (store)    |
- * ---------------      |              --------------
- * | JobDesc #1  |------|
- * | *(buffer 1) |
- * ---------------
- *
- * A job desc looks like this:
- *
- * ---------------------
- * | Header            |
- * | ShareDesc Pointer |
- * | SEQ_OUT_PTR       |
- * | (output buffer)   |
- * ---------------------
- *
- * The SharedDesc never changes, and each job descriptor points to one of two
- * buffers for each device, from which the data will be copied into the
- * requested destination
  */
 
 #include <linux/hw_random.h>
 #include <linux/completion.h>
 #include <linux/atomic.h>
+#include <linux/kfifo.h>
 
 #include "compat.h"
 
@@ -45,38 +22,34 @@
 #include "jr.h"
 #include "error.h"
 
+/* length of descriptors */
+#define CAAM_RNG_MAX_FIFO_STORE_SIZE	U16_MAX
+
+#define CAAM_RNG_FIFO_LEN		SZ_32K /* Must be a multiple of 2 */
+
 /*
- * Maximum buffer size: maximum number of random, cache-aligned bytes that
- * will be generated and moved to seq out ptr (extlen not allowed)
+ * See caam_init_desc()
  */
-#define RN_BUF_SIZE			(0xffff / L1_CACHE_BYTES * \
-					 L1_CACHE_BYTES)
+#define CAAM_RNG_DESC_LEN (CAAM_CMD_SZ +				\
+			   CAAM_CMD_SZ +				\
+			   CAAM_CMD_SZ + CAAM_PTR_SZ_MAX)
 
-/* length of descriptors */
-#define DESC_JOB_O_LEN			(CAAM_CMD_SZ * 2 + CAAM_PTR_SZ_MAX * 2)
-#define DESC_RNG_LEN			(3 * CAAM_CMD_SZ)
-
-/* Buffer, its dma address and lock */
-struct buf_data {
-	u8 buf[RN_BUF_SIZE] ____cacheline_aligned;
-	dma_addr_t addr;
-	struct completion filled;
-	u32 hw_desc[DESC_JOB_O_LEN];
-#define BUF_NOT_EMPTY 0
-#define BUF_EMPTY 1
-#define BUF_PENDING 2  /* Empty, but with job pending --don't submit another */
-	atomic_t empty;
+typedef u8 caam_rng_desc[CAAM_RNG_DESC_LEN];
+
+enum {
+	DESC_ASYNC,
+	DESC_SYNC,
+	DESC_NUM,
 };
 
 /* rng per-device context */
 struct caam_rng_ctx {
 	struct hwrng rng;
 	struct device *jrdev;
-	dma_addr_t sh_desc_dma;
-	u32 sh_desc[DESC_RNG_LEN];
-	unsigned int cur_buf_idx;
-	int current_buf;
-	struct buf_data bufs[2];
+	struct device *ctrldev;
+	caam_rng_desc *desc;
+	struct work_struct worker;
+	struct kfifo fifo;
 };
 
 static struct caam_rng_ctx *to_caam_rng_ctx(struct hwrng *r)
@@ -84,228 +57,146 @@ static struct caam_rng_ctx *to_caam_rng_ctx(struct hwrng *r)
 	return container_of(r, struct caam_rng_ctx, rng);
 }
 
-static inline void rng_unmap_buf(struct device *jrdev, struct buf_data *bd)
+static void caam_rng_done(struct device *jrdev, u32 *desc, u32 err,
+			  void *context)
 {
-	if (bd->addr)
-		dma_unmap_single(jrdev, bd->addr, RN_BUF_SIZE,
-				 DMA_FROM_DEVICE);
-}
-
-static inline void rng_unmap_ctx(struct caam_rng_ctx *ctx)
-{
-	struct device *jrdev = ctx->jrdev;
-
-	if (ctx->sh_desc_dma)
-		dma_unmap_single(jrdev, ctx->sh_desc_dma,
-				 desc_bytes(ctx->sh_desc), DMA_TO_DEVICE);
-	rng_unmap_buf(jrdev, &ctx->bufs[0]);
-	rng_unmap_buf(jrdev, &ctx->bufs[1]);
-}
-
-static void rng_done(struct device *jrdev, u32 *desc, u32 err, void *context)
-{
-	struct buf_data *bd;
-
-	bd = container_of(desc, struct buf_data, hw_desc[0]);
+	struct completion *done = context;
 
 	if (err)
 		caam_jr_strstatus(jrdev, err);
 
-	atomic_set(&bd->empty, BUF_NOT_EMPTY);
-	complete(&bd->filled);
-
-	/* Buffer refilled, invalidate cache */
-	dma_sync_single_for_cpu(jrdev, bd->addr, RN_BUF_SIZE, DMA_FROM_DEVICE);
-
-	print_hex_dump_debug("rng refreshed buf@: ", DUMP_PREFIX_ADDRESS, 16, 4,
-			     bd->buf, RN_BUF_SIZE, 1);
+	complete(done);
 }
 
-static inline int submit_job(struct caam_rng_ctx *ctx, int to_current)
+static u32 *caam_init_desc(u32 *desc, dma_addr_t dst_dma, int len)
 {
-	struct buf_data *bd = &ctx->bufs[!(to_current ^ ctx->current_buf)];
-	struct device *jrdev = ctx->jrdev;
-	u32 *desc = bd->hw_desc;
-	int err;
+	init_job_desc(desc, 0);	/* + 1 cmd_sz */
+	/* Generate random bytes: + 1 cmd_sz */
+	append_operation(desc, OP_ALG_ALGSEL_RNG | OP_TYPE_CLASS1_ALG);
+	/* Store bytes */
+	append_fifo_store(desc, dst_dma, len, FIFOST_TYPE_RNGSTORE);
 
-	dev_dbg(jrdev, "submitting job %d\n", !(to_current ^ ctx->current_buf));
-	init_completion(&bd->filled);
-	err = caam_jr_enqueue(jrdev, desc, rng_done, ctx);
-	if (err)
-		complete(&bd->filled); /* don't wait on failed job*/
-	else
-		atomic_inc(&bd->empty); /* note if pending */
+	print_hex_dump_debug("rng job desc@: ", DUMP_PREFIX_ADDRESS,
+			     16, 4, desc, desc_bytes(desc), 1);
 
-	return err;
+	return desc;
 }
 
-static int caam_read(struct hwrng *rng, void *data, size_t max, bool wait)
+static int caam_rng_read_one(struct device *jrdev,
+			     void *dst, int len,
+			     void *desc,
+			     struct completion *done)
 {
-	struct caam_rng_ctx *ctx = to_caam_rng_ctx(rng);
-	struct buf_data *bd = &ctx->bufs[ctx->current_buf];
-	int next_buf_idx, copied_idx;
+	dma_addr_t dst_dma;
 	int err;
 
-	if (atomic_read(&bd->empty)) {
-		/* try to submit job if there wasn't one */
-		if (atomic_read(&bd->empty) == BUF_EMPTY) {
-			err = submit_job(ctx, 1);
-			/* if can't submit job, can't even wait */
-			if (err)
-				return 0;
-		}
-		/* no immediate data, so exit if not waiting */
-		if (!wait)
-			return 0;
-
-		/* waiting for pending job */
-		if (atomic_read(&bd->empty))
-			wait_for_completion(&bd->filled);
-	}
-
-	next_buf_idx = ctx->cur_buf_idx + max;
-	dev_dbg(ctx->jrdev, "%s: start reading at buffer %d, idx %d\n",
-		 __func__, ctx->current_buf, ctx->cur_buf_idx);
+	len = min_t(int, len, CAAM_RNG_MAX_FIFO_STORE_SIZE);
 
-	/* if enough data in current buffer */
-	if (next_buf_idx < RN_BUF_SIZE) {
-		memcpy(data, bd->buf + ctx->cur_buf_idx, max);
-		ctx->cur_buf_idx = next_buf_idx;
-		return max;
+	dst_dma = dma_map_single(jrdev, dst, len, DMA_FROM_DEVICE);
+	if (dma_mapping_error(jrdev, dst_dma)) {
+		dev_err(jrdev, "unable to map destination memory\n");
+		return -ENOMEM;
 	}
 
-	/* else, copy what's left... */
-	copied_idx = RN_BUF_SIZE - ctx->cur_buf_idx;
-	memcpy(data, bd->buf + ctx->cur_buf_idx, copied_idx);
-	ctx->cur_buf_idx = 0;
-	atomic_set(&bd->empty, BUF_EMPTY);
-
-	/* ...refill... */
-	submit_job(ctx, 1);
+	init_completion(done);
+	err = caam_jr_enqueue(jrdev,
+			      caam_init_desc(desc, dst_dma, len),
+			      caam_rng_done, done);
+	if (!err)
+		wait_for_completion(done);
 
-	/* and use next buffer */
-	ctx->current_buf = !ctx->current_buf;
-	dev_dbg(ctx->jrdev, "switched to buffer %d\n", ctx->current_buf);
+	dma_unmap_single(jrdev, dst_dma, len, DMA_FROM_DEVICE);
 
-	/* since there already is some data read, don't wait */
-	return copied_idx + caam_read(rng, data + copied_idx,
-				      max - copied_idx, false);
+	return err ?: len;
 }
 
-static inline int rng_create_sh_desc(struct caam_rng_ctx *ctx)
+static void caam_rng_fill_async(struct caam_rng_ctx *ctx)
 {
-	struct device *jrdev = ctx->jrdev;
-	u32 *desc = ctx->sh_desc;
-
-	init_sh_desc(desc, HDR_SHARE_SERIAL);
-
-	/* Generate random bytes */
-	append_operation(desc, OP_ALG_ALGSEL_RNG | OP_TYPE_CLASS1_ALG);
-
-	/* Store bytes */
-	append_seq_fifo_store(desc, RN_BUF_SIZE, FIFOST_TYPE_RNGSTORE);
-
-	ctx->sh_desc_dma = dma_map_single(jrdev, desc, desc_bytes(desc),
-					  DMA_TO_DEVICE);
-	if (dma_mapping_error(jrdev, ctx->sh_desc_dma)) {
-		dev_err(jrdev, "unable to map shared descriptor\n");
-		return -ENOMEM;
-	}
-
-	print_hex_dump_debug("rng shdesc@: ", DUMP_PREFIX_ADDRESS, 16, 4,
-			     desc, desc_bytes(desc), 1);
+	struct scatterlist sg[1];
+	struct completion done;
+	int len, nents;
+
+	sg_init_table(sg, ARRAY_SIZE(sg));
+	nents = kfifo_dma_in_prepare(&ctx->fifo, sg, ARRAY_SIZE(sg),
+				     CAAM_RNG_FIFO_LEN);
+	if (!nents)
+		return;
+
+	len = caam_rng_read_one(ctx->jrdev, sg_virt(&sg[0]),
+				sg[0].length,
+				&ctx->desc[DESC_ASYNC],
+				&done);
+	if (len < 0)
+		return;
+
+	kfifo_dma_in_finish(&ctx->fifo, len);
+}
 
-	return 0;
+static void caam_rng_worker(struct work_struct *work)
+{
+	struct caam_rng_ctx *ctx = container_of(work, struct caam_rng_ctx,
+						worker);
+	caam_rng_fill_async(ctx);
 }
 
-static inline int rng_create_job_desc(struct caam_rng_ctx *ctx, int buf_id)
+static int caam_read(struct hwrng *rng, void *dst, size_t max, bool wait)
 {
-	struct device *jrdev = ctx->jrdev;
-	struct buf_data *bd = &ctx->bufs[buf_id];
-	u32 *desc = bd->hw_desc;
-	int sh_len = desc_len(ctx->sh_desc);
+	struct caam_rng_ctx *ctx = to_caam_rng_ctx(rng);
+	int out;
 
-	init_job_desc_shared(desc, ctx->sh_desc_dma, sh_len, HDR_SHARE_DEFER |
-			     HDR_REVERSE);
+	if (wait) {
+		struct completion done;
 
-	bd->addr = dma_map_single(jrdev, bd->buf, RN_BUF_SIZE, DMA_FROM_DEVICE);
-	if (dma_mapping_error(jrdev, bd->addr)) {
-		dev_err(jrdev, "unable to map dst\n");
-		return -ENOMEM;
+		return caam_rng_read_one(ctx->jrdev, dst, max,
+					 ctx->desc[DESC_SYNC], &done);
 	}
 
-	append_seq_out_ptr_intlen(desc, bd->addr, RN_BUF_SIZE, 0);
-
-	print_hex_dump_debug("rng job desc@: ", DUMP_PREFIX_ADDRESS, 16, 4,
-			     desc, desc_bytes(desc), 1);
+	out = kfifo_out(&ctx->fifo, dst, max);
+	if (kfifo_len(&ctx->fifo) <= CAAM_RNG_FIFO_LEN / 2)
+		schedule_work(&ctx->worker);
 
-	return 0;
+	return out;
 }
 
 static void caam_cleanup(struct hwrng *rng)
 {
 	struct caam_rng_ctx *ctx = to_caam_rng_ctx(rng);
-	int i;
-	struct buf_data *bd;
-
-	for (i = 0; i < 2; i++) {
-		bd = &ctx->bufs[i];
-		if (atomic_read(&bd->empty) == BUF_PENDING)
-			wait_for_completion(&bd->filled);
-	}
 
-	rng_unmap_ctx(ctx);
+	flush_work(&ctx->worker);
 	caam_jr_free(ctx->jrdev);
+	kfifo_free(&ctx->fifo);
 }
 
-static int caam_init_buf(struct caam_rng_ctx *ctx, int buf_id)
+static int caam_init(struct hwrng *rng)
 {
-	struct buf_data *bd = &ctx->bufs[buf_id];
+	struct caam_rng_ctx *ctx = to_caam_rng_ctx(rng);
 	int err;
 
-	err = rng_create_job_desc(ctx, buf_id);
-	if (err)
-		return err;
-
-	atomic_set(&bd->empty, BUF_EMPTY);
-	submit_job(ctx, buf_id == ctx->current_buf);
-	wait_for_completion(&bd->filled);
+	ctx->desc = devm_kcalloc(ctx->ctrldev, DESC_NUM, sizeof(*ctx->desc),
+				 GFP_DMA | GFP_KERNEL);
+	if (!ctx->desc)
+		return -ENOMEM;
 
-	return 0;
-}
+	if (kfifo_alloc(&ctx->fifo, CAAM_RNG_FIFO_LEN, GFP_DMA | GFP_KERNEL))
+		return -ENOMEM;
 
-static int caam_init(struct hwrng *rng)
-{
-	struct caam_rng_ctx *ctx = to_caam_rng_ctx(rng);
-	int err;
+	INIT_WORK(&ctx->worker, caam_rng_worker);
 
 	ctx->jrdev = caam_jr_alloc();
 	err = PTR_ERR_OR_ZERO(ctx->jrdev);
 	if (err) {
+		kfifo_free(&ctx->fifo);
 		pr_err("Job Ring Device allocation for transform failed\n");
 		return err;
 	}
 
-	err = rng_create_sh_desc(ctx);
-	if (err)
-		goto free_jrdev;
-
-	ctx->current_buf = 0;
-	ctx->cur_buf_idx = 0;
-
-	err = caam_init_buf(ctx, 0);
-	if (err)
-		goto free_jrdev;
-
-	err = caam_init_buf(ctx, 1);
-	if (err)
-		goto free_jrdev;
+	/*
+	 * Fill async buffer to have early randomness data for
+	 * hw_random
+	 */
+	caam_rng_fill_async(ctx);
 
 	return 0;
-
-free_jrdev:
-	caam_jr_free(ctx->jrdev);
-	return err;
 }
 
 int caam_rng_init(struct device *ctrldev);
@@ -335,10 +226,12 @@ int caam_rng_init(struct device *ctrldev)
 	if (!devres_open_group(ctrldev, caam_rng_init, GFP_KERNEL))
 		return -ENOMEM;
 
-	ctx = devm_kzalloc(ctrldev, sizeof(*ctx), GFP_DMA | GFP_KERNEL);
+	ctx = devm_kzalloc(ctrldev, sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
 
+	ctx->ctrldev = ctrldev;
+
 	ctx->rng.name    = "rng-caam";
 	ctx->rng.init    = caam_init;
 	ctx->rng.cleanup = caam_cleanup;
-- 
2.21.0


  parent reply	other threads:[~2020-01-27 16:57 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-27 16:56 [PATCH v7 0/9] enable CAAM's HWRNG as default Andrey Smirnov
2020-01-27 16:56 ` [PATCH v7 1/9] crypto: caam - allocate RNG instantiation descriptor with GFP_DMA Andrey Smirnov
2020-02-04 14:08   ` Horia Geanta
2020-02-24 16:40     ` Andrey Smirnov
2020-03-16  4:14     ` Andrey Smirnov
2020-03-17 15:20       ` Horia Geantă
2020-01-27 16:56 ` [PATCH v7 2/9] crypto: caam - use struct hwrng's .init for initialization Andrey Smirnov
2020-02-11 14:39   ` Horia Geanta
2020-01-27 16:56 ` [PATCH v7 3/9] crypto: caam - use devm_kzalloc to allocate JR data Andrey Smirnov
2020-02-11 18:23   ` Horia Geanta
2020-02-24 16:39     ` Andrey Smirnov
2020-01-27 16:56 ` [PATCH v7 4/9] crypto: caam - drop global context pointer and init_done Andrey Smirnov
2020-02-11 18:53   ` Horia Geanta
2020-02-11 20:57   ` Horia Geanta
2020-02-24 16:40     ` Andrey Smirnov
2020-01-27 16:56 ` Andrey Smirnov [this message]
2020-02-12 13:20   ` [PATCH v7 5/9] crypto: caam - simplify RNG implementation Horia Geanta
2020-02-24 17:16     ` Andrey Smirnov
2020-01-27 16:56 ` [PATCH v7 6/9] crypto: caam - check if RNG job failed Andrey Smirnov
2020-02-12 10:41   ` Horia Geanta
2020-02-24 16:37     ` Andrey Smirnov
2020-01-27 16:56 ` [PATCH v7 7/9] crypto: caam - invalidate entropy register during RNG initialization Andrey Smirnov
2020-02-25 20:26   ` Horia Geanta
2020-01-27 16:56 ` [PATCH v7 8/9] crypto: caam - enable prediction resistance in HRWNG Andrey Smirnov
2020-02-04 13:09   ` [EXT] " Andrei Botila (OSS)
2020-02-04 14:19     ` Horia Geanta
2020-01-27 16:56 ` [PATCH v7 9/9] crypto: caam - limit single JD RNG output to maximum of 16 bytes Andrey Smirnov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200127165646.19806-6-andrew.smirnov@gmail.com \
    --to=andrew.smirnov@gmail.com \
    --cc=cphealy@gmail.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=horia.geanta@nxp.com \
    --cc=iuliana.prodan@nxp.com \
    --cc=l.stach@pengutronix.de \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-imx@nxp.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.