[RFC PATCH] crypto: arc4: Implement a version optimized for memory usage

* [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage
@ 2021-05-02 19:29 Christophe JAILLET
  2021-05-04 16:57 ` Eric Biggers
  0 siblings, 1 reply; 5+ messages in thread
From: Christophe JAILLET @ 2021-05-02 19:29 UTC (permalink / raw)
  To: herbert, davem
  Cc: linux-crypto, linux-kernel, kernel-janitors, Christophe JAILLET

The S array does not need to be u32, u8 is enough
On machine which have efficient unaligned access, use u8 to save some
memory.

So, provide a version optimized for memory usage in such a case.

Based on my testing, the size of arc4_ctx is:
	u8  version:  264
	u32 version: 1032

On my machine, the u8 version is about 5% faster.
It save ~800 bytes of memory or stack depending on how arc4_ctx is stored.
It is likely to be more cache friendly.

It has been tested an Core i7-3770 with the following test program:

 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>

 #define u8      unsigned char
 #define u32     unsigned int
 #define true    1

struct arc4_ctx_8 {
	u8 S[256];
	u32 x, y;
};
struct arc4_ctx_32 {
	u32 S[256];
	u32 x, y;
};

 #define S_type	u8
int arc4_setkey_8(struct arc4_ctx_8 *ctx, const u8 *in_key, unsigned int key_len)
{
	int i, j = 0, k = 0;

	ctx->x = 1;
	ctx->y = 0;

	for (i = 0; i < 256; i++)
		ctx->S[i] = i;

	for (i = 0; i < 256; i++) {
		S_type a = ctx->S[i];

		j = (j + in_key[k] + a) & 0xff;
		ctx->S[i] = ctx->S[j];
		ctx->S[j] = a;
		if (++k >= key_len)
			k = 0;
	}

	return 0;
}

void arc4_crypt_8(struct arc4_ctx_8 *ctx, u8 *out, const u8 *in, unsigned int len)
{
	S_type *const S = ctx->S;
	S_type a, b, ta, tb;
	u32 x, y, ty;

	if (len == 0)
		return;

	x = ctx->x;
	y = ctx->y;

	a = S[x];
	y = (y + a) & 0xff;
	b = S[y];

	do {
		S[y] = a;
		a = (a + b) & 0xff;
		S[x] = b;
		x = (x + 1) & 0xff;
		ta = S[x];
		ty = (y + ta) & 0xff;
		tb = S[ty];
		*out++ = *in++ ^ S[a];
		if (--len == 0)
			break;
		y = ty;
		a = ta;
		b = tb;
	} while (true);

	ctx->x = x;
	ctx->y = y;
}

 #undef S_type
 #define S_type	u32
int arc4_setkey_32(struct arc4_ctx_32 *ctx, const u8 *in_key, unsigned int key_len)
{
	int i, j = 0, k = 0;

	ctx->x = 1;
	ctx->y = 0;

	for (i = 0; i < 256; i++)
		ctx->S[i] = i;

	for (i = 0; i < 256; i++) {
		S_type a = ctx->S[i];

		j = (j + in_key[k] + a) & 0xff;
		ctx->S[i] = ctx->S[j];
		ctx->S[j] = a;
		if (++k >= key_len)
			k = 0;
	}

	return 0;
}

void arc4_crypt_32(struct arc4_ctx_32 *ctx, u8 *out, const u8 *in, unsigned int len)
{
	S_type *const S = ctx->S;
	S_type a, b, ta, tb;
	u32 x, y, ty;

	if (len == 0)
		return;

	x = ctx->x;
	y = ctx->y;

	a = S[x];
	y = (y + a) & 0xff;
	b = S[y];

	do {
		S[y] = a;
		a = (a + b) & 0xff;
		S[x] = b;
		x = (x + 1) & 0xff;
		ta = S[x];
		ty = (y + ta) & 0xff;
		tb = S[ty];
		*out++ = *in++ ^ S[a];
		if (--len == 0)
			break;
		y = ty;
		a = ta;
		b = tb;
	} while (true);

	ctx->x = x;
	ctx->y = y;
}

 #define KEY     "AZERTY"
 #define in      "AZERTYUIOP_QSDFGHJKLM_WXCVBN"

int main() {
        long i;
        struct arc4_ctx_8 ctx_8;
        u8 out8[1024] = { };
        struct arc4_ctx_32 ctx_32;
        u8 out32[1024] = { };

        arc4_setkey_8(&ctx_8, KEY, strlen(KEY));
        arc4_crypt_8(&ctx_8, out8, in, strlen(in));

        arc4_setkey_32(&ctx_32, KEY, strlen(KEY));
        arc4_crypt_32(&ctx_32, out32, in, strlen(in));

        printf("%ld vs %ld\n", sizeof(ctx_8), sizeof(ctx_32));
        if (memcmp(out8, out32, 1024) == 0)
                printf("Ok\n");
        else
                printf("Broken\n");

        return 0;
}

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
The idea came from code found in staging/rtl8712/
See at the top of:
   https://elixir.bootlin.com/linux/v5.12/source/drivers/staging/rtl8712/rtl871x_security.c

More precisely, in an attempt to clean staging/rtl8712/, I triggered the
kernel test robot about some increasing stack usage:
   https://lore.kernel.org/kernel-janitors/YHQUH+Nqc%2FzS14Tb@kroah.com/T/#m832a01a9d1517e7efc4f671ed46deae9993d6ae9

The above patch works for me, but should be taken as a RFC.
---
 include/crypto/arc4.h | 8 +++++++-
 lib/crypto/arc4.c     | 8 ++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/crypto/arc4.h b/include/crypto/arc4.h
index f3c22fe01704..39545ed486e2 100644
--- a/include/crypto/arc4.h
+++ b/include/crypto/arc4.h
@@ -12,8 +12,14 @@
 #define ARC4_MAX_KEY_SIZE	256
 #define ARC4_BLOCK_SIZE		1
 
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define S_type	u8
+#else
+#define S_type	u32
+#endif
+
 struct arc4_ctx {
-	u32 S[256];
+	S_type S[256];
 	u32 x, y;
 };
 
diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c
index c2020f19c652..e0be0c2a08d9 100644
--- a/lib/crypto/arc4.c
+++ b/lib/crypto/arc4.c
@@ -21,7 +21,7 @@ int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
 		ctx->S[i] = i;
 
 	for (i = 0; i < 256; i++) {
-		u32 a = ctx->S[i];
+		S_type a = ctx->S[i];
 
 		j = (j + in_key[k] + a) & 0xff;
 		ctx->S[i] = ctx->S[j];
@@ -36,9 +36,9 @@ EXPORT_SYMBOL(arc4_setkey);
 
 void arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
 {
-	u32 *const S = ctx->S;
-	u32 x, y, a, b;
-	u32 ty, ta, tb;
+	S_type *const S = ctx->S;
+	S_type a, b, ta, tb;
+	u32 x, y, ty;
 
 	if (len == 0)
 		return;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread