kernel-janitors.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage
@ 2021-05-02 19:29 Christophe JAILLET
  2021-05-04 16:57 ` Eric Biggers
  0 siblings, 1 reply; 5+ messages in thread
From: Christophe JAILLET @ 2021-05-02 19:29 UTC (permalink / raw)
  To: herbert, davem
  Cc: linux-crypto, linux-kernel, kernel-janitors, Christophe JAILLET

The S array does not need to be u32, u8 is enough
On machine which have efficient unaligned access, use u8 to save some
memory.

So, provide a version optimized for memory usage in such a case.

Based on my testing, the size of arc4_ctx is:
	u8  version:  264
	u32 version: 1032

On my machine, the u8 version is about 5% faster.
It save ~800 bytes of memory or stack depending on how arc4_ctx is stored.
It is likely to be more cache friendly.

It has been tested an Core i7-3770 with the following test program:

 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>

 #define u8      unsigned char
 #define u32     unsigned int
 #define true    1

struct arc4_ctx_8 {
	u8 S[256];
	u32 x, y;
};
struct arc4_ctx_32 {
	u32 S[256];
	u32 x, y;
};

 #define S_type	u8
int arc4_setkey_8(struct arc4_ctx_8 *ctx, const u8 *in_key, unsigned int key_len)
{
	int i, j = 0, k = 0;

	ctx->x = 1;
	ctx->y = 0;

	for (i = 0; i < 256; i++)
		ctx->S[i] = i;

	for (i = 0; i < 256; i++) {
		S_type a = ctx->S[i];

		j = (j + in_key[k] + a) & 0xff;
		ctx->S[i] = ctx->S[j];
		ctx->S[j] = a;
		if (++k >= key_len)
			k = 0;
	}

	return 0;
}

void arc4_crypt_8(struct arc4_ctx_8 *ctx, u8 *out, const u8 *in, unsigned int len)
{
	S_type *const S = ctx->S;
	S_type a, b, ta, tb;
	u32 x, y, ty;

	if (len == 0)
		return;

	x = ctx->x;
	y = ctx->y;

	a = S[x];
	y = (y + a) & 0xff;
	b = S[y];

	do {
		S[y] = a;
		a = (a + b) & 0xff;
		S[x] = b;
		x = (x + 1) & 0xff;
		ta = S[x];
		ty = (y + ta) & 0xff;
		tb = S[ty];
		*out++ = *in++ ^ S[a];
		if (--len == 0)
			break;
		y = ty;
		a = ta;
		b = tb;
	} while (true);

	ctx->x = x;
	ctx->y = y;
}

 #undef S_type
 #define S_type	u32
int arc4_setkey_32(struct arc4_ctx_32 *ctx, const u8 *in_key, unsigned int key_len)
{
	int i, j = 0, k = 0;

	ctx->x = 1;
	ctx->y = 0;

	for (i = 0; i < 256; i++)
		ctx->S[i] = i;

	for (i = 0; i < 256; i++) {
		S_type a = ctx->S[i];

		j = (j + in_key[k] + a) & 0xff;
		ctx->S[i] = ctx->S[j];
		ctx->S[j] = a;
		if (++k >= key_len)
			k = 0;
	}

	return 0;
}

void arc4_crypt_32(struct arc4_ctx_32 *ctx, u8 *out, const u8 *in, unsigned int len)
{
	S_type *const S = ctx->S;
	S_type a, b, ta, tb;
	u32 x, y, ty;

	if (len == 0)
		return;

	x = ctx->x;
	y = ctx->y;

	a = S[x];
	y = (y + a) & 0xff;
	b = S[y];

	do {
		S[y] = a;
		a = (a + b) & 0xff;
		S[x] = b;
		x = (x + 1) & 0xff;
		ta = S[x];
		ty = (y + ta) & 0xff;
		tb = S[ty];
		*out++ = *in++ ^ S[a];
		if (--len == 0)
			break;
		y = ty;
		a = ta;
		b = tb;
	} while (true);

	ctx->x = x;
	ctx->y = y;
}

 #define KEY     "AZERTY"
 #define in      "AZERTYUIOP_QSDFGHJKLM_WXCVBN"

int main() {
        long i;
        struct arc4_ctx_8 ctx_8;
        u8 out8[1024] = { };
        struct arc4_ctx_32 ctx_32;
        u8 out32[1024] = { };

        arc4_setkey_8(&ctx_8, KEY, strlen(KEY));
        arc4_crypt_8(&ctx_8, out8, in, strlen(in));

        arc4_setkey_32(&ctx_32, KEY, strlen(KEY));
        arc4_crypt_32(&ctx_32, out32, in, strlen(in));

        printf("%ld vs %ld\n", sizeof(ctx_8), sizeof(ctx_32));
        if (memcmp(out8, out32, 1024) == 0)
                printf("Ok\n");
        else
                printf("Broken\n");

        return 0;
}

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
The idea came from code found in staging/rtl8712/
See at the top of:
   https://elixir.bootlin.com/linux/v5.12/source/drivers/staging/rtl8712/rtl871x_security.c

More precisely, in an attempt to clean staging/rtl8712/, I triggered the
kernel test robot about some increasing stack usage:
   https://lore.kernel.org/kernel-janitors/YHQUH+Nqc%2FzS14Tb@kroah.com/T/#m832a01a9d1517e7efc4f671ed46deae9993d6ae9

The above patch works for me, but should be taken as a RFC.
---
 include/crypto/arc4.h | 8 +++++++-
 lib/crypto/arc4.c     | 8 ++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/crypto/arc4.h b/include/crypto/arc4.h
index f3c22fe01704..39545ed486e2 100644
--- a/include/crypto/arc4.h
+++ b/include/crypto/arc4.h
@@ -12,8 +12,14 @@
 #define ARC4_MAX_KEY_SIZE	256
 #define ARC4_BLOCK_SIZE		1
 
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define S_type	u8
+#else
+#define S_type	u32
+#endif
+
 struct arc4_ctx {
-	u32 S[256];
+	S_type S[256];
 	u32 x, y;
 };
 
diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c
index c2020f19c652..e0be0c2a08d9 100644
--- a/lib/crypto/arc4.c
+++ b/lib/crypto/arc4.c
@@ -21,7 +21,7 @@ int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
 		ctx->S[i] = i;
 
 	for (i = 0; i < 256; i++) {
-		u32 a = ctx->S[i];
+		S_type a = ctx->S[i];
 
 		j = (j + in_key[k] + a) & 0xff;
 		ctx->S[i] = ctx->S[j];
@@ -36,9 +36,9 @@ EXPORT_SYMBOL(arc4_setkey);
 
 void arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
 {
-	u32 *const S = ctx->S;
-	u32 x, y, a, b;
-	u32 ty, ta, tb;
+	S_type *const S = ctx->S;
+	S_type a, b, ta, tb;
+	u32 x, y, ty;
 
 	if (len == 0)
 		return;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage
  2021-05-02 19:29 [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage Christophe JAILLET
@ 2021-05-04 16:57 ` Eric Biggers
  2021-05-04 17:59   ` Christophe JAILLET
  0 siblings, 1 reply; 5+ messages in thread
From: Eric Biggers @ 2021-05-04 16:57 UTC (permalink / raw)
  To: Christophe JAILLET
  Cc: herbert, davem, linux-crypto, linux-kernel, kernel-janitors

On Sun, May 02, 2021 at 09:29:46PM +0200, Christophe JAILLET wrote:
> +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
> +#define S_type	u8
> +#else
> +#define S_type	u32
> +#endif
> +
>  struct arc4_ctx {
> -	u32 S[256];
> +	S_type S[256];
>  	u32 x, y;
>  };

Is it actually useful to keep both versions?  It seems we could just use the u8
version everywhere.  Note that there aren't actually any unaligned memory
accesses, so choosing the version conditionally on
CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS seems odd.  What are you trying to
determine by checking that?

- Eric

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage
  2021-05-04 16:57 ` Eric Biggers
@ 2021-05-04 17:59   ` Christophe JAILLET
  2021-05-04 19:36     ` Eric Biggers
  2021-05-05 10:20     ` David Laight
  0 siblings, 2 replies; 5+ messages in thread
From: Christophe JAILLET @ 2021-05-04 17:59 UTC (permalink / raw)
  To: Eric Biggers; +Cc: herbert, davem, linux-crypto, linux-kernel, kernel-janitors

Le 04/05/2021 à 18:57, Eric Biggers a écrit :
> On Sun, May 02, 2021 at 09:29:46PM +0200, Christophe JAILLET wrote:
>> +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
>> +#define S_type	u8
>> +#else
>> +#define S_type	u32
>> +#endif
>> +
>>   struct arc4_ctx {
>> -	u32 S[256];
>> +	S_type S[256];
>>   	u32 x, y;
>>   };
> 
> Is it actually useful to keep both versions?  It seems we could just use the u8
> version everywhere.  Note that there aren't actually any unaligned memory
> accesses, so choosing the version conditionally on
> CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS seems odd.  What are you trying to
> determine by checking that?

Hi, this is a bad interpretation from me.

I thought that S[1] would likely use an odd address and would trigger an 
unaligned access. But as we would read only 1 byte, this is not the case.

Looking at [1], we have : "At this point, it should be clear that 
accessing a single byte (u8 or char) will never cause an unaligned 
access, because all memory addresses are evenly divisible by one."


I wanted to avoid potential performance cost related to using char (i.e 
u8) instead of int (i.e. u32).
On some architecture this could require some shift or masking or 
whatever to "unpack" the values of S.


[1]: 
https://www.kernel.org/doc/html/latest/core-api/unaligned-memory-access.html

CJ

> 
> - Eric
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage
  2021-05-04 17:59   ` Christophe JAILLET
@ 2021-05-04 19:36     ` Eric Biggers
  2021-05-05 10:20     ` David Laight
  1 sibling, 0 replies; 5+ messages in thread
From: Eric Biggers @ 2021-05-04 19:36 UTC (permalink / raw)
  To: Christophe JAILLET
  Cc: herbert, davem, linux-crypto, linux-kernel, kernel-janitors

On Tue, May 04, 2021 at 07:59:38PM +0200, Christophe JAILLET wrote:
> Le 04/05/2021 à 18:57, Eric Biggers a écrit :
> > On Sun, May 02, 2021 at 09:29:46PM +0200, Christophe JAILLET wrote:
> > > +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
> > > +#define S_type	u8
> > > +#else
> > > +#define S_type	u32
> > > +#endif
> > > +
> > >   struct arc4_ctx {
> > > -	u32 S[256];
> > > +	S_type S[256];
> > >   	u32 x, y;
> > >   };
> > 
> > Is it actually useful to keep both versions?  It seems we could just use the u8
> > version everywhere.  Note that there aren't actually any unaligned memory
> > accesses, so choosing the version conditionally on
> > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS seems odd.  What are you trying to
> > determine by checking that?
> 
> Hi, this is a bad interpretation from me.
> 
> I thought that S[1] would likely use an odd address and would trigger an
> unaligned access. But as we would read only 1 byte, this is not the case.
> 
> Looking at [1], we have : "At this point, it should be clear that accessing
> a single byte (u8 or char) will never cause an unaligned access, because all
> memory addresses are evenly divisible by one."
> 
> 
> I wanted to avoid potential performance cost related to using char (i.e u8)
> instead of int (i.e. u32).
> On some architecture this could require some shift or masking or whatever to
> "unpack" the values of S.
> 
> 
> [1]:
> https://www.kernel.org/doc/html/latest/core-api/unaligned-memory-access.html
> 
> CJ
> 

arc4 is an insecure cipher which is only supported for use in legacy protocols.
So we don't really need to worry about optimizing performance on every
architecture.  If the byte-based version is *usually* faster as well as uses
less memory, we probably should just use it everywhere.

- Eric

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage
  2021-05-04 17:59   ` Christophe JAILLET
  2021-05-04 19:36     ` Eric Biggers
@ 2021-05-05 10:20     ` David Laight
  1 sibling, 0 replies; 5+ messages in thread
From: David Laight @ 2021-05-05 10:20 UTC (permalink / raw)
  To: 'Christophe JAILLET', Eric Biggers
  Cc: herbert, davem, linux-crypto, linux-kernel, kernel-janitors

From: Christophe JAILLET
> Sent: 04 May 2021 19:00
> 
> Le 04/05/2021 à 18:57, Eric Biggers a écrit :
> > On Sun, May 02, 2021 at 09:29:46PM +0200, Christophe JAILLET wrote:
> >> +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
> >> +#define S_type	u8
> >> +#else
> >> +#define S_type	u32
> >> +#endif
> >> +
> >>   struct arc4_ctx {
> >> -	u32 S[256];
> >> +	S_type S[256];
> >>   	u32 x, y;
> >>   };
> >
> > Is it actually useful to keep both versions?  It seems we could just use the u8
> > version everywhere.  Note that there aren't actually any unaligned memory
> > accesses, so choosing the version conditionally on
> > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS seems odd.  What are you trying to
> > determine by checking that?
> 
> Hi, this is a bad interpretation from me.
...
> 
> I wanted to avoid potential performance cost related to using char (i.e
> u8) instead of int (i.e. u32).
> On some architecture this could require some shift or masking or
> whatever to "unpack" the values of S.

The only architecture that Linux ran on where the hardware
did RMW accesses for byte writes was some very old alpha cpu.
Even more recent alpha supported byte writes to memory.

On many architectures (not x86 or arm) indexing a byte array
is better because it saves the instruction to multiply the index by 4.
On x86-64 you want to be using 'unsigned int' for array indexes
so the compiler doesn't have to emit the instruction to sign extend
a 32bit int to 64 bits (sometimes it knows it can't be needed).

FWIW with a modern compiler all those temporaries are pointless.
The number of lines of code can be halved.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-05-05 10:20 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-02 19:29 [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage Christophe JAILLET
2021-05-04 16:57 ` Eric Biggers
2021-05-04 17:59   ` Christophe JAILLET
2021-05-04 19:36     ` Eric Biggers
2021-05-05 10:20     ` David Laight

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).