[RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions
@ 2023-05-30 13:52 Ard Biesheuvel
  2023-05-30 14:08 ` Philippe Mathieu-Daudé
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: Ard Biesheuvel @ 2023-05-30 13:52 UTC (permalink / raw)
  To: qemu-arm
  Cc: qemu-devel, Ard Biesheuvel, Peter Maydell, Alex Bennée,
	Richard Henderson, Philippe Mathieu-Daudé

ARM intrinsics for AES deviate from the x86 ones in the way they cover
the different stages of each round, and so mapping one to the other is
not entirely straight-forward. However, with a bit of care, we can still
use the x86 ones to emulate the ARM ones, which makes them constant time
(which is an important property in crypto) and substantially more
efficient.

Cc: Peter Maydell <peter.maydell@linaro.org>
Cc: Alex Bennée <alex.bennee@linaro.org>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
Suggestions welcome on how to make this more generic across targets and
compilers etc.

 target/arm/tcg/crypto_helper.c | 43 ++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/target/arm/tcg/crypto_helper.c b/target/arm/tcg/crypto_helper.c
index d28690321f..961112b6bd 100644
--- a/target/arm/tcg/crypto_helper.c
+++ b/target/arm/tcg/crypto_helper.c
@@ -18,10 +18,32 @@
 #include "crypto/sm4.h"
 #include "vec_internal.h"
 
+#ifdef __x86_64
+#pragma GCC target ("aes")
+#include <cpuid.h>
+#include <wmmintrin.h>
+
+static bool have_aes(void)
+{
+    static int cpuid_have_aes = -1;
+
+    if (cpuid_have_aes == -1) {
+        unsigned int eax, ebx, ecx, edx;
+        int ret = __get_cpuid(0x1, &eax, &ebx, &ecx, &edx);
+
+        cpuid_have_aes = ret && (ecx & bit_AES);
+    }
+    return cpuid_have_aes > 0;
+}
+#endif
+
 union CRYPTO_STATE {
     uint8_t    bytes[16];
     uint32_t   words[4];
     uint64_t   l[2];
+#ifdef __x86_64
+    __m128i    vec;
+#endif
 };
 
 #if HOST_BIG_ENDIAN
@@ -54,6 +76,16 @@ static void do_crypto_aese(uint64_t *rd, uint64_t *rn,
     union CRYPTO_STATE st = { .l = { rn[0], rn[1] } };
     int i;
 
+#ifdef __x86_64__
+    if (have_aes()) {
+        __m128i *d = (__m128i *)rd;
+
+        *d = decrypt ? _mm_aesdeclast_si128(rk.vec ^ st.vec, (__m128i){})
+                     : _mm_aesenclast_si128(rk.vec ^ st.vec, (__m128i){});
+        return;
+    }
+#endif
+
     /* xor state vector with round key */
     rk.l[0] ^= st.l[0];
     rk.l[1] ^= st.l[1];
@@ -217,6 +249,17 @@ static void do_crypto_aesmc(uint64_t *rd, uint64_t *rm, bool decrypt)
     union CRYPTO_STATE st = { .l = { rm[0], rm[1] } };
     int i;
 
+#ifdef __x86_64__
+    if (have_aes()) {
+        __m128i *d = (__m128i *)rd;
+
+        *d = decrypt ? _mm_aesdec_si128(_mm_aesenclast_si128(st.vec, (__m128i){}),
+                                        (__m128i){})
+                     : _mm_aesenc_si128(_mm_aesdeclast_si128(st.vec, (__m128i){}),
+                                        (__m128i){});
+        return;
+    }
+#endif
     for (i = 0; i < 16; i += 4) {
         CR_ST_WORD(st, i >> 2) =
             mc[decrypt][CR_ST_BYTE(st, i)] ^
-- 
2.39.2



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions
  2023-05-30 13:52 [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions Ard Biesheuvel
@ 2023-05-30 14:08 ` Philippe Mathieu-Daudé
  2023-05-30 14:48 ` Richard Henderson
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 8+ messages in thread
From: Philippe Mathieu-Daudé @ 2023-05-30 14:08 UTC (permalink / raw)
  To: Ard Biesheuvel, qemu-arm
  Cc: qemu-devel, Peter Maydell, Alex Bennée, Richard Henderson,
	Philippe Mathieu-Daudé

Hi Ard,

On 30/5/23 15:52, Ard Biesheuvel wrote:
> ARM intrinsics for AES deviate from the x86 ones in the way they cover
> the different stages of each round, and so mapping one to the other is
> not entirely straight-forward. However, with a bit of care, we can still
> use the x86 ones to emulate the ARM ones, which makes them constant time
> (which is an important property in crypto) and substantially more
> efficient.
> 
> Cc: Peter Maydell <peter.maydell@linaro.org>
> Cc: Alex Bennée <alex.bennee@linaro.org>
> Cc: Richard Henderson <richard.henderson@linaro.org>
> Cc: Philippe Mathieu-Daudé <f4bug@amsat.org>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
> Suggestions welcome on how to make this more generic across targets and
> compilers etc.
> 
>   target/arm/tcg/crypto_helper.c | 43 ++++++++++++++++++++
>   1 file changed, 43 insertions(+)
> 
> diff --git a/target/arm/tcg/crypto_helper.c b/target/arm/tcg/crypto_helper.c
> index d28690321f..961112b6bd 100644
> --- a/target/arm/tcg/crypto_helper.c
> +++ b/target/arm/tcg/crypto_helper.c
> @@ -18,10 +18,32 @@
>   #include "crypto/sm4.h"
>   #include "vec_internal.h"
>   
> +#ifdef __x86_64
> +#pragma GCC target ("aes")
> +#include <cpuid.h>
> +#include <wmmintrin.h>
> +
> +static bool have_aes(void)
> +{
> +    static int cpuid_have_aes = -1;
> +
> +    if (cpuid_have_aes == -1) {
> +        unsigned int eax, ebx, ecx, edx;
> +        int ret = __get_cpuid(0x1, &eax, &ebx, &ecx, &edx);
> +
> +        cpuid_have_aes = ret && (ecx & bit_AES);
> +    }
> +    return cpuid_have_aes > 0;
> +}
> +#endif

Per the recent cpuinfo API added in commit 6bc12fd042, I suppose
we should add CPUINFO_AES to host/include/i386/host/cpuinfo.h.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions
  2023-05-30 13:52 [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions Ard Biesheuvel
  2023-05-30 14:08 ` Philippe Mathieu-Daudé
@ 2023-05-30 14:48 ` Richard Henderson
  2023-05-30 16:43 ` Richard Henderson
  2023-05-30 16:45 ` Peter Maydell
  3 siblings, 0 replies; 8+ messages in thread
From: Richard Henderson @ 2023-05-30 14:48 UTC (permalink / raw)
  To: Ard Biesheuvel, qemu-arm
  Cc: qemu-devel, Peter Maydell, Alex Bennée, Philippe Mathieu-Daudé

On 5/30/23 06:52, Ard Biesheuvel wrote:
> ARM intrinsics for AES deviate from the x86 ones in the way they cover
> the different stages of each round, and so mapping one to the other is
> not entirely straight-forward. However, with a bit of care, we can still
> use the x86 ones to emulate the ARM ones, which makes them constant time
> (which is an important property in crypto) and substantially more
> efficient.
> 
> Cc: Peter Maydell <peter.maydell@linaro.org>
> Cc: Alex Bennée <alex.bennee@linaro.org>
> Cc: Richard Henderson <richard.henderson@linaro.org>
> Cc: Philippe Mathieu-Daudé <f4bug@amsat.org>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
> Suggestions welcome on how to make this more generic across targets and
> compilers etc.
> 
>   target/arm/tcg/crypto_helper.c | 43 ++++++++++++++++++++
>   1 file changed, 43 insertions(+)
> 
> diff --git a/target/arm/tcg/crypto_helper.c b/target/arm/tcg/crypto_helper.c
> index d28690321f..961112b6bd 100644
> --- a/target/arm/tcg/crypto_helper.c
> +++ b/target/arm/tcg/crypto_helper.c
> @@ -18,10 +18,32 @@
>   #include "crypto/sm4.h"
>   #include "vec_internal.h"
>   
> +#ifdef __x86_64
> +#pragma GCC target ("aes")

This doesn't work with clang.

What does work is __attribute__((__target__("aes"))), which requires that you pull those 
little code blocks out to separate functions to be annotated.  I believe they'll be 
inlined afterward, but not really relevant for your improvement.


> +#include <cpuid.h>
> +#include <wmmintrin.h>
> +
> +static bool have_aes(void)
> +{
> +    static int cpuid_have_aes = -1;
> +
> +    if (cpuid_have_aes == -1) {
> +        unsigned int eax, ebx, ecx, edx;
> +        int ret = __get_cpuid(0x1, &eax, &ebx, &ecx, &edx);
> +
> +        cpuid_have_aes = ret && (ecx & bit_AES);
> +    }
> +    return cpuid_have_aes > 0;
> +}
> +#endif

See "host/include/i386/host/cpuinfo.h" and "util/cpuinfo-i386.c".

I'll have a closer look at the other AES uses to see what might be sharable.


r~


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions
  2023-05-30 13:52 [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions Ard Biesheuvel
  2023-05-30 14:08 ` Philippe Mathieu-Daudé
  2023-05-30 14:48 ` Richard Henderson
@ 2023-05-30 16:43 ` Richard Henderson
  2023-05-30 16:58   ` Ard Biesheuvel
  2023-05-30 16:45 ` Peter Maydell
  3 siblings, 1 reply; 8+ messages in thread
From: Richard Henderson @ 2023-05-30 16:43 UTC (permalink / raw)
  To: Ard Biesheuvel, qemu-arm
  Cc: qemu-devel, Peter Maydell, Alex Bennée, Philippe Mathieu-Daudé

On 5/30/23 06:52, Ard Biesheuvel wrote:
> +#ifdef __x86_64__
> +    if (have_aes()) {
> +        __m128i *d = (__m128i *)rd;
> +
> +        *d = decrypt ? _mm_aesdeclast_si128(rk.vec ^ st.vec, (__m128i){})
> +                     : _mm_aesenclast_si128(rk.vec ^ st.vec, (__m128i){});

Do I correctly understand that the ARM xor is pre-shift

> +        return;
> +    }
> +#endif
> +
>       /* xor state vector with round key */
>       rk.l[0] ^= st.l[0];
>       rk.l[1] ^= st.l[1];

(like so)

whereas the x86 xor is post-shift

> void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
> {
>     int i;
>     Reg st = *v;
>     Reg rk = *s;
> 
>     for (i = 0; i < 8 << SHIFT; i++) {
>         d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]);
>     }

(like so, from target/i386/ops_sse.h)?

What might help: could we do the reverse -- emulate the x86 aesdeclast instruction with 
the aarch64 aesd instruction?


r~


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions
  2023-05-30 13:52 [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions Ard Biesheuvel
                   ` (2 preceding siblings ...)
  2023-05-30 16:43 ` Richard Henderson
@ 2023-05-30 16:45 ` Peter Maydell
  2023-05-30 17:02   ` Ard Biesheuvel
  3 siblings, 1 reply; 8+ messages in thread
From: Peter Maydell @ 2023-05-30 16:45 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: qemu-arm, qemu-devel, Alex Bennée, Richard Henderson,
	Philippe Mathieu-Daudé

On Tue, 30 May 2023 at 14:52, Ard Biesheuvel <ardb@kernel.org> wrote:
>
> ARM intrinsics for AES deviate from the x86 ones in the way they cover
> the different stages of each round, and so mapping one to the other is
> not entirely straight-forward. However, with a bit of care, we can still
> use the x86 ones to emulate the ARM ones, which makes them constant time
> (which is an important property in crypto) and substantially more
> efficient.

Do you have examples of workloads and speedups obtained,
by the way?

thanks
-- PMM


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions
  2023-05-30 16:43 ` Richard Henderson
@ 2023-05-30 16:58   ` Ard Biesheuvel
  2023-05-30 17:21     ` Richard Henderson
  0 siblings, 1 reply; 8+ messages in thread
From: Ard Biesheuvel @ 2023-05-30 16:58 UTC (permalink / raw)
  To: Richard Henderson
  Cc: qemu-arm, qemu-devel, Peter Maydell, Alex Bennée,
	Philippe Mathieu-Daudé

On Tue, 30 May 2023 at 18:43, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 5/30/23 06:52, Ard Biesheuvel wrote:
> > +#ifdef __x86_64__
> > +    if (have_aes()) {
> > +        __m128i *d = (__m128i *)rd;
> > +
> > +        *d = decrypt ? _mm_aesdeclast_si128(rk.vec ^ st.vec, (__m128i){})
> > +                     : _mm_aesenclast_si128(rk.vec ^ st.vec, (__m128i){});
>
> Do I correctly understand that the ARM xor is pre-shift
>
> > +        return;
> > +    }
> > +#endif
> > +
> >       /* xor state vector with round key */
> >       rk.l[0] ^= st.l[0];
> >       rk.l[1] ^= st.l[1];
>
> (like so)
>
> whereas the x86 xor is post-shift
>
> > void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
> > {
> >     int i;
> >     Reg st = *v;
> >     Reg rk = *s;
> >
> >     for (i = 0; i < 8 << SHIFT; i++) {
> >         d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]);
> >     }
>
> (like so, from target/i386/ops_sse.h)?
>

Indeed. Using the primitive operations defined in the AES paper, we
basically have the following for n rounds of AES (for n in {10, 12,
14})

for (n-1 rounds) {
  AddRoundKey
  ShiftRows
  SubBytes
  MixColumns
}
AddRoundKey
ShiftRows
SubBytes
AddRoundKey

AddRoundKey is just XOR, but it is incorporated into the instructions
that combine a couple of these steps.

So on x86, we have

aesenc:
  ShiftRows
  SubBytes
  MixColumns
  AddRoundKey

aesenclast:
  ShiftRows
  SubBytes
  AddRoundKey

and on ARM we have

aese:
  AddRoundKey
  ShiftRows
  SubBytes

aesmc:
  MixColumns


> What might help: could we do the reverse -- emulate the x86 aesdeclast instruction with
> the aarch64 aesd instruction?
>

Help in what sense? To emulate the x86 instructions on a ARM host?

But yes, aesenclast can be implement using aese in a similar way,
i.e., by passing a {0} vector as the round key into the instruction,
and performing the XOR explicitly using the real round key afterwards.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions
  2023-05-30 16:45 ` Peter Maydell
@ 2023-05-30 17:02   ` Ard Biesheuvel
  0 siblings, 0 replies; 8+ messages in thread
From: Ard Biesheuvel @ 2023-05-30 17:02 UTC (permalink / raw)
  To: Peter Maydell
  Cc: qemu-arm, qemu-devel, Alex Bennée, Richard Henderson,
	Philippe Mathieu-Daudé

On Tue, 30 May 2023 at 18:45, Peter Maydell <peter.maydell@linaro.org> wrote:
>
> On Tue, 30 May 2023 at 14:52, Ard Biesheuvel <ardb@kernel.org> wrote:
> >
> > ARM intrinsics for AES deviate from the x86 ones in the way they cover
> > the different stages of each round, and so mapping one to the other is
> > not entirely straight-forward. However, with a bit of care, we can still
> > use the x86 ones to emulate the ARM ones, which makes them constant time
> > (which is an important property in crypto) and substantially more
> > efficient.
>
> Do you have examples of workloads and speedups obtained,
> by the way?
>

I don't have any actual numbers to share, unfortunately.

I implemented this when i was experimenting with TPM based measured
boot and disk encryption in the guest. I'd say that running an OS
under emulation that uses disk encryption would be the most relevant
use case here.

Accelerated AES is typically at least an order of magnitude faster
than a table based C implementation, and does not stress the D-cache
as much (the tables involved are not tiny).


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions
  2023-05-30 16:58   ` Ard Biesheuvel
@ 2023-05-30 17:21     ` Richard Henderson
  0 siblings, 0 replies; 8+ messages in thread
From: Richard Henderson @ 2023-05-30 17:21 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: qemu-arm, qemu-devel, Peter Maydell, Alex Bennée,
	Philippe Mathieu-Daudé

On 5/30/23 09:58, Ard Biesheuvel wrote:
> On Tue, 30 May 2023 at 18:43, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> On 5/30/23 06:52, Ard Biesheuvel wrote:
>>> +#ifdef __x86_64__
>>> +    if (have_aes()) {
>>> +        __m128i *d = (__m128i *)rd;
>>> +
>>> +        *d = decrypt ? _mm_aesdeclast_si128(rk.vec ^ st.vec, (__m128i){})
>>> +                     : _mm_aesenclast_si128(rk.vec ^ st.vec, (__m128i){});
>>
>> Do I correctly understand that the ARM xor is pre-shift
>>
>>> +        return;
>>> +    }
>>> +#endif
>>> +
>>>        /* xor state vector with round key */
>>>        rk.l[0] ^= st.l[0];
>>>        rk.l[1] ^= st.l[1];
>>
>> (like so)
>>
>> whereas the x86 xor is post-shift
>>
>>> void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
>>> {
>>>      int i;
>>>      Reg st = *v;
>>>      Reg rk = *s;
>>>
>>>      for (i = 0; i < 8 << SHIFT; i++) {
>>>          d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]);
>>>      }
>>
>> (like so, from target/i386/ops_sse.h)?
>>
> 
> Indeed. Using the primitive operations defined in the AES paper, we
> basically have the following for n rounds of AES (for n in {10, 12,
> 14})
> 
> for (n-1 rounds) {
>    AddRoundKey
>    ShiftRows
>    SubBytes
>    MixColumns
> }
> AddRoundKey
> ShiftRows
> SubBytes
> AddRoundKey
> 
> AddRoundKey is just XOR, but it is incorporated into the instructions
> that combine a couple of these steps.
> 
> So on x86, we have
> 
> aesenc:
>    ShiftRows
>    SubBytes
>    MixColumns
>    AddRoundKey
> 
> aesenclast:
>    ShiftRows
>    SubBytes
>    AddRoundKey
> 
> and on ARM we have
> 
> aese:
>    AddRoundKey
>    ShiftRows
>    SubBytes
> 
> aesmc:
>    MixColumns
> 
> 
>> What might help: could we do the reverse -- emulate the x86 aesdeclast instruction with
>> the aarch64 aesd instruction?
>>
> 
> Help in what sense? To emulate the x86 instructions on a ARM host?

Well that too.  I meant help me understand the two primitives.

> But yes, aesenclast can be implement using aese in a similar way,
> i.e., by passing a {0} vector as the round key into the instruction,
> and performing the XOR explicitly using the real round key afterwards.

Excellent, thanks.


r~


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-05-30 17:22 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-30 13:52 [RFC PATCH] target/arm: use x86 intrinsics to implement AES instructions Ard Biesheuvel
2023-05-30 14:08 ` Philippe Mathieu-Daudé
2023-05-30 14:48 ` Richard Henderson
2023-05-30 16:43 ` Richard Henderson
2023-05-30 16:58   ` Ard Biesheuvel
2023-05-30 17:21     ` Richard Henderson
2023-05-30 16:45 ` Peter Maydell
2023-05-30 17:02   ` Ard Biesheuvel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.