All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86, crypto: ported aes-ni implementation to x86
@ 2010-10-29 21:10 Mathias Krause
  2010-10-29 22:15 ` Herbert Xu
  0 siblings, 1 reply; 6+ messages in thread
From: Mathias Krause @ 2010-10-29 21:10 UTC (permalink / raw)
  To: linux-crypto; +Cc: Mathias Krause

The AES-NI instructions are also available in legacy mode so the x86
architecture may profit from those, too.

To illustrate the performance gain here's a short summary of the tcrypt
speed test on a Core i5 M 520 running at 2.40GHz comparing both
assembler implementations:

                             aes-i586   aes-ni-i586   delta
256 bit, 8kB blocks, ECB:  46.81 MB/s   164.46 MB/s   +251%
256 bit, 8kB blocks, CBC:  43.89 MB/s    62.18 MB/s    +41%
384 bit, 8kB blocks, LRW:  42.24 MB/s   142.90 MB/s   +238%
512 bit, 8kB blocks, XTS:  43.41 MB/s   148.67 MB/s   +242%

Signed-off-by: Mathias Krause <minipli@googlemail.com>
---
 arch/x86/crypto/Makefile                 |    7 +-
 arch/x86/crypto/aesni-intel_asm-i586.S   |  773 +++++++++++++++++++++++++++
 arch/x86/crypto/aesni-intel_asm-x86_64.S |  841 ++++++++++++++++++++++++++++++
 arch/x86/crypto/aesni-intel_asm.S        |  841 ------------------------------
 arch/x86/crypto/aesni-intel_glue.c       |   18 +
 crypto/Kconfig                           |   32 ++-
 6 files changed, 1667 insertions(+), 845 deletions(-)
 create mode 100644 arch/x86/crypto/aesni-intel_asm-i586.S
 create mode 100644 arch/x86/crypto/aesni-intel_asm-x86_64.S
 delete mode 100644 arch/x86/crypto/aesni-intel_asm.S

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 1a58ad8..949e7e5 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,25 +5,26 @@
 obj-$(CONFIG_CRYPTO_FPU) += fpu.o
 
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
+obj-$(CONFIG_CRYPTO_AES_NI_INTEL_586) += aesni-intel-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
 obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_AES_NI_INTEL_X86_64) += aesni-intel-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
-obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
+aesni-intel-i586-y := aesni-intel_asm-i586.o aesni-intel_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
 salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
+aesni-intel-x86_64-y := aesni-intel_asm-x86_64.o aesni-intel_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 
-aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm-i586.S b/arch/x86/crypto/aesni-intel_asm-i586.S
new file mode 100644
index 0000000..e2bdb5a
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_asm-i586.S
@@ -0,0 +1,773 @@
+/*
+ * Implement AES algorithm in Intel AES-NI instructions.
+ *
+ * The white paper of AES-NI instructions can be downloaded from:
+ *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
+ *
+ * Copyright (C) 2008, Intel Corp.
+ *    Author: Huang Ying <ying.huang@intel.com>
+ *            Vinodh Gopal <vinodh.gopal@intel.com>
+ *            Kahraman Akdemir
+ * Copyright (C) 2010 secunet Security Networks AG
+ *    Author: Mathias Krause <mathias.krause@secunet.com>
+ *            ported x86_64 version to x86
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+.text
+
+#define STATE1	%xmm0
+#define STATE2	%xmm4
+#define STATE3	%xmm5
+#define STATE4	%xmm6
+#define STATE	STATE1
+#define IN1	%xmm1
+#define IN2	%xmm7
+#define IN	IN1
+#define KEY	%xmm2
+#define IV	%xmm3
+
+#define KEYP	%edi
+#define OUTP	%eax
+#define INP	%edx
+#define LEN	%esi
+#define IVP	%ebp
+#define EKLEN	480(KEYP)
+#define DKLEN	240(KEYP)
+#define T1	%ecx
+#define TKEYP	T1
+
+_key_expansion_128:
+_key_expansion_256a:
+	pshufd $0b11111111, %xmm1, %xmm1
+	shufps $0b00010000, %xmm0, %xmm4
+ 	pxor %xmm4, %xmm0
+	shufps $0b10001100, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	pxor %xmm1, %xmm0
+	movaps %xmm0, (%ecx)
+	add $0x10, %ecx
+	ret
+
+_key_expansion_192a:
+	pshufd $0b01010101, %xmm1, %xmm1
+	shufps $0b00010000, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	shufps $0b10001100, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	pxor %xmm1, %xmm0
+
+	movaps %xmm2, %xmm5
+	movaps %xmm2, %xmm6
+	pslldq $4, %xmm5
+	pshufd $0b11111111, %xmm0, %xmm3
+	pxor %xmm3, %xmm2
+	pxor %xmm5, %xmm2
+
+	movaps %xmm0, %xmm1
+	shufps $0b01000100, %xmm0, %xmm6
+	movaps %xmm6, (%ecx)
+	shufps $0b01001110, %xmm2, %xmm1
+	movaps %xmm1, 0x10(%ecx)
+	add $0x20, %ecx
+	ret
+
+_key_expansion_192b:
+	pshufd $0b01010101, %xmm1, %xmm1
+	shufps $0b00010000, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	shufps $0b10001100, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	pxor %xmm1, %xmm0
+
+	movaps %xmm2, %xmm5
+	pslldq $4, %xmm5
+	pshufd $0b11111111, %xmm0, %xmm3
+	pxor %xmm3, %xmm2
+	pxor %xmm5, %xmm2
+
+	movaps %xmm0, (%ecx)
+	add $0x10, %ecx
+	ret
+
+_key_expansion_256b:
+	pshufd $0b10101010, %xmm1, %xmm1
+	shufps $0b00010000, %xmm2, %xmm4
+	pxor %xmm4, %xmm2
+	shufps $0b10001100, %xmm2, %xmm4
+	pxor %xmm4, %xmm2
+	pxor %xmm1, %xmm2
+	movaps %xmm2, (%ecx)
+	add $0x10, %ecx
+	ret
+
+/*
+ * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ *                   unsigned int key_len)
+ */
+ENTRY(aesni_set_key)
+	pushl %edi
+	movl 8(%esp), %edi		# ctx
+	movl 12(%esp), %edx		# in_key
+	movl 16(%esp), %eax		# key_len
+
+	movups (%edx), %xmm0		# user key (first 16 bytes)
+	movaps %xmm0, (%edi)
+	lea 0x10(%edi), %ecx		# key addr
+	movl %eax, 480(%edi)
+	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
+	cmp $24, %al
+	jb .Lenc_key128
+	je .Lenc_key192
+	movups 0x10(%edx), %xmm2	# other user key
+	movaps %xmm2, (%ecx)
+	add $0x10, %ecx
+	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x1 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x2 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x4 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x8 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x10 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x20 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
+	call _key_expansion_256a
+	jmp .Ldec_key
+.Lenc_key192:
+	movq 0x10(%edx), %xmm2		# other user key
+	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
+	call _key_expansion_192a
+	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
+	call _key_expansion_192b
+	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
+	call _key_expansion_192a
+	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
+	call _key_expansion_192b
+	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
+	call _key_expansion_192a
+	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
+	call _key_expansion_192b
+	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
+	call _key_expansion_192a
+	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
+	call _key_expansion_192b
+	jmp .Ldec_key
+.Lenc_key128:
+	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
+	call _key_expansion_128
+	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
+	call _key_expansion_128
+	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
+	call _key_expansion_128
+	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
+	call _key_expansion_128
+	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
+	call _key_expansion_128
+	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
+	call _key_expansion_128
+	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
+	call _key_expansion_128
+	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
+	call _key_expansion_128
+	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
+	call _key_expansion_128
+	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
+	call _key_expansion_128
+.Ldec_key:
+	sub $0x10, %ecx
+	movaps (%edi), %xmm0
+	movaps (%ecx), %xmm1
+	movaps %xmm0, 240(%ecx)
+	movaps %xmm1, 240(%edi)
+	add $0x10, %edi
+	lea 240-16(%ecx), %edx
+.align 4
+.Ldec_key_loop:
+	movaps (%edi), %xmm0
+	AESIMC %xmm0 %xmm1
+	movaps %xmm1, (%edx)
+	add $0x10, %edi
+	sub $0x10, %edx
+	cmp %ecx, %edi
+	jb .Ldec_key_loop
+	xor %eax, %eax
+	popl %edi
+	ret
+
+/*
+ * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_enc)
+	pushl KEYP
+	movl 8(%esp), KEYP
+	movl 12(%esp), OUTP
+	movl 16(%esp), INP
+	movups (INP), STATE		# input
+	call _aesni_enc1
+	movups STATE, (OUTP)		# output
+	popl KEYP
+	ret
+
+/*
+ * _aesni_enc1:		internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	EKLEN:		round count
+ *	STATE:		initial state (input)
+ * output:
+ *	STATE:		finial state (output)
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_enc1:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE		# round 0
+	add $0x30, TKEYP
+	cmp $24, EKLEN
+	jb .Lenc128
+	lea 0x20(TKEYP), TKEYP
+	je .Lenc192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	AESENC KEY STATE
+	movaps -0x50(TKEYP), KEY
+	AESENC KEY STATE
+.align 4
+.Lenc192:
+	movaps -0x40(TKEYP), KEY
+	AESENC KEY STATE
+	movaps -0x30(TKEYP), KEY
+	AESENC KEY STATE
+.align 4
+.Lenc128:
+	movaps -0x20(TKEYP), KEY
+	AESENC KEY STATE
+	movaps -0x10(TKEYP), KEY
+	AESENC KEY STATE
+	movaps (TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x10(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x20(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x30(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x40(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x50(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x60(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x70(TKEYP), KEY
+	AESENCLAST KEY STATE
+	ret
+
+/*
+ * _aesni_enc4:	internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	EKLEN:		round count
+ *	STATE1:		initial state (input)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * output:
+ *	STATE1:		finial state (output)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_enc4:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE1		# round 0
+	pxor KEY, STATE2
+	pxor KEY, STATE3
+	pxor KEY, STATE4
+	add $0x30, TKEYP
+	cmp $24, EKLEN
+	jb .L4enc128
+	lea 0x20(TKEYP), TKEYP
+	je .L4enc192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps -0x50(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+#.align 4
+.L4enc192:
+	movaps -0x40(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps -0x30(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+#.align 4
+.L4enc128:
+	movaps -0x20(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps -0x10(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps (TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x10(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x20(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x30(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x40(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x50(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x60(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x70(TKEYP), KEY
+	AESENCLAST KEY STATE1		# last round
+	AESENCLAST KEY STATE2
+	AESENCLAST KEY STATE3
+	AESENCLAST KEY STATE4
+	ret
+
+/*
+ * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_dec)
+	pushl KEYP
+	movl 8(%esp), KEYP
+	movl 12(%esp), OUTP
+	movl 16(%esp), INP
+	add $240, KEYP
+	movups (INP), STATE		# input
+	call _aesni_dec1
+	movups STATE, (OUTP)		#output
+	popl KEYP
+	ret
+
+/*
+ * _aesni_dec1:		internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	DKLEN:		key length
+ *	STATE:		initial state (input)
+ * output:
+ *	STATE:		finial state (output)
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_dec1:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE		# round 0
+	add $0x30, TKEYP
+	cmp $24, DKLEN
+	jb .Ldec128
+	lea 0x20(TKEYP), TKEYP
+	je .Ldec192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps -0x50(TKEYP), KEY
+	AESDEC KEY STATE
+.align 4
+.Ldec192:
+	movaps -0x40(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps -0x30(TKEYP), KEY
+	AESDEC KEY STATE
+.align 4
+.Ldec128:
+	movaps -0x20(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps -0x10(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps (TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x10(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x20(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x30(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x40(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x50(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x60(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x70(TKEYP), KEY
+	AESDECLAST KEY STATE
+	ret
+
+/*
+ * _aesni_dec4:	internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	DKLEN:		key length
+ *	STATE1:		initial state (input)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * output:
+ *	STATE1:		finial state (output)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_dec4:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE1		# round 0
+	pxor KEY, STATE2
+	pxor KEY, STATE3
+	pxor KEY, STATE4
+	add $0x30, TKEYP
+	cmp $24, DKLEN
+	jb .L4dec128
+	lea 0x20(TKEYP), TKEYP
+	je .L4dec192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps -0x50(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+.align 4
+.L4dec192:
+	movaps -0x40(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps -0x30(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+.align 4
+.L4dec128:
+	movaps -0x20(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps -0x10(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps (TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x10(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x20(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x30(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x40(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x50(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x60(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x70(TKEYP), KEY
+	AESDECLAST KEY STATE1		# last round
+	AESDECLAST KEY STATE2
+	AESDECLAST KEY STATE3
+	AESDECLAST KEY STATE4
+	ret
+
+/*
+ * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len)
+ */
+ENTRY(aesni_ecb_enc)
+	pushl LEN
+	pushl KEYP
+	movl 12(%esp), KEYP
+	movl 16(%esp), OUTP
+	movl 20(%esp), INP
+	movl 24(%esp), LEN
+	test LEN, LEN		# check length
+	jz .Lecb_enc_ret
+	cmp $16, LEN
+	jb .Lecb_enc_ret
+	cmp $64, LEN
+	jb .Lecb_enc_loop1
+.align 4
+.Lecb_enc_loop4:
+	movups (INP), STATE1
+	movups 0x10(INP), STATE2
+	movups 0x20(INP), STATE3
+	movups 0x30(INP), STATE4
+	call _aesni_enc4
+	movups STATE1, (OUTP)
+	movups STATE2, 0x10(OUTP)
+	movups STATE3, 0x20(OUTP)
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lecb_enc_loop4
+	cmp $16, LEN
+	jb .Lecb_enc_ret
+.align 4
+.Lecb_enc_loop1:
+	movups (INP), STATE1
+	call _aesni_enc1
+	movups STATE1, (OUTP)
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lecb_enc_loop1
+.Lecb_enc_ret:
+	popl KEYP
+	popl LEN
+	ret
+
+/*
+ * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len);
+ */
+ENTRY(aesni_ecb_dec)
+	pushl LEN
+	pushl KEYP
+	movl 12(%esp), KEYP
+	movl 16(%esp), OUTP
+	movl 20(%esp), INP
+	movl 24(%esp), LEN
+	test LEN, LEN
+	jz .Lecb_dec_ret
+	add $240, KEYP
+	cmp $16, LEN
+	jb .Lecb_dec_ret
+	cmp $64, LEN
+	jb .Lecb_dec_loop1
+.align 4
+.Lecb_dec_loop4:
+	movups (INP), STATE1
+	movups 0x10(INP), STATE2
+	movups 0x20(INP), STATE3
+	movups 0x30(INP), STATE4
+	call _aesni_dec4
+	movups STATE1, (OUTP)
+	movups STATE2, 0x10(OUTP)
+	movups STATE3, 0x20(OUTP)
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lecb_dec_loop4
+	cmp $16, LEN
+	jb .Lecb_dec_ret
+.align 4
+.Lecb_dec_loop1:
+	movups (INP), STATE1
+	call _aesni_dec1
+	movups STATE1, (OUTP)
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lecb_dec_loop1
+.Lecb_dec_ret:
+	popl KEYP
+	popl LEN
+	ret
+
+/*
+ * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_enc)
+	pushl IVP
+	pushl LEN
+	pushl KEYP
+	movl 16(%esp), KEYP
+	movl 20(%esp), OUTP
+	movl 24(%esp), INP
+	movl 28(%esp), LEN
+	cmp $16, LEN
+	jb .Lcbc_enc_ret
+	movups (IVP), STATE	# load iv as initial state
+.align 4
+.Lcbc_enc_loop:
+	movups (INP), IN	# load input
+	pxor IN, STATE
+	call _aesni_enc1
+	movups STATE, (OUTP)	# store output
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lcbc_enc_loop
+	movups STATE, (IVP)
+.Lcbc_enc_ret:
+	popl KEYP
+	popl LEN
+	popl IVP
+	ret
+
+/*
+ * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_dec)
+	pushl IVP
+	pushl LEN
+	pushl KEYP
+	movl 16(%esp), KEYP
+	movl 20(%esp), OUTP
+	movl 24(%esp), INP
+	movl 28(%esp), LEN
+	cmp $16, LEN
+	jb .Lcbc_dec_just_ret
+	add $240, KEYP
+	movups (IVP), IV
+	cmp $64, LEN
+	jb .Lcbc_dec_loop1
+.align 4
+.Lcbc_dec_loop4:
+	movups (INP), IN1
+	movaps IN1, STATE1
+	movups 0x10(INP), IN2
+	movaps IN2, STATE2
+	movups 0x20(INP), IN1
+	movaps IN1, STATE3
+	movups 0x30(INP), IN2
+	movaps IN2, STATE4
+	call _aesni_dec4
+	pxor IV, STATE1
+	pxor (INP), STATE2
+	pxor 0x10(INP), STATE3
+	pxor IN1, STATE4
+	movaps IN2, IV
+	movups STATE1, (OUTP)
+	movups STATE2, 0x10(OUTP)
+	movups STATE3, 0x20(OUTP)
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lcbc_dec_loop4
+	cmp $16, LEN
+	jb .Lcbc_dec_ret
+.align 4
+.Lcbc_dec_loop1:
+	movups (INP), IN
+	movaps IN, STATE
+	call _aesni_dec1
+	pxor IV, STATE
+	movups STATE, (OUTP)
+	movaps IN, IV
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lcbc_dec_loop1
+.Lcbc_dec_ret:
+	movups IV, (IVP)
+.Lcbc_dec_just_ret:
+	popl KEYP
+	popl LEN
+	popl IVP
+	ret
diff --git a/arch/x86/crypto/aesni-intel_asm-x86_64.S b/arch/x86/crypto/aesni-intel_asm-x86_64.S
new file mode 100644
index 0000000..ff16756
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_asm-x86_64.S
@@ -0,0 +1,841 @@
+/*
+ * Implement AES algorithm in Intel AES-NI instructions.
+ *
+ * The white paper of AES-NI instructions can be downloaded from:
+ *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
+ *
+ * Copyright (C) 2008, Intel Corp.
+ *    Author: Huang Ying <ying.huang@intel.com>
+ *            Vinodh Gopal <vinodh.gopal@intel.com>
+ *            Kahraman Akdemir
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+.text
+
+#define STATE1	%xmm0
+#define STATE2	%xmm4
+#define STATE3	%xmm5
+#define STATE4	%xmm6
+#define STATE	STATE1
+#define IN1	%xmm1
+#define IN2	%xmm7
+#define IN3	%xmm8
+#define IN4	%xmm9
+#define IN	IN1
+#define KEY	%xmm2
+#define IV	%xmm3
+#define BSWAP_MASK %xmm10
+#define CTR	%xmm11
+#define INC	%xmm12
+
+#define KEYP	%rdi
+#define OUTP	%rsi
+#define INP	%rdx
+#define LEN	%rcx
+#define IVP	%r8
+#define KLEN	%r9d
+#define T1	%r10
+#define TKEYP	T1
+#define T2	%r11
+#define TCTR_LOW T2
+
+_key_expansion_128:
+_key_expansion_256a:
+	pshufd $0b11111111, %xmm1, %xmm1
+	shufps $0b00010000, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	shufps $0b10001100, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	pxor %xmm1, %xmm0
+	movaps %xmm0, (%rcx)
+	add $0x10, %rcx
+	ret
+
+_key_expansion_192a:
+	pshufd $0b01010101, %xmm1, %xmm1
+	shufps $0b00010000, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	shufps $0b10001100, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	pxor %xmm1, %xmm0
+
+	movaps %xmm2, %xmm5
+	movaps %xmm2, %xmm6
+	pslldq $4, %xmm5
+	pshufd $0b11111111, %xmm0, %xmm3
+	pxor %xmm3, %xmm2
+	pxor %xmm5, %xmm2
+
+	movaps %xmm0, %xmm1
+	shufps $0b01000100, %xmm0, %xmm6
+	movaps %xmm6, (%rcx)
+	shufps $0b01001110, %xmm2, %xmm1
+	movaps %xmm1, 16(%rcx)
+	add $0x20, %rcx
+	ret
+
+_key_expansion_192b:
+	pshufd $0b01010101, %xmm1, %xmm1
+	shufps $0b00010000, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	shufps $0b10001100, %xmm0, %xmm4
+	pxor %xmm4, %xmm0
+	pxor %xmm1, %xmm0
+
+	movaps %xmm2, %xmm5
+	pslldq $4, %xmm5
+	pshufd $0b11111111, %xmm0, %xmm3
+	pxor %xmm3, %xmm2
+	pxor %xmm5, %xmm2
+
+	movaps %xmm0, (%rcx)
+	add $0x10, %rcx
+	ret
+
+_key_expansion_256b:
+	pshufd $0b10101010, %xmm1, %xmm1
+	shufps $0b00010000, %xmm2, %xmm4
+	pxor %xmm4, %xmm2
+	shufps $0b10001100, %xmm2, %xmm4
+	pxor %xmm4, %xmm2
+	pxor %xmm1, %xmm2
+	movaps %xmm2, (%rcx)
+	add $0x10, %rcx
+	ret
+
+/*
+ * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ *                   unsigned int key_len)
+ */
+ENTRY(aesni_set_key)
+	movups (%rsi), %xmm0		# user key (first 16 bytes)
+	movaps %xmm0, (%rdi)
+	lea 0x10(%rdi), %rcx		# key addr
+	movl %edx, 480(%rdi)
+	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
+	cmp $24, %dl
+	jb .Lenc_key128
+	je .Lenc_key192
+	movups 0x10(%rsi), %xmm2	# other user key
+	movaps %xmm2, (%rcx)
+	add $0x10, %rcx
+	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x1 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x2 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x4 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x8 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x10 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
+	call _key_expansion_256a
+	AESKEYGENASSIST 0x20 %xmm0 %xmm1
+	call _key_expansion_256b
+	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
+	call _key_expansion_256a
+	jmp .Ldec_key
+.Lenc_key192:
+	movq 0x10(%rsi), %xmm2		# other user key
+	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
+	call _key_expansion_192a
+	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
+	call _key_expansion_192b
+	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
+	call _key_expansion_192a
+	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
+	call _key_expansion_192b
+	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
+	call _key_expansion_192a
+	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
+	call _key_expansion_192b
+	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
+	call _key_expansion_192a
+	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
+	call _key_expansion_192b
+	jmp .Ldec_key
+.Lenc_key128:
+	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
+	call _key_expansion_128
+	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
+	call _key_expansion_128
+	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
+	call _key_expansion_128
+	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
+	call _key_expansion_128
+	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
+	call _key_expansion_128
+	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
+	call _key_expansion_128
+	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
+	call _key_expansion_128
+	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
+	call _key_expansion_128
+	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
+	call _key_expansion_128
+	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
+	call _key_expansion_128
+.Ldec_key:
+	sub $0x10, %rcx
+	movaps (%rdi), %xmm0
+	movaps (%rcx), %xmm1
+	movaps %xmm0, 240(%rcx)
+	movaps %xmm1, 240(%rdi)
+	add $0x10, %rdi
+	lea 240-16(%rcx), %rsi
+.align 4
+.Ldec_key_loop:
+	movaps (%rdi), %xmm0
+	AESIMC %xmm0 %xmm1
+	movaps %xmm1, (%rsi)
+	add $0x10, %rdi
+	sub $0x10, %rsi
+	cmp %rcx, %rdi
+	jb .Ldec_key_loop
+	xor %rax, %rax
+	ret
+
+/*
+ * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_enc)
+	movl 480(KEYP), KLEN		# key length
+	movups (INP), STATE		# input
+	call _aesni_enc1
+	movups STATE, (OUTP)		# output
+	ret
+
+/*
+ * _aesni_enc1:		internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	KLEN:		round count
+ *	STATE:		initial state (input)
+ * output:
+ *	STATE:		finial state (output)
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_enc1:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE		# round 0
+	add $0x30, TKEYP
+	cmp $24, KLEN
+	jb .Lenc128
+	lea 0x20(TKEYP), TKEYP
+	je .Lenc192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	AESENC KEY STATE
+	movaps -0x50(TKEYP), KEY
+	AESENC KEY STATE
+.align 4
+.Lenc192:
+	movaps -0x40(TKEYP), KEY
+	AESENC KEY STATE
+	movaps -0x30(TKEYP), KEY
+	AESENC KEY STATE
+.align 4
+.Lenc128:
+	movaps -0x20(TKEYP), KEY
+	AESENC KEY STATE
+	movaps -0x10(TKEYP), KEY
+	AESENC KEY STATE
+	movaps (TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x10(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x20(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x30(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x40(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x50(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x60(TKEYP), KEY
+	AESENC KEY STATE
+	movaps 0x70(TKEYP), KEY
+	AESENCLAST KEY STATE
+	ret
+
+/*
+ * _aesni_enc4:	internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	KLEN:		round count
+ *	STATE1:		initial state (input)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * output:
+ *	STATE1:		finial state (output)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_enc4:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE1		# round 0
+	pxor KEY, STATE2
+	pxor KEY, STATE3
+	pxor KEY, STATE4
+	add $0x30, TKEYP
+	cmp $24, KLEN
+	jb .L4enc128
+	lea 0x20(TKEYP), TKEYP
+	je .L4enc192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps -0x50(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+#.align 4
+.L4enc192:
+	movaps -0x40(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps -0x30(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+#.align 4
+.L4enc128:
+	movaps -0x20(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps -0x10(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps (TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x10(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x20(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x30(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x40(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x50(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x60(TKEYP), KEY
+	AESENC KEY STATE1
+	AESENC KEY STATE2
+	AESENC KEY STATE3
+	AESENC KEY STATE4
+	movaps 0x70(TKEYP), KEY
+	AESENCLAST KEY STATE1		# last round
+	AESENCLAST KEY STATE2
+	AESENCLAST KEY STATE3
+	AESENCLAST KEY STATE4
+	ret
+
+/*
+ * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_dec)
+	mov 480(KEYP), KLEN		# key length
+	add $240, KEYP
+	movups (INP), STATE		# input
+	call _aesni_dec1
+	movups STATE, (OUTP)		#output
+	ret
+
+/*
+ * _aesni_dec1:		internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	KLEN:		key length
+ *	STATE:		initial state (input)
+ * output:
+ *	STATE:		finial state (output)
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_dec1:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE		# round 0
+	add $0x30, TKEYP
+	cmp $24, KLEN
+	jb .Ldec128
+	lea 0x20(TKEYP), TKEYP
+	je .Ldec192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps -0x50(TKEYP), KEY
+	AESDEC KEY STATE
+.align 4
+.Ldec192:
+	movaps -0x40(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps -0x30(TKEYP), KEY
+	AESDEC KEY STATE
+.align 4
+.Ldec128:
+	movaps -0x20(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps -0x10(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps (TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x10(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x20(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x30(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x40(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x50(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x60(TKEYP), KEY
+	AESDEC KEY STATE
+	movaps 0x70(TKEYP), KEY
+	AESDECLAST KEY STATE
+	ret
+
+/*
+ * _aesni_dec4:	internal ABI
+ * input:
+ *	KEYP:		key struct pointer
+ *	KLEN:		key length
+ *	STATE1:		initial state (input)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * output:
+ *	STATE1:		finial state (output)
+ *	STATE2
+ *	STATE3
+ *	STATE4
+ * changed:
+ *	KEY
+ *	TKEYP (T1)
+ */
+_aesni_dec4:
+	movaps (KEYP), KEY		# key
+	mov KEYP, TKEYP
+	pxor KEY, STATE1		# round 0
+	pxor KEY, STATE2
+	pxor KEY, STATE3
+	pxor KEY, STATE4
+	add $0x30, TKEYP
+	cmp $24, KLEN
+	jb .L4dec128
+	lea 0x20(TKEYP), TKEYP
+	je .L4dec192
+	add $0x20, TKEYP
+	movaps -0x60(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps -0x50(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+.align 4
+.L4dec192:
+	movaps -0x40(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps -0x30(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+.align 4
+.L4dec128:
+	movaps -0x20(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps -0x10(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps (TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x10(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x20(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x30(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x40(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x50(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x60(TKEYP), KEY
+	AESDEC KEY STATE1
+	AESDEC KEY STATE2
+	AESDEC KEY STATE3
+	AESDEC KEY STATE4
+	movaps 0x70(TKEYP), KEY
+	AESDECLAST KEY STATE1		# last round
+	AESDECLAST KEY STATE2
+	AESDECLAST KEY STATE3
+	AESDECLAST KEY STATE4
+	ret
+
+/*
+ * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len)
+ */
+ENTRY(aesni_ecb_enc)
+	test LEN, LEN		# check length
+	jz .Lecb_enc_ret
+	mov 480(KEYP), KLEN
+	cmp $16, LEN
+	jb .Lecb_enc_ret
+	cmp $64, LEN
+	jb .Lecb_enc_loop1
+.align 4
+.Lecb_enc_loop4:
+	movups (INP), STATE1
+	movups 0x10(INP), STATE2
+	movups 0x20(INP), STATE3
+	movups 0x30(INP), STATE4
+	call _aesni_enc4
+	movups STATE1, (OUTP)
+	movups STATE2, 0x10(OUTP)
+	movups STATE3, 0x20(OUTP)
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lecb_enc_loop4
+	cmp $16, LEN
+	jb .Lecb_enc_ret
+.align 4
+.Lecb_enc_loop1:
+	movups (INP), STATE1
+	call _aesni_enc1
+	movups STATE1, (OUTP)
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lecb_enc_loop1
+.Lecb_enc_ret:
+	ret
+
+/*
+ * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len);
+ */
+ENTRY(aesni_ecb_dec)
+	test LEN, LEN
+	jz .Lecb_dec_ret
+	mov 480(KEYP), KLEN
+	add $240, KEYP
+	cmp $16, LEN
+	jb .Lecb_dec_ret
+	cmp $64, LEN
+	jb .Lecb_dec_loop1
+.align 4
+.Lecb_dec_loop4:
+	movups (INP), STATE1
+	movups 0x10(INP), STATE2
+	movups 0x20(INP), STATE3
+	movups 0x30(INP), STATE4
+	call _aesni_dec4
+	movups STATE1, (OUTP)
+	movups STATE2, 0x10(OUTP)
+	movups STATE3, 0x20(OUTP)
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lecb_dec_loop4
+	cmp $16, LEN
+	jb .Lecb_dec_ret
+.align 4
+.Lecb_dec_loop1:
+	movups (INP), STATE1
+	call _aesni_dec1
+	movups STATE1, (OUTP)
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lecb_dec_loop1
+.Lecb_dec_ret:
+	ret
+
+/*
+ * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_enc)
+	cmp $16, LEN
+	jb .Lcbc_enc_ret
+	mov 480(KEYP), KLEN
+	movups (IVP), STATE	# load iv as initial state
+.align 4
+.Lcbc_enc_loop:
+	movups (INP), IN	# load input
+	pxor IN, STATE
+	call _aesni_enc1
+	movups STATE, (OUTP)	# store output
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lcbc_enc_loop
+	movups STATE, (IVP)
+.Lcbc_enc_ret:
+	ret
+
+/*
+ * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_dec)
+	cmp $16, LEN
+	jb .Lcbc_dec_just_ret
+	mov 480(KEYP), KLEN
+	add $240, KEYP
+	movups (IVP), IV
+	cmp $64, LEN
+	jb .Lcbc_dec_loop1
+.align 4
+.Lcbc_dec_loop4:
+	movups (INP), IN1
+	movaps IN1, STATE1
+	movups 0x10(INP), IN2
+	movaps IN2, STATE2
+	movups 0x20(INP), IN3
+	movaps IN3, STATE3
+	movups 0x30(INP), IN4
+	movaps IN4, STATE4
+	call _aesni_dec4
+	pxor IV, STATE1
+	pxor IN1, STATE2
+	pxor IN2, STATE3
+	pxor IN3, STATE4
+	movaps IN4, IV
+	movups STATE1, (OUTP)
+	movups STATE2, 0x10(OUTP)
+	movups STATE3, 0x20(OUTP)
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lcbc_dec_loop4
+	cmp $16, LEN
+	jb .Lcbc_dec_ret
+.align 4
+.Lcbc_dec_loop1:
+	movups (INP), IN
+	movaps IN, STATE
+	call _aesni_dec1
+	pxor IV, STATE
+	movups STATE, (OUTP)
+	movaps IN, IV
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lcbc_dec_loop1
+.Lcbc_dec_ret:
+	movups IV, (IVP)
+.Lcbc_dec_just_ret:
+	ret
+
+.align 16
+.Lbswap_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/*
+ * _aesni_inc_init:	internal ABI
+ *	setup registers used by _aesni_inc
+ * input:
+ *	IV
+ * output:
+ *	CTR:	== IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ *	INC:	== 1, in little endian
+ *	BSWAP_MASK == endian swapping mask
+ */
+_aesni_inc_init:
+	movaps .Lbswap_mask, BSWAP_MASK
+	movaps IV, CTR
+	PSHUFB_XMM BSWAP_MASK CTR
+	mov $1, TCTR_LOW
+	MOVQ_R64_XMM TCTR_LOW INC
+	MOVQ_R64_XMM CTR TCTR_LOW
+	ret
+
+/*
+ * _aesni_inc:		internal ABI
+ *	Increase IV by 1, IV is in big endian
+ * input:
+ *	IV
+ *	CTR:	== IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ *	INC:	== 1, in little endian
+ *	BSWAP_MASK == endian swapping mask
+ * output:
+ *	IV:	Increase by 1
+ * changed:
+ *	CTR:	== output IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ */
+_aesni_inc:
+	paddq INC, CTR
+	add $1, TCTR_LOW
+	jnc .Linc_low
+	pslldq $8, INC
+	paddq INC, CTR
+	psrldq $8, INC
+.Linc_low:
+	movaps CTR, IV
+	PSHUFB_XMM BSWAP_MASK IV
+	ret
+
+/*
+ * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len, u8 *iv)
+ */
+ENTRY(aesni_ctr_enc)
+	cmp $16, LEN
+	jb .Lctr_enc_just_ret
+	mov 480(KEYP), KLEN
+	movups (IVP), IV
+	call _aesni_inc_init
+	cmp $64, LEN
+	jb .Lctr_enc_loop1
+.align 4
+.Lctr_enc_loop4:
+	movaps IV, STATE1
+	call _aesni_inc
+	movups (INP), IN1
+	movaps IV, STATE2
+	call _aesni_inc
+	movups 0x10(INP), IN2
+	movaps IV, STATE3
+	call _aesni_inc
+	movups 0x20(INP), IN3
+	movaps IV, STATE4
+	call _aesni_inc
+	movups 0x30(INP), IN4
+	call _aesni_enc4
+	pxor IN1, STATE1
+	movups STATE1, (OUTP)
+	pxor IN2, STATE2
+	movups STATE2, 0x10(OUTP)
+	pxor IN3, STATE3
+	movups STATE3, 0x20(OUTP)
+	pxor IN4, STATE4
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lctr_enc_loop4
+	cmp $16, LEN
+	jb .Lctr_enc_ret
+.align 4
+.Lctr_enc_loop1:
+	movaps IV, STATE
+	call _aesni_inc
+	movups (INP), IN
+	call _aesni_enc1
+	pxor IN, STATE
+	movups STATE, (OUTP)
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lctr_enc_loop1
+.Lctr_enc_ret:
+	movups IV, (IVP)
+.Lctr_enc_just_ret:
+	ret
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
deleted file mode 100644
index ff16756..0000000
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ /dev/null
@@ -1,841 +0,0 @@
-/*
- * Implement AES algorithm in Intel AES-NI instructions.
- *
- * The white paper of AES-NI instructions can be downloaded from:
- *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
- *
- * Copyright (C) 2008, Intel Corp.
- *    Author: Huang Ying <ying.huang@intel.com>
- *            Vinodh Gopal <vinodh.gopal@intel.com>
- *            Kahraman Akdemir
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-#include <asm/inst.h>
-
-.text
-
-#define STATE1	%xmm0
-#define STATE2	%xmm4
-#define STATE3	%xmm5
-#define STATE4	%xmm6
-#define STATE	STATE1
-#define IN1	%xmm1
-#define IN2	%xmm7
-#define IN3	%xmm8
-#define IN4	%xmm9
-#define IN	IN1
-#define KEY	%xmm2
-#define IV	%xmm3
-#define BSWAP_MASK %xmm10
-#define CTR	%xmm11
-#define INC	%xmm12
-
-#define KEYP	%rdi
-#define OUTP	%rsi
-#define INP	%rdx
-#define LEN	%rcx
-#define IVP	%r8
-#define KLEN	%r9d
-#define T1	%r10
-#define TKEYP	T1
-#define T2	%r11
-#define TCTR_LOW T2
-
-_key_expansion_128:
-_key_expansion_256a:
-	pshufd $0b11111111, %xmm1, %xmm1
-	shufps $0b00010000, %xmm0, %xmm4
-	pxor %xmm4, %xmm0
-	shufps $0b10001100, %xmm0, %xmm4
-	pxor %xmm4, %xmm0
-	pxor %xmm1, %xmm0
-	movaps %xmm0, (%rcx)
-	add $0x10, %rcx
-	ret
-
-_key_expansion_192a:
-	pshufd $0b01010101, %xmm1, %xmm1
-	shufps $0b00010000, %xmm0, %xmm4
-	pxor %xmm4, %xmm0
-	shufps $0b10001100, %xmm0, %xmm4
-	pxor %xmm4, %xmm0
-	pxor %xmm1, %xmm0
-
-	movaps %xmm2, %xmm5
-	movaps %xmm2, %xmm6
-	pslldq $4, %xmm5
-	pshufd $0b11111111, %xmm0, %xmm3
-	pxor %xmm3, %xmm2
-	pxor %xmm5, %xmm2
-
-	movaps %xmm0, %xmm1
-	shufps $0b01000100, %xmm0, %xmm6
-	movaps %xmm6, (%rcx)
-	shufps $0b01001110, %xmm2, %xmm1
-	movaps %xmm1, 16(%rcx)
-	add $0x20, %rcx
-	ret
-
-_key_expansion_192b:
-	pshufd $0b01010101, %xmm1, %xmm1
-	shufps $0b00010000, %xmm0, %xmm4
-	pxor %xmm4, %xmm0
-	shufps $0b10001100, %xmm0, %xmm4
-	pxor %xmm4, %xmm0
-	pxor %xmm1, %xmm0
-
-	movaps %xmm2, %xmm5
-	pslldq $4, %xmm5
-	pshufd $0b11111111, %xmm0, %xmm3
-	pxor %xmm3, %xmm2
-	pxor %xmm5, %xmm2
-
-	movaps %xmm0, (%rcx)
-	add $0x10, %rcx
-	ret
-
-_key_expansion_256b:
-	pshufd $0b10101010, %xmm1, %xmm1
-	shufps $0b00010000, %xmm2, %xmm4
-	pxor %xmm4, %xmm2
-	shufps $0b10001100, %xmm2, %xmm4
-	pxor %xmm4, %xmm2
-	pxor %xmm1, %xmm2
-	movaps %xmm2, (%rcx)
-	add $0x10, %rcx
-	ret
-
-/*
- * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- *                   unsigned int key_len)
- */
-ENTRY(aesni_set_key)
-	movups (%rsi), %xmm0		# user key (first 16 bytes)
-	movaps %xmm0, (%rdi)
-	lea 0x10(%rdi), %rcx		# key addr
-	movl %edx, 480(%rdi)
-	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
-	cmp $24, %dl
-	jb .Lenc_key128
-	je .Lenc_key192
-	movups 0x10(%rsi), %xmm2	# other user key
-	movaps %xmm2, (%rcx)
-	add $0x10, %rcx
-	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
-	call _key_expansion_256a
-	AESKEYGENASSIST 0x1 %xmm0 %xmm1
-	call _key_expansion_256b
-	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
-	call _key_expansion_256a
-	AESKEYGENASSIST 0x2 %xmm0 %xmm1
-	call _key_expansion_256b
-	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
-	call _key_expansion_256a
-	AESKEYGENASSIST 0x4 %xmm0 %xmm1
-	call _key_expansion_256b
-	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
-	call _key_expansion_256a
-	AESKEYGENASSIST 0x8 %xmm0 %xmm1
-	call _key_expansion_256b
-	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
-	call _key_expansion_256a
-	AESKEYGENASSIST 0x10 %xmm0 %xmm1
-	call _key_expansion_256b
-	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
-	call _key_expansion_256a
-	AESKEYGENASSIST 0x20 %xmm0 %xmm1
-	call _key_expansion_256b
-	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
-	call _key_expansion_256a
-	jmp .Ldec_key
-.Lenc_key192:
-	movq 0x10(%rsi), %xmm2		# other user key
-	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
-	call _key_expansion_192a
-	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
-	call _key_expansion_192b
-	AESKEYGENASSIST 0x4 %xmm2 %xmm1		# round 3
-	call _key_expansion_192a
-	AESKEYGENASSIST 0x8 %xmm2 %xmm1		# round 4
-	call _key_expansion_192b
-	AESKEYGENASSIST 0x10 %xmm2 %xmm1	# round 5
-	call _key_expansion_192a
-	AESKEYGENASSIST 0x20 %xmm2 %xmm1	# round 6
-	call _key_expansion_192b
-	AESKEYGENASSIST 0x40 %xmm2 %xmm1	# round 7
-	call _key_expansion_192a
-	AESKEYGENASSIST 0x80 %xmm2 %xmm1	# round 8
-	call _key_expansion_192b
-	jmp .Ldec_key
-.Lenc_key128:
-	AESKEYGENASSIST 0x1 %xmm0 %xmm1		# round 1
-	call _key_expansion_128
-	AESKEYGENASSIST 0x2 %xmm0 %xmm1		# round 2
-	call _key_expansion_128
-	AESKEYGENASSIST 0x4 %xmm0 %xmm1		# round 3
-	call _key_expansion_128
-	AESKEYGENASSIST 0x8 %xmm0 %xmm1		# round 4
-	call _key_expansion_128
-	AESKEYGENASSIST 0x10 %xmm0 %xmm1	# round 5
-	call _key_expansion_128
-	AESKEYGENASSIST 0x20 %xmm0 %xmm1	# round 6
-	call _key_expansion_128
-	AESKEYGENASSIST 0x40 %xmm0 %xmm1	# round 7
-	call _key_expansion_128
-	AESKEYGENASSIST 0x80 %xmm0 %xmm1	# round 8
-	call _key_expansion_128
-	AESKEYGENASSIST 0x1b %xmm0 %xmm1	# round 9
-	call _key_expansion_128
-	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
-	call _key_expansion_128
-.Ldec_key:
-	sub $0x10, %rcx
-	movaps (%rdi), %xmm0
-	movaps (%rcx), %xmm1
-	movaps %xmm0, 240(%rcx)
-	movaps %xmm1, 240(%rdi)
-	add $0x10, %rdi
-	lea 240-16(%rcx), %rsi
-.align 4
-.Ldec_key_loop:
-	movaps (%rdi), %xmm0
-	AESIMC %xmm0 %xmm1
-	movaps %xmm1, (%rsi)
-	add $0x10, %rdi
-	sub $0x10, %rsi
-	cmp %rcx, %rdi
-	jb .Ldec_key_loop
-	xor %rax, %rax
-	ret
-
-/*
- * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
- */
-ENTRY(aesni_enc)
-	movl 480(KEYP), KLEN		# key length
-	movups (INP), STATE		# input
-	call _aesni_enc1
-	movups STATE, (OUTP)		# output
-	ret
-
-/*
- * _aesni_enc1:		internal ABI
- * input:
- *	KEYP:		key struct pointer
- *	KLEN:		round count
- *	STATE:		initial state (input)
- * output:
- *	STATE:		finial state (output)
- * changed:
- *	KEY
- *	TKEYP (T1)
- */
-_aesni_enc1:
-	movaps (KEYP), KEY		# key
-	mov KEYP, TKEYP
-	pxor KEY, STATE		# round 0
-	add $0x30, TKEYP
-	cmp $24, KLEN
-	jb .Lenc128
-	lea 0x20(TKEYP), TKEYP
-	je .Lenc192
-	add $0x20, TKEYP
-	movaps -0x60(TKEYP), KEY
-	AESENC KEY STATE
-	movaps -0x50(TKEYP), KEY
-	AESENC KEY STATE
-.align 4
-.Lenc192:
-	movaps -0x40(TKEYP), KEY
-	AESENC KEY STATE
-	movaps -0x30(TKEYP), KEY
-	AESENC KEY STATE
-.align 4
-.Lenc128:
-	movaps -0x20(TKEYP), KEY
-	AESENC KEY STATE
-	movaps -0x10(TKEYP), KEY
-	AESENC KEY STATE
-	movaps (TKEYP), KEY
-	AESENC KEY STATE
-	movaps 0x10(TKEYP), KEY
-	AESENC KEY STATE
-	movaps 0x20(TKEYP), KEY
-	AESENC KEY STATE
-	movaps 0x30(TKEYP), KEY
-	AESENC KEY STATE
-	movaps 0x40(TKEYP), KEY
-	AESENC KEY STATE
-	movaps 0x50(TKEYP), KEY
-	AESENC KEY STATE
-	movaps 0x60(TKEYP), KEY
-	AESENC KEY STATE
-	movaps 0x70(TKEYP), KEY
-	AESENCLAST KEY STATE
-	ret
-
-/*
- * _aesni_enc4:	internal ABI
- * input:
- *	KEYP:		key struct pointer
- *	KLEN:		round count
- *	STATE1:		initial state (input)
- *	STATE2
- *	STATE3
- *	STATE4
- * output:
- *	STATE1:		finial state (output)
- *	STATE2
- *	STATE3
- *	STATE4
- * changed:
- *	KEY
- *	TKEYP (T1)
- */
-_aesni_enc4:
-	movaps (KEYP), KEY		# key
-	mov KEYP, TKEYP
-	pxor KEY, STATE1		# round 0
-	pxor KEY, STATE2
-	pxor KEY, STATE3
-	pxor KEY, STATE4
-	add $0x30, TKEYP
-	cmp $24, KLEN
-	jb .L4enc128
-	lea 0x20(TKEYP), TKEYP
-	je .L4enc192
-	add $0x20, TKEYP
-	movaps -0x60(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-	movaps -0x50(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-#.align 4
-.L4enc192:
-	movaps -0x40(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-	movaps -0x30(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-#.align 4
-.L4enc128:
-	movaps -0x20(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-	movaps -0x10(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-	movaps (TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-	movaps 0x10(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-	movaps 0x20(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-	movaps 0x30(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-	movaps 0x40(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-	movaps 0x50(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-	movaps 0x60(TKEYP), KEY
-	AESENC KEY STATE1
-	AESENC KEY STATE2
-	AESENC KEY STATE3
-	AESENC KEY STATE4
-	movaps 0x70(TKEYP), KEY
-	AESENCLAST KEY STATE1		# last round
-	AESENCLAST KEY STATE2
-	AESENCLAST KEY STATE3
-	AESENCLAST KEY STATE4
-	ret
-
-/*
- * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
- */
-ENTRY(aesni_dec)
-	mov 480(KEYP), KLEN		# key length
-	add $240, KEYP
-	movups (INP), STATE		# input
-	call _aesni_dec1
-	movups STATE, (OUTP)		#output
-	ret
-
-/*
- * _aesni_dec1:		internal ABI
- * input:
- *	KEYP:		key struct pointer
- *	KLEN:		key length
- *	STATE:		initial state (input)
- * output:
- *	STATE:		finial state (output)
- * changed:
- *	KEY
- *	TKEYP (T1)
- */
-_aesni_dec1:
-	movaps (KEYP), KEY		# key
-	mov KEYP, TKEYP
-	pxor KEY, STATE		# round 0
-	add $0x30, TKEYP
-	cmp $24, KLEN
-	jb .Ldec128
-	lea 0x20(TKEYP), TKEYP
-	je .Ldec192
-	add $0x20, TKEYP
-	movaps -0x60(TKEYP), KEY
-	AESDEC KEY STATE
-	movaps -0x50(TKEYP), KEY
-	AESDEC KEY STATE
-.align 4
-.Ldec192:
-	movaps -0x40(TKEYP), KEY
-	AESDEC KEY STATE
-	movaps -0x30(TKEYP), KEY
-	AESDEC KEY STATE
-.align 4
-.Ldec128:
-	movaps -0x20(TKEYP), KEY
-	AESDEC KEY STATE
-	movaps -0x10(TKEYP), KEY
-	AESDEC KEY STATE
-	movaps (TKEYP), KEY
-	AESDEC KEY STATE
-	movaps 0x10(TKEYP), KEY
-	AESDEC KEY STATE
-	movaps 0x20(TKEYP), KEY
-	AESDEC KEY STATE
-	movaps 0x30(TKEYP), KEY
-	AESDEC KEY STATE
-	movaps 0x40(TKEYP), KEY
-	AESDEC KEY STATE
-	movaps 0x50(TKEYP), KEY
-	AESDEC KEY STATE
-	movaps 0x60(TKEYP), KEY
-	AESDEC KEY STATE
-	movaps 0x70(TKEYP), KEY
-	AESDECLAST KEY STATE
-	ret
-
-/*
- * _aesni_dec4:	internal ABI
- * input:
- *	KEYP:		key struct pointer
- *	KLEN:		key length
- *	STATE1:		initial state (input)
- *	STATE2
- *	STATE3
- *	STATE4
- * output:
- *	STATE1:		finial state (output)
- *	STATE2
- *	STATE3
- *	STATE4
- * changed:
- *	KEY
- *	TKEYP (T1)
- */
-_aesni_dec4:
-	movaps (KEYP), KEY		# key
-	mov KEYP, TKEYP
-	pxor KEY, STATE1		# round 0
-	pxor KEY, STATE2
-	pxor KEY, STATE3
-	pxor KEY, STATE4
-	add $0x30, TKEYP
-	cmp $24, KLEN
-	jb .L4dec128
-	lea 0x20(TKEYP), TKEYP
-	je .L4dec192
-	add $0x20, TKEYP
-	movaps -0x60(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-	movaps -0x50(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-.align 4
-.L4dec192:
-	movaps -0x40(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-	movaps -0x30(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-.align 4
-.L4dec128:
-	movaps -0x20(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-	movaps -0x10(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-	movaps (TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-	movaps 0x10(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-	movaps 0x20(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-	movaps 0x30(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-	movaps 0x40(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-	movaps 0x50(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-	movaps 0x60(TKEYP), KEY
-	AESDEC KEY STATE1
-	AESDEC KEY STATE2
-	AESDEC KEY STATE3
-	AESDEC KEY STATE4
-	movaps 0x70(TKEYP), KEY
-	AESDECLAST KEY STATE1		# last round
-	AESDECLAST KEY STATE2
-	AESDECLAST KEY STATE3
-	AESDECLAST KEY STATE4
-	ret
-
-/*
- * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- *		      size_t len)
- */
-ENTRY(aesni_ecb_enc)
-	test LEN, LEN		# check length
-	jz .Lecb_enc_ret
-	mov 480(KEYP), KLEN
-	cmp $16, LEN
-	jb .Lecb_enc_ret
-	cmp $64, LEN
-	jb .Lecb_enc_loop1
-.align 4
-.Lecb_enc_loop4:
-	movups (INP), STATE1
-	movups 0x10(INP), STATE2
-	movups 0x20(INP), STATE3
-	movups 0x30(INP), STATE4
-	call _aesni_enc4
-	movups STATE1, (OUTP)
-	movups STATE2, 0x10(OUTP)
-	movups STATE3, 0x20(OUTP)
-	movups STATE4, 0x30(OUTP)
-	sub $64, LEN
-	add $64, INP
-	add $64, OUTP
-	cmp $64, LEN
-	jge .Lecb_enc_loop4
-	cmp $16, LEN
-	jb .Lecb_enc_ret
-.align 4
-.Lecb_enc_loop1:
-	movups (INP), STATE1
-	call _aesni_enc1
-	movups STATE1, (OUTP)
-	sub $16, LEN
-	add $16, INP
-	add $16, OUTP
-	cmp $16, LEN
-	jge .Lecb_enc_loop1
-.Lecb_enc_ret:
-	ret
-
-/*
- * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- *		      size_t len);
- */
-ENTRY(aesni_ecb_dec)
-	test LEN, LEN
-	jz .Lecb_dec_ret
-	mov 480(KEYP), KLEN
-	add $240, KEYP
-	cmp $16, LEN
-	jb .Lecb_dec_ret
-	cmp $64, LEN
-	jb .Lecb_dec_loop1
-.align 4
-.Lecb_dec_loop4:
-	movups (INP), STATE1
-	movups 0x10(INP), STATE2
-	movups 0x20(INP), STATE3
-	movups 0x30(INP), STATE4
-	call _aesni_dec4
-	movups STATE1, (OUTP)
-	movups STATE2, 0x10(OUTP)
-	movups STATE3, 0x20(OUTP)
-	movups STATE4, 0x30(OUTP)
-	sub $64, LEN
-	add $64, INP
-	add $64, OUTP
-	cmp $64, LEN
-	jge .Lecb_dec_loop4
-	cmp $16, LEN
-	jb .Lecb_dec_ret
-.align 4
-.Lecb_dec_loop1:
-	movups (INP), STATE1
-	call _aesni_dec1
-	movups STATE1, (OUTP)
-	sub $16, LEN
-	add $16, INP
-	add $16, OUTP
-	cmp $16, LEN
-	jge .Lecb_dec_loop1
-.Lecb_dec_ret:
-	ret
-
-/*
- * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- *		      size_t len, u8 *iv)
- */
-ENTRY(aesni_cbc_enc)
-	cmp $16, LEN
-	jb .Lcbc_enc_ret
-	mov 480(KEYP), KLEN
-	movups (IVP), STATE	# load iv as initial state
-.align 4
-.Lcbc_enc_loop:
-	movups (INP), IN	# load input
-	pxor IN, STATE
-	call _aesni_enc1
-	movups STATE, (OUTP)	# store output
-	sub $16, LEN
-	add $16, INP
-	add $16, OUTP
-	cmp $16, LEN
-	jge .Lcbc_enc_loop
-	movups STATE, (IVP)
-.Lcbc_enc_ret:
-	ret
-
-/*
- * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- *		      size_t len, u8 *iv)
- */
-ENTRY(aesni_cbc_dec)
-	cmp $16, LEN
-	jb .Lcbc_dec_just_ret
-	mov 480(KEYP), KLEN
-	add $240, KEYP
-	movups (IVP), IV
-	cmp $64, LEN
-	jb .Lcbc_dec_loop1
-.align 4
-.Lcbc_dec_loop4:
-	movups (INP), IN1
-	movaps IN1, STATE1
-	movups 0x10(INP), IN2
-	movaps IN2, STATE2
-	movups 0x20(INP), IN3
-	movaps IN3, STATE3
-	movups 0x30(INP), IN4
-	movaps IN4, STATE4
-	call _aesni_dec4
-	pxor IV, STATE1
-	pxor IN1, STATE2
-	pxor IN2, STATE3
-	pxor IN3, STATE4
-	movaps IN4, IV
-	movups STATE1, (OUTP)
-	movups STATE2, 0x10(OUTP)
-	movups STATE3, 0x20(OUTP)
-	movups STATE4, 0x30(OUTP)
-	sub $64, LEN
-	add $64, INP
-	add $64, OUTP
-	cmp $64, LEN
-	jge .Lcbc_dec_loop4
-	cmp $16, LEN
-	jb .Lcbc_dec_ret
-.align 4
-.Lcbc_dec_loop1:
-	movups (INP), IN
-	movaps IN, STATE
-	call _aesni_dec1
-	pxor IV, STATE
-	movups STATE, (OUTP)
-	movaps IN, IV
-	sub $16, LEN
-	add $16, INP
-	add $16, OUTP
-	cmp $16, LEN
-	jge .Lcbc_dec_loop1
-.Lcbc_dec_ret:
-	movups IV, (IVP)
-.Lcbc_dec_just_ret:
-	ret
-
-.align 16
-.Lbswap_mask:
-	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-/*
- * _aesni_inc_init:	internal ABI
- *	setup registers used by _aesni_inc
- * input:
- *	IV
- * output:
- *	CTR:	== IV, in little endian
- *	TCTR_LOW: == lower qword of CTR
- *	INC:	== 1, in little endian
- *	BSWAP_MASK == endian swapping mask
- */
-_aesni_inc_init:
-	movaps .Lbswap_mask, BSWAP_MASK
-	movaps IV, CTR
-	PSHUFB_XMM BSWAP_MASK CTR
-	mov $1, TCTR_LOW
-	MOVQ_R64_XMM TCTR_LOW INC
-	MOVQ_R64_XMM CTR TCTR_LOW
-	ret
-
-/*
- * _aesni_inc:		internal ABI
- *	Increase IV by 1, IV is in big endian
- * input:
- *	IV
- *	CTR:	== IV, in little endian
- *	TCTR_LOW: == lower qword of CTR
- *	INC:	== 1, in little endian
- *	BSWAP_MASK == endian swapping mask
- * output:
- *	IV:	Increase by 1
- * changed:
- *	CTR:	== output IV, in little endian
- *	TCTR_LOW: == lower qword of CTR
- */
-_aesni_inc:
-	paddq INC, CTR
-	add $1, TCTR_LOW
-	jnc .Linc_low
-	pslldq $8, INC
-	paddq INC, CTR
-	psrldq $8, INC
-.Linc_low:
-	movaps CTR, IV
-	PSHUFB_XMM BSWAP_MASK IV
-	ret
-
-/*
- * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- *		      size_t len, u8 *iv)
- */
-ENTRY(aesni_ctr_enc)
-	cmp $16, LEN
-	jb .Lctr_enc_just_ret
-	mov 480(KEYP), KLEN
-	movups (IVP), IV
-	call _aesni_inc_init
-	cmp $64, LEN
-	jb .Lctr_enc_loop1
-.align 4
-.Lctr_enc_loop4:
-	movaps IV, STATE1
-	call _aesni_inc
-	movups (INP), IN1
-	movaps IV, STATE2
-	call _aesni_inc
-	movups 0x10(INP), IN2
-	movaps IV, STATE3
-	call _aesni_inc
-	movups 0x20(INP), IN3
-	movaps IV, STATE4
-	call _aesni_inc
-	movups 0x30(INP), IN4
-	call _aesni_enc4
-	pxor IN1, STATE1
-	movups STATE1, (OUTP)
-	pxor IN2, STATE2
-	movups STATE2, 0x10(OUTP)
-	pxor IN3, STATE3
-	movups STATE3, 0x20(OUTP)
-	pxor IN4, STATE4
-	movups STATE4, 0x30(OUTP)
-	sub $64, LEN
-	add $64, INP
-	add $64, OUTP
-	cmp $64, LEN
-	jge .Lctr_enc_loop4
-	cmp $16, LEN
-	jb .Lctr_enc_ret
-.align 4
-.Lctr_enc_loop1:
-	movaps IV, STATE
-	call _aesni_inc
-	movups (INP), IN
-	call _aesni_enc1
-	pxor IN, STATE
-	movups STATE, (OUTP)
-	sub $16, LEN
-	add $16, INP
-	add $16, OUTP
-	cmp $16, LEN
-	jge .Lctr_enc_loop1
-.Lctr_enc_ret:
-	movups IV, (IVP)
-.Lctr_enc_just_ret:
-	ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc..39f6238 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -59,8 +59,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
+#endif
 
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
@@ -324,6 +326,7 @@ static struct crypto_alg blk_cbc_alg = {
 	},
 };
 
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
 static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
 			    struct blkcipher_walk *walk)
 {
@@ -389,6 +392,7 @@ static struct crypto_alg blk_ctr_alg = {
 		},
 	},
 };
+#endif
 
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
@@ -536,6 +540,7 @@ static struct crypto_alg ablk_cbc_alg = {
 	},
 };
 
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
 static int ablk_ctr_init(struct crypto_tfm *tfm)
 {
 	struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +617,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
 	},
 };
 #endif
+#endif
 
 #ifdef HAS_LRW
 static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -746,18 +752,22 @@ static int __init aesni_init(void)
 		goto blk_ecb_err;
 	if ((err = crypto_register_alg(&blk_cbc_alg)))
 		goto blk_cbc_err;
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
 	if ((err = crypto_register_alg(&blk_ctr_alg)))
 		goto blk_ctr_err;
+#endif
 	if ((err = crypto_register_alg(&ablk_ecb_alg)))
 		goto ablk_ecb_err;
 	if ((err = crypto_register_alg(&ablk_cbc_alg)))
 		goto ablk_cbc_err;
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
 	if ((err = crypto_register_alg(&ablk_ctr_alg)))
 		goto ablk_ctr_err;
 #ifdef HAS_CTR
 	if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
 		goto ablk_rfc3686_ctr_err;
 #endif
+#endif
 #ifdef HAS_LRW
 	if ((err = crypto_register_alg(&ablk_lrw_alg)))
 		goto ablk_lrw_err;
@@ -784,18 +794,22 @@ ablk_pcbc_err:
 	crypto_unregister_alg(&ablk_lrw_alg);
 ablk_lrw_err:
 #endif
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
 #ifdef HAS_CTR
 	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
 ablk_rfc3686_ctr_err:
 #endif
 	crypto_unregister_alg(&ablk_ctr_alg);
 ablk_ctr_err:
+#endif
 	crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
 	crypto_unregister_alg(&ablk_ecb_alg);
 ablk_ecb_err:
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
 	crypto_unregister_alg(&blk_ctr_alg);
 blk_ctr_err:
+#endif
 	crypto_unregister_alg(&blk_cbc_alg);
 blk_cbc_err:
 	crypto_unregister_alg(&blk_ecb_alg);
@@ -818,13 +832,17 @@ static void __exit aesni_exit(void)
 #ifdef HAS_LRW
 	crypto_unregister_alg(&ablk_lrw_alg);
 #endif
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
 #ifdef HAS_CTR
 	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
 #endif
 	crypto_unregister_alg(&ablk_ctr_alg);
+#endif
 	crypto_unregister_alg(&ablk_cbc_alg);
 	crypto_unregister_alg(&ablk_ecb_alg);
+#ifdef CONFIG_CRYPTO_AES_NI_INTEL_X86_64
 	crypto_unregister_alg(&blk_ctr_alg);
+#endif
 	crypto_unregister_alg(&blk_cbc_alg);
 	crypto_unregister_alg(&blk_ecb_alg);
 	crypto_unregister_alg(&__aesni_alg);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e4bac29..7f917c6 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -537,7 +537,37 @@ config CRYPTO_AES_X86_64
 
 	  See <http://csrc.nist.gov/encryption/aes/> for more information.
 
-config CRYPTO_AES_NI_INTEL
+config CRYPTO_AES_NI_INTEL_586
+	tristate "AES cipher algorithms (AES-NI)"
+	depends on (X86 || UML_X86) && !64BIT
+	select CRYPTO_AES_586
+	select CRYPTO_CRYPTD
+	select CRYPTO_ALGAPI
+	select CRYPTO_FPU
+	help
+	  Use Intel AES-NI instructions for AES algorithm.
+
+	  AES cipher algorithms (FIPS-197). AES uses the Rijndael
+	  algorithm.
+
+	  Rijndael appears to be consistently a very good performer in
+	  both hardware and software across a wide range of computing
+	  environments regardless of its use in feedback or non-feedback
+	  modes. Its key setup time is excellent, and its key agility is
+	  good. Rijndael's very low memory requirements make it very well
+	  suited for restricted-space environments, in which it also
+	  demonstrates excellent performance. Rijndael's operations are
+	  among the easiest to defend against power and timing attacks.
+
+	  The AES specifies three key sizes: 128, 192 and 256 bits
+
+	  See <http://csrc.nist.gov/encryption/aes/> for more information.
+
+	  In addition to AES cipher algorithm support, the
+	  acceleration for some popular block cipher mode is supported
+	  too, including ECB, CBC, CTR, LRW, PCBC, XTS.
+
+config CRYPTO_AES_NI_INTEL_X86_64
 	tristate "AES cipher algorithms (AES-NI)"
 	depends on (X86 || UML_X86) && 64BIT
 	select CRYPTO_AES_X86_64
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86, crypto: ported aes-ni implementation to x86
  2010-10-29 21:10 [PATCH] x86, crypto: ported aes-ni implementation to x86 Mathias Krause
@ 2010-10-29 22:15 ` Herbert Xu
  2010-10-29 22:51   ` Mathias Krause
                     ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Herbert Xu @ 2010-10-29 22:15 UTC (permalink / raw)
  To: Mathias Krause; +Cc: linux-crypto, minipli

Mathias Krause <minipli@googlemail.com> wrote:
> The AES-NI instructions are also available in legacy mode so the x86
> architecture may profit from those, too.
> 
> To illustrate the performance gain here's a short summary of the tcrypt
> speed test on a Core i5 M 520 running at 2.40GHz comparing both
> assembler implementations:
> 
>                             aes-i586   aes-ni-i586   delta
> 256 bit, 8kB blocks, ECB:  46.81 MB/s   164.46 MB/s   +251%
> 256 bit, 8kB blocks, CBC:  43.89 MB/s    62.18 MB/s    +41%
> 384 bit, 8kB blocks, LRW:  42.24 MB/s   142.90 MB/s   +238%
> 512 bit, 8kB blocks, XTS:  43.41 MB/s   148.67 MB/s   +242%
> 
> Signed-off-by: Mathias Krause <minipli@googlemail.com>

Nice work :)

I have to say though that I'll love this een more if we could
avoid duplicating those assembly files somehow.  Is this possible?

Oh and those CBC numbers look out of whack.  I'd expect CBC to be
way faster as it's done directly by the hardware unlike the
other modes.  What numbers do you get in 64-bit before/after
your patch?

If the hardware CBC is really so much slower then maybe we should
stop using it.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86, crypto: ported aes-ni implementation to x86
  2010-10-29 22:15 ` Herbert Xu
@ 2010-10-29 22:51   ` Mathias Krause
  2010-10-31 19:32   ` Mathias Krause
  2010-11-03 12:47   ` Mathias Krause
  2 siblings, 0 replies; 6+ messages in thread
From: Mathias Krause @ 2010-10-29 22:51 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-crypto, Huang Ying

On 30.10.2010, 00:15 Herbert Xu wrote:
> Mathias Krause <minipli@googlemail.com> wrote:
>> The AES-NI instructions are also available in legacy mode so the x86
>> architecture may profit from those, too.
>> 
>> To illustrate the performance gain here's a short summary of the tcrypt
>> speed test on a Core i5 M 520 running at 2.40GHz comparing both
>> assembler implementations:
>> 
>>                            aes-i586   aes-ni-i586   delta
>> 256 bit, 8kB blocks, ECB:  46.81 MB/s   164.46 MB/s   +251%
>> 256 bit, 8kB blocks, CBC:  43.89 MB/s    62.18 MB/s    +41%
>> 384 bit, 8kB blocks, LRW:  42.24 MB/s   142.90 MB/s   +238%
>> 512 bit, 8kB blocks, XTS:  43.41 MB/s   148.67 MB/s   +242%
>> 
>> Signed-off-by: Mathias Krause <minipli@googlemail.com>
> 
> Nice work :)
> 
> I have to say though that I'll love this een more if we could
> avoid duplicating those assembly files somehow.  Is this possible?

I thought about that too but found it more easy to split those files.
The different calling conventions of the architectures and the limited 
register set on the 32-bit version made me make some not so nice 
#ifdef-able changes to the code so it'll work with less registers.

> Oh and those CBC numbers look out of whack.  I'd expect CBC to be
> way faster as it's done directly by the hardware unlike the
> other modes.

Well, actually the 32-bit assembler implementation has specialized 
algorithms for ECB and CBC. But the latter must be implemented a 
little different than the 64-bit version because I didn't have enough 
xmm registers to make a 1:1 port. So I reused some registers for 
loading memory values and used direct memory references to make 
aesni_cbc_dec() work with the limited amount of registers.

I'll look into it if we can do better, but if not, maybe leaving this 
one out for the 32-bit version might be the best option. Doing so may
even make it easier to combine the two assembler files again.

Btw., because of the limited register set I wasn't able to port the 
CTR mode version, yet. It uses even more registers -- xmm and general 
purpose. :(

>  What numbers do you get in 64-bit before/after
> your patch?

Haven't yet build a 64-bit kernel but will try that tomorrow.

> If the hardware CBC is really so much slower then maybe we should
> stop using it.

This must be related to the changes I made to the code. I would guess 
it doesn't like the additional memory loads.

There's even more potential for optimization since I've still a 
general purpose register left. ;)

See this version as a first version to get feedback, especially from 
Huang Ying. But it's already quite fast. :)


Regards,
Mathias

> 
> Thanks,
> -- 
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86, crypto: ported aes-ni implementation to x86
  2010-10-29 22:15 ` Herbert Xu
  2010-10-29 22:51   ` Mathias Krause
@ 2010-10-31 19:32   ` Mathias Krause
  2010-11-03 12:47   ` Mathias Krause
  2 siblings, 0 replies; 6+ messages in thread
From: Mathias Krause @ 2010-10-31 19:32 UTC (permalink / raw)
  To: Herbert Xu, Huang Ying; +Cc: linux-crypto

On 30.10.2010, 00:15 Herbert Xu wrote:
> Mathias Krause <minipli@googlemail.com> wrote:
>> To illustrate the performance gain here's a short summary of the tcrypt
>> speed test on a Core i5 M 520 running at 2.40GHz comparing both
>> assembler implementations:
>> 
>>                            aes-i586   aes-ni-i586   delta
>> 256 bit, 8kB blocks, ECB:  46.81 MB/s   164.46 MB/s   +251%
>> 256 bit, 8kB blocks, CBC:  43.89 MB/s    62.18 MB/s    +41%
>> 384 bit, 8kB blocks, LRW:  42.24 MB/s   142.90 MB/s   +238%
>> 512 bit, 8kB blocks, XTS:  43.41 MB/s   148.67 MB/s   +242%
>> 
>> Signed-off-by: Mathias Krause <minipli@googlemail.com>
> 
> Oh and those CBC numbers look out of whack.  I'd expect CBC to be
> way faster as it's done directly by the hardware unlike the
> other modes.  What numbers do you get in 64-bit before/after
> your patch?
> 
> If the hardware CBC is really so much slower then maybe we should
> stop using it.

Today I build and measured a 64-bit version without my changes and got 
results for the above tests at around 60 to 66 MB/s which is ridiculous! 
So I ran the test again and again and noticed that _sometimes_ I got 
results for _some_ algorithms at 150 to 160 MB/s. That's weird!

Testing the 32-bit version again (with my patch) I even got 151 MB/s for 
the CBC mode, albeit now other algorithms were down to 58 - 67 MB/s. 
Strange. Looks like I was just lucky with my first measurement. :/

I don't know why the numbers do vary that much. Maybe it's some magic in 
the processor deactivating some cores and the kernel scheduling work to 
the wrong core. Nevertheless my system under test was otherwise idle. I 
booted a minimal initramfs based system with no services at all but the 
ability to load the tcrypt module.

Maybe Huang Ying can give us some insight why the numbers do vary that 
much? My test case was 'modprobe tcrypt mode=200 sec=10' (for latter 
tests I reduces the sec parameter to 1 in favor of doing multiple runs). 
If that's an inappropriate test for the Intel AES instructions maybe 
somebody can tell me how to do better? Maybe dd to a cryptoloop device?

Regards,
Mathias

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86, crypto: ported aes-ni implementation to x86
  2010-10-29 22:15 ` Herbert Xu
  2010-10-29 22:51   ` Mathias Krause
  2010-10-31 19:32   ` Mathias Krause
@ 2010-11-03 12:47   ` Mathias Krause
  2 siblings, 0 replies; 6+ messages in thread
From: Mathias Krause @ 2010-11-03 12:47 UTC (permalink / raw)
  To: Herbert Xu, Huang Ying; +Cc: linux-crypto

Hi,

I modified the patch so it doesn't introduce a copy of the existing
assembler implementation but modifies the existing one to be usable for
64 and 32 bit. Additionally I added some alignment constraints for
internal functions which resulted in a noticeable speed-up.

I rerun the tests on another machine, an Core i7 M620, 2.67GHz. I also
took the "low-end" numbers for the AES-NI variants because I didn't
want to wait for the big numbers to come every now and then any more ;)
So here is the comparison of 5 consecutive tcrypt test runs for some
selected algorithms in MiB/s:

x86-64 (old):       1. run  2. run  3. run  4. run  5. run    mean
ECB, 256 bit, 8kB:  152.49  152.58  152.51  151.80  151.87  152.25
CBC. 256 bit, 8kB:  144.32  144.44  144.35  143.75  143.75  144.12
LRW, 320 bit, 8kB:  159.41  159.21  159.21  158.55  159.28  159.13
XTS, 512 bit, 8kB:  144.87  142.88  144.75  144.11  144.75  144.27

x86-64 (new):       1. run  2. run  3. run  4. run  5. run    mean
ECB, 256 bit, 8kB:  184.07  184.07  183.50  183.50  184.07  183.84
CBC. 256 bit, 8kB:  170.25  170.24  169.71  169.71  170.25  170.03
LRW, 320 bit, 8kB:  169.91  169.91  169.39  169.37  169.91  169.69
XTS, 512 bit, 8kB:  172.39  172.35  171.82  171.82  172.35  172.14

i586:               1. run  2. run  3. run  4. run  5. run    mean
ECB, 256 bit, 8kB:  125.98  126.03  126.03  125.64  126.03  125.94
CBC. 256 bit, 8kB:  118.18  118.19  117.84  117.84  118.19  118.04
LRW, 320 bit, 8kB:  128.37  128.35  127.97  127.98  128.35  128.20
XTS, 512 bit, 8kB:  118.52  118.50  118.14  118.14  118.49  118.35

x86 (AES-NI):       1. run  2. run  3. run  4. run  5. run    mean
ECB, 256 bit, 8kB:  187.33  187.34  187.33  186.75  186.74  187.09
CBC. 256 bit, 8kB:  171.84  171.84  171.84  171.28  171.28  171.61
LRW, 320 bit, 8kB:  168.54  168.54  168.53  168.00  168.02  168.32
XTS, 512 bit, 8kB:  166.61  166.60  166.60  166.08  166.60  166.49

Comparing the mean values gives us:

x86-64:                old     new   delta
ECB, 256 bit, 8kB:  152.25  183.84  +20.7%
CBC. 256 bit, 8kB:  144.12  170.03  +18.0%
LRW, 320 bit, 8kB:  159.13  169.69   +6.6%
XTS, 512 bit, 8kB:  144.27  172.14  +19.3%

x86:                  i586  aes-ni   delta
ECB, 256 bit, 8kB:  125.94  187.09  +48.6%
CBC. 256 bit, 8kB:  118.04  171.61  +45.4%
LRW, 320 bit, 8kB:  128.20  168.32  +31.3%
XTS, 512 bit, 8kB:  118.35  166.49  +40.7%

The funny thing is that the 32 bit implementation is sometimes even
faster then the 64 bit one. Nevertheless the minor optimization of
aligning function entries gave the 64 bit version quite a big
performance gain (up to 20%).

I'll post the new version of the patch in a follow-up email.

Regards,
Mathias

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] x86, crypto: ported aes-ni implementation to x86
@ 2010-11-03 12:53 Mathias Krause
  0 siblings, 0 replies; 6+ messages in thread
From: Mathias Krause @ 2010-11-03 12:53 UTC (permalink / raw)
  To: linux-crypto, Herbert Xu, Huang Ying; +Cc: Mathias Krause

The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.

To illustrate the performance gain here's a short summary of the tcrypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:

x86:                              i568       aes-ni   delta
256 bit, 8kB blocks, ECB:  125.94 MB/s  187.09 MB/s  +48.6%
256 bit, 8kB blocks, CBC:  118.04 MB/s  171.61 MB/s  +45.4%
320 bit, 8kB blocks, LRW:  128.20 MB/s  168.32 MB/s  +31.3%
512 bit, 8kB blocks, XTS:  118.35 MB/s  166.49 MB/s  +40.7%

Additionally, due to some minor optimizations, the 64-bit version also
get a performance gain up to 20% as seen below:

x86-64:                      old impl.     new impl.  delta
256 bit, 8kB blocks, ECB:  152.25 MB/s  183.84 MB/s  +20.7%
256 bit, 8kB blocks, CBC:  144.12 MB/s  170.03 MB/s  +18.0%
320 bit, 8kB blocks, LRW:  159.13 MB/s  169.69 MB/s   +6.6%
512 bit, 8kB blocks, XTS:  144.27 MB/s  172.14 MB/s  +19.3%

Signed-off-by: Mathias Krause <minipli@googlemail.com>
---
v2 changes:
* hide almost all register names in macros so the same code base can be shared
  between x86 and x86_64
* unified Kconfig documentation again
* added alignment constraints for internal functions.
---
 arch/x86/crypto/aesni-intel_asm.S  |  149 ++++++++++++++++++++++++++++-------
 arch/x86/crypto/aesni-intel_glue.c |   22 ++++-
 crypto/Kconfig                     |    8 +-
 3 files changed, 141 insertions(+), 38 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756..bf810b7 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,9 @@
  *            Vinodh Gopal <vinodh.gopal@intel.com>
  *            Kahraman Akdemir
  *
+ * Ported x86_64 version to x86:
+ *    Author: Mathias Krause <minipli@googlemail.com>
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -32,12 +35,16 @@
 #define IN	IN1
 #define KEY	%xmm2
 #define IV	%xmm3
+
 #define BSWAP_MASK %xmm10
 #define CTR	%xmm11
 #define INC	%xmm12
 
+#ifdef __x86_64__
+#define AREG	%rax
 #define KEYP	%rdi
 #define OUTP	%rsi
+#define UKEYP	OUTP
 #define INP	%rdx
 #define LEN	%rcx
 #define IVP	%r8
@@ -46,6 +53,18 @@
 #define TKEYP	T1
 #define T2	%r11
 #define TCTR_LOW T2
+#else
+#define AREG	%eax
+#define KEYP	%edi
+#define OUTP	AREG
+#define UKEYP	OUTP
+#define INP	%edx
+#define LEN	%esi
+#define IVP	%ebp
+#define KLEN	%ebx
+#define T1	%ecx
+#define TKEYP	T1
+#endif
 
 _key_expansion_128:
 _key_expansion_256a:
@@ -55,10 +74,11 @@ _key_expansion_256a:
 	shufps $0b10001100, %xmm0, %xmm4
 	pxor %xmm4, %xmm0
 	pxor %xmm1, %xmm0
-	movaps %xmm0, (%rcx)
-	add $0x10, %rcx
+	movaps %xmm0, (TKEYP)
+	add $0x10, TKEYP
 	ret
 
+.align 4
 _key_expansion_192a:
 	pshufd $0b01010101, %xmm1, %xmm1
 	shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +96,13 @@ _key_expansion_192a:
 
 	movaps %xmm0, %xmm1
 	shufps $0b01000100, %xmm0, %xmm6
-	movaps %xmm6, (%rcx)
+	movaps %xmm6, (TKEYP)
 	shufps $0b01001110, %xmm2, %xmm1
-	movaps %xmm1, 16(%rcx)
-	add $0x20, %rcx
+	movaps %xmm1, 0x10(TKEYP)
+	add $0x20, TKEYP
 	ret
 
+.align 4
 _key_expansion_192b:
 	pshufd $0b01010101, %xmm1, %xmm1
 	shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +117,11 @@ _key_expansion_192b:
 	pxor %xmm3, %xmm2
 	pxor %xmm5, %xmm2
 
-	movaps %xmm0, (%rcx)
-	add $0x10, %rcx
+	movaps %xmm0, (TKEYP)
+	add $0x10, TKEYP
 	ret
 
+.align 4
 _key_expansion_256b:
 	pshufd $0b10101010, %xmm1, %xmm1
 	shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +129,8 @@ _key_expansion_256b:
 	shufps $0b10001100, %xmm2, %xmm4
 	pxor %xmm4, %xmm2
 	pxor %xmm1, %xmm2
-	movaps %xmm2, (%rcx)
-	add $0x10, %rcx
+	movaps %xmm2, (TKEYP)
+	add $0x10, TKEYP
 	ret
 
 /*
@@ -116,17 +138,23 @@ _key_expansion_256b:
  *                   unsigned int key_len)
  */
 ENTRY(aesni_set_key)
-	movups (%rsi), %xmm0		# user key (first 16 bytes)
-	movaps %xmm0, (%rdi)
-	lea 0x10(%rdi), %rcx		# key addr
-	movl %edx, 480(%rdi)
+#ifndef __x86_64__
+	pushl KEYP
+	movl 8(%esp), KEYP		# ctx
+	movl 12(%esp), UKEYP		# in_key
+	movl 16(%esp), %edx		# key_len
+#endif
+	movups (UKEYP), %xmm0		# user key (first 16 bytes)
+	movaps %xmm0, (KEYP)
+	lea 0x10(KEYP), TKEYP		# key addr
+	movl %edx, 480(KEYP)
 	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
 	cmp $24, %dl
 	jb .Lenc_key128
 	je .Lenc_key192
-	movups 0x10(%rsi), %xmm2	# other user key
-	movaps %xmm2, (%rcx)
-	add $0x10, %rcx
+	movups 0x10(UKEYP), %xmm2	# other user key
+	movaps %xmm2, (TKEYP)
+	add $0x10, TKEYP
 	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
 	call _key_expansion_256a
 	AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +183,7 @@ ENTRY(aesni_set_key)
 	call _key_expansion_256a
 	jmp .Ldec_key
 .Lenc_key192:
-	movq 0x10(%rsi), %xmm2		# other user key
+	movq 0x10(UKEYP), %xmm2		# other user key
 	AESKEYGENASSIST 0x1 %xmm2 %xmm1		# round 1
 	call _key_expansion_192a
 	AESKEYGENASSIST 0x2 %xmm2 %xmm1		# round 2
@@ -195,33 +223,47 @@ ENTRY(aesni_set_key)
 	AESKEYGENASSIST 0x36 %xmm0 %xmm1	# round 10
 	call _key_expansion_128
 .Ldec_key:
-	sub $0x10, %rcx
-	movaps (%rdi), %xmm0
-	movaps (%rcx), %xmm1
-	movaps %xmm0, 240(%rcx)
-	movaps %xmm1, 240(%rdi)
-	add $0x10, %rdi
-	lea 240-16(%rcx), %rsi
+	sub $0x10, TKEYP
+	movaps (KEYP), %xmm0
+	movaps (TKEYP), %xmm1
+	movaps %xmm0, 240(TKEYP)
+	movaps %xmm1, 240(KEYP)
+	add $0x10, KEYP
+	lea 240-16(TKEYP), UKEYP
 .align 4
 .Ldec_key_loop:
-	movaps (%rdi), %xmm0
+	movaps (KEYP), %xmm0
 	AESIMC %xmm0 %xmm1
-	movaps %xmm1, (%rsi)
-	add $0x10, %rdi
-	sub $0x10, %rsi
-	cmp %rcx, %rdi
+	movaps %xmm1, (UKEYP)
+	add $0x10, KEYP
+	sub $0x10, UKEYP
+	cmp TKEYP, KEYP
 	jb .Ldec_key_loop
-	xor %rax, %rax
+	xor AREG, AREG
+#ifndef __x86_64__
+	popl KEYP
+#endif
 	ret
 
 /*
  * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  */
 ENTRY(aesni_enc)
+#ifndef __x86_64__
+	pushl KEYP
+	pushl KLEN
+	movl 12(%esp), KEYP
+	movl 16(%esp), OUTP
+	movl 20(%esp), INP
+#endif
 	movl 480(KEYP), KLEN		# key length
 	movups (INP), STATE		# input
 	call _aesni_enc1
 	movups STATE, (OUTP)		# output
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+#endif
 	ret
 
 /*
@@ -236,6 +278,7 @@ ENTRY(aesni_enc)
  *	KEY
  *	TKEYP (T1)
  */
+.align 4
 _aesni_enc1:
 	movaps (KEYP), KEY		# key
 	mov KEYP, TKEYP
@@ -298,6 +341,7 @@ _aesni_enc1:
  *	KEY
  *	TKEYP (T1)
  */
+.align 4
 _aesni_enc4:
 	movaps (KEYP), KEY		# key
 	mov KEYP, TKEYP
@@ -391,11 +435,22 @@ _aesni_enc4:
  * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  */
 ENTRY(aesni_dec)
+#ifndef __x86_64__
+	pushl KEYP
+	pushl KLEN
+	movl 12(%esp), KEYP
+	movl 16(%esp), OUTP
+	movl 20(%esp), INP
+#endif
 	mov 480(KEYP), KLEN		# key length
 	add $240, KEYP
 	movups (INP), STATE		# input
 	call _aesni_dec1
 	movups STATE, (OUTP)		#output
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+#endif
 	ret
 
 /*
@@ -410,6 +465,7 @@ ENTRY(aesni_dec)
  *	KEY
  *	TKEYP (T1)
  */
+.align 4
 _aesni_dec1:
 	movaps (KEYP), KEY		# key
 	mov KEYP, TKEYP
@@ -472,6 +528,7 @@ _aesni_dec1:
  *	KEY
  *	TKEYP (T1)
  */
+.align 4
 _aesni_dec4:
 	movaps (KEYP), KEY		# key
 	mov KEYP, TKEYP
@@ -566,6 +623,15 @@ _aesni_dec4:
  *		      size_t len)
  */
 ENTRY(aesni_ecb_enc)
+#ifndef __x86_64__
+	pushl LEN
+	pushl KEYP
+	pushl KLEN
+	movl 16(%esp), KEYP
+	movl 20(%esp), OUTP
+	movl 24(%esp), INP
+	movl 28(%esp), LEN
+#endif
 	test LEN, LEN		# check length
 	jz .Lecb_enc_ret
 	mov 480(KEYP), KLEN
@@ -609,6 +675,15 @@ ENTRY(aesni_ecb_enc)
  *		      size_t len);
  */
 ENTRY(aesni_ecb_dec)
+#ifndef __x86_64__
+	pushl LEN
+	pushl KEYP
+	pushl KLEN
+	movl 16(%esp), KEYP
+	movl 20(%esp), OUTP
+	movl 24(%esp), INP
+	movl 28(%esp), LEN
+#endif
 	test LEN, LEN
 	jz .Lecb_dec_ret
 	mov 480(KEYP), KLEN
@@ -646,6 +721,11 @@ ENTRY(aesni_ecb_dec)
 	cmp $16, LEN
 	jge .Lecb_dec_loop1
 .Lecb_dec_ret:
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+	popl LEN
+#endif
 	ret
 
 /*
@@ -670,8 +750,14 @@ ENTRY(aesni_cbc_enc)
 	jge .Lcbc_enc_loop
 	movups STATE, (IVP)
 .Lcbc_enc_ret:
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+	popl LEN
+#endif
 	ret
 
+#ifdef __x86_64__
 /*
  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  *		      size_t len, u8 *iv)
@@ -744,6 +830,7 @@ ENTRY(aesni_cbc_dec)
  *	INC:	== 1, in little endian
  *	BSWAP_MASK == endian swapping mask
  */
+.align 4
 _aesni_inc_init:
 	movaps .Lbswap_mask, BSWAP_MASK
 	movaps IV, CTR
@@ -768,6 +855,7 @@ _aesni_inc_init:
  *	CTR:	== output IV, in little endian
  *	TCTR_LOW: == lower qword of CTR
  */
+.align 4
 _aesni_inc:
 	paddq INC, CTR
 	add $1, TCTR_LOW
@@ -839,3 +927,4 @@ ENTRY(aesni_ctr_enc)
 	movups IV, (IVP)
 .Lctr_enc_just_ret:
 	ret
+#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc..d0f0e7b 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -55,12 +55,14 @@ asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len);
 asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len);
+#ifdef CONFIG_X86_64
 asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
+#endif
 
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
@@ -254,6 +256,7 @@ static struct crypto_alg blk_ecb_alg = {
 	},
 };
 
+#ifdef CONFIG_X86_64
 static int cbc_encrypt(struct blkcipher_desc *desc,
 		       struct scatterlist *dst, struct scatterlist *src,
 		       unsigned int nbytes)
@@ -389,6 +392,7 @@ static struct crypto_alg blk_ctr_alg = {
 		},
 	},
 };
+#endif
 
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
@@ -500,6 +504,7 @@ static struct crypto_alg ablk_ecb_alg = {
 	},
 };
 
+#ifdef CONFIG_X86_64
 static int ablk_cbc_init(struct crypto_tfm *tfm)
 {
 	struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +617,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
 	},
 };
 #endif
+#endif
 
 #ifdef HAS_LRW
 static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -744,12 +750,13 @@ static int __init aesni_init(void)
 		goto __aes_err;
 	if ((err = crypto_register_alg(&blk_ecb_alg)))
 		goto blk_ecb_err;
+	if ((err = crypto_register_alg(&ablk_ecb_alg)))
+		goto ablk_ecb_err;
+#ifdef CONFIG_X86_64
 	if ((err = crypto_register_alg(&blk_cbc_alg)))
 		goto blk_cbc_err;
 	if ((err = crypto_register_alg(&blk_ctr_alg)))
 		goto blk_ctr_err;
-	if ((err = crypto_register_alg(&ablk_ecb_alg)))
-		goto ablk_ecb_err;
 	if ((err = crypto_register_alg(&ablk_cbc_alg)))
 		goto ablk_cbc_err;
 	if ((err = crypto_register_alg(&ablk_ctr_alg)))
@@ -758,6 +765,7 @@ static int __init aesni_init(void)
 	if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
 		goto ablk_rfc3686_ctr_err;
 #endif
+#endif
 #ifdef HAS_LRW
 	if ((err = crypto_register_alg(&ablk_lrw_alg)))
 		goto ablk_lrw_err;
@@ -784,6 +792,7 @@ ablk_pcbc_err:
 	crypto_unregister_alg(&ablk_lrw_alg);
 ablk_lrw_err:
 #endif
+#ifdef CONFIG_X86_64
 #ifdef HAS_CTR
 	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
 ablk_rfc3686_ctr_err:
@@ -792,12 +801,13 @@ ablk_rfc3686_ctr_err:
 ablk_ctr_err:
 	crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
-	crypto_unregister_alg(&ablk_ecb_alg);
-ablk_ecb_err:
 	crypto_unregister_alg(&blk_ctr_alg);
 blk_ctr_err:
 	crypto_unregister_alg(&blk_cbc_alg);
 blk_cbc_err:
+#endif
+	crypto_unregister_alg(&ablk_ecb_alg);
+ablk_ecb_err:
 	crypto_unregister_alg(&blk_ecb_alg);
 blk_ecb_err:
 	crypto_unregister_alg(&__aesni_alg);
@@ -818,14 +828,16 @@ static void __exit aesni_exit(void)
 #ifdef HAS_LRW
 	crypto_unregister_alg(&ablk_lrw_alg);
 #endif
+#ifdef CONFIG_X86_64
 #ifdef HAS_CTR
 	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
 #endif
 	crypto_unregister_alg(&ablk_ctr_alg);
 	crypto_unregister_alg(&ablk_cbc_alg);
-	crypto_unregister_alg(&ablk_ecb_alg);
 	crypto_unregister_alg(&blk_ctr_alg);
 	crypto_unregister_alg(&blk_cbc_alg);
+#endif
+	crypto_unregister_alg(&ablk_ecb_alg);
 	crypto_unregister_alg(&blk_ecb_alg);
 	crypto_unregister_alg(&__aesni_alg);
 	crypto_unregister_alg(&aesni_alg);
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e4bac29..459fd35 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -539,8 +539,9 @@ config CRYPTO_AES_X86_64
 
 config CRYPTO_AES_NI_INTEL
 	tristate "AES cipher algorithms (AES-NI)"
-	depends on (X86 || UML_X86) && 64BIT
-	select CRYPTO_AES_X86_64
+	depends on (X86 || UML_X86)
+	select CRYPTO_AES_X86_64 if 64BIT
+	select CRYPTO_AES_586 if !64BIT
 	select CRYPTO_CRYPTD
 	select CRYPTO_ALGAPI
 	select CRYPTO_FPU
@@ -565,7 +566,8 @@ config CRYPTO_AES_NI_INTEL
 
 	  In addition to AES cipher algorithm support, the
 	  acceleration for some popular block cipher mode is supported
-	  too, including ECB, CBC, CTR, LRW, PCBC, XTS.
+	  too, including ECB, LRW, PCBC, XTS. The 64 bit version has
+	  additional acceleration for CBC and CTR.
 
 config CRYPTO_ANUBIS
 	tristate "Anubis cipher algorithm"
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2010-11-03 12:53 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-29 21:10 [PATCH] x86, crypto: ported aes-ni implementation to x86 Mathias Krause
2010-10-29 22:15 ` Herbert Xu
2010-10-29 22:51   ` Mathias Krause
2010-10-31 19:32   ` Mathias Krause
2010-11-03 12:47   ` Mathias Krause
2010-11-03 12:53 Mathias Krause

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.