linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
@ 2010-09-10 17:36 Hoban, Adrian
  2010-09-20  8:07 ` Herbert Xu
  0 siblings, 1 reply; 19+ messages in thread
From: Hoban, Adrian @ 2010-09-10 17:36 UTC (permalink / raw)
  To: linux-crypto, Herbert Xu; +Cc: linux-kernel

This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
kernels. It supports 128-bit AES key size. This leverages the crypto
AEAD interface type to facilitate a combined AES & GCM operation to
be implemented in assembly code. The assembly code leverages Intel(R)
AES New Instructions and the PCLMULQDQ instruction.

Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com>
Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com>
Signed-off-by: James Guilford <james.guilford@intel.com>
Signed-off-by: Edward Clinton <edward.clinton@intel.com>
Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com>
---
 arch/x86/crypto/aesni-intel_asm.S  | 1112 +++++++++++++++++++++++++++++++++++-
 arch/x86/crypto/aesni-intel_glue.c |  518 +++++++++++++++++-
 2 files changed, 1627 insertions(+), 3 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756..3950769 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -8,6 +8,17 @@
  *    Author: Huang Ying <ying.huang@intel.com>
  *            Vinodh Gopal <vinodh.gopal@intel.com>
  *            Kahraman Akdemir
+ *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             James Guilford (james.guilford@intel.com)
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Wajdi Feghali (wajdi.k.feghali@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -16,9 +27,50 @@
  */

 #include <linux/linkage.h>
-#include <asm/inst.h>
+
+
+.data
+POLY:   .octa 0xC2000000000000000000000000000001
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+
+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
+MASK1:      .octa 0x0000000000000000ffffffffffffffff
+MASK2:      .octa 0xffffffffffffffff0000000000000000
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
+ZERO:       .octa 0x00000000000000000000000000000000
+ONE:        .octa 0x00000000000000000000000000000001
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+dec:        .octa 0x1
+enc:        .octa 0x2
+

 .text
+#define        STACK_OFFSET    8*3
+#define        HashKey         16*0    // store HashKey <<1 mod poly here
+#define        HashKey_2       16*1    // store HashKey^2 <<1 mod poly here
+#define        HashKey_3       16*2    // store HashKey^3 <<1 mod poly here
+#define        HashKey_4       16*3    // store HashKey^4 <<1 mod poly here
+#define        HashKey_k       16*4    // store XOR of High 64 bits and Low 64 bits of  HashKey <<1 mod poly here (for Karatsuba purposes)
+#define        HashKey_2_k     16*5    // store XOR of High 64 bits and Low 64 bits of  HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+#define        HashKey_3_k     16*6    // store XOR of High 64 bits and Low 64 bits of  HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+#define        HashKey_4_k     16*7    // store XOR of High 64 bits and Low 64 bits of  HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+#define        VARIABLE_OFFSET 16*8
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%r14)
+#define arg8 STACK_OFFSET+16(%r14)
+#define arg9 STACK_OFFSET+24(%r14)
+#define arg10 STACK_OFFSET+32(%r14)
+

 #define STATE1 %xmm0
 #define STATE2 %xmm4
@@ -47,6 +99,1064 @@
 #define T2     %r11
 #define TCTR_LOW T2

+
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+       movdqa    \GH, \TMP1
+       pshufd    $78, \GH, \TMP2
+       pshufd    $78, \HK, \TMP3
+       pxor      \GH, \TMP2            # TMP2 = a1+a0
+       pxor      \HK, \TMP3            # TMP3 = b1+b0
+       pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
+       pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
+       pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
+       pxor      \GH, \TMP2
+       pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
+       movdqa    \TMP2, \TMP3
+       pslldq    $8, \TMP3             # left shift TMP3 2 DWs
+       psrldq    $8, \TMP2             # right shift TMP2 2 DWs
+       pxor      \TMP3, \GH
+       pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
+
+        # first phase of the reduction
+
+       movdqa    \GH, \TMP2
+       movdqa    \GH, \TMP3
+       movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4 in in order to perform independent shifts
+       pslld     $31, \TMP2            # packed right shift <<31
+       pslld     $30, \TMP3            # packed right shift <<30
+       pslld     $25, \TMP4            # packed right shift <<25
+       pxor      \TMP3, \TMP2          # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       movdqa    \TMP2, \TMP5
+       psrldq    $4, \TMP5             # right shift TMP5 1 DW
+       pslldq    $12, \TMP2            # left shift TMP2 3 DWs
+       pxor      \TMP2, \GH
+
+        # second phase of the reduction
+
+       movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4 in in order to perform independent shifts
+       movdqa    \GH,\TMP3
+       movdqa    \GH,\TMP4
+       psrld     $1,\TMP2              # packed left shift >>1
+       psrld     $2,\TMP3              # packed left shift >>2
+       psrld     $7,\TMP4              # packed left shift >>7
+       pxor      \TMP3,\TMP2           # xor the shifted versions
+       pxor      \TMP4,\TMP2
+       pxor      \TMP5, \TMP2
+       pxor      \TMP2, \GH
+       pxor      \TMP1, \GH            # result is in TMP1
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+
+       mov        arg7, %r10           # %r10 = AAD
+       mov        arg8, %r12           # %r12 = aadLen
+       mov        %r12, %r11
+       pxor       %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+       movd       (%r10), \TMP1
+       pslldq     $12, \TMP1
+       psrldq     $4, %xmm\i
+       pxor       \TMP1, %xmm\i
+       add        $4, %r10
+       sub        $4, %r12
+       jne        _get_AAD_loop\num_initial_blocks\operation
+       cmp        $16, %r11
+       je         _get_AAD_loop2_done\num_initial_blocks\operation
+       mov        $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+       psrldq     $4, %xmm\i
+       sub        $4, %r12
+       cmp        %r11, %r12
+       jne        _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+       pshufb     SHUF_MASK(%rip), %xmm\i          # byte-reflect the AAD data
+       xor        %r11, %r11                       # initialise the data pointer offset as zero
+
+
+        # start AES for num_initial_blocks blocks
+
+       mov        %arg5, %rax                      # %rax = *Y0
+       movdqu     (%rax), \XMM0                    # XMM0 = Y0
+       pshufb     SHUF_MASK(%rip), \XMM0
+.if \i_seq != 0
+.irpc index, \i_seq
+       paddd      ONE(%rip), \XMM0                 # INCR Y0
+       movdqa     \XMM0, %xmm\index
+       pshufb     SHUF_MASK(%rip), %xmm\index      # perform a 16 byte swap
+.endr
+.irpc index, \i_seq
+       pxor       16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+       aesenc     16*1(%rdi), %xmm\index          # Round 1
+.endr
+.irpc index, \i_seq
+       aesenc     16*2(%arg1), %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       aesenc     16*3(%arg1), %xmm\index          # Round 3
+.endr
+.irpc index, \i_seq
+       aesenc     16*4(%arg1), %xmm\index          # Round 4
+.endr
+.irpc index, \i_seq
+       aesenc     16*5(%arg1), %xmm\index          # Round 5
+.endr
+.irpc index, \i_seq
+       aesenc     16*6(%arg1), %xmm\index          # Round 6
+.endr
+.irpc index, \i_seq
+       aesenc     16*7(%arg1), %xmm\index          # Round 7
+.endr
+.irpc index, \i_seq
+       aesenc     16*8(%arg1), %xmm\index          # Round 8
+.endr
+.irpc index, \i_seq
+       aesenc     16*9(%arg1), %xmm\index          # Round 9
+.endr
+.irpc index, \i_seq
+       aesenclast 16*10(%arg1), %xmm\index         # Round 10
+.endr
+.irpc index, \i_seq
+       movdqu     (%arg3 , %r11, 1), \TMP1
+       pxor       \TMP1, %xmm\index
+       movdqu     %xmm\index, (%arg2 , %r11, 1)    # write back plaintext/ciphertext for num_initial_blocks
+       add        $16, %r11
+.if \operation == dec
+       movdqa     \TMP1, %xmm\index
+.endif
+       pshufb     SHUF_MASK(%rip), %xmm\index      # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+       GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+
+        # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+        pxor       %xmm5, %xmm6
+       GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm6, %xmm7
+       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+        pxor       %xmm6, %xmm7
+       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+       cmp        $64, %r13
+       jl      _initial_blocks_done\num_initial_blocks\operation   # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM1
+       pshufb     SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM2
+       pshufb     SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM3
+       pshufb     SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM4
+       pshufb     SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
+       pxor       16*0(%arg1), \XMM1
+       pxor       16*0(%arg1), \XMM2
+       pxor       16*0(%arg1), \XMM3
+       pxor       16*0(%arg1), \XMM4
+       movdqa     \TMP3, \TMP5
+       pshufd     $78, \TMP3, \TMP1
+       pxor       \TMP3, \TMP1
+       movdqa     \TMP1, HashKey_k(%rsp)
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 = HashKey^2<<1 (mod poly)
+       movdqa     \TMP5, HashKey_2(%rsp)                           # HashKey_2 = HashKey^2<<1 (mod poly)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+       aesenc     16*\index(%arg1), \XMM1
+       aesenc     16*\index(%arg1), \XMM2
+       aesenc     16*\index(%arg1), \XMM3
+       aesenc     16*\index(%arg1), \XMM4
+.endr
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_3(%rsp)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+       aesenc     16*\index(%arg1), \XMM1
+       aesenc     16*\index(%arg1), \XMM2
+       aesenc     16*\index(%arg1), \XMM3
+       aesenc     16*\index(%arg1), \XMM4
+.endr
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_4(%rsp)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_4_k(%rsp)
+       aesenclast 160(%arg1), \XMM1
+       aesenclast 160(%arg1), \XMM2
+       aesenclast 160(%arg1), \XMM3
+       aesenclast 160(%arg1), \XMM4
+       movdqu     16*0(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM1
+.if \operation == dec
+       movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM1
+.endif
+       movdqu     16*1(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM2
+.if \operation == dec
+       movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM2
+.endif
+       movdqu     16*2(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM3
+.if \operation == dec
+       movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM3
+.endif
+       movdqu     16*3(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM4
+.if \operation == dec
+       movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM4
+.else
+       movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
+       movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
+       movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
+       movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+.endif
+       add        $64, %r11
+       pshufb     SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
+       pxor       \XMMDst, \XMM1         # combine GHASHed value with the corresponding ciphertext
+       pshufb     SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
+       pshufb     SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
+       pshufb     SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
+_initial_blocks_done\num_initial_blocks\operation:
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+       movdqa    \XMM1, \XMM5
+       movdqa    \XMM2, \XMM6
+       movdqa    \XMM3, \XMM7
+       movdqa    \XMM4, \XMM8
+
+        # multiply TMP5 * HashKey using karatsuba
+
+       movdqa    \XMM5, \TMP4
+       pshufd    $78, \XMM5, \TMP6
+       pxor      \XMM5, \TMP6
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    HashKey_4(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
+       movdqa    \XMM0, \XMM1
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM2
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM3
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM4
+       pshufb    SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
+       pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
+       pshufb    SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
+       pshufb    SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
+       pshufb    SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
+       pxor      (%arg1), \XMM1
+       pxor      (%arg1), \XMM2
+       pxor      (%arg1), \XMM3
+       pxor      (%arg1), \XMM4
+       movdqa    HashKey_4_k(%rsp), \TMP5
+       pclmulqdq $0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
+       aesenc    16(%arg1), \XMM1              # Round 1
+       aesenc    16(%arg1), \XMM2
+       aesenc    16(%arg1), \XMM3
+       aesenc    16(%arg1), \XMM4
+       aesenc    32(%arg1), \XMM1              # Round 2
+       aesenc    32(%arg1), \XMM2
+       aesenc    32(%arg1), \XMM3
+       aesenc    32(%arg1), \XMM4
+       movdqa    \XMM6, \TMP1
+       pshufd    $78, \XMM6, \TMP2
+       pxor      \XMM6, \TMP2
+       movdqa    HashKey_3(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
+       aesenc    48(%arg1), \XMM1              # Round 3
+       aesenc    48(%arg1), \XMM2
+       aesenc    48(%arg1), \XMM3
+       aesenc    48(%arg1), \XMM4
+       pclmulqdq $0x00, \TMP5, \XMM6           # XMM6 = a0*b0
+       aesenc    64(%arg1), \XMM1              # Round 4
+       aesenc    64(%arg1), \XMM2
+       aesenc    64(%arg1), \XMM3
+       aesenc    64(%arg1), \XMM4
+       movdqa    HashKey_3_k(%rsp), \TMP5
+       pclmulqdq $0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+       aesenc    80(%arg1), \XMM1              # Round 5
+       aesenc    80(%arg1), \XMM2
+       aesenc    80(%arg1), \XMM3
+       aesenc    80(%arg1), \XMM4
+       pxor      \TMP1, \TMP4                  # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+       pxor      \XMM6, \XMM5
+       pxor      \TMP2, \TMP6
+       movdqa    \XMM7, \TMP1
+       pshufd    $78, \XMM7, \TMP2
+       pxor      \XMM7, \TMP2
+       movdqa    HashKey_2(%rsp ), \TMP5
+
+        # Multiply TMP5 * HashKey using karatsuba
+
+       pclmulqdq $0x11, \TMP5, \TMP1           # TMP1 = a1*b1
+       aesenc    96(%arg1), \XMM1              # Round 6
+       aesenc    96(%arg1), \XMM2
+       aesenc    96(%arg1), \XMM3
+       aesenc    96(%arg1), \XMM4
+       pclmulqdq $0x00, \TMP5, \XMM7           # XMM7 = a0*b0
+       aesenc    112(%arg1), \XMM1             # Round 7
+       aesenc    112(%arg1), \XMM2
+       aesenc    112(%arg1), \XMM3
+       aesenc    112(%arg1), \XMM4
+       movdqa    HashKey_2_k(%rsp), \TMP5
+       pclmulqdq $0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+       aesenc    128(%arg1), \XMM1             # Round 8
+       aesenc    128(%arg1), \XMM2
+       aesenc    128(%arg1), \XMM3
+       aesenc    128(%arg1), \XMM4
+       pxor      \TMP1, \TMP4                  # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+       pxor      \XMM7, \XMM5
+       pxor      \TMP2, \TMP6
+
+        # Multiply XMM8 * HashKey
+        # XMM8 and TMP5 hold the values for the two operands
+
+       movdqa    \XMM8, \TMP1
+       pshufd    $78, \XMM8, \TMP2
+       pxor      \XMM8, \TMP2
+       movdqa    HashKey(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP1          # TMP1 = a1*b1
+       aesenc    144(%arg1), \XMM1            # Round 9
+       aesenc    144(%arg1), \XMM2
+       aesenc    144(%arg1), \XMM3
+       aesenc    144(%arg1), \XMM4
+       pclmulqdq $0x00, \TMP5, \XMM8          # XMM8 = a0*b0
+       aesenclast 160(%arg1), \XMM1           # Round 10
+       aesenclast 160(%arg1), \XMM2
+       aesenclast 160(%arg1), \XMM3
+       aesenclast 160(%arg1), \XMM4
+       movdqa    HashKey_k(%rsp), \TMP5
+       pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
+       movdqu    (%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+       movdqu    \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
+       movdqa    \TMP3, \XMM1
+.endif
+       movdqu    16(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+       movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqa    \TMP3, \XMM2
+.endif
+       movdqu    32(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+       movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqa    \TMP3, \XMM3
+.endif
+       movdqu    48(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+       movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqa    \TMP3, \XMM4
+.else
+        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
+        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
+.endif
+       pshufb    SHUF_MASK(%rip), \XMM1       # perform a 16 byte swap
+       pshufb    SHUF_MASK(%rip), \XMM2       # perform a 16 byte swap
+       pshufb    SHUF_MASK(%rip), \XMM3       # perform a 16 byte swap
+       pshufb    SHUF_MASK(%rip), \XMM4       # perform a 16 byte sway
+
+       pxor      \TMP4, \TMP1
+       pxor      \XMM8, \XMM5
+       pxor      \TMP6, \TMP2
+       pxor      \TMP1, \TMP2
+       pxor      \XMM5, \TMP2
+       movdqa    \TMP2, \TMP3
+       pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
+       psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
+       pxor      \TMP3, \XMM5
+       pxor      \TMP2, \TMP1                 # accumulate the results in TMP1:XMM5
+
+        # first phase of reduction
+
+       movdqa    \XMM5, \TMP2
+       movdqa    \XMM5, \TMP3
+       movdqa    \XMM5, \TMP4                 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+       pslld     $31, \TMP2                   # packed right shift << 31
+       pslld     $30, \TMP3                   # packed right shift << 30
+       pslld     $25, \TMP4                   # packed right shift << 25
+       pxor      \TMP3, \TMP2                 # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       movdqa    \TMP2, \TMP5
+       psrldq    $4, \TMP5                    # right shift T5 1 DW
+       pslldq    $12, \TMP2                   # left shift T2 3 DWs
+       pxor      \TMP2, \XMM5
+
+        # second phase of reduction
+
+       movdqa    \XMM5,\TMP2                  # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+       movdqa    \XMM5,\TMP3
+       movdqa    \XMM5,\TMP4
+       psrld     $1, \TMP2                    # packed left shift >>1
+       psrld     $2, \TMP3                    # packed left shift >>2
+       psrld     $7, \TMP4                    # packed left shift >>7
+       pxor      \TMP3,\TMP2                  # xor the shifted versions
+       pxor      \TMP4,\TMP2
+       pxor      \TMP5, \TMP2
+       pxor      \TMP2, \XMM5
+       pxor      \TMP1, \XMM5                 # result is in TMP1
+
+       pxor      \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
+
+        # Multiply TMP6 * HashKey (using Karatsuba)
+
+       movdqa    \XMM1, \TMP6
+       pshufd    $78, \XMM1, \TMP2
+       pxor      \XMM1, \TMP2
+       movdqa    HashKey_4(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
+       pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
+       movdqa    HashKey_4_k(%rsp), \TMP4
+       pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       movdqa    \XMM1, \XMMDst
+       movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+       movdqa    \XMM2, \TMP1
+       pshufd    $78, \XMM2, \TMP2
+       pxor      \XMM2, \TMP2
+       movdqa    HashKey_3(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+       pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
+       movdqa    HashKey_3_k(%rsp), \TMP4
+       pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       pxor      \TMP1, \TMP6
+       pxor      \XMM2, \XMMDst
+       pxor      \TMP2, \XMM1              # results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+       movdqa    \XMM3, \TMP1
+       pshufd    $78, \XMM3, \TMP2
+       pxor      \XMM3, \TMP2
+       movdqa    HashKey_2(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+       pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
+       movdqa    HashKey_2_k(%rsp), \TMP4
+       pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       pxor      \TMP1, \TMP6
+       pxor      \XMM3, \XMMDst
+       pxor      \TMP2, \XMM1              # results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+       movdqa    \XMM4, \TMP1
+       pshufd    $78, \XMM4, \TMP2
+       pxor      \XMM4, \TMP2
+       movdqa    HashKey(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+       pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
+       movdqa    HashKey_k(%rsp), \TMP4
+       pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       pxor      \TMP1, \TMP6
+       pxor      \XMM4, \XMMDst
+       pxor      \XMM1, \TMP2
+       pxor      \TMP6, \TMP2
+       pxor      \XMMDst, \TMP2            # middle section of the temp results combined as in karatsuba algorithm
+       movdqa    \TMP2, \TMP4
+       pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
+       psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
+       pxor      \TMP4, \XMMDst
+       pxor      \TMP2, \TMP6              # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
+
+        # first phase of the reduction
+
+       movdqa    \XMMDst, \TMP2
+       movdqa    \XMMDst, \TMP3
+       movdqa    \XMMDst, \TMP4            # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
+       pslld     $31, \TMP2                # packed right shifting << 31
+       pslld     $30, \TMP3                # packed right shifting << 30
+       pslld     $25, \TMP4                # packed right shifting << 25
+       pxor      \TMP3, \TMP2              # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       movdqa    \TMP2, \TMP7
+       psrldq    $4, \TMP7                 # right shift TMP7 1 DW
+       pslldq    $12, \TMP2                # left shift TMP2 3 DWs
+       pxor      \TMP2, \XMMDst
+
+        # second phase of the reduction
+
+       movdqa    \XMMDst, \TMP2            # make 3 copies of XMMDst for doing 3 shift operations
+       movdqa    \XMMDst, \TMP3
+       movdqa    \XMMDst, \TMP4
+       psrld     $1, \TMP2                 # packed left shift >> 1
+       psrld     $2, \TMP3                 # packed left shift >> 2
+       psrld     $7, \TMP4                 # packed left shift >> 7
+       pxor      \TMP3, \TMP2              # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       pxor      \TMP7, \TMP2
+       pxor      \TMP2, \XMMDst
+       pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
+.endm
+
+/* Encryption of a single block done*/
+.macro ENCRYPT_SINGLE_BLOCK XMM0
+
+       pxor    (%arg1), \XMM0
+       aesenc  16(%arg1), \XMM0
+       aesenc  32(%arg1), \XMM0
+       aesenc  48(%arg1), \XMM0
+       aesenc  64(%arg1), \XMM0
+       aesenc  80(%arg1), \XMM0
+       aesenc  96(%arg1), \XMM0
+       aesenc  112(%arg1), \XMM0
+       aesenc  128(%arg1), \XMM0
+       aesenc  144(%arg1), \XMM0
+       aesenclast 160(%arg1), \XMM0
+.endm
+
+
+/**********************************************************************************************
+*void aesni_gcm_dec(void *aes_ctx,               // AES Key schedule. Starts on a 16 byte boundary.
+*                   u8 *out,                     // Plaintext output. Encrypt in-place is allowed.
+*                   const u8 *in,                // Ciphertext input
+*                   u64 plaintext_len,           // Length of data in bytes for decryption.
+*                   u8 *iv,                      // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                                // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                                // concatenated with 0x00000001. 16-byte aligned pointer.
+*                   u8 *hash_subkey,             // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                   const u8 *aad,               // Additional Authentication Data (AAD)
+*                   u64 aad_len,                 // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                   u8  *auth_tag,               // Authenticated Tag output. The driver will compare this to the
+*                                                // given authentication tag and only return the plaintext if they match.
+*                   u64 auth_tag_len);           // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8.
+*
+*
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 64-bit Extended Sequence Number
+*
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes. The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes. For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+*******************************************************************************************/
+
+ENTRY(aesni_gcm_dec)
+       push    %r12
+       push    %r13
+       push    %r14
+       mov     %rsp, %r14
+/*
+* states of %xmm registers %xmm6:%xmm15 not saved
+* all %xmm registers are clobbered
+*/
+       sub     $VARIABLE_OFFSET, %rsp
+       and     $~63, %rsp                        # align rsp to 64 bytes
+       mov     %arg6, %r12
+       movdqu  (%r12), %xmm13                    # %xmm13 = HashKey
+       pshufb  SHUF_MASK(%rip), %xmm13
+
+        # Precompute HashKey<<1 (mod poly) from the hash key (this is required for GHASH)
+
+       movdqa  %xmm13, %xmm2
+       psllq   $1, %xmm13
+       psrlq   $63, %xmm2
+       movdqa  %xmm2, %xmm1
+       pslldq  $8, %xmm2
+       psrldq  $8, %xmm1
+       por     %xmm2, %xmm13
+
+        # Reduction
+
+       pshufd  $0x24, %xmm1, %xmm2
+       pcmpeqd TWOONE(%rip), %xmm2
+       pand    POLY(%rip), %xmm2
+       pxor    %xmm2, %xmm13                   # %xmm13 holds the HashKey<<1 (mod poly)
+
+
+        # Decrypt first few blocks
+
+       movdqa  %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
+       mov     %arg4, %r13                     # save the number of bytes of plaintext/ciphertext
+       and     $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
+       mov     %r13, %r12
+       and     $(3<<4), %r12
+       jz      _initial_num_blocks_is_0_decrypt
+       cmp     $(2<<4), %r12
+       jb      _initial_num_blocks_is_1_decrypt
+       je      _initial_num_blocks_is_2_decrypt
+_initial_num_blocks_is_3_decrypt:
+       INITIAL_BLOCKS  3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
+       sub     $48, %r13
+       jmp     _initial_blocks_decrypted
+_initial_num_blocks_is_2_decrypt:
+       INITIAL_BLOCKS  2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
+       sub     $32, %r13
+       jmp     _initial_blocks_decrypted
+_initial_num_blocks_is_1_decrypt:
+       INITIAL_BLOCKS  1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
+       sub     $16, %r13
+       jmp     _initial_blocks_decrypted
+_initial_num_blocks_is_0_decrypt:
+       INITIAL_BLOCKS  0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
+_initial_blocks_decrypted:
+       cmp     $0, %r13
+       je      _zero_cipher_left_decrypt
+       sub     $64, %r13
+       je      _four_cipher_left_decrypt
+_decrypt_by_4:
+       GHASH_4_ENCRYPT_4_PARALLEL      %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
+       add     $64, %r11
+       sub     $64, %r13
+       jne     _decrypt_by_4
+_four_cipher_left_decrypt:
+       GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_decrypt:
+       mov     %arg4, %r13
+       and     $15, %r13                               # %r13 = arg4 (mod 16)
+       je      _multiple_of_16_bytes_decrypt
+
+        # Handle the last <16 byte block seperately
+
+       paddd   ONE(%rip), %xmm0                        # increment CNT to get Yn
+       pshufb  SHUF_MASK(%rip), %xmm0
+       ENCRYPT_SINGLE_BLOCK    %xmm0                   # E(K, Yn)
+       sub     $16, %r11
+       add     %r13, %r11
+       movdqu  (%arg3,%r11,1), %xmm1                   # recieve the last <16 byte block
+       lea     SHIFT_MASK+16(%rip), %r12
+       sub     %r13, %r12                              # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
+                                                        # (%r13 is the number of bytes in plaintext mod 16)
+       movdqu  (%r12), %xmm2                           # get the appropriate shuffle mask
+       pshufb  %xmm2, %xmm1                            # right shift 16-%r13 butes
+       movdqa  %xmm1, %xmm2
+       pxor    %xmm1, %xmm0                            # Ciphertext XOR E(K, Yn)
+       movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1           # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
+       pand    %xmm1, %xmm0                            # mask out top 16-%r13 bytes of %xmm0
+       pand    %xmm1, %xmm2
+       pshufb  SHUF_MASK(%rip),%xmm2
+       pxor    %xmm2, %xmm8
+       GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6      # GHASH computation for the last <16 byte block
+       sub     %r13, %r11
+       add     $16, %r11
+
+        # output %r13 bytes
+
+       movq    %xmm0, %rax
+       cmp     $8, %r13
+       jle     _less_than_8_bytes_left_decrypt
+       mov     %rax, (%arg2 , %r11, 1)
+       add     $8, %r11
+       psrldq  $8, %xmm0
+       movq    %xmm0, %rax
+       sub     $8, %r13
+_less_than_8_bytes_left_decrypt:
+       mov     %al,  (%arg2, %r11, 1)
+       add     $1, %r11
+       shr     $8, %rax
+       sub     $1, %r13
+       jne     _less_than_8_bytes_left_decrypt
+_multiple_of_16_bytes_decrypt:
+       mov     arg8, %r12                # %r13 = aadLen (number of bytes)
+       shl     $3, %r12                  # convert into number of bits
+       movd    %r12d, %xmm15             # len(A) in %xmm15
+       shl     $3, %arg4                 # len(C) in bits (*128)
+       movq    %arg4, %xmm1
+       pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
+       pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
+       pxor    %xmm15, %xmm8
+       GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6      # final GHASH computation
+       pshufb  SHUF_MASK(%rip), %xmm8
+       mov     %arg5, %rax               # %rax = *Y0
+       movdqu  (%rax), %xmm0             # %xmm0 = Y0
+       ENCRYPT_SINGLE_BLOCK    %xmm0     # E(K, Y0)
+       pxor    %xmm8, %xmm0
+_return_T_decrypt:
+       mov     arg9, %r10                # %r10 = authTag
+       mov     arg10, %r11               # %r11 = auth_tag_len
+       cmp     $16, %r11
+       je      _T_16_decrypt
+       cmp     $12, %r11
+       je      _T_12_decrypt
+_T_8_decrypt:
+       movq    %xmm0, %rax
+       mov     %rax, (%r10)
+       jmp     _return_T_done_decrypt
+_T_12_decrypt:
+       movq    %xmm0, %rax
+       mov     %rax, (%r10)
+       psrldq  $8, %xmm0
+       movd    %xmm0, %eax
+       mov     %eax, 8(%r10)
+       jmp     _return_T_done_decrypt
+_T_16_decrypt:
+       movdqu  %xmm0, (%r10)
+_return_T_done_decrypt:
+       mov     %r14, %rsp
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       ret
+
+
+/***************************************************************************************************
+* void aesni_gcm_enc(void *aes_ctx,               // AES Key schedule. Starts on a 16 byte boundary.
+*                    u8 *out,                    // Ciphertext output. Encrypt in-place is allowed.
+*                    const u8 *in,               // Plaintext input
+*                    u64 plaintext_len,          // Length of data in bytes for encryption.
+*                    u8 *iv,                     // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                                // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                                // concatenated with 0x00000001. 16-byte aligned pointer.
+*                    u8 *hash_subkey,            // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                    const u8 *aad,              // Additional Authentication Data (AAD)
+*                    u64 aad_len,                // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                    u8 *auth_tag,               // Authenticated Tag output.
+*                    u64 auth_tag_len);          // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8.
+*
+*
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 64-bit Extended Sequence Number
+*
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes. The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes. For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+**************************************************************************************************************************/
+ENTRY(aesni_gcm_enc)
+       push    %r12
+       push    %r13
+       push    %r14
+       mov     %rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+       sub     $VARIABLE_OFFSET, %rsp
+       and     $~63, %rsp
+       mov     %arg6, %r12
+       movdqu  (%r12), %xmm13
+       pshufb  SHUF_MASK(%rip), %xmm13
+
+        # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+       movdqa  %xmm13, %xmm2
+       psllq   $1, %xmm13
+       psrlq   $63, %xmm2
+       movdqa  %xmm2, %xmm1
+       pslldq  $8, %xmm2
+       psrldq  $8, %xmm1
+       por     %xmm2, %xmm13
+
+        # reduce HashKey<<1
+
+       pshufd  $0x24, %xmm1, %xmm2
+       pcmpeqd TWOONE(%rip), %xmm2
+       pand    POLY(%rip), %xmm2
+       pxor    %xmm2, %xmm13
+       movdqa  %xmm13, HashKey(%rsp)
+       mov     %arg4, %r13               # %xmm13 holds HashKey<<1 (mod poly)
+       and     $-16, %r13
+       mov     %r13, %r12
+
+        # Encrypt first few blocks
+
+       and     $(3<<4), %r12
+       jz      _initial_num_blocks_is_0_encrypt
+       cmp     $(2<<4), %r12
+       jb      _initial_num_blocks_is_1_encrypt
+       je      _initial_num_blocks_is_2_encrypt
+_initial_num_blocks_is_3_encrypt:
+       INITIAL_BLOCKS  3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
+       sub     $48, %r13
+       jmp     _initial_blocks_encrypted
+_initial_num_blocks_is_2_encrypt:
+       INITIAL_BLOCKS  2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
+       sub     $32, %r13
+       jmp     _initial_blocks_encrypted
+_initial_num_blocks_is_1_encrypt:
+       INITIAL_BLOCKS  1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
+       sub     $16, %r13
+       jmp     _initial_blocks_encrypted
+_initial_num_blocks_is_0_encrypt:
+       INITIAL_BLOCKS  0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
+_initial_blocks_encrypted:
+
+        # Main loop - Encrypt remaining blocks
+
+       cmp     $0, %r13
+       je      _zero_cipher_left_encrypt
+       sub     $64, %r13
+       je      _four_cipher_left_encrypt
+_encrypt_by_4_encrypt:
+       GHASH_4_ENCRYPT_4_PARALLEL      %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
+       add     $64, %r11
+       sub     $64, %r13
+       jne     _encrypt_by_4_encrypt
+_four_cipher_left_encrypt:
+       GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_encrypt:
+       mov     %arg4, %r13
+       and     $15, %r13                       # %r13 = arg4 (mod 16)
+       je      _multiple_of_16_bytes_encrypt
+
+         # Handle the last <16 Byte block seperately
+
+       paddd   ONE(%rip), %xmm0                # INCR CNT to get Yn
+       pshufb  SHUF_MASK(%rip), %xmm0
+       ENCRYPT_SINGLE_BLOCK    %xmm0           # Encrypt(K, Yn)
+       sub     $16, %r11
+       add     %r13, %r11
+       movdqu  (%arg3,%r11,1), %xmm1           # recieve the last <16 byte blocks
+       lea     SHIFT_MASK+16(%rip), %r12
+       sub     %r13, %r12                      # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+                                                # (%r13 is the number of bytes in plaintext mod 16)
+       movdqu  (%r12), %xmm2                   # get the appropriate shuffle mask
+       pshufb  %xmm2, %xmm1                    # shift right 16-r13 byte
+       pxor    %xmm1, %xmm0                    # Plaintext XOR Encrypt(K, Yn)
+       movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1   # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+       pand    %xmm1, %xmm0                    # mask out top 16-r13 bytes of xmm0
+
+       pshufb  SHUF_MASK(%rip),%xmm0
+       pxor    %xmm0, %xmm8
+       GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6    # GHASH computation for the last <16 byte block
+       sub     %r13, %r11
+       add     $16, %r11
+       pshufb  SHUF_MASK(%rip), %xmm0          # shuffle xmm0 back to output as ciphertext
+
+        # Output %r13 bytes
+
+       movq    %xmm0, %rax
+       cmp     $8, %r13
+       jle     _less_than_8_bytes_left_encrypt
+       mov     %rax, (%arg2 , %r11, 1)
+       add     $8, %r11
+       psrldq  $8, %xmm0
+       movq    %xmm0, %rax
+       sub     $8, %r13
+_less_than_8_bytes_left_encrypt:
+       mov     %al,  (%arg2, %r11, 1)
+       add     $1, %r11
+       shr     $8, %rax
+       sub     $1, %r13
+       jne     _less_than_8_bytes_left_encrypt
+_multiple_of_16_bytes_encrypt:
+       mov     arg8, %r12                     # %r12 = addLen (number of bytes)
+       shl     $3, %r12
+       movd    %r12d, %xmm15                  # len(A) in %xmm15
+       shl     $3, %arg4                      # len(C) in bits (*128)
+       movq    %arg4, %xmm1
+       pslldq  $8, %xmm15                     # %xmm15 = len(A)||0x0000000000000000
+       pxor    %xmm1, %xmm15                  # %xmm15 = len(A)||len(C)
+       pxor    %xmm15, %xmm8
+       GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6      # final GHASH computation
+
+       pshufb  SHUF_MASK(%rip), %xmm8         # perform a 16 byte swap
+       mov     %arg5, %rax                    # %rax  = *Y0
+       movdqu  (%rax), %xmm0                  # %xmm0 = Y0
+       ENCRYPT_SINGLE_BLOCK    %xmm0          # Encrypt(K, Y0)
+       pxor    %xmm8, %xmm0
+_return_T_encrypt:
+       mov     arg9, %r10                     # %r10 = authTag
+       mov     arg10, %r11                    # %r11 = auth_tag_len
+       cmp     $16, %r11
+       je      _T_16_encrypt
+       cmp     $12, %r11
+       je      _T_12_encrypt
+_T_8_encrypt:
+       movq    %xmm0, %rax
+       mov     %rax, (%r10)
+       jmp     _return_T_done_encrypt
+_T_12_encrypt:
+       movq    %xmm0, %rax
+       mov     %rax, (%r10)
+       psrldq  $8, %xmm0
+       movd    %xmm0, %eax
+       mov     %eax, 8(%r10)
+       jmp     _return_T_done_encrypt
+_T_16_encrypt:
+       movdqu  %xmm0, (%r10)
+_return_T_done_encrypt:
+       mov     %r14, %rsp
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       ret
+
+#include <asm/inst.h>
+
 _key_expansion_128:
 _key_expansion_256a:
        pshufd $0b11111111, %xmm1, %xmm1
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc..f0b0a6f 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
  * Copyright (C) 2008, Intel Corp.
  *    Author: Huang Ying <ying.huang@intel.com>
  *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
 #include <crypto/ctr.h>
 #include <asm/i387.h>
 #include <asm/aes.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>

 #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
 #define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
        struct cryptd_ablkcipher *cryptd_tfm;
 };

-#define AESNI_ALIGN    16
+/* This data is stored at the end of the crypto_tfm struct.
+ * It's a type of per "session" data storage location.
+ * This needs to be 16 byte aligned.
+ */
+struct aesni_rfc4106_gcm_ctx {
+       u8 hash_subkey[16];
+       struct crypto_aes_ctx aes_key_expanded;
+       u8 nonce[4];
+       struct cryptd_aead *cryptd_tfm;
+};
+
+struct aesni_gcm_set_hash_subkey_result {
+       int err;
+       struct completion completion;
+};
+
+struct aesni_hash_subkey_req_data {
+       u8 iv[16];
+       struct aesni_gcm_set_hash_subkey_result result;
+       struct scatterlist sg;
+};
+
+#define AESNI_ALIGN    (16)
 #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
+#define RFC4106_HASH_SUBKEY_SIZE 16

 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
                             unsigned int key_len);
@@ -62,6 +97,57 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len, u8 *iv);

+/* asmlinkage void aesni_gcm_enc()
+ * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
+ * const u8 *in, Plaintext input
+ * unsigned long plaintext_len, Length of data in bytes for encryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
+ *          is going to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
+ *          Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
+                       const u8 *in, unsigned long plaintext_len, u8 *iv,
+                       u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+                       u8 *auth_tag, unsigned long auth_tag_len);
+
+/* asmlinkage void aesni_gcm_dec()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
+ * const u8 *in, Ciphertext input
+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
+ * to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
+                       const u8 *in, unsigned long ciphertext_len, u8 *iv,
+                       u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+                       u8 *auth_tag, unsigned long auth_tag_len);
+
+static inline struct
+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
+{
+       return
+               (struct aesni_rfc4106_gcm_ctx *)
+               PTR_ALIGN((u8 *)
+               crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
+}
+
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
        unsigned long addr = (unsigned long)raw_ctx;
@@ -730,6 +816,422 @@ static struct crypto_alg ablk_xts_alg = {
 };
 #endif

+static int rfc4106_init(struct crypto_tfm *tfm)
+{
+       struct cryptd_aead *cryptd_tfm;
+       struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
+               PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+       cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+       if (IS_ERR(cryptd_tfm))
+               return PTR_ERR(cryptd_tfm);
+       ctx->cryptd_tfm = cryptd_tfm;
+       tfm->crt_aead.reqsize = sizeof(struct aead_request)
+               + crypto_aead_reqsize(&cryptd_tfm->base);
+       return 0;
+}
+
+static void rfc4106_exit(struct crypto_tfm *tfm)
+{
+       struct aesni_rfc4106_gcm_ctx *ctx =
+               (struct aesni_rfc4106_gcm_ctx *)
+               PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+       if (!IS_ERR(ctx->cryptd_tfm))
+               cryptd_free_aead(ctx->cryptd_tfm);
+       return;
+}
+
+static void
+rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
+{
+       struct aesni_gcm_set_hash_subkey_result *result = req->data;
+
+       if (err == -EINPROGRESS)
+               return;
+       result->err = err;
+       complete(&result->completion);
+}
+
+static int
+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+{
+       struct crypto_ablkcipher *ctr_tfm;
+       struct ablkcipher_request *req;
+       int ret = -EINVAL;
+       struct aesni_hash_subkey_req_data *req_data;
+
+       ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
+       if (IS_ERR(ctr_tfm))
+               return PTR_ERR(ctr_tfm);
+
+       crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
+
+       ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
+       if (ret) {
+               crypto_free_ablkcipher(ctr_tfm);
+               return ret;
+       }
+
+       req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
+       if (!req) {
+               crypto_free_ablkcipher(ctr_tfm);
+               return -EINVAL;
+       }
+
+       req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+       if (!req_data) {
+               crypto_free_ablkcipher(ctr_tfm);
+               return -ENOMEM;
+       }
+       memset(req_data->iv, 0, sizeof(req_data->iv));
+
+       /* Clear the data in the hash sub key container to zero.*/
+       /* We want to cipher all zeros to create the hash sub key. */
+       memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
+
+       init_completion(&req_data->result.completion);
+       sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
+       ablkcipher_request_set_tfm(req, ctr_tfm);
+       ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+                                       CRYPTO_TFM_REQ_MAY_BACKLOG,
+                                       rfc4106_set_hash_subkey_done,
+                                       &req_data->result);
+
+       ablkcipher_request_set_crypt(req, &req_data->sg,
+               &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
+
+       ret = crypto_ablkcipher_encrypt(req);
+       if (ret == -EINPROGRESS || ret == -EBUSY) {
+               ret = wait_for_completion_interruptible
+                       (&req_data->result.completion);
+               if (!ret)
+                       ret = req_data->result.err;
+       }
+       ablkcipher_request_free(req);
+       kfree(req_data);
+       crypto_free_ablkcipher(ctr_tfm);
+       return ret;
+}
+
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+                                                  unsigned int key_len)
+{
+       int ret = 0;
+       struct crypto_tfm *tfm = crypto_aead_tfm(parent);
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+       u8 *new_key_mem = NULL;
+
+       if (key_len < 4) {
+               crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+       /*Account for 4 byte nonce at the end.*/
+       key_len -= 4;
+       if (key_len != AES_KEYSIZE_128) {
+               crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+
+       memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
+       /*This must be on a 16 byte boundary!*/
+       if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
+               return -EINVAL;
+
+       if ((unsigned long)key % AESNI_ALIGN) {
+               /*key is not aligned: use an auxuliar aligned pointer*/
+               new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
+               if (!new_key_mem)
+                       return -ENOMEM;
+
+               new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
+               memcpy(new_key_mem, key, key_len);
+               key = new_key_mem;
+       }
+
+       if (!irq_fpu_usable())
+               ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
+               key, key_len);
+       else {
+               kernel_fpu_begin();
+               ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
+               kernel_fpu_end();
+       }
+       /*This must be on a 16 byte boundary!*/
+       if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
+               ret = -EINVAL;
+               goto exit;
+       }
+       ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+exit:
+       kfree(new_key_mem);
+       return ret;
+}
+
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+                               unsigned int authsize)
+{
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+       struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+       switch (authsize) {
+       case 8:
+       case 12:
+       case 16:
+               break;
+       default:
+               return -EINVAL;
+       }
+       crypto_aead_crt(parent)->authsize = authsize;
+       crypto_aead_crt(cryptd_child)->authsize = authsize;
+       return 0;
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+       int ret;
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+       struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+       if (!irq_fpu_usable()) {
+               struct aead_request *cryptd_req =
+                       (struct aead_request *) aead_request_ctx(req);
+               memcpy(cryptd_req, req, sizeof(*req));
+               aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+               return crypto_aead_encrypt(cryptd_req);
+       } else {
+               kernel_fpu_begin();
+               ret = cryptd_child->base.crt_aead.encrypt(req);
+               kernel_fpu_end();
+               return ret;
+       }
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+       int ret;
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+       struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+       if (!irq_fpu_usable()) {
+               struct aead_request *cryptd_req =
+                       (struct aead_request *) aead_request_ctx(req);
+               memcpy(cryptd_req, req, sizeof(*req));
+               aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+               return crypto_aead_decrypt(cryptd_req);
+       } else {
+               kernel_fpu_begin();
+               ret = cryptd_child->base.crt_aead.decrypt(req);
+               kernel_fpu_end();
+               return ret;
+       }
+}
+
+static struct crypto_alg rfc4106_alg = {
+       .cra_name = "rfc4106(gcm(aes))",
+       .cra_driver_name = "rfc4106-gcm-aesni",
+       .cra_priority = 400,
+       .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+       .cra_blocksize = 1,
+       .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+       .cra_alignmask = 0,
+       .cra_type = &crypto_nivaead_type,
+       .cra_module = THIS_MODULE,
+       .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
+       .cra_init = rfc4106_init,
+       .cra_exit = rfc4106_exit,
+       .cra_u = {
+               .aead = {
+                       .setkey = rfc4106_set_key,
+                       .setauthsize = rfc4106_set_authsize,
+                       .encrypt = rfc4106_encrypt,
+                       .decrypt = rfc4106_decrypt,
+                       .geniv = "seqiv",
+                       .ivsize = 8,
+                       .maxauthsize = 16,
+               },
+       },
+};
+
+static int __driver_rfc4106_encrypt(struct aead_request *req)
+{
+       u8 one_entry_in_sg = 0;
+       u8 *src, *dst, *assoc;
+       __be32 counter = cpu_to_be32(1);
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+       void *aes_ctx = &(ctx->aes_key_expanded);
+       unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+       u8 iv_tab[16+AESNI_ALIGN];
+       u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
+       struct scatter_walk src_sg_walk;
+       struct scatter_walk assoc_sg_walk;
+       struct scatter_walk dst_sg_walk;
+       unsigned int i;
+
+       /* Assuming we are supporting rfc4106 64-bit extended */
+       /* sequence numbers We need to have the AAD length equal */
+       /* to 8 or 12 bytes */
+       if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+               return -EINVAL;
+       /* IV below built */
+       for (i = 0; i < 4; i++)
+               *(iv+i) = ctx->nonce[i];
+       for (i = 0; i < 8; i++)
+               *(iv+4+i) = req->iv[i];
+       *((__be32 *)(iv+12)) = counter;
+
+       if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+               one_entry_in_sg = 1;
+               scatterwalk_start(&src_sg_walk, req->src);
+               scatterwalk_start(&assoc_sg_walk, req->assoc);
+               src = scatterwalk_map(&src_sg_walk, 0);
+               assoc = scatterwalk_map(&assoc_sg_walk, 0);
+               dst = src;
+               if (unlikely(req->src != req->dst)) {
+                       scatterwalk_start(&dst_sg_walk, req->dst);
+                       dst = scatterwalk_map(&dst_sg_walk, 0);
+               }
+
+       } else {
+               /* Allocate memory for src, dst, assoc */
+               src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+                       GFP_ATOMIC);
+               if (unlikely(!src))
+                       return -ENOMEM;
+               assoc = (src + req->cryptlen + auth_tag_len);
+               scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+               scatterwalk_map_and_copy(assoc, req->assoc, 0,
+                                       req->assoclen, 0);
+               dst = src;
+       }
+
+       aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+               ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
+               + ((unsigned long)req->cryptlen), auth_tag_len);
+
+       /* The authTag (aka the Integrity Check Value) needs to be written
+        * back to the packet. */
+       if (one_entry_in_sg) {
+               if (unlikely(req->src != req->dst)) {
+                       scatterwalk_unmap(dst, 0);
+                       scatterwalk_done(&dst_sg_walk, 0, 0);
+               }
+               scatterwalk_unmap(src, 0);
+               scatterwalk_unmap(assoc, 0);
+               scatterwalk_done(&src_sg_walk, 0, 0);
+               scatterwalk_done(&assoc_sg_walk, 0, 0);
+       } else {
+               scatterwalk_map_and_copy(dst, req->dst, 0,
+                       req->cryptlen + auth_tag_len, 1);
+               kfree(src);
+       }
+       return 0;
+}
+
+static int __driver_rfc4106_decrypt(struct aead_request *req)
+{
+       u8 one_entry_in_sg = 0;
+       u8 *src, *dst, *assoc;
+       unsigned long tempCipherLen = 0;
+       __be32 counter = cpu_to_be32(1);
+       int retval = 0;
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+       void *aes_ctx = &(ctx->aes_key_expanded);
+       unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+       u8 iv_and_authTag[32+AESNI_ALIGN];
+       u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
+       u8 *authTag = iv + 16;
+       struct scatter_walk src_sg_walk;
+       struct scatter_walk assoc_sg_walk;
+       struct scatter_walk dst_sg_walk;
+       unsigned int i;
+
+       if (unlikely((req->cryptlen < auth_tag_len) ||
+               (req->assoclen != 8 && req->assoclen != 12)))
+               return -EINVAL;
+       /* Assuming we are supporting rfc4106 64-bit extended */
+       /* sequence numbers We need to have the AAD length */
+       /* equal to 8 or 12 bytes */
+
+       tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
+       /* IV below built */
+       for (i = 0; i < 4; i++)
+               *(iv+i) = ctx->nonce[i];
+       for (i = 0; i < 8; i++)
+               *(iv+4+i) = req->iv[i];
+       *((__be32 *)(iv+12)) = counter;
+
+       if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+               one_entry_in_sg = 1;
+               scatterwalk_start(&src_sg_walk, req->src);
+               scatterwalk_start(&assoc_sg_walk, req->assoc);
+               src = scatterwalk_map(&src_sg_walk, 0);
+               assoc = scatterwalk_map(&assoc_sg_walk, 0);
+               dst = src;
+               if (unlikely(req->src != req->dst)) {
+                       scatterwalk_start(&dst_sg_walk, req->dst);
+                       dst = scatterwalk_map(&dst_sg_walk, 0);
+               }
+
+       } else {
+               /* Allocate memory for src, dst, assoc */
+               src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+               if (!src)
+                       return -ENOMEM;
+               assoc = (src + req->cryptlen + auth_tag_len);
+               scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+               scatterwalk_map_and_copy(assoc, req->assoc, 0,
+                       req->assoclen, 0);
+               dst = src;
+       }
+
+       aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+               ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
+               authTag, auth_tag_len);
+
+       /* Compare generated tag with passed in tag. */
+       retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
+               -EBADMSG : 0;
+
+       if (one_entry_in_sg) {
+               if (unlikely(req->src != req->dst)) {
+                       scatterwalk_unmap(dst, 0);
+                       scatterwalk_done(&dst_sg_walk, 0, 0);
+               }
+               scatterwalk_unmap(src, 0);
+               scatterwalk_unmap(assoc, 0);
+               scatterwalk_done(&src_sg_walk, 0, 0);
+               scatterwalk_done(&assoc_sg_walk, 0, 0);
+       } else {
+               scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
+               kfree(src);
+       }
+       return retval;
+}
+
+static struct crypto_alg __rfc4106_alg = {
+       .cra_name               = "__gcm-aes-aesni",
+       .cra_driver_name        = "__driver-gcm-aes-aesni",
+       .cra_priority           = 0,
+       .cra_flags              = CRYPTO_ALG_TYPE_AEAD,
+       .cra_blocksize          = 1,
+       .cra_ctxsize            = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+       .cra_alignmask          = 0,
+       .cra_type               = &crypto_aead_type,
+       .cra_module             = THIS_MODULE,
+       .cra_list               = LIST_HEAD_INIT(__rfc4106_alg.cra_list),
+       .cra_u = {
+               .aead = {
+                       .encrypt        = __driver_rfc4106_encrypt,
+                       .decrypt        = __driver_rfc4106_decrypt,
+               },
+       },
+};
+
 static int __init aesni_init(void)
 {
        int err;
@@ -738,6 +1240,7 @@ static int __init aesni_init(void)
                printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
                return -ENODEV;
        }
+
        if ((err = crypto_register_alg(&aesni_alg)))
                goto aes_err;
        if ((err = crypto_register_alg(&__aesni_alg)))
@@ -770,10 +1273,19 @@ static int __init aesni_init(void)
        if ((err = crypto_register_alg(&ablk_xts_alg)))
                goto ablk_xts_err;
 #endif
-
+       err = crypto_register_alg(&__rfc4106_alg);
+       if (err)
+               goto __aead_gcm_err;
+       err = crypto_register_alg(&rfc4106_alg);
+       if (err)
+               goto aead_gcm_err;
        return err;

+aead_gcm_err:
+       crypto_unregister_alg(&__rfc4106_alg);
+__aead_gcm_err:
 #ifdef HAS_XTS
+       crypto_unregister_alg(&ablk_xts_alg);
 ablk_xts_err:
 #endif
 #ifdef HAS_PCBC
@@ -809,6 +1321,8 @@ aes_err:

 static void __exit aesni_exit(void)
 {
+       crypto_unregister_alg(&__rfc4106_alg);
+       crypto_unregister_alg(&rfc4106_alg);
 #ifdef HAS_XTS
        crypto_unregister_alg(&ablk_xts_alg);
 #endif
--
1.5.3.rc5



--------------------------------------------------------------
Intel Shannon Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
Business address: Dromore House, East Park, Shannon, Co. Clare

This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). Any review or distribution by others is strictly prohibited. If you are not the intended recipient, please contact the sender and delete all copies.



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-09-10 17:36 [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions Hoban, Adrian
@ 2010-09-20  8:07 ` Herbert Xu
       [not found]   ` <AANLkTim_gVO2tHUw+vFC9vXRh8Dtv7N38HTCMUZ7LmNg@mail.gmail.com>
  0 siblings, 1 reply; 19+ messages in thread
From: Herbert Xu @ 2010-09-20  8:07 UTC (permalink / raw)
  To: Hoban, Adrian; +Cc: linux-crypto, linux-kernel

On Fri, Sep 10, 2010 at 06:36:24PM +0100, Hoban, Adrian wrote:
> This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
> kernels. It supports 128-bit AES key size. This leverages the crypto
> AEAD interface type to facilitate a combined AES & GCM operation to
> be implemented in assembly code. The assembly code leverages Intel(R)
> AES New Instructions and the PCLMULQDQ instruction.
> 
> Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
> Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
> Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
> Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com>
> Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com>
> Signed-off-by: James Guilford <james.guilford@intel.com>
> Signed-off-by: Edward Clinton <edward.clinton@intel.com>
> Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com>

I've applied patch 1/3 but 2/3 does not apply because you've
converted the tabs to spaces.

Please resend patches 2/3 and 3/3 with a cc to yourself and
double-check that you can still apply the received version.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
       [not found]   ` <AANLkTim_gVO2tHUw+vFC9vXRh8Dtv7N38HTCMUZ7LmNg@mail.gmail.com>
@ 2010-09-20 11:21     ` Struk, Tadeusz
  2010-09-23  7:58       ` Herbert Xu
  0 siblings, 1 reply; 19+ messages in thread
From: Struk, Tadeusz @ 2010-09-20 11:21 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Hoban, Adrian, Paoloni, Gabriele, Struk, Tadeusz, O Mahony,
	Aidan, linux-crypto, linux-kernel


[-- Attachment #1.1: Type: text/plain, Size: 65296 bytes --]

Hello Herbert,
I've reworked the patch 2/3 and successfully applied all of them.
The problem was spaces at EOL.
Please find the reworked patch below.

>From 06444d8a95458d807ae14699e557739281d0b026 Mon Sep 17 00:00:00 2001
From: Adrian Hoban <ahoban@allen.ir.intel.com>
Date: Fri, 10 Sep 2010 18:08:45 +0100
Subject: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions

This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
kernels. It supports 128-bit AES key size. This leverages the crypto
AEAD interface type to facilitate a combined AES & GCM operation to
be implemented in assembly code. The assembly code leverages Intel(R)
AES New Instructions and the PCLMULQDQ instruction.

Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com>
Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com>
Signed-off-by: James Guilford <james.guilford@intel.com>
Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com>
---
 arch/x86/crypto/aesni-intel_asm.S  | 1112
+++++++++++++++++++++++++++++++++++-
 arch/x86/crypto/aesni-intel_glue.c |  518 +++++++++++++++++-
 2 files changed, 1627 insertions(+), 3 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S
b/arch/x86/crypto/aesni-intel_asm.S
index ff16756..3950769 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -8,6 +8,17 @@
  *    Author: Huang Ying <ying.huang@intel.com>
  *            Vinodh Gopal <vinodh.gopal@intel.com>
  *            Kahraman Akdemir
+ *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             James Guilford (james.guilford@intel.com)
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Wajdi Feghali (wajdi.k.feghali@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -16,9 +27,50 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/inst.h>
+
+
+.data
+POLY:   .octa 0xC2000000000000000000000000000001
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow
ALL_F
+
+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
+MASK1:      .octa 0x0000000000000000ffffffffffffffff
+MASK2:      .octa 0xffffffffffffffff0000000000000000
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
+ZERO:       .octa 0x00000000000000000000000000000000
+ONE:        .octa 0x00000000000000000000000000000001
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+dec:        .octa 0x1
+enc:        .octa 0x2
+
 
 .text
+#define	STACK_OFFSET    8*3
+#define	HashKey		16*0	// store HashKey <<1 mod poly here
+#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
+#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
+#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
+#define	HashKey_k	16*4	// store XOR of High 64 bits and Low
64 bits of  HashKey <<1 mod poly here (for Karatsuba purposes)
+#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low
64 bits of  HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low
64 bits of  HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low
64 bits of  HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+#define	VARIABLE_OFFSET	16*8
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%r14)
+#define arg8 STACK_OFFSET+16(%r14)
+#define arg9 STACK_OFFSET+24(%r14)
+#define arg10 STACK_OFFSET+32(%r14)
+
 
 #define STATE1	%xmm0
 #define STATE2	%xmm4
@@ -47,6 +99,1064 @@
 #define T2	%r11
 #define TCTR_LOW T2
 
+
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as
input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro	GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+	movdqa	  \GH, \TMP1
+	pshufd	  $78, \GH, \TMP2
+	pshufd	  $78, \HK, \TMP3
+	pxor	  \GH, \TMP2            # TMP2 = a1+a0
+	pxor	  \HK, \TMP3            # TMP3 = b1+b0
+	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
+	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
+	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
+	pxor	  \GH, \TMP2
+	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
+	movdqa	  \TMP2, \TMP3
+	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
+	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
+	pxor	  \TMP3, \GH
+	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
+
+        # first phase of the reduction
+
+	movdqa    \GH, \TMP2
+	movdqa    \GH, \TMP3
+	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4 in
in order to perform independent shifts
+	pslld     $31, \TMP2            # packed right shift <<31
+	pslld     $30, \TMP3            # packed right shift <<30
+	pslld     $25, \TMP4            # packed right shift <<25
+	pxor      \TMP3, \TMP2          # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP5
+	psrldq    $4, \TMP5             # right shift TMP5 1 DW
+	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
+	pxor      \TMP2, \GH
+
+        # second phase of the reduction
+
+	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4 in
in order to perform independent shifts
+	movdqa    \GH,\TMP3
+	movdqa    \GH,\TMP4
+	psrld     $1,\TMP2              # packed left shift >>1
+	psrld     $2,\TMP3              # packed left shift >>2
+	psrld     $7,\TMP4              # packed left shift >>7
+	pxor      \TMP3,\TMP2		# xor the shifted versions
+	pxor      \TMP4,\TMP2
+	pxor      \TMP5, \TMP2
+	pxor      \TMP2, \GH
+	pxor      \TMP1, \GH            # result is in TMP1
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on the
ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers are
clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+
+	mov	   arg7, %r10           # %r10 = AAD
+	mov	   arg8, %r12           # %r12 = aadLen
+	mov	   %r12, %r11
+	pxor	   %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+	movd	   (%r10), \TMP1
+	pslldq	   $12, \TMP1
+	psrldq	   $4, %xmm\i
+	pxor	   \TMP1, %xmm\i
+	add	   $4, %r10
+	sub	   $4, %r12
+	jne	   _get_AAD_loop\num_initial_blocks\operation
+	cmp	   $16, %r11
+	je	   _get_AAD_loop2_done\num_initial_blocks\operation
+	mov	   $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+	psrldq	   $4, %xmm\i
+	sub	   $4, %r12
+	cmp	   %r11, %r12
+	jne	   _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+	pshufb	   SHUF_MASK(%rip), %xmm\i          # byte-reflect the AAD
data
+	xor	   %r11, %r11                       # initialise the data
pointer offset as zero
+
+
+        # start AES for num_initial_blocks blocks
+
+	mov	   %arg5, %rax                      # %rax = *Y0
+	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
+	pshufb	   SHUF_MASK(%rip), \XMM0
+.if \i_seq != 0
+.irpc index, \i_seq
+	paddd	   ONE(%rip), \XMM0                 # INCR Y0
+	movdqa	   \XMM0, %xmm\index
+	pshufb	   SHUF_MASK(%rip), %xmm\index      # perform a 16 byte swap
+.endr
+.irpc index, \i_seq
+	pxor	   16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+	aesenc     16*1(%rdi), %xmm\index          # Round 1
+.endr
+.irpc index, \i_seq
+	aesenc     16*2(%arg1), %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	aesenc     16*3(%arg1), %xmm\index          # Round 3
+.endr
+.irpc index, \i_seq
+	aesenc     16*4(%arg1), %xmm\index          # Round 4
+.endr
+.irpc index, \i_seq
+	aesenc     16*5(%arg1), %xmm\index          # Round 5
+.endr
+.irpc index, \i_seq
+	aesenc     16*6(%arg1), %xmm\index          # Round 6
+.endr
+.irpc index, \i_seq
+	aesenc     16*7(%arg1), %xmm\index          # Round 7
+.endr
+.irpc index, \i_seq
+	aesenc     16*8(%arg1), %xmm\index          # Round 8
+.endr
+.irpc index, \i_seq
+	aesenc     16*9(%arg1), %xmm\index          # Round 9
+.endr
+.irpc index, \i_seq
+	aesenclast 16*10(%arg1), %xmm\index         # Round 10
+.endr
+.irpc index, \i_seq
+	movdqu	   (%arg3 , %r11, 1), \TMP1
+	pxor	   \TMP1, %xmm\index
+	movdqu	   %xmm\index, (%arg2 , %r11, 1)    # write back
plaintext/ciphertext for num_initial_blocks
+	add	   $16, %r11
+.if \operation == dec
+	movdqa     \TMP1, %xmm\index
+.endif
+	pshufb	   SHUF_MASK(%rip), %xmm\index      # prepare
plaintext/ciphertext for GHASH computation
+.endr
+.endif
+	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+
+        # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+        pxor       %xmm5, %xmm6
+	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+	cmp	   $64, %r13
+	jl	_initial_blocks_done\num_initial_blocks\operation   # no
need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM1
+	pshufb	   SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM2
+	pshufb	   SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM3
+	pshufb	   SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM4
+	pshufb	   SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
+	pxor	   16*0(%arg1), \XMM1
+	pxor	   16*0(%arg1), \XMM2
+	pxor	   16*0(%arg1), \XMM3
+	pxor	   16*0(%arg1), \XMM4
+	movdqa	   \TMP3, \TMP5
+	pshufd	   $78, \TMP3, \TMP1
+	pxor	   \TMP3, \TMP1
+	movdqa	   \TMP1, HashKey_k(%rsp)
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 =
HashKey^2<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_2(%rsp)                           #
HashKey_2 = HashKey^2<<1 (mod poly)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+	aesenc	   16*\index(%arg1), \XMM1
+	aesenc	   16*\index(%arg1), \XMM2
+	aesenc	   16*\index(%arg1), \XMM3
+	aesenc	   16*\index(%arg1), \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 =
HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_3(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+	aesenc	   16*\index(%arg1), \XMM1
+	aesenc	   16*\index(%arg1), \XMM2
+	aesenc	   16*\index(%arg1), \XMM3
+	aesenc	   16*\index(%arg1), \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 =
HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_4(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_4_k(%rsp)
+	aesenclast 160(%arg1), \XMM1
+	aesenclast 160(%arg1), \XMM2
+	aesenclast 160(%arg1), \XMM3
+	aesenclast 160(%arg1), \XMM4
+	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM1
+.if \operation == dec
+	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM1
+.endif
+	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM2
+.if \operation == dec
+	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM2
+.endif
+	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM3
+.if \operation == dec
+	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM3
+.endif
+	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM4
+.if \operation == dec
+	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM4
+.else
+	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
+	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
+	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
+	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+.endif
+	add	   $64, %r11
+	pshufb	   SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
+	pxor	   \XMMDst, \XMM1         # combine GHASHed value with the
corresponding ciphertext
+	pshufb	   SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
+	pshufb	   SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
+	pshufb	   SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
+_initial_blocks_done\num_initial_blocks\operation:
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 XMM0 XMM1
XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+	movdqa	  \XMM1, \XMM5
+	movdqa	  \XMM2, \XMM6
+	movdqa	  \XMM3, \XMM7
+	movdqa	  \XMM4, \XMM8
+
+        # multiply TMP5 * HashKey using karatsuba
+
+	movdqa	  \XMM5, \TMP4
+	pshufd	  $78, \XMM5, \TMP6
+	pxor	  \XMM5, \TMP6
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa	  HashKey_4(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
+	movdqa    \XMM0, \XMM1
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM2
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM3
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM4
+	pshufb	  SHUF_MASK(%rip), \XMM1	# perform a 16 byte swap
+	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
+	pshufb	  SHUF_MASK(%rip), \XMM2	# perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM3	# perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM4	# perform a 16 byte swap
+	pxor	  (%arg1), \XMM1
+	pxor	  (%arg1), \XMM2
+	pxor	  (%arg1), \XMM3
+	pxor	  (%arg1), \XMM4
+	movdqa	  HashKey_4_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
+	aesenc	  16(%arg1), \XMM1              # Round 1
+	aesenc	  16(%arg1), \XMM2
+	aesenc	  16(%arg1), \XMM3
+	aesenc	  16(%arg1), \XMM4
+	aesenc	  32(%arg1), \XMM1              # Round 2
+	aesenc	  32(%arg1), \XMM2
+	aesenc	  32(%arg1), \XMM3
+	aesenc	  32(%arg1), \XMM4
+	movdqa	  \XMM6, \TMP1
+	pshufd	  $78, \XMM6, \TMP2
+	pxor	  \XMM6, \TMP2
+	movdqa	  HashKey_3(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
+	aesenc    48(%arg1), \XMM1              # Round 3
+	aesenc    48(%arg1), \XMM2
+	aesenc    48(%arg1), \XMM3
+	aesenc    48(%arg1), \XMM4
+	pclmulqdq $0x00, \TMP5, \XMM6           # XMM6 = a0*b0
+	aesenc	  64(%arg1), \XMM1              # Round 4
+	aesenc	  64(%arg1), \XMM2
+	aesenc	  64(%arg1), \XMM3
+	aesenc	  64(%arg1), \XMM4
+	movdqa	  HashKey_3_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	aesenc	  80(%arg1), \XMM1              # Round 5
+	aesenc	  80(%arg1), \XMM2
+	aesenc	  80(%arg1), \XMM3
+	aesenc	  80(%arg1), \XMM4
+	pxor	  \TMP1, \TMP4			# accumulate the results in
TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM6, \XMM5
+	pxor	  \TMP2, \TMP6
+	movdqa	  \XMM7, \TMP1
+	pshufd	  $78, \XMM7, \TMP2
+	pxor	  \XMM7, \TMP2
+	movdqa	  HashKey_2(%rsp ), \TMP5
+
+        # Multiply TMP5 * HashKey using karatsuba
+
+	pclmulqdq $0x11, \TMP5, \TMP1           # TMP1 = a1*b1
+	aesenc	  96(%arg1), \XMM1              # Round 6
+	aesenc	  96(%arg1), \XMM2
+	aesenc	  96(%arg1), \XMM3
+	aesenc	  96(%arg1), \XMM4
+	pclmulqdq $0x00, \TMP5, \XMM7           # XMM7 = a0*b0
+	aesenc	  112(%arg1), \XMM1             # Round 7
+	aesenc	  112(%arg1), \XMM2
+	aesenc	  112(%arg1), \XMM3
+	aesenc	  112(%arg1), \XMM4
+	movdqa	  HashKey_2_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	aesenc	  128(%arg1), \XMM1             # Round 8
+	aesenc	  128(%arg1), \XMM2
+	aesenc	  128(%arg1), \XMM3
+	aesenc	  128(%arg1), \XMM4
+	pxor	  \TMP1, \TMP4			# accumulate the results in
TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM7, \XMM5
+	pxor	  \TMP2, \TMP6
+
+        # Multiply XMM8 * HashKey
+        # XMM8 and TMP5 hold the values for the two operands
+
+	movdqa	  \XMM8, \TMP1
+	pshufd	  $78, \XMM8, \TMP2
+	pxor	  \XMM8, \TMP2
+	movdqa	  HashKey(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1          # TMP1 = a1*b1
+	aesenc	  144(%arg1), \XMM1            # Round 9
+	aesenc	  144(%arg1), \XMM2
+	aesenc	  144(%arg1), \XMM3
+	aesenc	  144(%arg1), \XMM4
+	pclmulqdq $0x00, \TMP5, \XMM8          # XMM8 = a0*b0
+	aesenclast 160(%arg1), \XMM1           # Round 10
+	aesenclast 160(%arg1), \XMM2
+	aesenclast 160(%arg1), \XMM3
+	aesenclast 160(%arg1), \XMM4
+	movdqa    HashKey_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
+	movdqu	  (%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
+	movdqa    \TMP3, \XMM1
+.endif
+	movdqu	  16(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM2
+.endif
+	movdqu	  32(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM3
+.endif
+	movdqu	  48(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM4
+.else
+        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext
buffer
+        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext
buffer
+        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext
buffer
+        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext
buffer
+.endif
+	pshufb	  SHUF_MASK(%rip), \XMM1       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM2       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM3       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM4       # perform a 16 byte sway
+
+	pxor	  \TMP4, \TMP1
+	pxor	  \XMM8, \XMM5
+	pxor	  \TMP6, \TMP2
+	pxor	  \TMP1, \TMP2
+	pxor	  \XMM5, \TMP2
+	movdqa	  \TMP2, \TMP3
+	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
+	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
+	pxor	  \TMP3, \XMM5
+	pxor	  \TMP2, \TMP1		       # accumulate the results in
TMP1:XMM5
+
+        # first phase of reduction
+
+	movdqa    \XMM5, \TMP2
+	movdqa    \XMM5, \TMP3
+	movdqa    \XMM5, \TMP4		       # move XMM5 into TMP2, TMP3,
TMP4 in order to perform shifts independently
+	pslld     $31, \TMP2                   # packed right shift << 31
+	pslld     $30, \TMP3                   # packed right shift << 30
+	pslld     $25, \TMP4                   # packed right shift << 25
+	pxor      \TMP3, \TMP2	               # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP5
+	psrldq    $4, \TMP5                    # right shift T5 1 DW
+	pslldq    $12, \TMP2                   # left shift T2 3 DWs
+	pxor      \TMP2, \XMM5
+
+        # second phase of reduction
+
+	movdqa    \XMM5,\TMP2		       # make 3 copies of XMM5 into
TMP2, TMP3, TMP4
+	movdqa    \XMM5,\TMP3
+	movdqa    \XMM5,\TMP4
+	psrld     $1, \TMP2                    # packed left shift >>1
+	psrld     $2, \TMP3                    # packed left shift >>2
+	psrld     $7, \TMP4                    # packed left shift >>7
+	pxor      \TMP3,\TMP2		       # xor the shifted versions
+	pxor      \TMP4,\TMP2
+	pxor      \TMP5, \TMP2
+	pxor      \TMP2, \XMM5
+	pxor      \TMP1, \XMM5                 # result is in TMP1
+
+	pxor	  \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 XMM1 XMM2 XMM3 XMM4
XMMDst
+
+        # Multiply TMP6 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM1, \TMP6
+	pshufd	  $78, \XMM1, \TMP2
+	pxor	  \XMM1, \TMP2
+	movdqa	  HashKey_4(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
+	movdqa	  HashKey_4_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	movdqa	  \XMM1, \XMMDst
+	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM2, \TMP1
+	pshufd	  $78, \XMM2, \TMP2
+	pxor	  \XMM2, \TMP2
+	movdqa	  HashKey_3(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
+	movdqa	  HashKey_3_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM2, \XMMDst
+	pxor	  \TMP2, \XMM1              # results accumulated in TMP6,
XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM3, \TMP1
+	pshufd	  $78, \XMM3, \TMP2
+	pxor	  \XMM3, \TMP2
+	movdqa	  HashKey_2(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
+	movdqa	  HashKey_2_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM3, \XMMDst
+	pxor	  \TMP2, \XMM1              # results accumulated in TMP6,
XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM4, \TMP1
+	pshufd	  $78, \XMM4, \TMP2
+	pxor	  \XMM4, \TMP2
+	movdqa	  HashKey(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
+	movdqa	  HashKey_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM4, \XMMDst
+	pxor	  \XMM1, \TMP2
+	pxor	  \TMP6, \TMP2
+	pxor	  \XMMDst, \TMP2            # middle section of the temp
results combined as in karatsuba algorithm
+	movdqa	  \TMP2, \TMP4
+	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
+	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
+	pxor	  \TMP4, \XMMDst
+	pxor	  \TMP2, \TMP6              # TMP6:XMMDst holds the result
of the accumulated carry-less multiplications
+
+        # first phase of the reduction
+
+	movdqa    \XMMDst, \TMP2
+	movdqa    \XMMDst, \TMP3
+	movdqa    \XMMDst, \TMP4            # move XMMDst into TMP2, TMP3,
TMP4 in order to perform 3 shifts independently
+	pslld     $31, \TMP2                # packed right shifting << 31
+	pslld     $30, \TMP3                # packed right shifting << 30
+	pslld     $25, \TMP4                # packed right shifting << 25
+	pxor      \TMP3, \TMP2              # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP7
+	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
+	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
+	pxor      \TMP2, \XMMDst
+
+        # second phase of the reduction
+
+	movdqa    \XMMDst, \TMP2            # make 3 copies of XMMDst for
doing 3 shift operations
+	movdqa    \XMMDst, \TMP3
+	movdqa    \XMMDst, \TMP4
+	psrld     $1, \TMP2                 # packed left shift >> 1
+	psrld     $2, \TMP3                 # packed left shift >> 2
+	psrld     $7, \TMP4                 # packed left shift >> 7
+	pxor      \TMP3, \TMP2              # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	pxor      \TMP7, \TMP2
+	pxor      \TMP2, \XMMDst
+	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
+.endm
+
+/* Encryption of a single block done*/
+.macro ENCRYPT_SINGLE_BLOCK XMM0
+
+	pxor	(%arg1), \XMM0
+	aesenc	16(%arg1), \XMM0
+	aesenc	32(%arg1), \XMM0
+	aesenc	48(%arg1), \XMM0
+	aesenc	64(%arg1), \XMM0
+	aesenc	80(%arg1), \XMM0
+	aesenc	96(%arg1), \XMM0
+	aesenc	112(%arg1), \XMM0
+	aesenc	128(%arg1), \XMM0
+	aesenc	144(%arg1), \XMM0
+	aesenclast 160(%arg1), \XMM0
+.endm
+
+
+/**************************************************************************
********************
+*void aesni_gcm_dec(void *aes_ctx,               // AES Key schedule.
Starts on a 16 byte boundary.
+*                   u8 *out,                     // Plaintext output.
Encrypt in-place is allowed.
+*                   const u8 *in,                // Ciphertext input
+*                   u64 plaintext_len,           // Length of data in bytes
for decryption.
+*                   u8 *iv,                      // Pre-counter block j0: 4
byte salt (from Security Association)
+*                                                // concatenated with 8
byte Initialisation Vector (from IPSec ESP Payload)
+*                                                // concatenated with
0x00000001. 16-byte aligned pointer.
+*                   u8 *hash_subkey,             // H, the Hash sub key
input. Data starts on a 16-byte boundary.
+*                   const u8 *aad,               // Additional
Authentication Data (AAD)
+*                   u64 aad_len,                 // Length of AAD in bytes.
With RFC4106 this is going to be 8 or 12 bytes
+*                   u8  *auth_tag,               // Authenticated Tag
output. The driver will compare this to the
+*                                                // given authentication
tag and only return the plaintext if they match.
+*                   u64 auth_tag_len);           // Authenticated Tag
Length in bytes. Valid values are 16 (most likely), 12 or 8.
+*
+*
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the
first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence
Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 64-bit Extended
Sequence Number
+*
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16
bytes. For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+***************************************************************************
****************/
+
+ENTRY(aesni_gcm_dec)
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	%rsp, %r14
+/*
+* states of %xmm registers %xmm6:%xmm15 not saved
+* all %xmm registers are clobbered
+*/
+	sub	$VARIABLE_OFFSET, %rsp
+	and	$~63, %rsp                        # align rsp to 64 bytes
+	mov	%arg6, %r12
+	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
+	pshufb	SHUF_MASK(%rip), %xmm13
+
+        # Precompute HashKey<<1 (mod poly) from the hash key (this is
required for GHASH)
+
+	movdqa	%xmm13, %xmm2
+	psllq	$1, %xmm13
+	psrlq	$63, %xmm2
+	movdqa	%xmm2, %xmm1
+	pslldq	$8, %xmm2
+	psrldq	$8, %xmm1
+	por	%xmm2, %xmm13
+
+        # Reduction
+
+	pshufd	$0x24, %xmm1, %xmm2
+	pcmpeqd TWOONE(%rip), %xmm2
+	pand	POLY(%rip), %xmm2
+	pxor	%xmm2, %xmm13			# %xmm13 holds the
HashKey<<1 (mod poly)
+
+
+        # Decrypt first few blocks
+
+	movdqa	%xmm13, HashKey(%rsp)           # store HashKey<<1 (mod
poly)
+	mov	%arg4, %r13                     # save the number of bytes
of plaintext/ciphertext
+	and	$-16, %r13                      # %r13 = %r13 - (%r13 mod
16)
+	mov	%r13, %r12
+	and	$(3<<4), %r12
+	jz	_initial_num_blocks_is_0_decrypt
+	cmp	$(2<<4), %r12
+	jb	_initial_num_blocks_is_1_decrypt
+	je	_initial_num_blocks_is_2_decrypt
+_initial_num_blocks_is_3_decrypt:
+	INITIAL_BLOCKS	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0,
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
+	sub	$48, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_2_decrypt:
+	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0,
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
+	sub	$32, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_1_decrypt:
+	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0,
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
+	sub	$16, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_0_decrypt:
+	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0,
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
+_initial_blocks_decrypted:
+	cmp	$0, %r13
+	je	_zero_cipher_left_decrypt
+	sub	$64, %r13
+	je	_four_cipher_left_decrypt
+_decrypt_by_4:
+	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12,
%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, dec
+	add	$64, %r11
+	sub	$64, %r13
+	jne	_decrypt_by_4
+_four_cipher_left_decrypt:
+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_decrypt:
+	mov	%arg4, %r13
+	and	$15, %r13				# %r13 = arg4 (mod
16)
+	je	_multiple_of_16_bytes_decrypt
+
+        # Handle the last <16 byte block seperately
+
+	paddd	ONE(%rip), %xmm0                        # increment CNT to
get Yn
+	pshufb	SHUF_MASK(%rip), %xmm0
+	ENCRYPT_SINGLE_BLOCK 	%xmm0			# E(K, Yn)
+	sub	$16, %r11
+	add	%r13, %r11
+	movdqu	(%arg3,%r11,1), %xmm1			# recieve the last
<16 byte block
+	lea	SHIFT_MASK+16(%rip), %r12
+	sub	%r13, %r12				# adjust the shuffle
mask pointer to be able to shift 16-%r13 bytes
+							# (%r13 is the
number of bytes in plaintext mod 16)
+	movdqu	(%r12), %xmm2				# get the
appropriate shuffle mask
+	pshufb	%xmm2, %xmm1				# right shift
16-%r13 butes
+	movdqa  %xmm1, %xmm2
+	pxor	%xmm1, %xmm0				# Ciphertext XOR
E(K, Yn)
+	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1           # get the
appropriate mask to mask out top 16-%r13 bytes of %xmm0
+	pand	%xmm1, %xmm0				# mask out top
16-%r13 bytes of %xmm0
+	pand    %xmm1, %xmm2
+	pshufb	SHUF_MASK(%rip),%xmm2
+	pxor	%xmm2, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
# GHASH computation for the last <16 byte block
+	sub	%r13, %r11
+	add	$16, %r11
+
+        # output %r13 bytes
+
+	movq	%xmm0, %rax
+	cmp	$8, %r13
+	jle	_less_than_8_bytes_left_decrypt
+	mov	%rax, (%arg2 , %r11, 1)
+	add	$8, %r11
+	psrldq	$8, %xmm0
+	movq	%xmm0, %rax
+	sub	$8, %r13
+_less_than_8_bytes_left_decrypt:
+	mov	%al,  (%arg2, %r11, 1)
+	add	$1, %r11
+	shr	$8, %rax
+	sub	$1, %r13
+	jne	_less_than_8_bytes_left_decrypt
+_multiple_of_16_bytes_decrypt:
+	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
+	shl	$3, %r12		  # convert into number of bits
+	movd	%r12d, %xmm15		  # len(A) in %xmm15
+	shl	$3, %arg4		  # len(C) in bits (*128)
+	movq	%arg4, %xmm1
+	pslldq	$8, %xmm15		  # %xmm15 =
len(A)||0x0000000000000000
+	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
+	pxor	%xmm15, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
# final GHASH computation
+	pshufb	SHUF_MASK(%rip), %xmm8
+	mov	%arg5, %rax		  # %rax = *Y0
+	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
+	ENCRYPT_SINGLE_BLOCK	%xmm0 	  # E(K, Y0)
+	pxor	%xmm8, %xmm0
+_return_T_decrypt:
+	mov	arg9, %r10                # %r10 = authTag
+	mov	arg10, %r11               # %r11 = auth_tag_len
+	cmp	$16, %r11
+	je	_T_16_decrypt
+	cmp	$12, %r11
+	je	_T_12_decrypt
+_T_8_decrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	jmp	_return_T_done_decrypt
+_T_12_decrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	psrldq	$8, %xmm0
+	movd	%xmm0, %eax
+	mov	%eax, 8(%r10)
+	jmp	_return_T_done_decrypt
+_T_16_decrypt:
+	movdqu	%xmm0, (%r10)
+_return_T_done_decrypt:
+	mov	%r14, %rsp
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	ret
+
+
+/**************************************************************************
*************************
+* void aesni_gcm_enc(void *aes_ctx,               // AES Key schedule.
Starts on a 16 byte boundary.
+*                    u8 *out,                    // Ciphertext output.
Encrypt in-place is allowed.
+*                    const u8 *in,               // Plaintext input
+*                    u64 plaintext_len,          // Length of data in bytes
for encryption.
+*                    u8 *iv,                     // Pre-counter block j0: 4
byte salt (from Security Association)
+*                                                // concatenated with 8
byte Initialisation Vector (from IPSec ESP Payload)
+*                                                // concatenated with
0x00000001. 16-byte aligned pointer.
+*                    u8 *hash_subkey,            // H, the Hash sub key
input. Data starts on a 16-byte boundary.
+*                    const u8 *aad,              // Additional
Authentication Data (AAD)
+*                    u64 aad_len,                // Length of AAD in bytes.
With RFC4106 this is going to be 8 or 12 bytes
+*                    u8 *auth_tag,               // Authenticated Tag
output.
+*                    u64 auth_tag_len);          // Authenticated Tag
Length in bytes. Valid values are 16 (most likely), 12 or 8.
+*
+*
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the
first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence
Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 64-bit Extended
Sequence Number
+*
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16
bytes. For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+***************************************************************************
***********************************************/
+ENTRY(aesni_gcm_enc)
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	%rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+	sub	$VARIABLE_OFFSET, %rsp
+	and	$~63, %rsp
+	mov	%arg6, %r12
+	movdqu	(%r12), %xmm13
+	pshufb	SHUF_MASK(%rip), %xmm13
+
+        # precompute HashKey<<1 mod poly from the HashKey (required for
GHASH)
+
+	movdqa	%xmm13, %xmm2
+	psllq	$1, %xmm13
+	psrlq	$63, %xmm2
+	movdqa	%xmm2, %xmm1
+	pslldq	$8, %xmm2
+	psrldq	$8, %xmm1
+	por	%xmm2, %xmm13
+
+        # reduce HashKey<<1
+
+	pshufd	$0x24, %xmm1, %xmm2
+	pcmpeqd TWOONE(%rip), %xmm2
+	pand	POLY(%rip), %xmm2
+	pxor	%xmm2, %xmm13
+	movdqa	%xmm13, HashKey(%rsp)
+	mov	%arg4, %r13               # %xmm13 holds HashKey<<1 (mod
poly)
+	and	$-16, %r13
+	mov	%r13, %r12
+
+        # Encrypt first few blocks
+
+	and	$(3<<4), %r12
+	jz	_initial_num_blocks_is_0_encrypt
+	cmp	$(2<<4), %r12
+	jb	_initial_num_blocks_is_1_encrypt
+	je	_initial_num_blocks_is_2_encrypt
+_initial_num_blocks_is_3_encrypt:
+	INITIAL_BLOCKS	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0,
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
+	sub	$48, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_2_encrypt:
+	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0,
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
+	sub	$32, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_1_encrypt:
+	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0,
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
+	sub	$16, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_0_encrypt:
+	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0,
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
+_initial_blocks_encrypted:
+
+        # Main loop - Encrypt remaining blocks
+
+	cmp	$0, %r13
+	je	_zero_cipher_left_encrypt
+	sub	$64, %r13
+	je	_four_cipher_left_encrypt
+_encrypt_by_4_encrypt:
+	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12,
%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, enc
+	add	$64, %r11
+	sub	$64, %r13
+	jne	_encrypt_by_4_encrypt
+_four_cipher_left_encrypt:
+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_encrypt:
+	mov	%arg4, %r13
+	and	$15, %r13			# %r13 = arg4 (mod 16)
+	je	_multiple_of_16_bytes_encrypt
+
+         # Handle the last <16 Byte block seperately
+
+	paddd	ONE(%rip), %xmm0                # INCR CNT to get Yn
+	pshufb	SHUF_MASK(%rip), %xmm0
+	ENCRYPT_SINGLE_BLOCK	%xmm0 		# Encrypt(K, Yn)
+	sub	$16, %r11
+	add	%r13, %r11
+	movdqu	(%arg3,%r11,1), %xmm1		# recieve the last <16 byte
blocks
+	lea	SHIFT_MASK+16(%rip), %r12
+	sub	%r13, %r12			# adjust the shuffle mask
pointer to be able to shift 16-r13 bytes
+                                                # (%r13 is the number of
bytes in plaintext mod 16)
+	movdqu	(%r12), %xmm2			# get the appropriate
shuffle mask
+	pshufb	%xmm2, %xmm1			# shift right 16-r13 byte
+	pxor	%xmm1, %xmm0			# Plaintext XOR Encrypt(K,
Yn)
+	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1   # get the appropriate mask
to mask out top 16-r13 bytes of xmm0
+	pand	%xmm1, %xmm0			# mask out top 16-r13 bytes
of xmm0
+
+	pshufb	SHUF_MASK(%rip),%xmm0
+	pxor	%xmm0, %xmm8
+	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6	#
GHASH computation for the last <16 byte block
+	sub	%r13, %r11
+	add	$16, %r11
+	pshufb	SHUF_MASK(%rip), %xmm0          # shuffle xmm0 back to
output as ciphertext
+
+        # Output %r13 bytes
+
+	movq	%xmm0, %rax
+	cmp	$8, %r13
+	jle	_less_than_8_bytes_left_encrypt
+	mov	%rax, (%arg2 , %r11, 1)
+	add	$8, %r11
+	psrldq	$8, %xmm0
+	movq	%xmm0, %rax
+	sub	$8, %r13
+_less_than_8_bytes_left_encrypt:
+	mov	%al,  (%arg2, %r11, 1)
+	add	$1, %r11
+	shr	$8, %rax
+	sub	$1, %r13
+	jne	_less_than_8_bytes_left_encrypt
+_multiple_of_16_bytes_encrypt:
+	mov	arg8, %r12	               # %r12 = addLen (number of
bytes)
+	shl	$3, %r12
+	movd	%r12d, %xmm15	               # len(A) in %xmm15
+	shl	$3, %arg4	               # len(C) in bits (*128)
+	movq	%arg4, %xmm1
+	pslldq	$8, %xmm15	               # %xmm15 =
len(A)||0x0000000000000000
+	pxor	%xmm1, %xmm15	               # %xmm15 = len(A)||len(C)
+	pxor	%xmm15, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
# final GHASH computation
+
+	pshufb	SHUF_MASK(%rip), %xmm8         # perform a 16 byte swap
+	mov	%arg5, %rax		       # %rax  = *Y0
+	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
+	ENCRYPT_SINGLE_BLOCK	%xmm0          # Encrypt(K, Y0)
+	pxor	%xmm8, %xmm0
+_return_T_encrypt:
+	mov	arg9, %r10                     # %r10 = authTag
+	mov	arg10, %r11                    # %r11 = auth_tag_len
+	cmp	$16, %r11
+	je	_T_16_encrypt
+	cmp	$12, %r11
+	je	_T_12_encrypt
+_T_8_encrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	jmp	_return_T_done_encrypt
+_T_12_encrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	psrldq	$8, %xmm0
+	movd	%xmm0, %eax
+	mov	%eax, 8(%r10)
+	jmp	_return_T_done_encrypt
+_T_16_encrypt:
+	movdqu	%xmm0, (%r10)
+_return_T_done_encrypt:
+	mov	%r14, %rsp
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	ret
+
+#include <asm/inst.h>
+
 _key_expansion_128:
 _key_expansion_256a:
 	pshufd $0b11111111, %xmm1, %xmm1
diff --git a/arch/x86/crypto/aesni-intel_glue.c
b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc..f0b0a6f 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
  * Copyright (C) 2008, Intel Corp.
  *    Author: Huang Ying <ying.huang@intel.com>
  *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
 #include <crypto/ctr.h>
 #include <asm/i387.h>
 #include <asm/aes.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
 
 #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
 #define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
 
-#define AESNI_ALIGN	16
+/* This data is stored at the end of the crypto_tfm struct.
+ * It's a type of per "session" data storage location.
+ * This needs to be 16 byte aligned.
+ */
+struct aesni_rfc4106_gcm_ctx {
+	u8 hash_subkey[16];
+	struct crypto_aes_ctx aes_key_expanded;
+	u8 nonce[4];
+	struct cryptd_aead *cryptd_tfm;
+};
+
+struct aesni_gcm_set_hash_subkey_result {
+	int err;
+	struct completion completion;
+};
+
+struct aesni_hash_subkey_req_data {
+	u8 iv[16];
+	struct aesni_gcm_set_hash_subkey_result result;
+	struct scatterlist sg;
+};
+
+#define AESNI_ALIGN	(16)
 #define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE-1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
 
 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 			     unsigned int key_len);
@@ -62,6 +97,57 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx,
u8 *out,
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
+/* asmlinkage void aesni_gcm_enc()
+ * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
+ * const u8 *in, Plaintext input
+ * unsigned long plaintext_len, Length of data in bytes for encryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte
boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
+ *          is going to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
+ *          Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long
aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+/* asmlinkage void aesni_gcm_dec()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
+ * const u8 *in, Ciphertext input
+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte
boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is
going
+ * to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long
aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+static inline struct
+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
+{
+	return
+		(struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)
+		crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
+}
+
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
 	unsigned long addr = (unsigned long)raw_ctx;
@@ -730,6 +816,422 @@ static struct crypto_alg ablk_xts_alg = {
 };
 #endif
 
+static int rfc4106_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ctx->cryptd_tfm = cryptd_tfm;
+	tfm->crt_aead.reqsize = sizeof(struct aead_request)
+		+ crypto_aead_reqsize(&cryptd_tfm->base);
+	return 0;
+}
+
+static void rfc4106_exit(struct crypto_tfm *tfm)
+{
+	struct aesni_rfc4106_gcm_ctx *ctx =
+		(struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+	if (!IS_ERR(ctx->cryptd_tfm))
+		cryptd_free_aead(ctx->cryptd_tfm);
+	return;
+}
+
+static void
+rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
+{
+	struct aesni_gcm_set_hash_subkey_result *result = req->data;
+
+	if (err == -EINPROGRESS)
+		return;
+	result->err = err;
+	complete(&result->completion);
+}
+
+static int
+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int
key_len)
+{
+	struct crypto_ablkcipher *ctr_tfm;
+	struct ablkcipher_request *req;
+	int ret = -EINVAL;
+	struct aesni_hash_subkey_req_data *req_data;
+
+	ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
+	if (IS_ERR(ctr_tfm))
+		return PTR_ERR(ctr_tfm);
+
+	crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
+
+	ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
+	if (ret) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return ret;
+	}
+
+	req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
+	if (!req) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return -EINVAL;
+	}
+
+	req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+	if (!req_data) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return -ENOMEM;
+	}
+	memset(req_data->iv, 0, sizeof(req_data->iv));
+
+	/* Clear the data in the hash sub key container to zero.*/
+	/* We want to cipher all zeros to create the hash sub key. */
+	memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
+
+	init_completion(&req_data->result.completion);
+	sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
+	ablkcipher_request_set_tfm(req, ctr_tfm);
+	ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+					rfc4106_set_hash_subkey_done,
+					&req_data->result);
+
+	ablkcipher_request_set_crypt(req, &req_data->sg,
+		&req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
+
+	ret = crypto_ablkcipher_encrypt(req);
+	if (ret == -EINPROGRESS || ret == -EBUSY) {
+		ret = wait_for_completion_interruptible
+			(&req_data->result.completion);
+		if (!ret)
+			ret = req_data->result.err;
+	}
+	ablkcipher_request_free(req);
+	kfree(req_data);
+	crypto_free_ablkcipher(ctr_tfm);
+	return ret;
+}
+
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+						   unsigned int key_len)
+{
+	int ret = 0;
+	struct crypto_tfm *tfm = crypto_aead_tfm(parent);
+	struct aesni_rfc4106_gcm_ctx *ctx =
aesni_rfc4106_gcm_ctx_get(parent);
+	u8 *new_key_mem = NULL;
+
+	if (key_len < 4) {
+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	/*Account for 4 byte nonce at the end.*/
+	key_len -= 4;
+	if (key_len != AES_KEYSIZE_128) {
+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
+	/*This must be on a 16 byte boundary!*/
+	if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) %
AESNI_ALIGN)
+		return -EINVAL;
+
+	if ((unsigned long)key % AESNI_ALIGN) {
+		/*key is not aligned: use an auxuliar aligned pointer*/
+		new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
+		if (!new_key_mem)
+			return -ENOMEM;
+
+		new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
+		memcpy(new_key_mem, key, key_len);
+		key = new_key_mem;
+	}
+
+	if (!irq_fpu_usable())
+		ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
+		key, key_len);
+	else {
+		kernel_fpu_begin();
+		ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
+		kernel_fpu_end();
+	}
+	/*This must be on a 16 byte boundary!*/
+	if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
+		ret = -EINVAL;
+		goto exit;
+	}
+	ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+exit:
+	kfree(new_key_mem);
+	return ret;
+}
+
+/* This is the Integrity Check Value (aka the authentication tag length and
can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+				unsigned int authsize)
+{
+	struct aesni_rfc4106_gcm_ctx *ctx =
aesni_rfc4106_gcm_ctx_get(parent);
+	struct crypto_aead *cryptd_child =
cryptd_aead_child(ctx->cryptd_tfm);
+
+	switch (authsize) {
+	case 8:
+	case 12:
+	case 16:
+		break;
+	default:
+		return -EINVAL;
+	}
+	crypto_aead_crt(parent)->authsize = authsize;
+	crypto_aead_crt(cryptd_child)->authsize = authsize;
+	return 0;
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct crypto_aead *cryptd_child =
cryptd_aead_child(ctx->cryptd_tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_aead_encrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = cryptd_child->base.crt_aead.encrypt(req);
+		kernel_fpu_end();
+		return ret;
+	}
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct crypto_aead *cryptd_child =
cryptd_aead_child(ctx->cryptd_tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_aead_decrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = cryptd_child->base.crt_aead.decrypt(req);
+		kernel_fpu_end();
+		return ret;
+	}
+}
+
+static struct crypto_alg rfc4106_alg = {
+	.cra_name = "rfc4106(gcm(aes))",
+	.cra_driver_name = "rfc4106-gcm-aesni",
+	.cra_priority = 400,
+	.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+	.cra_blocksize = 1,
+	.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+	.cra_alignmask = 0,
+	.cra_type = &crypto_nivaead_type,
+	.cra_module = THIS_MODULE,
+	.cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
+	.cra_init = rfc4106_init,
+	.cra_exit = rfc4106_exit,
+	.cra_u = {
+		.aead = {
+			.setkey = rfc4106_set_key,
+			.setauthsize = rfc4106_set_authsize,
+			.encrypt = rfc4106_encrypt,
+			.decrypt = rfc4106_decrypt,
+			.geniv = "seqiv",
+			.ivsize = 8,
+			.maxauthsize = 16,
+		},
+	},
+};
+
+static int __driver_rfc4106_encrypt(struct aead_request *req)
+{
+	u8 one_entry_in_sg = 0;
+	u8 *src, *dst, *assoc;
+	__be32 counter = cpu_to_be32(1);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	void *aes_ctx = &(ctx->aes_key_expanded);
+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	u8 iv_tab[16+AESNI_ALIGN];
+	u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
+	struct scatter_walk src_sg_walk;
+	struct scatter_walk assoc_sg_walk;
+	struct scatter_walk dst_sg_walk;
+	unsigned int i;
+
+	/* Assuming we are supporting rfc4106 64-bit extended */
+	/* sequence numbers We need to have the AAD length equal */
+	/* to 8 or 12 bytes */
+	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+		return -EINVAL;
+	/* IV below built */
+	for (i = 0; i < 4; i++)
+		*(iv+i) = ctx->nonce[i];
+	for (i = 0; i < 8; i++)
+		*(iv+4+i) = req->iv[i];
+	*((__be32 *)(iv+12)) = counter;
+
+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+		one_entry_in_sg = 1;
+		scatterwalk_start(&src_sg_walk, req->src);
+		scatterwalk_start(&assoc_sg_walk, req->assoc);
+		src = scatterwalk_map(&src_sg_walk, 0);
+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		dst = src;
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_start(&dst_sg_walk, req->dst);
+			dst = scatterwalk_map(&dst_sg_walk, 0);
+		}
+
+	} else {
+		/* Allocate memory for src, dst, assoc */
+		src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+			GFP_ATOMIC);
+		if (unlikely(!src))
+			return -ENOMEM;
+		assoc = (src + req->cryptlen + auth_tag_len);
+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen,
0);
+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
+					req->assoclen, 0);
+		dst = src;
+	}
+
+	aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
+		+ ((unsigned long)req->cryptlen), auth_tag_len);
+
+	/* The authTag (aka the Integrity Check Value) needs to be written
+	 * back to the packet. */
+	if (one_entry_in_sg) {
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_unmap(dst, 0);
+			scatterwalk_done(&dst_sg_walk, 0, 0);
+		}
+		scatterwalk_unmap(src, 0);
+		scatterwalk_unmap(assoc, 0);
+		scatterwalk_done(&src_sg_walk, 0, 0);
+		scatterwalk_done(&assoc_sg_walk, 0, 0);
+	} else {
+		scatterwalk_map_and_copy(dst, req->dst, 0,
+			req->cryptlen + auth_tag_len, 1);
+		kfree(src);
+	}
+	return 0;
+}
+
+static int __driver_rfc4106_decrypt(struct aead_request *req)
+{
+	u8 one_entry_in_sg = 0;
+	u8 *src, *dst, *assoc;
+	unsigned long tempCipherLen = 0;
+	__be32 counter = cpu_to_be32(1);
+	int retval = 0;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	void *aes_ctx = &(ctx->aes_key_expanded);
+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	u8 iv_and_authTag[32+AESNI_ALIGN];
+	u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
+	u8 *authTag = iv + 16;
+	struct scatter_walk src_sg_walk;
+	struct scatter_walk assoc_sg_walk;
+	struct scatter_walk dst_sg_walk;
+	unsigned int i;
+
+	if (unlikely((req->cryptlen < auth_tag_len) ||
+		(req->assoclen != 8 && req->assoclen != 12)))
+		return -EINVAL;
+	/* Assuming we are supporting rfc4106 64-bit extended */
+	/* sequence numbers We need to have the AAD length */
+	/* equal to 8 or 12 bytes */
+
+	tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
+	/* IV below built */
+	for (i = 0; i < 4; i++)
+		*(iv+i) = ctx->nonce[i];
+	for (i = 0; i < 8; i++)
+		*(iv+4+i) = req->iv[i];
+	*((__be32 *)(iv+12)) = counter;
+
+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+		one_entry_in_sg = 1;
+		scatterwalk_start(&src_sg_walk, req->src);
+		scatterwalk_start(&assoc_sg_walk, req->assoc);
+		src = scatterwalk_map(&src_sg_walk, 0);
+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		dst = src;
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_start(&dst_sg_walk, req->dst);
+			dst = scatterwalk_map(&dst_sg_walk, 0);
+		}
+
+	} else {
+		/* Allocate memory for src, dst, assoc */
+		src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+		if (!src)
+			return -ENOMEM;
+		assoc = (src + req->cryptlen + auth_tag_len);
+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen,
0);
+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
+			req->assoclen, 0);
+		dst = src;
+	}
+
+	aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
+		authTag, auth_tag_len);
+
+	/* Compare generated tag with passed in tag. */
+	retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
+		-EBADMSG : 0;
+
+	if (one_entry_in_sg) {
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_unmap(dst, 0);
+			scatterwalk_done(&dst_sg_walk, 0, 0);
+		}
+		scatterwalk_unmap(src, 0);
+		scatterwalk_unmap(assoc, 0);
+		scatterwalk_done(&src_sg_walk, 0, 0);
+		scatterwalk_done(&assoc_sg_walk, 0, 0);
+	} else {
+		scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen,
1);
+		kfree(src);
+	}
+	return retval;
+}
+
+static struct crypto_alg __rfc4106_alg = {
+	.cra_name		= "__gcm-aes-aesni",
+	.cra_driver_name	= "__driver-gcm-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx) +
AESNI_ALIGN,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_aead_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(__rfc4106_alg.cra_list),
+	.cra_u = {
+		.aead = {
+			.encrypt	= __driver_rfc4106_encrypt,
+			.decrypt	= __driver_rfc4106_decrypt,
+		},
+	},
+};
+
 static int __init aesni_init(void)
 {
 	int err;
@@ -738,6 +1240,7 @@ static int __init aesni_init(void)
 		printk(KERN_INFO "Intel AES-NI instructions are not
detected.\n");
 		return -ENODEV;
 	}
+
 	if ((err = crypto_register_alg(&aesni_alg)))
 		goto aes_err;
 	if ((err = crypto_register_alg(&__aesni_alg)))
@@ -770,10 +1273,19 @@ static int __init aesni_init(void)
 	if ((err = crypto_register_alg(&ablk_xts_alg)))
 		goto ablk_xts_err;
 #endif
-
+	err = crypto_register_alg(&__rfc4106_alg);
+	if (err)
+		goto __aead_gcm_err;
+	err = crypto_register_alg(&rfc4106_alg);
+	if (err)
+		goto aead_gcm_err;
 	return err;
 
+aead_gcm_err:
+	crypto_unregister_alg(&__rfc4106_alg);
+__aead_gcm_err:
 #ifdef HAS_XTS
+	crypto_unregister_alg(&ablk_xts_alg);
 ablk_xts_err:
 #endif
 #ifdef HAS_PCBC
@@ -809,6 +1321,8 @@ aes_err:
 
 static void __exit aesni_exit(void)
 {
+	crypto_unregister_alg(&__rfc4106_alg);
+	crypto_unregister_alg(&rfc4106_alg);
 #ifdef HAS_XTS
 	crypto_unregister_alg(&ablk_xts_alg);
 #endif
-- 
1.5.3.rc5

On Mon, Sep 20, 2010 at 9:07 AM, Herbert Xu <herbert@gondor.apana.org.au>
wrote:
> On Fri, Sep 10, 2010 at 06:36:24PM +0100, Hoban, Adrian wrote:
>> This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
>> kernels. It supports 128-bit AES key size. This leverages the crypto
>> AEAD interface type to facilitate a combined AES & GCM operation to
>> be implemented in assembly code. The assembly code leverages Intel(R)
>> AES New Instructions and the PCLMULQDQ instruction.
>>
>> Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
>> Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
>> Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
>> Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com>
>> Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com>
>> Signed-off-by: James Guilford <james.guilford@intel.com>
>> Signed-off-by: Edward Clinton <edward.clinton@intel.com>
>> Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com>
>
> I've applied patch 1/3 but 2/3 does not apply because you've
> converted the tabs to spaces.
>
> Please resend patches 2/3 and 3/3 with a cc to yourself and
> double-check that you can still apply the received version.
>
> Thanks,
> --
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
> --
> To unsubscribe from this list: send the line "unsubscribe linux-crypto" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

[-- Attachment #1.2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 7072 bytes --]

[-- Attachment #2: Type: text/plain, Size: 541 bytes --]

--------------------------------------------------------------
Intel Shannon Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
Business address: Dromore House, East Park, Shannon, Co. Clare

This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). Any review or distribution by others is strictly prohibited. If you are not the intended recipient, please contact the sender and delete all copies.


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-09-20 11:21     ` Struk, Tadeusz
@ 2010-09-23  7:58       ` Herbert Xu
  2010-09-24 10:07         ` Paoloni, Gabriele
  0 siblings, 1 reply; 19+ messages in thread
From: Herbert Xu @ 2010-09-23  7:58 UTC (permalink / raw)
  To: Struk, Tadeusz
  Cc: Hoban, Adrian, Paoloni, Gabriele, O Mahony, Aidan, linux-crypto,
	linux-kernel

On Mon, Sep 20, 2010 at 12:21:54PM +0100, Struk, Tadeusz wrote:
> Hello Herbert,
> I've reworked the patch 2/3 and successfully applied all of them.
> The problem was spaces at EOL.
> Please find the reworked patch below.
> 
> >From 06444d8a95458d807ae14699e557739281d0b026 Mon Sep 17 00:00:00 2001
> From: Adrian Hoban <ahoban@allen.ir.intel.com>
> Date: Fri, 10 Sep 2010 18:08:45 +0100
> Subject: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
> 
> This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
> kernels. It supports 128-bit AES key size. This leverages the crypto
> AEAD interface type to facilitate a combined AES & GCM operation to
> be implemented in assembly code. The assembly code leverages Intel(R)
> AES New Instructions and the PCLMULQDQ instruction.
> 
> Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
> Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
> Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
> Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com>
> Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com>
> Signed-off-by: James Guilford <james.guilford@intel.com>
> Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com>

Sorry, still doesn't work:

$ git apply ~/p
fatal: corrupt patch at line 67
$ patch -s -p1 < ~/p
patch: **** malformed patch at line 66: ALL_F
$

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-09-23  7:58       ` Herbert Xu
@ 2010-09-24 10:07         ` Paoloni, Gabriele
  0 siblings, 0 replies; 19+ messages in thread
From: Paoloni, Gabriele @ 2010-09-24 10:07 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Hoban, Adrian, O Mahony, Aidan, linux-crypto, linux-kernel,
	Struk, Tadeusz

Hi Herbert I think I've fixed the issue, it should work now.

Find below the new patch (between the star lines)

Best Regards

Gabriele Paoloni

********************************************************************
This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
kernels. It supports 128-bit AES key size. This leverages the crypto
AEAD interface type to facilitate a combined AES & GCM operation to
be implemented in assembly code. The assembly code leverages Intel(R)
AES New Instructions and the PCLMULQDQ instruction.

Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com>
Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com>
Signed-off-by: James Guilford <james.guilford@intel.com>
Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com>

diff -uNr a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
--- a/arch/x86/crypto/aesni-intel_asm.S 2010-08-20 19:55:55.000000000 +0100
+++ b/arch/x86/crypto/aesni-intel_asm.S 2010-09-24 09:55:40.000000000 +0100
@@ -9,6 +9,17 @@
  *            Vinodh Gopal <vinodh.gopal@intel.com>
  *            Kahraman Akdemir
  *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             James Guilford (james.guilford@intel.com)
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Wajdi Feghali (wajdi.k.feghali@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -16,9 +27,59 @@
  */

 #include <linux/linkage.h>
-#include <asm/inst.h>
+
+
+.data
+POLY:   .octa 0xC2000000000000000000000000000001
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK,
+# and ZERO should follow ALL_F
+
+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
+MASK1:      .octa 0x0000000000000000ffffffffffffffff
+MASK2:      .octa 0xffffffffffffffff0000000000000000
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
+ZERO:       .octa 0x00000000000000000000000000000000
+ONE:        .octa 0x00000000000000000000000000000001
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+dec:        .octa 0x1
+enc:        .octa 0x2
+

 .text
+#define        STACK_OFFSET    8*3
+#define        HashKey         16*0    // store HashKey <<1 mod poly here
+#define        HashKey_2       16*1    // store HashKey^2 <<1 mod poly here
+#define        HashKey_3       16*2    // store HashKey^3 <<1 mod poly here
+#define        HashKey_4       16*3    // store HashKey^4 <<1 mod poly here
+#define        HashKey_k       16*4    // store XOR of High 64 bits and Low 64
+                               // bits of  HashKey <<1 mod poly here
+                               //(for Karatsuba purposes)
+#define        HashKey_2_k     16*5    // store XOR of High 64 bits and Low 64
+                               // bits of  HashKey^2 <<1 mod poly here
+                               // (for Karatsuba purposes)
+#define        HashKey_3_k     16*6    // store XOR of High 64 bits and Low 64
+                               // bits of  HashKey^3 <<1 mod poly here
+                               // (for Karatsuba purposes)
+#define        HashKey_4_k     16*7    // store XOR of High 64 bits and Low 64
+                               // bits of  HashKey^4 <<1 mod poly here
+                               // (for Karatsuba purposes)
+#define        VARIABLE_OFFSET 16*8
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%r14)
+#define arg8 STACK_OFFSET+16(%r14)
+#define arg9 STACK_OFFSET+24(%r14)
+#define arg10 STACK_OFFSET+32(%r14)
+

 #define STATE1 %xmm0
 #define STATE2 %xmm4
@@ -47,6 +108,1118 @@
 #define T2     %r11
 #define TCTR_LOW T2

+
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+       movdqa    \GH, \TMP1
+       pshufd    $78, \GH, \TMP2
+       pshufd    $78, \HK, \TMP3
+       pxor      \GH, \TMP2            # TMP2 = a1+a0
+       pxor      \HK, \TMP3            # TMP3 = b1+b0
+       pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
+       pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
+       pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
+       pxor      \GH, \TMP2
+       pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
+       movdqa    \TMP2, \TMP3
+       pslldq    $8, \TMP3             # left shift TMP3 2 DWs
+       psrldq    $8, \TMP2             # right shift TMP2 2 DWs
+       pxor      \TMP3, \GH
+       pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
+
+        # first phase of the reduction
+
+       movdqa    \GH, \TMP2
+       movdqa    \GH, \TMP3
+       movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
+                                       # in in order to perform
+                                       # independent shifts
+       pslld     $31, \TMP2            # packed right shift <<31
+       pslld     $30, \TMP3            # packed right shift <<30
+       pslld     $25, \TMP4            # packed right shift <<25
+       pxor      \TMP3, \TMP2          # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       movdqa    \TMP2, \TMP5
+       psrldq    $4, \TMP5             # right shift TMP5 1 DW
+       pslldq    $12, \TMP2            # left shift TMP2 3 DWs
+       pxor      \TMP2, \GH
+
+        # second phase of the reduction
+
+       movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
+                                       # in in order to perform
+                                       # independent shifts
+       movdqa    \GH,\TMP3
+       movdqa    \GH,\TMP4
+       psrld     $1,\TMP2              # packed left shift >>1
+       psrld     $2,\TMP3              # packed left shift >>2
+       psrld     $7,\TMP4              # packed left shift >>7
+       pxor      \TMP3,\TMP2           # xor the shifted versions
+       pxor      \TMP4,\TMP2
+       pxor      \TMP5, \TMP2
+       pxor      \TMP2, \GH
+       pxor      \TMP1, \GH            # result is in TMP1
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+
+       mov        arg7, %r10           # %r10 = AAD
+       mov        arg8, %r12           # %r12 = aadLen
+       mov        %r12, %r11
+       pxor       %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+       movd       (%r10), \TMP1
+       pslldq     $12, \TMP1
+       psrldq     $4, %xmm\i
+       pxor       \TMP1, %xmm\i
+       add        $4, %r10
+       sub        $4, %r12
+       jne        _get_AAD_loop\num_initial_blocks\operation
+       cmp        $16, %r11
+       je         _get_AAD_loop2_done\num_initial_blocks\operation
+       mov        $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+       psrldq     $4, %xmm\i
+       sub        $4, %r12
+       cmp        %r11, %r12
+       jne        _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+       pshufb     SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
+       xor        %r11, %r11 # initialise the data pointer offset as zero
+
+        # start AES for num_initial_blocks blocks
+
+       mov        %arg5, %rax                      # %rax = *Y0
+       movdqu     (%rax), \XMM0                    # XMM0 = Y0
+       pshufb     SHUF_MASK(%rip), \XMM0
+.if \i_seq != 0
+.irpc index, \i_seq
+       paddd      ONE(%rip), \XMM0                 # INCR Y0
+       movdqa     \XMM0, %xmm\index
+       pshufb     SHUF_MASK(%rip), %xmm\index      # perform a 16 byte swap
+.endr
+.irpc index, \i_seq
+       pxor       16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+       aesenc     16*1(%rdi), %xmm\index          # Round 1
+.endr
+.irpc index, \i_seq
+       aesenc     16*2(%arg1), %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+       aesenc     16*3(%arg1), %xmm\index          # Round 3
+.endr
+.irpc index, \i_seq
+       aesenc     16*4(%arg1), %xmm\index          # Round 4
+.endr
+.irpc index, \i_seq
+       aesenc     16*5(%arg1), %xmm\index          # Round 5
+.endr
+.irpc index, \i_seq
+       aesenc     16*6(%arg1), %xmm\index          # Round 6
+.endr
+.irpc index, \i_seq
+       aesenc     16*7(%arg1), %xmm\index          # Round 7
+.endr
+.irpc index, \i_seq
+       aesenc     16*8(%arg1), %xmm\index          # Round 8
+.endr
+.irpc index, \i_seq
+       aesenc     16*9(%arg1), %xmm\index          # Round 9
+.endr
+.irpc index, \i_seq
+       aesenclast 16*10(%arg1), %xmm\index         # Round 10
+.endr
+.irpc index, \i_seq
+       movdqu     (%arg3 , %r11, 1), \TMP1
+       pxor       \TMP1, %xmm\index
+       movdqu     %xmm\index, (%arg2 , %r11, 1)
+       # write back plaintext/ciphertext for num_initial_blocks
+       add        $16, %r11
+.if \operation == dec
+       movdqa     \TMP1, %xmm\index
+.endif
+       pshufb     SHUF_MASK(%rip), %xmm\index
+               # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+       GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+        pxor       %xmm5, %xmm6
+       GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm6, %xmm7
+       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+        pxor       %xmm6, %xmm7
+       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+        pxor       %xmm7, %xmm8
+       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+       cmp        $64, %r13
+       jl      _initial_blocks_done\num_initial_blocks\operation
+       # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM1
+       pshufb     SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM2
+       pshufb     SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM3
+       pshufb     SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
+       paddd      ONE(%rip), \XMM0              # INCR Y0
+       movdqa     \XMM0, \XMM4
+       pshufb     SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
+       pxor       16*0(%arg1), \XMM1
+       pxor       16*0(%arg1), \XMM2
+       pxor       16*0(%arg1), \XMM3
+       pxor       16*0(%arg1), \XMM4
+       movdqa     \TMP3, \TMP5
+       pshufd     $78, \TMP3, \TMP1
+       pxor       \TMP3, \TMP1
+       movdqa     \TMP1, HashKey_k(%rsp)
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+       movdqa     \TMP5, HashKey_2(%rsp)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+       aesenc     16*\index(%arg1), \XMM1
+       aesenc     16*\index(%arg1), \XMM2
+       aesenc     16*\index(%arg1), \XMM3
+       aesenc     16*\index(%arg1), \XMM4
+.endr
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_3(%rsp)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+       aesenc     16*\index(%arg1), \XMM1
+       aesenc     16*\index(%arg1), \XMM2
+       aesenc     16*\index(%arg1), \XMM3
+       aesenc     16*\index(%arg1), \XMM4
+.endr
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_4(%rsp)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_4_k(%rsp)
+       aesenclast 160(%arg1), \XMM1
+       aesenclast 160(%arg1), \XMM2
+       aesenclast 160(%arg1), \XMM3
+       aesenclast 160(%arg1), \XMM4
+       movdqu     16*0(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM1
+.if \operation == dec
+       movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM1
+.endif
+       movdqu     16*1(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM2
+.if \operation == dec
+       movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM2
+.endif
+       movdqu     16*2(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM3
+.if \operation == dec
+       movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM3
+.endif
+       movdqu     16*3(%arg3 , %r11 , 1), \TMP1
+       pxor       \TMP1, \XMM4
+.if \operation == dec
+       movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+       movdqa     \TMP1, \XMM4
+.else
+       movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
+       movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
+       movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
+       movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+.endif
+       add        $64, %r11
+       pshufb     SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
+       pxor       \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+       pshufb     SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
+       pshufb     SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
+       pshufb     SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
+_initial_blocks_done\num_initial_blocks\operation:
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+       movdqa    \XMM1, \XMM5
+       movdqa    \XMM2, \XMM6
+       movdqa    \XMM3, \XMM7
+       movdqa    \XMM4, \XMM8
+
+        # multiply TMP5 * HashKey using karatsuba
+
+       movdqa    \XMM5, \TMP4
+       pshufd    $78, \XMM5, \TMP6
+       pxor      \XMM5, \TMP6
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    HashKey_4(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
+       movdqa    \XMM0, \XMM1
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM2
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM3
+       paddd     ONE(%rip), \XMM0              # INCR CNT
+       movdqa    \XMM0, \XMM4
+       pshufb    SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
+       pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
+       pshufb    SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
+       pshufb    SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
+       pshufb    SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
+       pxor      (%arg1), \XMM1
+       pxor      (%arg1), \XMM2
+       pxor      (%arg1), \XMM3
+       pxor      (%arg1), \XMM4
+       movdqa    HashKey_4_k(%rsp), \TMP5
+       pclmulqdq $0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
+       aesenc    16(%arg1), \XMM1              # Round 1
+       aesenc    16(%arg1), \XMM2
+       aesenc    16(%arg1), \XMM3
+       aesenc    16(%arg1), \XMM4
+       aesenc    32(%arg1), \XMM1              # Round 2
+       aesenc    32(%arg1), \XMM2
+       aesenc    32(%arg1), \XMM3
+       aesenc    32(%arg1), \XMM4
+       movdqa    \XMM6, \TMP1
+       pshufd    $78, \XMM6, \TMP2
+       pxor      \XMM6, \TMP2
+       movdqa    HashKey_3(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
+       aesenc    48(%arg1), \XMM1              # Round 3
+       aesenc    48(%arg1), \XMM2
+       aesenc    48(%arg1), \XMM3
+       aesenc    48(%arg1), \XMM4
+       pclmulqdq $0x00, \TMP5, \XMM6           # XMM6 = a0*b0
+       aesenc    64(%arg1), \XMM1              # Round 4
+       aesenc    64(%arg1), \XMM2
+       aesenc    64(%arg1), \XMM3
+       aesenc    64(%arg1), \XMM4
+       movdqa    HashKey_3_k(%rsp), \TMP5
+       pclmulqdq $0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+       aesenc    80(%arg1), \XMM1              # Round 5
+       aesenc    80(%arg1), \XMM2
+       aesenc    80(%arg1), \XMM3
+       aesenc    80(%arg1), \XMM4
+       pxor      \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+       pxor      \XMM6, \XMM5
+       pxor      \TMP2, \TMP6
+       movdqa    \XMM7, \TMP1
+       pshufd    $78, \XMM7, \TMP2
+       pxor      \XMM7, \TMP2
+       movdqa    HashKey_2(%rsp ), \TMP5
+
+        # Multiply TMP5 * HashKey using karatsuba
+
+       pclmulqdq $0x11, \TMP5, \TMP1           # TMP1 = a1*b1
+       aesenc    96(%arg1), \XMM1              # Round 6
+       aesenc    96(%arg1), \XMM2
+       aesenc    96(%arg1), \XMM3
+       aesenc    96(%arg1), \XMM4
+       pclmulqdq $0x00, \TMP5, \XMM7           # XMM7 = a0*b0
+       aesenc    112(%arg1), \XMM1             # Round 7
+       aesenc    112(%arg1), \XMM2
+       aesenc    112(%arg1), \XMM3
+       aesenc    112(%arg1), \XMM4
+       movdqa    HashKey_2_k(%rsp), \TMP5
+       pclmulqdq $0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+       aesenc    128(%arg1), \XMM1             # Round 8
+       aesenc    128(%arg1), \XMM2
+       aesenc    128(%arg1), \XMM3
+       aesenc    128(%arg1), \XMM4
+       pxor      \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+       pxor      \XMM7, \XMM5
+       pxor      \TMP2, \TMP6
+
+        # Multiply XMM8 * HashKey
+        # XMM8 and TMP5 hold the values for the two operands
+
+       movdqa    \XMM8, \TMP1
+       pshufd    $78, \XMM8, \TMP2
+       pxor      \XMM8, \TMP2
+       movdqa    HashKey(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP1          # TMP1 = a1*b1
+       aesenc    144(%arg1), \XMM1            # Round 9
+       aesenc    144(%arg1), \XMM2
+       aesenc    144(%arg1), \XMM3
+       aesenc    144(%arg1), \XMM4
+       pclmulqdq $0x00, \TMP5, \XMM8          # XMM8 = a0*b0
+       aesenclast 160(%arg1), \XMM1           # Round 10
+       aesenclast 160(%arg1), \XMM2
+       aesenclast 160(%arg1), \XMM3
+       aesenclast 160(%arg1), \XMM4
+       movdqa    HashKey_k(%rsp), \TMP5
+       pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
+       movdqu    (%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+       movdqu    \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
+       movdqa    \TMP3, \XMM1
+.endif
+       movdqu    16(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+       movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqa    \TMP3, \XMM2
+.endif
+       movdqu    32(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+       movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqa    \TMP3, \XMM3
+.endif
+       movdqu    48(%arg3,%r11,1), \TMP3
+       pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+       movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqa    \TMP3, \XMM4
+.else
+        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
+        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
+.endif
+       pshufb    SHUF_MASK(%rip), \XMM1       # perform a 16 byte swap
+       pshufb    SHUF_MASK(%rip), \XMM2       # perform a 16 byte swap
+       pshufb    SHUF_MASK(%rip), \XMM3       # perform a 16 byte swap
+       pshufb    SHUF_MASK(%rip), \XMM4       # perform a 16 byte sway
+
+       pxor      \TMP4, \TMP1
+       pxor      \XMM8, \XMM5
+       pxor      \TMP6, \TMP2
+       pxor      \TMP1, \TMP2
+       pxor      \XMM5, \TMP2
+       movdqa    \TMP2, \TMP3
+       pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
+       psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
+       pxor      \TMP3, \XMM5
+       pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
+
+        # first phase of reduction
+
+       movdqa    \XMM5, \TMP2
+       movdqa    \XMM5, \TMP3
+       movdqa    \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+       pslld     $31, \TMP2                   # packed right shift << 31
+       pslld     $30, \TMP3                   # packed right shift << 30
+       pslld     $25, \TMP4                   # packed right shift << 25
+       pxor      \TMP3, \TMP2                 # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       movdqa    \TMP2, \TMP5
+       psrldq    $4, \TMP5                    # right shift T5 1 DW
+       pslldq    $12, \TMP2                   # left shift T2 3 DWs
+       pxor      \TMP2, \XMM5
+
+        # second phase of reduction
+
+       movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+       movdqa    \XMM5,\TMP3
+       movdqa    \XMM5,\TMP4
+       psrld     $1, \TMP2                    # packed left shift >>1
+       psrld     $2, \TMP3                    # packed left shift >>2
+       psrld     $7, \TMP4                    # packed left shift >>7
+       pxor      \TMP3,\TMP2                  # xor the shifted versions
+       pxor      \TMP4,\TMP2
+       pxor      \TMP5, \TMP2
+       pxor      \TMP2, \XMM5
+       pxor      \TMP1, \XMM5                 # result is in TMP1
+
+       pxor      \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
+TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
+
+        # Multiply TMP6 * HashKey (using Karatsuba)
+
+       movdqa    \XMM1, \TMP6
+       pshufd    $78, \XMM1, \TMP2
+       pxor      \XMM1, \TMP2
+       movdqa    HashKey_4(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
+       pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
+       movdqa    HashKey_4_k(%rsp), \TMP4
+       pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       movdqa    \XMM1, \XMMDst
+       movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+       movdqa    \XMM2, \TMP1
+       pshufd    $78, \XMM2, \TMP2
+       pxor      \XMM2, \TMP2
+       movdqa    HashKey_3(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+       pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
+       movdqa    HashKey_3_k(%rsp), \TMP4
+       pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       pxor      \TMP1, \TMP6
+       pxor      \XMM2, \XMMDst
+       pxor      \TMP2, \XMM1
+# results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+       movdqa    \XMM3, \TMP1
+       pshufd    $78, \XMM3, \TMP2
+       pxor      \XMM3, \TMP2
+       movdqa    HashKey_2(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+       pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
+       movdqa    HashKey_2_k(%rsp), \TMP4
+       pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       pxor      \TMP1, \TMP6
+       pxor      \XMM3, \XMMDst
+       pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+       movdqa    \XMM4, \TMP1
+       pshufd    $78, \XMM4, \TMP2
+       pxor      \XMM4, \TMP2
+       movdqa    HashKey(%rsp), \TMP5
+       pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+       pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
+       movdqa    HashKey_k(%rsp), \TMP4
+       pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+       pxor      \TMP1, \TMP6
+       pxor      \XMM4, \XMMDst
+       pxor      \XMM1, \TMP2
+       pxor      \TMP6, \TMP2
+       pxor      \XMMDst, \TMP2
+       # middle section of the temp results combined as in karatsuba algorithm
+       movdqa    \TMP2, \TMP4
+       pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
+       psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
+       pxor      \TMP4, \XMMDst
+       pxor      \TMP2, \TMP6
+# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
+       # first phase of the reduction
+       movdqa    \XMMDst, \TMP2
+       movdqa    \XMMDst, \TMP3
+       movdqa    \XMMDst, \TMP4
+# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
+       pslld     $31, \TMP2                # packed right shifting << 31
+       pslld     $30, \TMP3                # packed right shifting << 30
+       pslld     $25, \TMP4                # packed right shifting << 25
+       pxor      \TMP3, \TMP2              # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       movdqa    \TMP2, \TMP7
+       psrldq    $4, \TMP7                 # right shift TMP7 1 DW
+       pslldq    $12, \TMP2                # left shift TMP2 3 DWs
+       pxor      \TMP2, \XMMDst
+
+        # second phase of the reduction
+       movdqa    \XMMDst, \TMP2
+       # make 3 copies of XMMDst for doing 3 shift operations
+       movdqa    \XMMDst, \TMP3
+       movdqa    \XMMDst, \TMP4
+       psrld     $1, \TMP2                 # packed left shift >> 1
+       psrld     $2, \TMP3                 # packed left shift >> 2
+       psrld     $7, \TMP4                 # packed left shift >> 7
+       pxor      \TMP3, \TMP2              # xor the shifted versions
+       pxor      \TMP4, \TMP2
+       pxor      \TMP7, \TMP2
+       pxor      \TMP2, \XMMDst
+       pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
+.endm
+
+/* Encryption of a single block done*/
+.macro ENCRYPT_SINGLE_BLOCK XMM0
+
+       pxor    (%arg1), \XMM0
+       aesenc  16(%arg1), \XMM0
+       aesenc  32(%arg1), \XMM0
+       aesenc  48(%arg1), \XMM0
+       aesenc  64(%arg1), \XMM0
+       aesenc  80(%arg1), \XMM0
+       aesenc  96(%arg1), \XMM0
+       aesenc  112(%arg1), \XMM0
+       aesenc  128(%arg1), \XMM0
+       aesenc  144(%arg1), \XMM0
+       aesenclast 160(%arg1), \XMM0
+.endm
+
+
+/*****************************************************************************
+*void aesni_gcm_dec(void *aes_ctx,
+*                           // AES Key schedule. Starts on a 16 byte boundary.
+*                   u8 *out,
+*                            // Plaintext output. Encrypt in-place is allowed.
+*                   const u8 *in,
+*                            // Ciphertext input
+*                   u64 plaintext_len,
+*                            // Length of data in bytes for decryption.
+*                   u8 *iv,
+*           // Pre-counter block j0: 4 byte salt (from Security Association)
+* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                    // concatenated with 0x00000001. 16-byte aligned pointer.
+*                   u8 *hash_subkey,
+*             // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                   const u8 *aad,     // Additional Authentication Data (AAD)
+*                   u64 aad_len,
+*   // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                   u8  *auth_tag,
+*            // Authenticated Tag output. The driver will compare this to the
+*    // given authentication tag and only return the plaintext if they match.
+*                   u64 auth_tag_len);
+* // Authenticated Tag Length in bytes. Valid values are 16
+* // (most likely), 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the first
+*       set of 11 keys in the data structure void *aes_ctx
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                        AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
+*       The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+*       For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+*****************************************************************************/
+
+ENTRY(aesni_gcm_dec)
+       push    %r12
+       push    %r13
+       push    %r14
+       mov     %rsp, %r14
+/*
+* states of %xmm registers %xmm6:%xmm15 not saved
+* all %xmm registers are clobbered
+*/
+       sub     $VARIABLE_OFFSET, %rsp
+       and     $~63, %rsp                        # align rsp to 64 bytes
+       mov     %arg6, %r12
+       movdqu  (%r12), %xmm13                    # %xmm13 = HashKey
+       pshufb  SHUF_MASK(%rip), %xmm13
+
+# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
+
+       movdqa  %xmm13, %xmm2
+       psllq   $1, %xmm13
+       psrlq   $63, %xmm2
+       movdqa  %xmm2, %xmm1
+       pslldq  $8, %xmm2
+       psrldq  $8, %xmm1
+       por     %xmm2, %xmm13
+
+        # Reduction
+
+       pshufd  $0x24, %xmm1, %xmm2
+       pcmpeqd TWOONE(%rip), %xmm2
+       pand    POLY(%rip), %xmm2
+       pxor    %xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
+
+
+        # Decrypt first few blocks
+
+       movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
+       mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
+       and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
+       mov %r13, %r12
+       and $(3<<4), %r12
+       jz _initial_num_blocks_is_0_decrypt
+       cmp $(2<<4), %r12
+       jb _initial_num_blocks_is_1_decrypt
+       je _initial_num_blocks_is_2_decrypt
+_initial_num_blocks_is_3_decrypt:
+       INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
+       sub     $48, %r13
+       jmp     _initial_blocks_decrypted
+_initial_num_blocks_is_2_decrypt:
+       INITIAL_BLOCKS  2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
+       sub     $32, %r13
+       jmp     _initial_blocks_decrypted
+_initial_num_blocks_is_1_decrypt:
+       INITIAL_BLOCKS  1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
+       sub     $16, %r13
+       jmp     _initial_blocks_decrypted
+_initial_num_blocks_is_0_decrypt:
+       INITIAL_BLOCKS  0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
+_initial_blocks_decrypted:
+       cmp     $0, %r13
+       je      _zero_cipher_left_decrypt
+       sub     $64, %r13
+       je      _four_cipher_left_decrypt
+_decrypt_by_4:
+       GHASH_4_ENCRYPT_4_PARALLEL      %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
+       add     $64, %r11
+       sub     $64, %r13
+       jne     _decrypt_by_4
+_four_cipher_left_decrypt:
+       GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_decrypt:
+       mov     %arg4, %r13
+       and     $15, %r13                               # %r13 = arg4 (mod 16)
+       je      _multiple_of_16_bytes_decrypt
+
+        # Handle the last <16 byte block seperately
+
+       paddd ONE(%rip), %xmm0         # increment CNT to get Yn
+       pshufb SHUF_MASK(%rip), %xmm0
+       ENCRYPT_SINGLE_BLOCK  %xmm0    # E(K, Yn)
+       sub $16, %r11
+       add %r13, %r11
+       movdqu (%arg3,%r11,1), %xmm1   # recieve the last <16 byte block
+       lea SHIFT_MASK+16(%rip), %r12
+       sub %r13, %r12
+# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
+# (%r13 is the number of bytes in plaintext mod 16)
+       movdqu (%r12), %xmm2           # get the appropriate shuffle mask
+       pshufb %xmm2, %xmm1            # right shift 16-%r13 butes
+       movdqa  %xmm1, %xmm2
+       pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
+       movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+       # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
+       pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
+       pand    %xmm1, %xmm2
+       pshufb SHUF_MASK(%rip),%xmm2
+       pxor %xmm2, %xmm8
+       GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+                 # GHASH computation for the last <16 byte block
+       sub %r13, %r11
+       add $16, %r11
+
+        # output %r13 bytes
+       movq    %xmm0, %rax
+       cmp     $8, %r13
+       jle     _less_than_8_bytes_left_decrypt
+       mov     %rax, (%arg2 , %r11, 1)
+       add     $8, %r11
+       psrldq  $8, %xmm0
+       movq    %xmm0, %rax
+       sub     $8, %r13
+_less_than_8_bytes_left_decrypt:
+       mov     %al,  (%arg2, %r11, 1)
+       add     $1, %r11
+       shr     $8, %rax
+       sub     $1, %r13
+       jne     _less_than_8_bytes_left_decrypt
+_multiple_of_16_bytes_decrypt:
+       mov     arg8, %r12                # %r13 = aadLen (number of bytes)
+       shl     $3, %r12                  # convert into number of bits
+       movd    %r12d, %xmm15             # len(A) in %xmm15
+       shl     $3, %arg4                 # len(C) in bits (*128)
+       movq    %arg4, %xmm1
+       pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
+       pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
+       pxor    %xmm15, %xmm8
+       GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+                # final GHASH computation
+       pshufb  SHUF_MASK(%rip), %xmm8
+       mov     %arg5, %rax               # %rax = *Y0
+       movdqu  (%rax), %xmm0             # %xmm0 = Y0
+       ENCRYPT_SINGLE_BLOCK    %xmm0     # E(K, Y0)
+       pxor    %xmm8, %xmm0
+_return_T_decrypt:
+       mov     arg9, %r10                # %r10 = authTag
+       mov     arg10, %r11               # %r11 = auth_tag_len
+       cmp     $16, %r11
+       je      _T_16_decrypt
+       cmp     $12, %r11
+       je      _T_12_decrypt
+_T_8_decrypt:
+       movq    %xmm0, %rax
+       mov     %rax, (%r10)
+       jmp     _return_T_done_decrypt
+_T_12_decrypt:
+       movq    %xmm0, %rax
+       mov     %rax, (%r10)
+       psrldq  $8, %xmm0
+       movd    %xmm0, %eax
+       mov     %eax, 8(%r10)
+       jmp     _return_T_done_decrypt
+_T_16_decrypt:
+       movdqu  %xmm0, (%r10)
+_return_T_done_decrypt:
+       mov     %r14, %rsp
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       ret
+
+
+/*****************************************************************************
+* void aesni_gcm_enc(void *aes_ctx,
+*                           // AES Key schedule. Starts on a 16 byte boundary.
+*                    u8 *out,
+*                           // Ciphertext output. Encrypt in-place is allowed.
+*                    const u8 *in,               // Plaintext input
+*                    u64 plaintext_len,
+*                                   // Length of data in bytes for encryption.
+*                    u8 *iv,
+*            // Pre-counter block j0: 4 byte salt (from Security Association)
+*  // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                   // concatenated with 0x00000001. 16-byte aligned pointer.
+*                    u8 *hash_subkey,
+*            // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                    const u8 *aad,   // Additional Authentication Data (AAD)
+*                    u64 aad_len,
+*   // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                    u8 *auth_tag,               // Authenticated Tag output.
+*                    u64 auth_tag_len);
+* // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
+*  12 or 8.
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the
+*       first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                 AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                         AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
+*       The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+*       For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+***************************************************************************/
+ENTRY(aesni_gcm_enc)
+       push    %r12
+       push    %r13
+       push    %r14
+       mov     %rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+       sub     $VARIABLE_OFFSET, %rsp
+       and     $~63, %rsp
+       mov     %arg6, %r12
+       movdqu  (%r12), %xmm13
+       pshufb  SHUF_MASK(%rip), %xmm13
+
+# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+       movdqa  %xmm13, %xmm2
+       psllq   $1, %xmm13
+       psrlq   $63, %xmm2
+       movdqa  %xmm2, %xmm1
+       pslldq  $8, %xmm2
+       psrldq  $8, %xmm1
+       por     %xmm2, %xmm13
+
+        # reduce HashKey<<1
+
+       pshufd  $0x24, %xmm1, %xmm2
+       pcmpeqd TWOONE(%rip), %xmm2
+       pand    POLY(%rip), %xmm2
+       pxor    %xmm2, %xmm13
+       movdqa  %xmm13, HashKey(%rsp)
+       mov     %arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
+       and     $-16, %r13
+       mov     %r13, %r12
+
+        # Encrypt first few blocks
+
+       and     $(3<<4), %r12
+       jz      _initial_num_blocks_is_0_encrypt
+       cmp     $(2<<4), %r12
+       jb      _initial_num_blocks_is_1_encrypt
+       je      _initial_num_blocks_is_2_encrypt
+_initial_num_blocks_is_3_encrypt:
+       INITIAL_BLOCKS  3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
+       sub     $48, %r13
+       jmp     _initial_blocks_encrypted
+_initial_num_blocks_is_2_encrypt:
+       INITIAL_BLOCKS  2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
+       sub     $32, %r13
+       jmp     _initial_blocks_encrypted
+_initial_num_blocks_is_1_encrypt:
+       INITIAL_BLOCKS  1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
+       sub     $16, %r13
+       jmp     _initial_blocks_encrypted
+_initial_num_blocks_is_0_encrypt:
+       INITIAL_BLOCKS  0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
+_initial_blocks_encrypted:
+
+        # Main loop - Encrypt remaining blocks
+
+       cmp     $0, %r13
+       je      _zero_cipher_left_encrypt
+       sub     $64, %r13
+       je      _four_cipher_left_encrypt
+_encrypt_by_4_encrypt:
+       GHASH_4_ENCRYPT_4_PARALLEL      %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
+       add     $64, %r11
+       sub     $64, %r13
+       jne     _encrypt_by_4_encrypt
+_four_cipher_left_encrypt:
+       GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_encrypt:
+       mov     %arg4, %r13
+       and     $15, %r13                       # %r13 = arg4 (mod 16)
+       je      _multiple_of_16_bytes_encrypt
+
+         # Handle the last <16 Byte block seperately
+       paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
+       pshufb SHUF_MASK(%rip), %xmm0
+       ENCRYPT_SINGLE_BLOCK    %xmm0         # Encrypt(K, Yn)
+       sub $16, %r11
+       add %r13, %r11
+       movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
+       lea SHIFT_MASK+16(%rip), %r12
+       sub %r13, %r12
+       # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+       # (%r13 is the number of bytes in plaintext mod 16)
+       movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
+       pshufb  %xmm2, %xmm1            # shift right 16-r13 byte
+       pxor    %xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
+       movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
+       # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+       pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
+
+       pshufb  SHUF_MASK(%rip),%xmm0
+       pxor    %xmm0, %xmm8
+       GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+       # GHASH computation for the last <16 byte block
+       sub     %r13, %r11
+       add     $16, %r11
+       pshufb SHUF_MASK(%rip), %xmm0
+       # shuffle xmm0 back to output as ciphertext
+
+        # Output %r13 bytes
+       movq %xmm0, %rax
+       cmp $8, %r13
+       jle _less_than_8_bytes_left_encrypt
+       mov %rax, (%arg2 , %r11, 1)
+       add $8, %r11
+       psrldq $8, %xmm0
+       movq %xmm0, %rax
+       sub $8, %r13
+_less_than_8_bytes_left_encrypt:
+       mov %al,  (%arg2, %r11, 1)
+       add $1, %r11
+       shr $8, %rax
+       sub $1, %r13
+       jne _less_than_8_bytes_left_encrypt
+_multiple_of_16_bytes_encrypt:
+       mov     arg8, %r12    # %r12 = addLen (number of bytes)
+       shl     $3, %r12
+       movd    %r12d, %xmm15       # len(A) in %xmm15
+       shl     $3, %arg4               # len(C) in bits (*128)
+       movq    %arg4, %xmm1
+       pslldq  $8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
+       pxor    %xmm1, %xmm15       # %xmm15 = len(A)||len(C)
+       pxor    %xmm15, %xmm8
+       GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+       # final GHASH computation
+
+       pshufb  SHUF_MASK(%rip), %xmm8         # perform a 16 byte swap
+       mov     %arg5, %rax                    # %rax  = *Y0
+       movdqu  (%rax), %xmm0                  # %xmm0 = Y0
+       ENCRYPT_SINGLE_BLOCK    %xmm0          # Encrypt(K, Y0)
+       pxor    %xmm8, %xmm0
+_return_T_encrypt:
+       mov     arg9, %r10                     # %r10 = authTag
+       mov     arg10, %r11                    # %r11 = auth_tag_len
+       cmp     $16, %r11
+       je      _T_16_encrypt
+       cmp     $12, %r11
+       je      _T_12_encrypt
+_T_8_encrypt:
+       movq    %xmm0, %rax
+       mov     %rax, (%r10)
+       jmp     _return_T_done_encrypt
+_T_12_encrypt:
+       movq    %xmm0, %rax
+       mov     %rax, (%r10)
+       psrldq  $8, %xmm0
+       movd    %xmm0, %eax
+       mov     %eax, 8(%r10)
+       jmp     _return_T_done_encrypt
+_T_16_encrypt:
+       movdqu  %xmm0, (%r10)
+_return_T_done_encrypt:
+       mov     %r14, %rsp
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       ret
+
+#include <asm/inst.h>
+
 _key_expansion_128:
 _key_expansion_256a:
        pshufd $0b11111111, %xmm1, %xmm1
diff -uNr a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
--- a/arch/x86/crypto/aesni-intel_glue.c        2010-08-20 19:55:55.000000000 +0100
+++ b/arch/x86/crypto/aesni-intel_glue.c        2010-09-23 22:43:01.000000000 +0100
@@ -5,6 +5,14 @@
  * Copyright (C) 2008, Intel Corp.
  *    Author: Huang Ying <ying.huang@intel.com>
  *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
 #include <crypto/ctr.h>
 #include <asm/i387.h>
 #include <asm/aes.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>

 #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
 #define HAS_CTR
@@ -42,8 +54,31 @@
        struct cryptd_ablkcipher *cryptd_tfm;
 };

-#define AESNI_ALIGN    16
+/* This data is stored at the end of the crypto_tfm struct.
+ * It's a type of per "session" data storage location.
+ * This needs to be 16 byte aligned.
+ */
+struct aesni_rfc4106_gcm_ctx {
+       u8 hash_subkey[16];
+       struct crypto_aes_ctx aes_key_expanded;
+       u8 nonce[4];
+       struct cryptd_aead *cryptd_tfm;
+};
+
+struct aesni_gcm_set_hash_subkey_result {
+       int err;
+       struct completion completion;
+};
+
+struct aesni_hash_subkey_req_data {
+       u8 iv[16];
+       struct aesni_gcm_set_hash_subkey_result result;
+       struct scatterlist sg;
+};
+
+#define AESNI_ALIGN    (16)
 #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
+#define RFC4106_HASH_SUBKEY_SIZE 16

 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
                             unsigned int key_len);
@@ -62,6 +97,57 @@
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
                              const u8 *in, unsigned int len, u8 *iv);

+/* asmlinkage void aesni_gcm_enc()
+ * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
+ * const u8 *in, Plaintext input
+ * unsigned long plaintext_len, Length of data in bytes for encryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
+ *          is going to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
+ *          Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
+                       const u8 *in, unsigned long plaintext_len, u8 *iv,
+                       u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+                       u8 *auth_tag, unsigned long auth_tag_len);
+
+/* asmlinkage void aesni_gcm_dec()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
+ * const u8 *in, Ciphertext input
+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
+ * to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
+                       const u8 *in, unsigned long ciphertext_len, u8 *iv,
+                       u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+                       u8 *auth_tag, unsigned long auth_tag_len);
+
+static inline struct
+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
+{
+       return
+               (struct aesni_rfc4106_gcm_ctx *)
+               PTR_ALIGN((u8 *)
+               crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
+}
+
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
        unsigned long addr = (unsigned long)raw_ctx;
@@ -730,6 +816,422 @@
 };
 #endif

+static int rfc4106_init(struct crypto_tfm *tfm)
+{
+       struct cryptd_aead *cryptd_tfm;
+       struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
+               PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+       cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+       if (IS_ERR(cryptd_tfm))
+               return PTR_ERR(cryptd_tfm);
+       ctx->cryptd_tfm = cryptd_tfm;
+       tfm->crt_aead.reqsize = sizeof(struct aead_request)
+               + crypto_aead_reqsize(&cryptd_tfm->base);
+       return 0;
+}
+
+static void rfc4106_exit(struct crypto_tfm *tfm)
+{
+       struct aesni_rfc4106_gcm_ctx *ctx =
+               (struct aesni_rfc4106_gcm_ctx *)
+               PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+       if (!IS_ERR(ctx->cryptd_tfm))
+               cryptd_free_aead(ctx->cryptd_tfm);
+       return;
+}
+
+static void
+rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
+{
+       struct aesni_gcm_set_hash_subkey_result *result = req->data;
+
+       if (err == -EINPROGRESS)
+               return;
+       result->err = err;
+       complete(&result->completion);
+}
+
+static int
+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+{
+       struct crypto_ablkcipher *ctr_tfm;
+       struct ablkcipher_request *req;
+       int ret = -EINVAL;
+       struct aesni_hash_subkey_req_data *req_data;
+
+       ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
+       if (IS_ERR(ctr_tfm))
+               return PTR_ERR(ctr_tfm);
+
+       crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
+
+       ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
+       if (ret) {
+               crypto_free_ablkcipher(ctr_tfm);
+               return ret;
+       }
+
+       req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
+       if (!req) {
+               crypto_free_ablkcipher(ctr_tfm);
+               return -EINVAL;
+       }
+
+       req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+       if (!req_data) {
+               crypto_free_ablkcipher(ctr_tfm);
+               return -ENOMEM;
+       }
+       memset(req_data->iv, 0, sizeof(req_data->iv));
+
+       /* Clear the data in the hash sub key container to zero.*/
+       /* We want to cipher all zeros to create the hash sub key. */
+       memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
+
+       init_completion(&req_data->result.completion);
+       sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
+       ablkcipher_request_set_tfm(req, ctr_tfm);
+       ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+                                       CRYPTO_TFM_REQ_MAY_BACKLOG,
+                                       rfc4106_set_hash_subkey_done,
+                                       &req_data->result);
+
+       ablkcipher_request_set_crypt(req, &req_data->sg,
+               &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
+
+       ret = crypto_ablkcipher_encrypt(req);
+       if (ret == -EINPROGRESS || ret == -EBUSY) {
+               ret = wait_for_completion_interruptible
+                       (&req_data->result.completion);
+               if (!ret)
+                       ret = req_data->result.err;
+       }
+       ablkcipher_request_free(req);
+       kfree(req_data);
+       crypto_free_ablkcipher(ctr_tfm);
+       return ret;
+}
+
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+                                                  unsigned int key_len)
+{
+       int ret = 0;
+       struct crypto_tfm *tfm = crypto_aead_tfm(parent);
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+       u8 *new_key_mem = NULL;
+
+       if (key_len < 4) {
+               crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+       /*Account for 4 byte nonce at the end.*/
+       key_len -= 4;
+       if (key_len != AES_KEYSIZE_128) {
+               crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+               return -EINVAL;
+       }
+
+       memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
+       /*This must be on a 16 byte boundary!*/
+       if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
+               return -EINVAL;
+
+       if ((unsigned long)key % AESNI_ALIGN) {
+               /*key is not aligned: use an auxuliar aligned pointer*/
+               new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
+               if (!new_key_mem)
+                       return -ENOMEM;
+
+               new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
+               memcpy(new_key_mem, key, key_len);
+               key = new_key_mem;
+       }
+
+       if (!irq_fpu_usable())
+               ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
+               key, key_len);
+       else {
+               kernel_fpu_begin();
+               ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
+               kernel_fpu_end();
+       }
+       /*This must be on a 16 byte boundary!*/
+       if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
+               ret = -EINVAL;
+               goto exit;
+       }
+       ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+exit:
+       kfree(new_key_mem);
+       return ret;
+}
+
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+                               unsigned int authsize)
+{
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+       struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+       switch (authsize) {
+       case 8:
+       case 12:
+       case 16:
+               break;
+       default:
+               return -EINVAL;
+       }
+       crypto_aead_crt(parent)->authsize = authsize;
+       crypto_aead_crt(cryptd_child)->authsize = authsize;
+       return 0;
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+       int ret;
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+       struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+       if (!irq_fpu_usable()) {
+               struct aead_request *cryptd_req =
+                       (struct aead_request *) aead_request_ctx(req);
+               memcpy(cryptd_req, req, sizeof(*req));
+               aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+               return crypto_aead_encrypt(cryptd_req);
+       } else {
+               kernel_fpu_begin();
+               ret = cryptd_child->base.crt_aead.encrypt(req);
+               kernel_fpu_end();
+               return ret;
+       }
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+       int ret;
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+       struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+       if (!irq_fpu_usable()) {
+               struct aead_request *cryptd_req =
+                       (struct aead_request *) aead_request_ctx(req);
+               memcpy(cryptd_req, req, sizeof(*req));
+               aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+               return crypto_aead_decrypt(cryptd_req);
+       } else {
+               kernel_fpu_begin();
+               ret = cryptd_child->base.crt_aead.decrypt(req);
+               kernel_fpu_end();
+               return ret;
+       }
+}
+
+static struct crypto_alg rfc4106_alg = {
+       .cra_name = "rfc4106(gcm(aes))",
+       .cra_driver_name = "rfc4106-gcm-aesni",
+       .cra_priority = 400,
+       .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+       .cra_blocksize = 1,
+       .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+       .cra_alignmask = 0,
+       .cra_type = &crypto_nivaead_type,
+       .cra_module = THIS_MODULE,
+       .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
+       .cra_init = rfc4106_init,
+       .cra_exit = rfc4106_exit,
+       .cra_u = {
+               .aead = {
+                       .setkey = rfc4106_set_key,
+                       .setauthsize = rfc4106_set_authsize,
+                       .encrypt = rfc4106_encrypt,
+                       .decrypt = rfc4106_decrypt,
+                       .geniv = "seqiv",
+                       .ivsize = 8,
+                       .maxauthsize = 16,
+               },
+       },
+};
+
+static int __driver_rfc4106_encrypt(struct aead_request *req)
+{
+       u8 one_entry_in_sg = 0;
+       u8 *src, *dst, *assoc;
+       __be32 counter = cpu_to_be32(1);
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+       void *aes_ctx = &(ctx->aes_key_expanded);
+       unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+       u8 iv_tab[16+AESNI_ALIGN];
+       u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
+       struct scatter_walk src_sg_walk;
+       struct scatter_walk assoc_sg_walk;
+       struct scatter_walk dst_sg_walk;
+       unsigned int i;
+
+       /* Assuming we are supporting rfc4106 64-bit extended */
+       /* sequence numbers We need to have the AAD length equal */
+       /* to 8 or 12 bytes */
+       if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+               return -EINVAL;
+       /* IV below built */
+       for (i = 0; i < 4; i++)
+               *(iv+i) = ctx->nonce[i];
+       for (i = 0; i < 8; i++)
+               *(iv+4+i) = req->iv[i];
+       *((__be32 *)(iv+12)) = counter;
+
+       if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+               one_entry_in_sg = 1;
+               scatterwalk_start(&src_sg_walk, req->src);
+               scatterwalk_start(&assoc_sg_walk, req->assoc);
+               src = scatterwalk_map(&src_sg_walk, 0);
+               assoc = scatterwalk_map(&assoc_sg_walk, 0);
+               dst = src;
+               if (unlikely(req->src != req->dst)) {
+                       scatterwalk_start(&dst_sg_walk, req->dst);
+                       dst = scatterwalk_map(&dst_sg_walk, 0);
+               }
+
+       } else {
+               /* Allocate memory for src, dst, assoc */
+               src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+                       GFP_ATOMIC);
+               if (unlikely(!src))
+                       return -ENOMEM;
+               assoc = (src + req->cryptlen + auth_tag_len);
+               scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+               scatterwalk_map_and_copy(assoc, req->assoc, 0,
+                                       req->assoclen, 0);
+               dst = src;
+       }
+
+       aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+               ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
+               + ((unsigned long)req->cryptlen), auth_tag_len);
+
+       /* The authTag (aka the Integrity Check Value) needs to be written
+        * back to the packet. */
+       if (one_entry_in_sg) {
+               if (unlikely(req->src != req->dst)) {
+                       scatterwalk_unmap(dst, 0);
+                       scatterwalk_done(&dst_sg_walk, 0, 0);
+               }
+               scatterwalk_unmap(src, 0);
+               scatterwalk_unmap(assoc, 0);
+               scatterwalk_done(&src_sg_walk, 0, 0);
+               scatterwalk_done(&assoc_sg_walk, 0, 0);
+       } else {
+               scatterwalk_map_and_copy(dst, req->dst, 0,
+                       req->cryptlen + auth_tag_len, 1);
+               kfree(src);
+       }
+       return 0;
+}
+
+static int __driver_rfc4106_decrypt(struct aead_request *req)
+{
+       u8 one_entry_in_sg = 0;
+       u8 *src, *dst, *assoc;
+       unsigned long tempCipherLen = 0;
+       __be32 counter = cpu_to_be32(1);
+       int retval = 0;
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+       void *aes_ctx = &(ctx->aes_key_expanded);
+       unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+       u8 iv_and_authTag[32+AESNI_ALIGN];
+       u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
+       u8 *authTag = iv + 16;
+       struct scatter_walk src_sg_walk;
+       struct scatter_walk assoc_sg_walk;
+       struct scatter_walk dst_sg_walk;
+       unsigned int i;
+
+       if (unlikely((req->cryptlen < auth_tag_len) ||
+               (req->assoclen != 8 && req->assoclen != 12)))
+               return -EINVAL;
+       /* Assuming we are supporting rfc4106 64-bit extended */
+       /* sequence numbers We need to have the AAD length */
+       /* equal to 8 or 12 bytes */
+
+       tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
+       /* IV below built */
+       for (i = 0; i < 4; i++)
+               *(iv+i) = ctx->nonce[i];
+       for (i = 0; i < 8; i++)
+               *(iv+4+i) = req->iv[i];
+       *((__be32 *)(iv+12)) = counter;
+
+       if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+               one_entry_in_sg = 1;
+               scatterwalk_start(&src_sg_walk, req->src);
+               scatterwalk_start(&assoc_sg_walk, req->assoc);
+               src = scatterwalk_map(&src_sg_walk, 0);
+               assoc = scatterwalk_map(&assoc_sg_walk, 0);
+               dst = src;
+               if (unlikely(req->src != req->dst)) {
+                       scatterwalk_start(&dst_sg_walk, req->dst);
+                       dst = scatterwalk_map(&dst_sg_walk, 0);
+               }
+
+       } else {
+               /* Allocate memory for src, dst, assoc */
+               src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+               if (!src)
+                       return -ENOMEM;
+               assoc = (src + req->cryptlen + auth_tag_len);
+               scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+               scatterwalk_map_and_copy(assoc, req->assoc, 0,
+                       req->assoclen, 0);
+               dst = src;
+       }
+
+       aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+               ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
+               authTag, auth_tag_len);
+
+       /* Compare generated tag with passed in tag. */
+       retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
+               -EBADMSG : 0;
+
+       if (one_entry_in_sg) {
+               if (unlikely(req->src != req->dst)) {
+                       scatterwalk_unmap(dst, 0);
+                       scatterwalk_done(&dst_sg_walk, 0, 0);
+               }
+               scatterwalk_unmap(src, 0);
+               scatterwalk_unmap(assoc, 0);
+               scatterwalk_done(&src_sg_walk, 0, 0);
+               scatterwalk_done(&assoc_sg_walk, 0, 0);
+       } else {
+               scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
+               kfree(src);
+       }
+       return retval;
+}
+
+static struct crypto_alg __rfc4106_alg = {
+       .cra_name               = "__gcm-aes-aesni",
+       .cra_driver_name        = "__driver-gcm-aes-aesni",
+       .cra_priority           = 0,
+       .cra_flags              = CRYPTO_ALG_TYPE_AEAD,
+       .cra_blocksize          = 1,
+       .cra_ctxsize    = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+       .cra_alignmask          = 0,
+       .cra_type               = &crypto_aead_type,
+       .cra_module             = THIS_MODULE,
+       .cra_list               = LIST_HEAD_INIT(__rfc4106_alg.cra_list),
+       .cra_u = {
+               .aead = {
+                       .encrypt        = __driver_rfc4106_encrypt,
+                       .decrypt        = __driver_rfc4106_decrypt,
+               },
+       },
+};
+
 static int __init aesni_init(void)
 {
        int err;
@@ -738,6 +1240,7 @@
                printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
                return -ENODEV;
        }
+
        if ((err = crypto_register_alg(&aesni_alg)))
                goto aes_err;
        if ((err = crypto_register_alg(&__aesni_alg)))
@@ -770,10 +1273,19 @@
        if ((err = crypto_register_alg(&ablk_xts_alg)))
                goto ablk_xts_err;
 #endif
-
+       err = crypto_register_alg(&__rfc4106_alg);
+       if (err)
+               goto __aead_gcm_err;
+       err = crypto_register_alg(&rfc4106_alg);
+       if (err)
+               goto aead_gcm_err;
        return err;

+aead_gcm_err:
+       crypto_unregister_alg(&__rfc4106_alg);
+__aead_gcm_err:
 #ifdef HAS_XTS
+       crypto_unregister_alg(&ablk_xts_alg);
 ablk_xts_err:
 #endif
 #ifdef HAS_PCBC
@@ -809,6 +1321,8 @@

 static void __exit aesni_exit(void)
 {
+       crypto_unregister_alg(&__rfc4106_alg);
+       crypto_unregister_alg(&rfc4106_alg);
 #ifdef HAS_XTS
        crypto_unregister_alg(&ablk_xts_alg);
 #endif
********************************************************************

>-----Original Message-----
>From: Herbert Xu [mailto:herbert@gondor.apana.org.au]
>Sent: Thursday, September 23, 2010 8:59 AM
>To: Struk, Tadeusz
>Cc: Hoban, Adrian; Paoloni, Gabriele; O Mahony, Aidan; linux-crypto@vger.kernel.org; linux-kernel@vger.kernel.org
>Subject: Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions

>On Mon, Sep 20, 2010 at 12:21:54PM +0100, Struk, Tadeusz wrote:
>> Hello Herbert,
>> I've reworked the patch 2/3 and successfully applied all of them.
>> The problem was spaces at EOL.
>> Please find the reworked patch below.
>>
>> >From 06444d8a95458d807ae14699e557739281d0b026 Mon Sep 17 00:00:00 2001
>> From: Adrian Hoban <ahoban@allen.ir.intel.com>
>> Date: Fri, 10 Sep 2010 18:08:45 +0100
>> Subject: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
>>
>> This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
>> kernels. It supports 128-bit AES key size. This leverages the crypto
>> AEAD interface type to facilitate a combined AES & GCM operation to
>> be implemented in assembly code. The assembly code leverages Intel(R)
>> AES New Instructions and the PCLMULQDQ instruction.
>>
>> Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
>> Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
>> Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
>> Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com>
>> Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com>
>> Signed-off-by: James Guilford <james.guilford@intel.com>
>> Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com>
>
>Sorry, still doesn't work:
>
>$ git apply ~/p
>fatal: corrupt patch at line 67
>$ patch -s -p1 < ~/p
>patch: **** malformed patch at line 66: ALL_F
>$
>
>Cheers,
>--
>Email: Herbert Xu <herbert@gondor.apana.org.au>
>Home Page: http://gondor.apana.org.au/~herbert/
>PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--------------------------------------------------------------
Intel Shannon Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
Business address: Dromore House, East Park, Shannon, Co. Clare

This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). Any review or distribution by others is strictly prohibited. If you are not the intended recipient, please contact the sender and delete all copies.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-12-03  1:30           ` Herbert Xu
  2010-12-03 11:05             ` Struk, Tadeusz
@ 2010-12-14  2:21             ` Herbert Xu
  1 sibling, 0 replies; 19+ messages in thread
From: Herbert Xu @ 2010-12-14  2:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Struk, Tadeusz, linux-kernel, linux-crypto, O Mahony, Aidan,
	Paoloni, Gabriele, Hoban, Adrian

On Fri, Dec 03, 2010 at 09:30:21AM +0800, Herbert Xu wrote:
>
> Andrew, I'll just back this out if it doesn't get resolved.

Andrew, the problem should be resolved in the current cryptodev
tree.  Please let me know if it still blows up for you.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-12-03  1:30           ` Herbert Xu
@ 2010-12-03 11:05             ` Struk, Tadeusz
  2010-12-14  2:21             ` Herbert Xu
  1 sibling, 0 replies; 19+ messages in thread
From: Struk, Tadeusz @ 2010-12-03 11:05 UTC (permalink / raw)
  To: Herbert Xu, Andrew Morton
  Cc: linux-kernel, linux-crypto, O Mahony, Aidan, Paoloni, Gabriele,
	Hoban, Adrian

Hi Herbert and Andrew,
Sorry for delay. We have the patch almost ready. We just want to do some more testing before we send it.
We should be ready to send it out next week. Next Friday (10 Dec) will be worst case date.
Thanks,
Tadeusz
 

-----Original Message-----
From: Herbert Xu [mailto:herbert@gondor.apana.org.au] 
Sent: Friday, December 03, 2010 1:30 AM
To: Andrew Morton
Cc: Struk, Tadeusz; linux-kernel@vger.kernel.org; linux-crypto@vger.kernel.org; O Mahony, Aidan; Paoloni, Gabriele; Hoban, Adrian
Subject: Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions

On Thu, Dec 02, 2010 at 04:14:03PM -0800, Andrew Morton wrote:
>
> > > Tadeusz, could you please make an incremental patch that converts
> > > all the pshufb instructions to macros?
> > > 
> >
> > Will do.
> > Tadeusz
> > 
> 
> (top-posting repaired)
> 
> (busted attribution/quoting repaired)
> 
> Current mainline still has this failure.

Tadeusz, can you update us on your progress on this?

Andrew, I'll just back this out if it doesn't get resolved.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--------------------------------------------------------------
Intel Shannon Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
Business address: Dromore House, East Park, Shannon, Co. Clare

This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). Any review or distribution by others is strictly prohibited. If you are not the intended recipient, please contact the sender and delete all copies.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-12-03  0:14         ` Andrew Morton
@ 2010-12-03  1:30           ` Herbert Xu
  2010-12-03 11:05             ` Struk, Tadeusz
  2010-12-14  2:21             ` Herbert Xu
  0 siblings, 2 replies; 19+ messages in thread
From: Herbert Xu @ 2010-12-03  1:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Struk, Tadeusz, linux-kernel, linux-crypto, O Mahony, Aidan,
	Paoloni, Gabriele, Hoban, Adrian

On Thu, Dec 02, 2010 at 04:14:03PM -0800, Andrew Morton wrote:
>
> > > Tadeusz, could you please make an incremental patch that converts
> > > all the pshufb instructions to macros?
> > > 
> >
> > Will do.
> > Tadeusz
> > 
> 
> (top-posting repaired)
> 
> (busted attribution/quoting repaired)
> 
> Current mainline still has this failure.

Tadeusz, can you update us on your progress on this?

Andrew, I'll just back this out if it doesn't get resolved.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-11-18  9:38       ` Struk, Tadeusz
@ 2010-12-03  0:14         ` Andrew Morton
  2010-12-03  1:30           ` Herbert Xu
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2010-12-03  0:14 UTC (permalink / raw)
  To: Struk, Tadeusz
  Cc: Herbert Xu, linux-kernel, linux-crypto, O Mahony, Aidan, Paoloni,
	Gabriele, Hoban, Adrian

On Thu, 18 Nov 2010 09:38:50 +0000
"Struk, Tadeusz" <tadeusz.struk@intel.com> wrote:

> -----Original Message-----
> > From: Herbert Xu [mailto:herbert@gondor.apana.org.au] 
> > Sent: Thursday, November 18, 2010 9:27 AM
> > To: Andrew Morton
> > Cc: Struk, Tadeusz; linux-kernel@vger.kernel.org; linux-crypto@vger.kernel.org; O Mahony, Aidan; Paoloni, Gabriele; Hoban, Adrian
> > Subject: Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
> > 
> > On Wed, Nov 17, 2010 at 05:23:31PM -0800, Andrew Morton wrote:
> > > On Thu, 4 Nov 2010 14:04:05 -0500
> > > Herbert Xu <herbert@gondor.hengli.com.au> wrote:
> > > 
> > > > On Thu, Oct 28, 2010 at 04:19:09PM +0100, tadeusz.struk@intel.com wrote:
> > > > > Hi Herbert,
> > > > >    We have reworked the assembly to use macros instead of the new aesni instructions.
> > > > 
> > > > Both applied.  Thanks for your efforts!
> > > 
> > > Whatever's in today's linux-next is a big fail with gas-2.16.1.
> > > 
> > > Which was entirely predictable!  How often has this happened recently??
> > > 
> > > 
> > > arch/x86/crypto/aesni-intel_asm.S: Assembler messages:
> > > arch/x86/crypto/aesni-intel_asm.S:834: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm13'
> > > arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm5'
> > > arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
> > > arch/x86/crypto/aesni-intel_asm.S:866: Error: bad expression
> > 
> > Sorry, but I no longer have access to a binutils that is older
> > than 2.18 so I never saw this failure.
> > 
> > Tadeusz, could you please make an incremental patch that converts
> > all the pshufb instructions to macros?
> > 
>
> Will do.
> Tadeusz
> 

(top-posting repaired)

(busted attribution/quoting repaired)

Current mainline still has this failure.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-11-18  9:26     ` Herbert Xu
@ 2010-11-18  9:38       ` Struk, Tadeusz
  2010-12-03  0:14         ` Andrew Morton
  0 siblings, 1 reply; 19+ messages in thread
From: Struk, Tadeusz @ 2010-11-18  9:38 UTC (permalink / raw)
  To: Herbert Xu, Andrew Morton
  Cc: linux-kernel, linux-crypto, O Mahony, Aidan, Paoloni, Gabriele,
	Hoban, Adrian


[-- Attachment #1.1: Type: text/plain, Size: 1704 bytes --]

Will do.
Tadeusz

-----Original Message-----
From: Herbert Xu [mailto:herbert@gondor.apana.org.au] 
Sent: Thursday, November 18, 2010 9:27 AM
To: Andrew Morton
Cc: Struk, Tadeusz; linux-kernel@vger.kernel.org; linux-crypto@vger.kernel.org; O Mahony, Aidan; Paoloni, Gabriele; Hoban, Adrian
Subject: Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions

On Wed, Nov 17, 2010 at 05:23:31PM -0800, Andrew Morton wrote:
> On Thu, 4 Nov 2010 14:04:05 -0500
> Herbert Xu <herbert@gondor.hengli.com.au> wrote:
> 
> > On Thu, Oct 28, 2010 at 04:19:09PM +0100, tadeusz.struk@intel.com wrote:
> > > Hi Herbert,
> > >    We have reworked the assembly to use macros instead of the new aesni instructions.
> > 
> > Both applied.  Thanks for your efforts!
> 
> Whatever's in today's linux-next is a big fail with gas-2.16.1.
> 
> Which was entirely predictable!  How often has this happened recently??
> 
> 
> arch/x86/crypto/aesni-intel_asm.S: Assembler messages:
> arch/x86/crypto/aesni-intel_asm.S:834: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm13'
> arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm5'
> arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
> arch/x86/crypto/aesni-intel_asm.S:866: Error: bad expression

Sorry, but I no longer have access to a binutils that is older
than 2.18 so I never saw this failure.

Tadeusz, could you please make an incremental patch that converts
all the pshufb instructions to macros?

Thanks!
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[-- Attachment #1.2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 7072 bytes --]

[-- Attachment #2: Type: text/plain, Size: 541 bytes --]

--------------------------------------------------------------
Intel Shannon Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
Business address: Dromore House, East Park, Shannon, Co. Clare

This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). Any review or distribution by others is strictly prohibited. If you are not the intended recipient, please contact the sender and delete all copies.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-11-18  1:23   ` Andrew Morton
@ 2010-11-18  9:26     ` Herbert Xu
  2010-11-18  9:38       ` Struk, Tadeusz
  0 siblings, 1 reply; 19+ messages in thread
From: Herbert Xu @ 2010-11-18  9:26 UTC (permalink / raw)
  To: Andrew Morton
  Cc: tadeusz.struk, linux-kernel, linux-crypto, aidan.o.mahony,
	gabriele.paoloni, adrian.hoban

On Wed, Nov 17, 2010 at 05:23:31PM -0800, Andrew Morton wrote:
> On Thu, 4 Nov 2010 14:04:05 -0500
> Herbert Xu <herbert@gondor.hengli.com.au> wrote:
> 
> > On Thu, Oct 28, 2010 at 04:19:09PM +0100, tadeusz.struk@intel.com wrote:
> > > Hi Herbert,
> > >    We have reworked the assembly to use macros instead of the new aesni instructions.
> > 
> > Both applied.  Thanks for your efforts!
> 
> Whatever's in today's linux-next is a big fail with gas-2.16.1.
> 
> Which was entirely predictable!  How often has this happened recently??
> 
> 
> arch/x86/crypto/aesni-intel_asm.S: Assembler messages:
> arch/x86/crypto/aesni-intel_asm.S:834: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm13'
> arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm5'
> arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
> arch/x86/crypto/aesni-intel_asm.S:866: Error: bad expression

Sorry, but I no longer have access to a binutils that is older
than 2.18 so I never saw this failure.

Tadeusz, could you please make an incremental patch that converts
all the pshufb instructions to macros?

Thanks!
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-11-04 19:04 ` Herbert Xu
@ 2010-11-18  1:23   ` Andrew Morton
  2010-11-18  9:26     ` Herbert Xu
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2010-11-18  1:23 UTC (permalink / raw)
  To: Herbert Xu
  Cc: tadeusz.struk, linux-kernel, linux-crypto, aidan.o.mahony,
	gabriele.paoloni, adrian.hoban

On Thu, 4 Nov 2010 14:04:05 -0500
Herbert Xu <herbert@gondor.hengli.com.au> wrote:

> On Thu, Oct 28, 2010 at 04:19:09PM +0100, tadeusz.struk@intel.com wrote:
> > Hi Herbert,
> >    We have reworked the assembly to use macros instead of the new aesni instructions.
> 
> Both applied.  Thanks for your efforts!

Whatever's in today's linux-next is a big fail with gas-2.16.1.

Which was entirely predictable!  How often has this happened recently??


arch/x86/crypto/aesni-intel_asm.S: Assembler messages:
arch/x86/crypto/aesni-intel_asm.S:834: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm13'
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm5'
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:866: Error: bad expression
arch/x86/crypto/aesni-intel_asm.S:866: Error: junk at end of line, first unrecognized character is `0'
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm6'
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:866: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm6'
arch/x86/crypto/aesni-intel_asm.S:866: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:866: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:866: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:871: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm6'
arch/x86/crypto/aesni-intel_asm.S:871: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:871: Error: bad expression
arch/x86/crypto/aesni-intel_asm.S:871: Error: junk at end of line, first unrecognized character is `0'
arch/x86/crypto/aesni-intel_asm.S:871: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:871: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:871: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:871: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:871: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:871: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:871: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:871: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:871: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:871: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:876: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:876: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:876: Error: bad expression
arch/x86/crypto/aesni-intel_asm.S:876: Error: junk at end of line, first unrecognized character is `0'
arch/x86/crypto/aesni-intel_asm.S:876: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:876: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:876: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:876: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:876: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:876: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:876: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:881: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:881: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:881: Error: bad expression
arch/x86/crypto/aesni-intel_asm.S:881: Error: junk at end of line, first unrecognized character is `0'
arch/x86/crypto/aesni-intel_asm.S:881: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:881: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:881: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:881: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:881: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:881: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:881: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:889: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:889: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:889: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:889: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:889: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:889: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:889: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:889: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:889: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:889: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:889: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:889: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:905: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:915: Error: no such instruction: `pshufb %xmm2,%xmm1'
arch/x86/crypto/aesni-intel_asm.S:922: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:930: Error: suffix or operands invalid for `movq'
arch/x86/crypto/aesni-intel_asm.S:936: Error: suffix or operands invalid for `movq'
arch/x86/crypto/aesni-intel_asm.S:949: Error: suffix or operands invalid for `movq'
arch/x86/crypto/aesni-intel_asm.S:955: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:968: Error: suffix or operands invalid for `movq'
arch/x86/crypto/aesni-intel_asm.S:972: Error: suffix or operands invalid for `movq'
arch/x86/crypto/aesni-intel_asm.S:1084: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm13'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm5'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: bad expression
arch/x86/crypto/aesni-intel_asm.S:1115: Error: junk at end of line, first unrecognized character is `0'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm6'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm6'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:1115: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:1120: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm6'
arch/x86/crypto/aesni-intel_asm.S:1120: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:1120: Error: bad expression
arch/x86/crypto/aesni-intel_asm.S:1120: Error: junk at end of line, first unrecognized character is `0'
arch/x86/crypto/aesni-intel_asm.S:1120: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:1120: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:1120: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:1120: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:1120: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:1120: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:1120: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:1120: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:1120: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:1120: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:1125: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:1125: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:1125: Error: bad expression
arch/x86/crypto/aesni-intel_asm.S:1125: Error: junk at end of line, first unrecognized character is `0'
arch/x86/crypto/aesni-intel_asm.S:1125: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:1125: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:1125: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:1125: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:1125: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:1125: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:1125: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:1130: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:1130: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:1130: Error: bad expression
arch/x86/crypto/aesni-intel_asm.S:1130: Error: junk at end of line, first unrecognized character is `0'
arch/x86/crypto/aesni-intel_asm.S:1130: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:1130: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:1130: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:1130: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:1130: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:1130: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:1130: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:15: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:1141: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:1141: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:1141: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:1141: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:1141: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:1141: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:1141: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:1141: Error: non-constant expression in ".if" statement
arch/x86/crypto/aesni-intel_asm.S:1141: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm1'
arch/x86/crypto/aesni-intel_asm.S:1141: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm2'
arch/x86/crypto/aesni-intel_asm.S:1141: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm3'
arch/x86/crypto/aesni-intel_asm.S:1141: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm4'
arch/x86/crypto/aesni-intel_asm.S:1156: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:1166: Error: no such instruction: `pshufb %xmm2,%xmm1'
arch/x86/crypto/aesni-intel_asm.S:1172: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:1178: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm0'
arch/x86/crypto/aesni-intel_asm.S:1182: Error: suffix or operands invalid for `movq'
arch/x86/crypto/aesni-intel_asm.S:1188: Error: suffix or operands invalid for `movq'
arch/x86/crypto/aesni-intel_asm.S:1201: Error: suffix or operands invalid for `movq'
arch/x86/crypto/aesni-intel_asm.S:1208: Error: no such instruction: `pshufb SHUF_MASK(%rip),%xmm8'
arch/x86/crypto/aesni-intel_asm.S:1221: Error: suffix or operands invalid for `movq'
arch/x86/crypto/aesni-intel_asm.S:1225: Error: suffix or operands invalid for `movq'
make[2]: *** [arch/x86/crypto/aesni-intel_asm.o] Error 1
make[1]: *** [arch/x86/crypto] Error 2
make: *** [arch/x86] Error 2


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-10-28 15:19 tadeusz.struk
@ 2010-11-04 19:04 ` Herbert Xu
  2010-11-18  1:23   ` Andrew Morton
  0 siblings, 1 reply; 19+ messages in thread
From: Herbert Xu @ 2010-11-04 19:04 UTC (permalink / raw)
  To: tadeusz.struk
  Cc: linux-kernel, linux-crypto, aidan.o.mahony, gabriele.paoloni,
	adrian.hoban

On Thu, Oct 28, 2010 at 04:19:09PM +0100, tadeusz.struk@intel.com wrote:
> Hi Herbert,
>    We have reworked the assembly to use macros instead of the new aesni instructions.

Both applied.  Thanks for your efforts!

>    Do you think it is possible to get it in into 2.6.37?

Sorry, it needed to have been ready much earlier to make this
merge window.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
@ 2010-10-28 15:19 tadeusz.struk
  2010-11-04 19:04 ` Herbert Xu
  0 siblings, 1 reply; 19+ messages in thread
From: tadeusz.struk @ 2010-10-28 15:19 UTC (permalink / raw)
  To: herbert
  Cc: linux-kernel, linux-crypto, aidan.o.mahony, gabriele.paoloni,
	adrian.hoban, tadeusz.struk

Hi Herbert,
   We have reworked the assembly to use macros instead of the new aesni instructions.
   Do you think it is possible to get it in into 2.6.37?
   Thanks,
   Tadeusz

>From 0ea40c3b63af3c4c3573cf5247ce855f06b45a9c Mon Sep 17 00:00:00 2001
From: Tadeusz Struk <tadeusz.struk@intel.com>
Date: Thu, 28 Oct 2010 15:01:58 +0100
Subject: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions

 This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
 kernels. It supports 128-bit AES key size. This leverages the crypto
 AEAD interface type to facilitate a combined AES & GCM operation to
 be implemented in assembly code. The assembly code leverages Intel(R)
 AES New Instructions and the PCLMULQDQ instruction.

Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com>
Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com>
Signed-off-by: James Guilford <james.guilford@intel.com>
Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com>
---
 arch/x86/crypto/aesni-intel_asm.S  | 1192 ++++++++++++++++++++++++++++++++++++
 arch/x86/crypto/aesni-intel_glue.c |  518 ++++++++++++++++-
 2 files changed, 1708 insertions(+), 2 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756..aafced5 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,17 @@
  *            Vinodh Gopal <vinodh.gopal@intel.com>
  *            Kahraman Akdemir
  *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             James Guilford (james.guilford@intel.com)
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Wajdi Feghali (wajdi.k.feghali@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +29,60 @@
 #include <linux/linkage.h>
 #include <asm/inst.h>
 
+.data
+POLY:   .octa 0xC2000000000000000000000000000001
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK,
+# and ZERO should follow ALL_F
+
+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
+MASK1:      .octa 0x0000000000000000ffffffffffffffff
+MASK2:      .octa 0xffffffffffffffff0000000000000000
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
+ZERO:       .octa 0x00000000000000000000000000000000
+ONE:        .octa 0x00000000000000000000000000000001
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+dec:        .octa 0x1
+enc:        .octa 0x2
+
+
 .text
 
+
+#define	STACK_OFFSET    8*3
+#define	HashKey		16*0	// store HashKey <<1 mod poly here
+#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
+#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
+#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
+#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
+				// bits of  HashKey <<1 mod poly here
+				//(for Karatsuba purposes)
+#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
+				// bits of  HashKey^2 <<1 mod poly here
+				// (for Karatsuba purposes)
+#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
+				// bits of  HashKey^3 <<1 mod poly here
+				// (for Karatsuba purposes)
+#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
+				// bits of  HashKey^4 <<1 mod poly here
+				// (for Karatsuba purposes)
+#define	VARIABLE_OFFSET	16*8
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%r14)
+#define arg8 STACK_OFFSET+16(%r14)
+#define arg9 STACK_OFFSET+24(%r14)
+#define arg10 STACK_OFFSET+32(%r14)
+
+
 #define STATE1	%xmm0
 #define STATE2	%xmm4
 #define STATE3	%xmm5
@@ -47,6 +110,1135 @@
 #define T2	%r11
 #define TCTR_LOW T2
 
+
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+	movdqa	  \GH, \TMP1
+	pshufd	  $78, \GH, \TMP2
+	pshufd	  $78, \HK, \TMP3
+	pxor	  \GH, \TMP2            # TMP2 = a1+a0
+	pxor	  \HK, \TMP3            # TMP3 = b1+b0
+	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
+	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
+	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
+	pxor	  \GH, \TMP2
+	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
+	movdqa	  \TMP2, \TMP3
+	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
+	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
+	pxor	  \TMP3, \GH
+	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
+
+        # first phase of the reduction
+
+	movdqa    \GH, \TMP2
+	movdqa    \GH, \TMP3
+	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
+					# in in order to perform
+					# independent shifts
+	pslld     $31, \TMP2            # packed right shift <<31
+	pslld     $30, \TMP3            # packed right shift <<30
+	pslld     $25, \TMP4            # packed right shift <<25
+	pxor      \TMP3, \TMP2          # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP5
+	psrldq    $4, \TMP5             # right shift TMP5 1 DW
+	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
+	pxor      \TMP2, \GH
+
+        # second phase of the reduction
+
+	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
+					# in in order to perform
+					# independent shifts
+	movdqa    \GH,\TMP3
+	movdqa    \GH,\TMP4
+	psrld     $1,\TMP2              # packed left shift >>1
+	psrld     $2,\TMP3              # packed left shift >>2
+	psrld     $7,\TMP4              # packed left shift >>7
+	pxor      \TMP3,\TMP2		# xor the shifted versions
+	pxor      \TMP4,\TMP2
+	pxor      \TMP5, \TMP2
+	pxor      \TMP2, \GH
+	pxor      \TMP1, \GH            # result is in TMP1
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+
+	mov	   arg7, %r10           # %r10 = AAD
+	mov	   arg8, %r12           # %r12 = aadLen
+	mov	   %r12, %r11
+	pxor	   %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+	movd	   (%r10), \TMP1
+	pslldq	   $12, \TMP1
+	psrldq	   $4, %xmm\i
+	pxor	   \TMP1, %xmm\i
+	add	   $4, %r10
+	sub	   $4, %r12
+	jne	   _get_AAD_loop\num_initial_blocks\operation
+	cmp	   $16, %r11
+	je	   _get_AAD_loop2_done\num_initial_blocks\operation
+	mov	   $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+	psrldq	   $4, %xmm\i
+	sub	   $4, %r12
+	cmp	   %r11, %r12
+	jne	   _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+	pshufb	   SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
+	xor	   %r11, %r11 # initialise the data pointer offset as zero
+
+        # start AES for num_initial_blocks blocks
+
+	mov	   %arg5, %rax                      # %rax = *Y0
+	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
+	pshufb	   SHUF_MASK(%rip), \XMM0
+.if \i_seq != 0
+.irpc index, \i_seq
+	paddd	   ONE(%rip), \XMM0                 # INCR Y0
+	movdqa	   \XMM0, %xmm\index
+	pshufb	   SHUF_MASK(%rip), %xmm\index      # perform a 16 byte swap
+.endr
+.irpc index, \i_seq
+	pxor	   16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+	movaps 0x10(%rdi), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 1
+.endr
+.irpc index, \i_seq
+	movaps 0x20(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x30(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x40(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x50(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x60(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x70(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x80(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0x90(%arg1), \TMP1
+	AESENC     \TMP1, %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	movaps 0xa0(%arg1), \TMP1
+	AESENCLAST \TMP1, %xmm\index         # Round 10
+.endr
+.irpc index, \i_seq
+	movdqu	   (%arg3 , %r11, 1), \TMP1
+	pxor	   \TMP1, %xmm\index
+	movdqu	   %xmm\index, (%arg2 , %r11, 1)
+	# write back plaintext/ciphertext for num_initial_blocks
+	add	   $16, %r11
+.if \operation == dec
+	movdqa     \TMP1, %xmm\index
+.endif
+	pshufb	   SHUF_MASK(%rip), %xmm\index
+		# prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+        pxor       %xmm5, %xmm6
+	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+	cmp	   $64, %r13
+	jl	_initial_blocks_done\num_initial_blocks\operation
+	# no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM1
+	pshufb	   SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM2
+	pshufb	   SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM3
+	pshufb	   SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM4
+	pshufb	   SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
+	pxor	   16*0(%arg1), \XMM1
+	pxor	   16*0(%arg1), \XMM2
+	pxor	   16*0(%arg1), \XMM3
+	pxor	   16*0(%arg1), \XMM4
+	movdqa	   \TMP3, \TMP5
+	pshufd	   $78, \TMP3, \TMP1
+	pxor	   \TMP3, \TMP1
+	movdqa	   \TMP1, HashKey_k(%rsp)
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_2(%rsp)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+	movaps 0x10*\index(%arg1), \TMP1
+	AESENC	   \TMP1, \XMM1
+	AESENC	   \TMP1, \XMM2
+	AESENC	   \TMP1, \XMM3
+	AESENC	   \TMP1, \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_3(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+	movaps 0x10*\index(%arg1), \TMP1
+	AESENC	   \TMP1, \XMM1
+	AESENC	   \TMP1, \XMM2
+	AESENC	   \TMP1, \XMM3
+	AESENC	   \TMP1, \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_4(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_4_k(%rsp)
+	movaps 0xa0(%arg1), \TMP2
+	AESENCLAST \TMP2, \XMM1
+	AESENCLAST \TMP2, \XMM2
+	AESENCLAST \TMP2, \XMM3
+	AESENCLAST \TMP2, \XMM4
+	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM1
+.if \operation == dec
+	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM1
+.endif
+	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM2
+.if \operation == dec
+	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM2
+.endif
+	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM3
+.if \operation == dec
+	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM3
+.endif
+	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM4
+.if \operation == dec
+	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM4
+.else
+	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
+	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
+	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
+	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+.endif
+	add	   $64, %r11
+	pshufb	   SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
+	pxor	   \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+	pshufb	   SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
+	pshufb	   SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
+	pshufb	   SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
+_initial_blocks_done\num_initial_blocks\operation:
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+	movdqa	  \XMM1, \XMM5
+	movdqa	  \XMM2, \XMM6
+	movdqa	  \XMM3, \XMM7
+	movdqa	  \XMM4, \XMM8
+
+        # multiply TMP5 * HashKey using karatsuba
+
+	movdqa	  \XMM5, \TMP4
+	pshufd	  $78, \XMM5, \TMP6
+	pxor	  \XMM5, \TMP6
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa	  HashKey_4(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
+	movdqa    \XMM0, \XMM1
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM2
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM3
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM4
+	pshufb	  SHUF_MASK(%rip), \XMM1	# perform a 16 byte swap
+	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
+	pshufb	  SHUF_MASK(%rip), \XMM2	# perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM3	# perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM4	# perform a 16 byte swap
+	pxor	  (%arg1), \XMM1
+	pxor	  (%arg1), \XMM2
+	pxor	  (%arg1), \XMM3
+	pxor	  (%arg1), \XMM4
+	movdqa	  HashKey_4_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
+	movaps 0x10(%arg1), \TMP1
+	AESENC	  \TMP1, \XMM1              # Round 1
+	AESENC	  \TMP1, \XMM2
+	AESENC	  \TMP1, \XMM3
+	AESENC	  \TMP1, \XMM4
+	movaps 0x20(%arg1), \TMP1
+	AESENC	  \TMP1, \XMM1              # Round 2
+	AESENC	  \TMP1, \XMM2
+	AESENC	  \TMP1, \XMM3
+	AESENC	  \TMP1, \XMM4
+	movdqa	  \XMM6, \TMP1
+	pshufd	  $78, \XMM6, \TMP2
+	pxor	  \XMM6, \TMP2
+	movdqa	  HashKey_3(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
+	movaps 0x30(%arg1), \TMP3
+	AESENC    \TMP3, \XMM1              # Round 3
+	AESENC    \TMP3, \XMM2
+	AESENC    \TMP3, \XMM3
+	AESENC    \TMP3, \XMM4
+	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
+	movaps 0x40(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1              # Round 4
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	movdqa	  HashKey_3_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	movaps 0x50(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1              # Round 5
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	pxor	  \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM6, \XMM5
+	pxor	  \TMP2, \TMP6
+	movdqa	  \XMM7, \TMP1
+	pshufd	  $78, \XMM7, \TMP2
+	pxor	  \XMM7, \TMP2
+	movdqa	  HashKey_2(%rsp ), \TMP5
+
+        # Multiply TMP5 * HashKey using karatsuba
+
+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
+	movaps 0x60(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1              # Round 6
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
+	movaps 0x70(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1             # Round 7
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	movdqa	  HashKey_2_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	movaps 0x80(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1             # Round 8
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	pxor	  \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM7, \XMM5
+	pxor	  \TMP2, \TMP6
+
+        # Multiply XMM8 * HashKey
+        # XMM8 and TMP5 hold the values for the two operands
+
+	movdqa	  \XMM8, \TMP1
+	pshufd	  $78, \XMM8, \TMP2
+	pxor	  \XMM8, \TMP2
+	movdqa	  HashKey(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
+	movaps 0x90(%arg1), \TMP3
+	AESENC	  \TMP3, \XMM1            # Round 9
+	AESENC	  \TMP3, \XMM2
+	AESENC	  \TMP3, \XMM3
+	AESENC	  \TMP3, \XMM4
+	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
+	movaps 0xa0(%arg1), \TMP3
+	AESENCLAST \TMP3, \XMM1           # Round 10
+	AESENCLAST \TMP3, \XMM2
+	AESENCLAST \TMP3, \XMM3
+	AESENCLAST \TMP3, \XMM4
+	movdqa    HashKey_k(%rsp), \TMP5
+	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
+	movdqu	  (%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
+	movdqa    \TMP3, \XMM1
+.endif
+	movdqu	  16(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM2
+.endif
+	movdqu	  32(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM3
+.endif
+	movdqu	  48(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM4
+.else
+    movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
+    movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
+    movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
+    movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
+.endif
+	pshufb	  SHUF_MASK(%rip), \XMM1       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM2       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM3       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM4       # perform a 16 byte sway
+
+	pxor	  \TMP4, \TMP1
+	pxor	  \XMM8, \XMM5
+	pxor	  \TMP6, \TMP2
+	pxor	  \TMP1, \TMP2
+	pxor	  \XMM5, \TMP2
+	movdqa	  \TMP2, \TMP3
+	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
+	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
+	pxor	  \TMP3, \XMM5
+	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
+
+        # first phase of reduction
+
+	movdqa    \XMM5, \TMP2
+	movdqa    \XMM5, \TMP3
+	movdqa    \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+	pslld     $31, \TMP2                   # packed right shift << 31
+	pslld     $30, \TMP3                   # packed right shift << 30
+	pslld     $25, \TMP4                   # packed right shift << 25
+	pxor      \TMP3, \TMP2	               # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP5
+	psrldq    $4, \TMP5                    # right shift T5 1 DW
+	pslldq    $12, \TMP2                   # left shift T2 3 DWs
+	pxor      \TMP2, \XMM5
+
+        # second phase of reduction
+
+	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+	movdqa    \XMM5,\TMP3
+	movdqa    \XMM5,\TMP4
+	psrld     $1, \TMP2                    # packed left shift >>1
+	psrld     $2, \TMP3                    # packed left shift >>2
+	psrld     $7, \TMP4                    # packed left shift >>7
+	pxor      \TMP3,\TMP2		       # xor the shifted versions
+	pxor      \TMP4,\TMP2
+	pxor      \TMP5, \TMP2
+	pxor      \TMP2, \XMM5
+	pxor      \TMP1, \XMM5                 # result is in TMP1
+
+	pxor	  \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
+TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
+
+        # Multiply TMP6 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM1, \TMP6
+	pshufd	  $78, \XMM1, \TMP2
+	pxor	  \XMM1, \TMP2
+	movdqa	  HashKey_4(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
+	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
+	movdqa	  HashKey_4_k(%rsp), \TMP4
+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	movdqa	  \XMM1, \XMMDst
+	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM2, \TMP1
+	pshufd	  $78, \XMM2, \TMP2
+	pxor	  \XMM2, \TMP2
+	movdqa	  HashKey_3(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
+	movdqa	  HashKey_3_k(%rsp), \TMP4
+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM2, \XMMDst
+	pxor	  \TMP2, \XMM1
+# results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM3, \TMP1
+	pshufd	  $78, \XMM3, \TMP2
+	pxor	  \XMM3, \TMP2
+	movdqa	  HashKey_2(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
+	movdqa	  HashKey_2_k(%rsp), \TMP4
+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM3, \XMMDst
+	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+	movdqa	  \XMM4, \TMP1
+	pshufd	  $78, \XMM4, \TMP2
+	pxor	  \XMM4, \TMP2
+	movdqa	  HashKey(%rsp), \TMP5
+	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
+	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
+	movdqa	  HashKey_k(%rsp), \TMP4
+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM4, \XMMDst
+	pxor	  \XMM1, \TMP2
+	pxor	  \TMP6, \TMP2
+	pxor	  \XMMDst, \TMP2
+	# middle section of the temp results combined as in karatsuba algorithm
+	movdqa	  \TMP2, \TMP4
+	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
+	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
+	pxor	  \TMP4, \XMMDst
+	pxor	  \TMP2, \TMP6
+# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
+	# first phase of the reduction
+	movdqa    \XMMDst, \TMP2
+	movdqa    \XMMDst, \TMP3
+	movdqa    \XMMDst, \TMP4
+# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
+	pslld     $31, \TMP2                # packed right shifting << 31
+	pslld     $30, \TMP3                # packed right shifting << 30
+	pslld     $25, \TMP4                # packed right shifting << 25
+	pxor      \TMP3, \TMP2              # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP7
+	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
+	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
+	pxor      \TMP2, \XMMDst
+
+        # second phase of the reduction
+	movdqa    \XMMDst, \TMP2
+	# make 3 copies of XMMDst for doing 3 shift operations
+	movdqa    \XMMDst, \TMP3
+	movdqa    \XMMDst, \TMP4
+	psrld     $1, \TMP2                 # packed left shift >> 1
+	psrld     $2, \TMP3                 # packed left shift >> 2
+	psrld     $7, \TMP4                 # packed left shift >> 7
+	pxor      \TMP3, \TMP2              # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	pxor      \TMP7, \TMP2
+	pxor      \TMP2, \XMMDst
+	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
+.endm
+
+/* Encryption of a single block done*/
+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
+
+	pxor	(%arg1), \XMM0
+        movaps 16(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 32(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 48(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 64(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 80(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 96(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 112(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 128(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 144(%arg1), \TMP1
+	AESENC	\TMP1, \XMM0
+        movaps 160(%arg1), \TMP1
+	AESENCLAST	\TMP1, \XMM0
+.endm
+
+
+/*****************************************************************************
+* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
+*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
+*                   const u8 *in,      // Ciphertext input
+*                   u64 plaintext_len, // Length of data in bytes for decryption.
+*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
+*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                   const u8 *aad,     // Additional Authentication Data (AAD)
+*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
+*                                      // given authentication tag and only return the plaintext if they match.
+*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
+*                                      // (most likely), 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the first
+*       set of 11 keys in the data structure void *aes_ctx
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                        AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
+*       The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+*       For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+*****************************************************************************/
+
+ENTRY(aesni_gcm_dec)
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	%rsp, %r14
+/*
+* states of %xmm registers %xmm6:%xmm15 not saved
+* all %xmm registers are clobbered
+*/
+	sub	$VARIABLE_OFFSET, %rsp
+	and	$~63, %rsp                        # align rsp to 64 bytes
+	mov	%arg6, %r12
+	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
+	pshufb	SHUF_MASK(%rip), %xmm13
+
+# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
+
+	movdqa	%xmm13, %xmm2
+	psllq	$1, %xmm13
+	psrlq	$63, %xmm2
+	movdqa	%xmm2, %xmm1
+	pslldq	$8, %xmm2
+	psrldq	$8, %xmm1
+	por	%xmm2, %xmm13
+
+        # Reduction
+
+	pshufd	$0x24, %xmm1, %xmm2
+	pcmpeqd TWOONE(%rip), %xmm2
+	pand	POLY(%rip), %xmm2
+	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
+
+
+        # Decrypt first few blocks
+
+	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
+	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
+	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
+	mov %r13, %r12
+	and $(3<<4), %r12
+	jz _initial_num_blocks_is_0_decrypt
+	cmp $(2<<4), %r12
+	jb _initial_num_blocks_is_1_decrypt
+	je _initial_num_blocks_is_2_decrypt
+_initial_num_blocks_is_3_decrypt:
+	INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
+	sub	$48, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_2_decrypt:
+	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
+	sub	$32, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_1_decrypt:
+	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
+	sub	$16, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_0_decrypt:
+	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
+_initial_blocks_decrypted:
+	cmp	$0, %r13
+	je	_zero_cipher_left_decrypt
+	sub	$64, %r13
+	je	_four_cipher_left_decrypt
+_decrypt_by_4:
+	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
+	add	$64, %r11
+	sub	$64, %r13
+	jne	_decrypt_by_4
+_four_cipher_left_decrypt:
+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_decrypt:
+	mov	%arg4, %r13
+	and	$15, %r13				# %r13 = arg4 (mod 16)
+	je	_multiple_of_16_bytes_decrypt
+
+        # Handle the last <16 byte block seperately
+
+	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
+	pshufb SHUF_MASK(%rip), %xmm0
+	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
+	sub $16, %r11
+	add %r13, %r11
+	movdqu (%arg3,%r11,1), %xmm1   # recieve the last <16 byte block
+	lea SHIFT_MASK+16(%rip), %r12
+	sub %r13, %r12
+# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
+# (%r13 is the number of bytes in plaintext mod 16)
+	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
+	pshufb %xmm2, %xmm1            # right shift 16-%r13 butes
+	movdqa  %xmm1, %xmm2
+	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
+	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
+	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
+	pand    %xmm1, %xmm2
+	pshufb SHUF_MASK(%rip),%xmm2
+	pxor %xmm2, %xmm8
+	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+	          # GHASH computation for the last <16 byte block
+	sub %r13, %r11
+	add $16, %r11
+
+        # output %r13 bytes
+	movq	%xmm0, %rax
+	cmp	$8, %r13
+	jle	_less_than_8_bytes_left_decrypt
+	mov	%rax, (%arg2 , %r11, 1)
+	add	$8, %r11
+	psrldq	$8, %xmm0
+	movq	%xmm0, %rax
+	sub	$8, %r13
+_less_than_8_bytes_left_decrypt:
+	mov	%al,  (%arg2, %r11, 1)
+	add	$1, %r11
+	shr	$8, %rax
+	sub	$1, %r13
+	jne	_less_than_8_bytes_left_decrypt
+_multiple_of_16_bytes_decrypt:
+	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
+	shl	$3, %r12		  # convert into number of bits
+	movd	%r12d, %xmm15		  # len(A) in %xmm15
+	shl	$3, %arg4		  # len(C) in bits (*128)
+	movq	%arg4, %xmm1
+	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
+	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
+	pxor	%xmm15, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+	         # final GHASH computation
+	pshufb	SHUF_MASK(%rip), %xmm8
+	mov	%arg5, %rax		  # %rax = *Y0
+	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
+	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
+	pxor	%xmm8, %xmm0
+_return_T_decrypt:
+	mov	arg9, %r10                # %r10 = authTag
+	mov	arg10, %r11               # %r11 = auth_tag_len
+	cmp	$16, %r11
+	je	_T_16_decrypt
+	cmp	$12, %r11
+	je	_T_12_decrypt
+_T_8_decrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	jmp	_return_T_done_decrypt
+_T_12_decrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	psrldq	$8, %xmm0
+	movd	%xmm0, %eax
+	mov	%eax, 8(%r10)
+	jmp	_return_T_done_decrypt
+_T_16_decrypt:
+	movdqu	%xmm0, (%r10)
+_return_T_done_decrypt:
+	mov	%r14, %rsp
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	ret
+
+
+/*****************************************************************************
+* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
+*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
+*                    const u8 *in,       // Plaintext input
+*                    u64 plaintext_len,  // Length of data in bytes for encryption.
+*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
+*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                    const u8 *aad,      // Additional Authentication Data (AAD)
+*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                    u8 *auth_tag,       // Authenticated Tag output.
+*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
+*                                        // 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the
+*       first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                 AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                         AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
+*       The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+*       For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+***************************************************************************/
+ENTRY(aesni_gcm_enc)
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	%rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+	sub	$VARIABLE_OFFSET, %rsp
+	and	$~63, %rsp
+	mov	%arg6, %r12
+	movdqu	(%r12), %xmm13
+	pshufb	SHUF_MASK(%rip), %xmm13
+
+# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+	movdqa	%xmm13, %xmm2
+	psllq	$1, %xmm13
+	psrlq	$63, %xmm2
+	movdqa	%xmm2, %xmm1
+	pslldq	$8, %xmm2
+	psrldq	$8, %xmm1
+	por	%xmm2, %xmm13
+
+        # reduce HashKey<<1
+
+	pshufd	$0x24, %xmm1, %xmm2
+	pcmpeqd TWOONE(%rip), %xmm2
+	pand	POLY(%rip), %xmm2
+	pxor	%xmm2, %xmm13
+	movdqa	%xmm13, HashKey(%rsp)
+	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
+	and	$-16, %r13
+	mov	%r13, %r12
+
+        # Encrypt first few blocks
+
+	and	$(3<<4), %r12
+	jz	_initial_num_blocks_is_0_encrypt
+	cmp	$(2<<4), %r12
+	jb	_initial_num_blocks_is_1_encrypt
+	je	_initial_num_blocks_is_2_encrypt
+_initial_num_blocks_is_3_encrypt:
+	INITIAL_BLOCKS	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
+	sub	$48, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_2_encrypt:
+	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
+	sub	$32, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_1_encrypt:
+	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
+	sub	$16, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_0_encrypt:
+	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
+_initial_blocks_encrypted:
+
+        # Main loop - Encrypt remaining blocks
+
+	cmp	$0, %r13
+	je	_zero_cipher_left_encrypt
+	sub	$64, %r13
+	je	_four_cipher_left_encrypt
+_encrypt_by_4_encrypt:
+	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
+	add	$64, %r11
+	sub	$64, %r13
+	jne	_encrypt_by_4_encrypt
+_four_cipher_left_encrypt:
+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_encrypt:
+	mov	%arg4, %r13
+	and	$15, %r13			# %r13 = arg4 (mod 16)
+	je	_multiple_of_16_bytes_encrypt
+
+         # Handle the last <16 Byte block seperately
+	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
+	pshufb SHUF_MASK(%rip), %xmm0
+	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
+	sub $16, %r11
+	add %r13, %r11
+	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
+	lea SHIFT_MASK+16(%rip), %r12
+	sub %r13, %r12
+	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+	# (%r13 is the number of bytes in plaintext mod 16)
+	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
+	pshufb	%xmm2, %xmm1            # shift right 16-r13 byte
+	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
+	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
+	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
+	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
+
+	pshufb	SHUF_MASK(%rip),%xmm0
+	pxor	%xmm0, %xmm8
+	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+	# GHASH computation for the last <16 byte block
+	sub	%r13, %r11
+	add	$16, %r11
+	pshufb SHUF_MASK(%rip), %xmm0
+	# shuffle xmm0 back to output as ciphertext
+
+        # Output %r13 bytes
+	movq %xmm0, %rax
+	cmp $8, %r13
+	jle _less_than_8_bytes_left_encrypt
+	mov %rax, (%arg2 , %r11, 1)
+	add $8, %r11
+	psrldq $8, %xmm0
+	movq %xmm0, %rax
+	sub $8, %r13
+_less_than_8_bytes_left_encrypt:
+	mov %al,  (%arg2, %r11, 1)
+	add $1, %r11
+	shr $8, %rax
+	sub $1, %r13
+	jne _less_than_8_bytes_left_encrypt
+_multiple_of_16_bytes_encrypt:
+	mov	arg8, %r12    # %r12 = addLen (number of bytes)
+	shl	$3, %r12
+	movd	%r12d, %xmm15       # len(A) in %xmm15
+	shl	$3, %arg4               # len(C) in bits (*128)
+	movq	%arg4, %xmm1
+	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
+	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
+	pxor	%xmm15, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+	# final GHASH computation
+
+	pshufb	SHUF_MASK(%rip), %xmm8         # perform a 16 byte swap
+	mov	%arg5, %rax		       # %rax  = *Y0
+	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
+	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
+	pxor	%xmm8, %xmm0
+_return_T_encrypt:
+	mov	arg9, %r10                     # %r10 = authTag
+	mov	arg10, %r11                    # %r11 = auth_tag_len
+	cmp	$16, %r11
+	je	_T_16_encrypt
+	cmp	$12, %r11
+	je	_T_12_encrypt
+_T_8_encrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	jmp	_return_T_done_encrypt
+_T_12_encrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	psrldq	$8, %xmm0
+	movd	%xmm0, %eax
+	mov	%eax, 8(%r10)
+	jmp	_return_T_done_encrypt
+_T_16_encrypt:
+	movdqu	%xmm0, (%r10)
+_return_T_done_encrypt:
+	mov	%r14, %rsp
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	ret
+
+
+
 _key_expansion_128:
 _key_expansion_256a:
 	pshufd $0b11111111, %xmm1, %xmm1
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc..02d349d 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
  * Copyright (C) 2008, Intel Corp.
  *    Author: Huang Ying <ying.huang@intel.com>
  *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
 #include <crypto/ctr.h>
 #include <asm/i387.h>
 #include <asm/aes.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
 
 #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
 #define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
 
-#define AESNI_ALIGN	16
+/* This data is stored at the end of the crypto_tfm struct.
+ * It's a type of per "session" data storage location.
+ * This needs to be 16 byte aligned.
+ */
+struct aesni_rfc4106_gcm_ctx {
+	u8 hash_subkey[16];
+	struct crypto_aes_ctx aes_key_expanded;
+	u8 nonce[4];
+	struct cryptd_aead *cryptd_tfm;
+};
+
+struct aesni_gcm_set_hash_subkey_result {
+	int err;
+	struct completion completion;
+};
+
+struct aesni_hash_subkey_req_data {
+	u8 iv[16];
+	struct aesni_gcm_set_hash_subkey_result result;
+	struct scatterlist sg;
+};
+
+#define AESNI_ALIGN	(16)
 #define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE-1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
 
 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 			     unsigned int key_len);
@@ -62,6 +97,57 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
+/* asmlinkage void aesni_gcm_enc()
+ * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
+ * const u8 *in, Plaintext input
+ * unsigned long plaintext_len, Length of data in bytes for encryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
+ *          is going to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
+ *          Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+/* asmlinkage void aesni_gcm_dec()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
+ * const u8 *in, Ciphertext input
+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
+ * to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+static inline struct
+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
+{
+	return
+		(struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)
+		crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
+}
+
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
 	unsigned long addr = (unsigned long)raw_ctx;
@@ -730,6 +816,422 @@ static struct crypto_alg ablk_xts_alg = {
 };
 #endif
 
+static int rfc4106_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ctx->cryptd_tfm = cryptd_tfm;
+	tfm->crt_aead.reqsize = sizeof(struct aead_request)
+		+ crypto_aead_reqsize(&cryptd_tfm->base);
+	return 0;
+}
+
+static void rfc4106_exit(struct crypto_tfm *tfm)
+{
+	struct aesni_rfc4106_gcm_ctx *ctx =
+		(struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+	if (!IS_ERR(ctx->cryptd_tfm))
+		cryptd_free_aead(ctx->cryptd_tfm);
+	return;
+}
+
+static void
+rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
+{
+	struct aesni_gcm_set_hash_subkey_result *result = req->data;
+
+	if (err == -EINPROGRESS)
+		return;
+	result->err = err;
+	complete(&result->completion);
+}
+
+static int
+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+{
+	struct crypto_ablkcipher *ctr_tfm;
+	struct ablkcipher_request *req;
+	int ret = -EINVAL;
+	struct aesni_hash_subkey_req_data *req_data;
+
+	ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
+	if (IS_ERR(ctr_tfm))
+		return PTR_ERR(ctr_tfm);
+
+	crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
+
+	ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
+	if (ret) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return ret;
+	}
+
+	req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
+	if (!req) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return -EINVAL;
+	}
+
+	req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+	if (!req_data) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return -ENOMEM;
+	}
+	memset(req_data->iv, 0, sizeof(req_data->iv));
+
+	/* Clear the data in the hash sub key container to zero.*/
+	/* We want to cipher all zeros to create the hash sub key. */
+	memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
+
+	init_completion(&req_data->result.completion);
+	sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
+	ablkcipher_request_set_tfm(req, ctr_tfm);
+	ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+					rfc4106_set_hash_subkey_done,
+					&req_data->result);
+
+	ablkcipher_request_set_crypt(req, &req_data->sg,
+		&req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
+
+	ret = crypto_ablkcipher_encrypt(req);
+	if (ret == -EINPROGRESS || ret == -EBUSY) {
+		ret = wait_for_completion_interruptible
+			(&req_data->result.completion);
+		if (!ret)
+			ret = req_data->result.err;
+	}
+	ablkcipher_request_free(req);
+	kfree(req_data);
+	crypto_free_ablkcipher(ctr_tfm);
+	return ret;
+}
+
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+						   unsigned int key_len)
+{
+	int ret = 0;
+	struct crypto_tfm *tfm = crypto_aead_tfm(parent);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+	u8 *new_key_mem = NULL;
+
+	if (key_len < 4) {
+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	/*Account for 4 byte nonce at the end.*/
+	key_len -= 4;
+	if (key_len != AES_KEYSIZE_128) {
+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
+	/*This must be on a 16 byte boundary!*/
+	if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
+		return -EINVAL;
+
+	if ((unsigned long)key % AESNI_ALIGN) {
+		/*key is not aligned: use an auxuliar aligned pointer*/
+		new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
+		if (!new_key_mem)
+			return -ENOMEM;
+
+		new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
+		memcpy(new_key_mem, key, key_len);
+		key = new_key_mem;
+	}
+
+	if (!irq_fpu_usable())
+		ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
+		key, key_len);
+	else {
+		kernel_fpu_begin();
+		ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
+		kernel_fpu_end();
+	}
+	/*This must be on a 16 byte boundary!*/
+	if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
+		ret = -EINVAL;
+		goto exit;
+	}
+	ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+exit:
+	kfree(new_key_mem);
+	return ret;
+}
+
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+				unsigned int authsize)
+{
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	switch (authsize) {
+	case 8:
+	case 12:
+	case 16:
+		break;
+	default:
+		return -EINVAL;
+	}
+	crypto_aead_crt(parent)->authsize = authsize;
+	crypto_aead_crt(cryptd_child)->authsize = authsize;
+	return 0;
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_aead_encrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = cryptd_child->base.crt_aead.encrypt(req);
+		kernel_fpu_end();
+		return ret;
+	}
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_aead_decrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = cryptd_child->base.crt_aead.decrypt(req);
+		kernel_fpu_end();
+		return ret;
+	}
+}
+
+static struct crypto_alg rfc4106_alg = {
+	.cra_name = "rfc4106(gcm(aes))",
+	.cra_driver_name = "rfc4106-gcm-aesni",
+	.cra_priority = 400,
+	.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+	.cra_blocksize = 1,
+	.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+	.cra_alignmask = 0,
+	.cra_type = &crypto_nivaead_type,
+	.cra_module = THIS_MODULE,
+	.cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
+	.cra_init = rfc4106_init,
+	.cra_exit = rfc4106_exit,
+	.cra_u = {
+		.aead = {
+			.setkey = rfc4106_set_key,
+			.setauthsize = rfc4106_set_authsize,
+			.encrypt = rfc4106_encrypt,
+			.decrypt = rfc4106_decrypt,
+			.geniv = "seqiv",
+			.ivsize = 8,
+			.maxauthsize = 16,
+		},
+	},
+};
+
+static int __driver_rfc4106_encrypt(struct aead_request *req)
+{
+	u8 one_entry_in_sg = 0;
+	u8 *src, *dst, *assoc;
+	__be32 counter = cpu_to_be32(1);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	void *aes_ctx = &(ctx->aes_key_expanded);
+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	u8 iv_tab[16+AESNI_ALIGN];
+	u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
+	struct scatter_walk src_sg_walk;
+	struct scatter_walk assoc_sg_walk;
+	struct scatter_walk dst_sg_walk;
+	unsigned int i;
+
+	/* Assuming we are supporting rfc4106 64-bit extended */
+	/* sequence numbers We need to have the AAD length equal */
+	/* to 8 or 12 bytes */
+	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+		return -EINVAL;
+	/* IV below built */
+	for (i = 0; i < 4; i++)
+		*(iv+i) = ctx->nonce[i];
+	for (i = 0; i < 8; i++)
+		*(iv+4+i) = req->iv[i];
+	*((__be32 *)(iv+12)) = counter;
+
+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+		one_entry_in_sg = 1;
+		scatterwalk_start(&src_sg_walk, req->src);
+		scatterwalk_start(&assoc_sg_walk, req->assoc);
+		src = scatterwalk_map(&src_sg_walk, 0);
+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		dst = src;
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_start(&dst_sg_walk, req->dst);
+			dst = scatterwalk_map(&dst_sg_walk, 0);
+		}
+
+	} else {
+		/* Allocate memory for src, dst, assoc */
+		src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+			GFP_ATOMIC);
+		if (unlikely(!src))
+			return -ENOMEM;
+		assoc = (src + req->cryptlen + auth_tag_len);
+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
+					req->assoclen, 0);
+		dst = src;
+	}
+
+	aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
+		+ ((unsigned long)req->cryptlen), auth_tag_len);
+
+	/* The authTag (aka the Integrity Check Value) needs to be written
+	 * back to the packet. */
+	if (one_entry_in_sg) {
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_unmap(dst, 0);
+			scatterwalk_done(&dst_sg_walk, 0, 0);
+		}
+		scatterwalk_unmap(src, 0);
+		scatterwalk_unmap(assoc, 0);
+		scatterwalk_done(&src_sg_walk, 0, 0);
+		scatterwalk_done(&assoc_sg_walk, 0, 0);
+	} else {
+		scatterwalk_map_and_copy(dst, req->dst, 0,
+			req->cryptlen + auth_tag_len, 1);
+		kfree(src);
+	}
+	return 0;
+}
+
+static int __driver_rfc4106_decrypt(struct aead_request *req)
+{
+	u8 one_entry_in_sg = 0;
+	u8 *src, *dst, *assoc;
+	unsigned long tempCipherLen = 0;
+	__be32 counter = cpu_to_be32(1);
+	int retval = 0;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	void *aes_ctx = &(ctx->aes_key_expanded);
+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	u8 iv_and_authTag[32+AESNI_ALIGN];
+	u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
+	u8 *authTag = iv + 16;
+	struct scatter_walk src_sg_walk;
+	struct scatter_walk assoc_sg_walk;
+	struct scatter_walk dst_sg_walk;
+	unsigned int i;
+
+	if (unlikely((req->cryptlen < auth_tag_len) ||
+		(req->assoclen != 8 && req->assoclen != 12)))
+		return -EINVAL;
+	/* Assuming we are supporting rfc4106 64-bit extended */
+	/* sequence numbers We need to have the AAD length */
+	/* equal to 8 or 12 bytes */
+
+	tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
+	/* IV below built */
+	for (i = 0; i < 4; i++)
+		*(iv+i) = ctx->nonce[i];
+	for (i = 0; i < 8; i++)
+		*(iv+4+i) = req->iv[i];
+	*((__be32 *)(iv+12)) = counter;
+
+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+		one_entry_in_sg = 1;
+		scatterwalk_start(&src_sg_walk, req->src);
+		scatterwalk_start(&assoc_sg_walk, req->assoc);
+		src = scatterwalk_map(&src_sg_walk, 0);
+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		dst = src;
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_start(&dst_sg_walk, req->dst);
+			dst = scatterwalk_map(&dst_sg_walk, 0);
+		}
+
+	} else {
+		/* Allocate memory for src, dst, assoc */
+		src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+		if (!src)
+			return -ENOMEM;
+		assoc = (src + req->cryptlen + auth_tag_len);
+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
+			req->assoclen, 0);
+		dst = src;
+	}
+
+	aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
+		authTag, auth_tag_len);
+
+	/* Compare generated tag with passed in tag. */
+	retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
+		-EBADMSG : 0;
+
+	if (one_entry_in_sg) {
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_unmap(dst, 0);
+			scatterwalk_done(&dst_sg_walk, 0, 0);
+		}
+		scatterwalk_unmap(src, 0);
+		scatterwalk_unmap(assoc, 0);
+		scatterwalk_done(&src_sg_walk, 0, 0);
+		scatterwalk_done(&assoc_sg_walk, 0, 0);
+	} else {
+		scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
+		kfree(src);
+	}
+	return retval;
+}
+
+static struct crypto_alg __rfc4106_alg = {
+	.cra_name		= "__gcm-aes-aesni",
+	.cra_driver_name	= "__driver-gcm-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
+	.cra_blocksize		= 1,
+	.cra_ctxsize	= sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_aead_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(__rfc4106_alg.cra_list),
+	.cra_u = {
+		.aead = {
+			.encrypt	= __driver_rfc4106_encrypt,
+			.decrypt	= __driver_rfc4106_decrypt,
+		},
+	},
+};
+
 static int __init aesni_init(void)
 {
 	int err;
@@ -738,6 +1240,7 @@ static int __init aesni_init(void)
 		printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
 		return -ENODEV;
 	}
+
 	if ((err = crypto_register_alg(&aesni_alg)))
 		goto aes_err;
 	if ((err = crypto_register_alg(&__aesni_alg)))
@@ -770,10 +1273,19 @@ static int __init aesni_init(void)
 	if ((err = crypto_register_alg(&ablk_xts_alg)))
 		goto ablk_xts_err;
 #endif
-
+	err = crypto_register_alg(&__rfc4106_alg);
+	if (err)
+		goto __aead_gcm_err;
+	err = crypto_register_alg(&rfc4106_alg);
+	if (err)
+		goto aead_gcm_err;
 	return err;
 
+aead_gcm_err:
+	crypto_unregister_alg(&__rfc4106_alg);
+__aead_gcm_err:
 #ifdef HAS_XTS
+	crypto_unregister_alg(&ablk_xts_alg);
 ablk_xts_err:
 #endif
 #ifdef HAS_PCBC
@@ -809,6 +1321,8 @@ aes_err:
 
 static void __exit aesni_exit(void)
 {
+	crypto_unregister_alg(&__rfc4106_alg);
+	crypto_unregister_alg(&rfc4106_alg);
 #ifdef HAS_XTS
 	crypto_unregister_alg(&ablk_xts_alg);
 #endif
-- 
1.5.3.rc5

--------------------------------------------------------------
Intel Shannon Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
Business address: Dromore House, East Park, Shannon, Co. Clare

This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). Any review or distribution by others is strictly prohibited. If you are not the intended recipient, please contact the sender and delete all copies.



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-10-20 10:49       ` Hoban, Adrian
@ 2010-10-20 13:02         ` Herbert Xu
  0 siblings, 0 replies; 19+ messages in thread
From: Herbert Xu @ 2010-10-20 13:02 UTC (permalink / raw)
  To: Hoban, Adrian
  Cc: linux-crypto, linux-kernel, Struk, Tadeusz, O Mahony, Aidan,
	Paoloni, Gabriele

On Wed, Oct 20, 2010 at 11:49:10AM +0100, Hoban, Adrian wrote:
> Hi Herbert,
> 
> We have been using binutils v2.19. 
> 
> Can you please recommend the version(s) of binutils (gas) that we should test with before submitting a patch? V2.16.1, or perhaps the older version (2.12) described in /Documentation/Changes? 
> 
> We will retest with the recommended version, update the code and resubmit the patch ASAP. 

I was using 2.18 so if you can make it work with 2.16 I think it
should be fine as it wouldn't require any of the new AES instructions.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
       [not found]     ` <24483BA4C9F69C43A7379639D35543D7C2F99F27@irsmsx502.ger.corp.intel.com>
@ 2010-10-20 10:49       ` Hoban, Adrian
  2010-10-20 13:02         ` Herbert Xu
  0 siblings, 1 reply; 19+ messages in thread
From: Hoban, Adrian @ 2010-10-20 10:49 UTC (permalink / raw)
  To: Herbert Xu, linux-crypto
  Cc: linux-kernel, Struk, Tadeusz, O Mahony, Aidan, Paoloni, Gabriele

Hi Herbert,

We have been using binutils v2.19. 

Can you please recommend the version(s) of binutils (gas) that we should test with before submitting a patch? V2.16.1, or perhaps the older version (2.12) described in /Documentation/Changes? 

We will retest with the recommended version, update the code and resubmit the patch ASAP. 

Cheers,
Adrian


-----Original Message-----
From: Herbert Xu [mailto:herbert@gondor.apana.org.au] 
Sent: Tuesday, October 19, 2010 1:49 PM
To: Struk, Tadeusz
Cc: linux-kernel@vger.kernel.org; linux-crypto@vger.kernel.org; O Mahony, Aidan; Paoloni, Gabriele; Hoban, Adrian
Subject: Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions

On Tue, Oct 12, 2010 at 09:31:41AM +0100, tadeusz.struk@intel.com wrote:
> Hi Herbert,
> Resubmitting the two other patches as requested.
> Thanks,
> Tadeusz
> 
> >From 06444d8a95458d807ae14699e557739281d0b026 Mon Sep 17 00:00:00 2001
> From: Adrian Hoban <ahoban@allen.ir.intel.com>
> Date: Fri, 10 Sep 2010 18:08:45 +0100
> Subject: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
> 
> This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
> kernels. It supports 128-bit AES key size. This leverages the crypto
> AEAD interface type to facilitate a combined AES & GCM operation to
> be implemented in assembly code. The assembly code leverages Intel(R)
> AES New Instructions and the PCLMULQDQ instruction.

OK this patch applies now at least.  However it doesn't build:

  AS [M]  arch/x86/crypto/aesni-intel_asm.o
arch/x86/crypto/aesni-intel_asm.S: Assembler messages:
arch/x86/crypto/aesni-intel_asm.S:803: Error: no such instruction: `aesenc 16*1(%rdi),%xmm6'
arch/x86/crypto/aesni-intel_asm.S:803: Error: no such instruction: `aesenc 16*1(%rdi),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:803: Error: no such instruction: `aesenc 16*1(%rdi),%xmm8'
...

I suppose you can't rely on these new binutils instructions just
yet.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--------------------------------------------------------------
Intel Shannon Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
Business address: Dromore House, East Park, Shannon, Co. Clare

This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). Any review or distribution by others is strictly prohibited. If you are not the intended recipient, please contact the sender and delete all copies.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
  2010-10-12  8:31 tadeusz.struk
@ 2010-10-19 12:49 ` Herbert Xu
       [not found]   ` <44813D8942D35947805CBEEA94195BB001B135B69E@irsmsx505.ger.corp.intel.com>
  0 siblings, 1 reply; 19+ messages in thread
From: Herbert Xu @ 2010-10-19 12:49 UTC (permalink / raw)
  To: tadeusz.struk
  Cc: linux-kernel, linux-crypto, aidan.o.mahony, gabriele.paoloni,
	adrian.hoban

On Tue, Oct 12, 2010 at 09:31:41AM +0100, tadeusz.struk@intel.com wrote:
> Hi Herbert,
> Resubmitting the two other patches as requested.
> Thanks,
> Tadeusz
> 
> >From 06444d8a95458d807ae14699e557739281d0b026 Mon Sep 17 00:00:00 2001
> From: Adrian Hoban <ahoban@allen.ir.intel.com>
> Date: Fri, 10 Sep 2010 18:08:45 +0100
> Subject: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
> 
> This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
> kernels. It supports 128-bit AES key size. This leverages the crypto
> AEAD interface type to facilitate a combined AES & GCM operation to
> be implemented in assembly code. The assembly code leverages Intel(R)
> AES New Instructions and the PCLMULQDQ instruction.

OK this patch applies now at least.  However it doesn't build:

  AS [M]  arch/x86/crypto/aesni-intel_asm.o
arch/x86/crypto/aesni-intel_asm.S: Assembler messages:
arch/x86/crypto/aesni-intel_asm.S:803: Error: no such instruction: `aesenc 16*1(%rdi),%xmm6'
arch/x86/crypto/aesni-intel_asm.S:803: Error: no such instruction: `aesenc 16*1(%rdi),%xmm7'
arch/x86/crypto/aesni-intel_asm.S:803: Error: no such instruction: `aesenc 16*1(%rdi),%xmm8'
...

I suppose you can't rely on these new binutils instructions just
yet.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
@ 2010-10-12  8:31 tadeusz.struk
  2010-10-19 12:49 ` Herbert Xu
  0 siblings, 1 reply; 19+ messages in thread
From: tadeusz.struk @ 2010-10-12  8:31 UTC (permalink / raw)
  To: herbert
  Cc: linux-kernel, linux-crypto, aidan.o.mahony, gabriele.paoloni,
	adrian.hoban, tadeusz.struk

Hi Herbert,
Resubmitting the two other patches as requested.
Thanks,
Tadeusz

>From 06444d8a95458d807ae14699e557739281d0b026 Mon Sep 17 00:00:00 2001
From: Adrian Hoban <ahoban@allen.ir.intel.com>
Date: Fri, 10 Sep 2010 18:08:45 +0100
Subject: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions

This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
kernels. It supports 128-bit AES key size. This leverages the crypto
AEAD interface type to facilitate a combined AES & GCM operation to
be implemented in assembly code. The assembly code leverages Intel(R)
AES New Instructions and the PCLMULQDQ instruction.

Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com>
Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com>
Signed-off-by: James Guilford <james.guilford@intel.com>
Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com>
---
 arch/x86/crypto/aesni-intel_asm.S  | 1112 +++++++++++++++++++++++++++++++++++-
 arch/x86/crypto/aesni-intel_glue.c |  518 +++++++++++++++++-
 2 files changed, 1627 insertions(+), 3 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756..3950769 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -8,6 +8,17 @@
  *    Author: Huang Ying <ying.huang@intel.com>
  *            Vinodh Gopal <vinodh.gopal@intel.com>
  *            Kahraman Akdemir
+ *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             James Guilford (james.guilford@intel.com)
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Wajdi Feghali (wajdi.k.feghali@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -16,9 +27,50 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/inst.h>
+
+
+.data
+POLY:   .octa 0xC2000000000000000000000000000001
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+
+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
+MASK1:      .octa 0x0000000000000000ffffffffffffffff
+MASK2:      .octa 0xffffffffffffffff0000000000000000
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
+ZERO:       .octa 0x00000000000000000000000000000000
+ONE:        .octa 0x00000000000000000000000000000001
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+dec:        .octa 0x1
+enc:        .octa 0x2
+
 
 .text
+#define	STACK_OFFSET    8*3
+#define	HashKey		16*0	// store HashKey <<1 mod poly here
+#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
+#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
+#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
+#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64 bits of  HashKey <<1 mod poly here (for Karatsuba purposes)
+#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64 bits of  HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64 bits of  HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64 bits of  HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+#define	VARIABLE_OFFSET	16*8
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%r14)
+#define arg8 STACK_OFFSET+16(%r14)
+#define arg9 STACK_OFFSET+24(%r14)
+#define arg10 STACK_OFFSET+32(%r14)
+
 
 #define STATE1	%xmm0
 #define STATE2	%xmm4
@@ -47,6 +99,1064 @@
 #define T2	%r11
 #define TCTR_LOW T2
 
+
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro	GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+	movdqa	  \GH, \TMP1
+	pshufd	  $78, \GH, \TMP2
+	pshufd	  $78, \HK, \TMP3
+	pxor	  \GH, \TMP2            # TMP2 = a1+a0
+	pxor	  \HK, \TMP3            # TMP3 = b1+b0
+	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
+	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
+	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
+	pxor	  \GH, \TMP2
+	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
+	movdqa	  \TMP2, \TMP3
+	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
+	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
+	pxor	  \TMP3, \GH
+	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
+
+        # first phase of the reduction
+
+	movdqa    \GH, \TMP2
+	movdqa    \GH, \TMP3
+	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4 in in order to perform independent shifts
+	pslld     $31, \TMP2            # packed right shift <<31
+	pslld     $30, \TMP3            # packed right shift <<30
+	pslld     $25, \TMP4            # packed right shift <<25
+	pxor      \TMP3, \TMP2          # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP5
+	psrldq    $4, \TMP5             # right shift TMP5 1 DW
+	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
+	pxor      \TMP2, \GH
+
+        # second phase of the reduction
+
+	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4 in in order to perform independent shifts
+	movdqa    \GH,\TMP3
+	movdqa    \GH,\TMP4
+	psrld     $1,\TMP2              # packed left shift >>1
+	psrld     $2,\TMP3              # packed left shift >>2
+	psrld     $7,\TMP4              # packed left shift >>7
+	pxor      \TMP3,\TMP2		# xor the shifted versions
+	pxor      \TMP4,\TMP2
+	pxor      \TMP5, \TMP2
+	pxor      \TMP2, \GH
+	pxor      \TMP1, \GH            # result is in TMP1
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+
+	mov	   arg7, %r10           # %r10 = AAD
+	mov	   arg8, %r12           # %r12 = aadLen
+	mov	   %r12, %r11
+	pxor	   %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+	movd	   (%r10), \TMP1
+	pslldq	   $12, \TMP1
+	psrldq	   $4, %xmm\i
+	pxor	   \TMP1, %xmm\i
+	add	   $4, %r10
+	sub	   $4, %r12
+	jne	   _get_AAD_loop\num_initial_blocks\operation
+	cmp	   $16, %r11
+	je	   _get_AAD_loop2_done\num_initial_blocks\operation
+	mov	   $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+	psrldq	   $4, %xmm\i
+	sub	   $4, %r12
+	cmp	   %r11, %r12
+	jne	   _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+	pshufb	   SHUF_MASK(%rip), %xmm\i          # byte-reflect the AAD data
+	xor	   %r11, %r11                       # initialise the data pointer offset as zero
+
+
+        # start AES for num_initial_blocks blocks
+
+	mov	   %arg5, %rax                      # %rax = *Y0
+	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
+	pshufb	   SHUF_MASK(%rip), \XMM0
+.if \i_seq != 0
+.irpc index, \i_seq
+	paddd	   ONE(%rip), \XMM0                 # INCR Y0
+	movdqa	   \XMM0, %xmm\index
+	pshufb	   SHUF_MASK(%rip), %xmm\index      # perform a 16 byte swap
+.endr
+.irpc index, \i_seq
+	pxor	   16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+	aesenc     16*1(%rdi), %xmm\index          # Round 1
+.endr
+.irpc index, \i_seq
+	aesenc     16*2(%arg1), %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	aesenc     16*3(%arg1), %xmm\index          # Round 3
+.endr
+.irpc index, \i_seq
+	aesenc     16*4(%arg1), %xmm\index          # Round 4
+.endr
+.irpc index, \i_seq
+	aesenc     16*5(%arg1), %xmm\index          # Round 5
+.endr
+.irpc index, \i_seq
+	aesenc     16*6(%arg1), %xmm\index          # Round 6
+.endr
+.irpc index, \i_seq
+	aesenc     16*7(%arg1), %xmm\index          # Round 7
+.endr
+.irpc index, \i_seq
+	aesenc     16*8(%arg1), %xmm\index          # Round 8
+.endr
+.irpc index, \i_seq
+	aesenc     16*9(%arg1), %xmm\index          # Round 9
+.endr
+.irpc index, \i_seq
+	aesenclast 16*10(%arg1), %xmm\index         # Round 10
+.endr
+.irpc index, \i_seq
+	movdqu	   (%arg3 , %r11, 1), \TMP1
+	pxor	   \TMP1, %xmm\index
+	movdqu	   %xmm\index, (%arg2 , %r11, 1)    # write back plaintext/ciphertext for num_initial_blocks
+	add	   $16, %r11
+.if \operation == dec
+	movdqa     \TMP1, %xmm\index
+.endif
+	pshufb	   SHUF_MASK(%rip), %xmm\index      # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+
+        # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+        pxor       %xmm5, %xmm6
+	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+	cmp	   $64, %r13
+	jl	_initial_blocks_done\num_initial_blocks\operation   # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM1
+	pshufb	   SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM2
+	pshufb	   SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM3
+	pshufb	   SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM4
+	pshufb	   SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
+	pxor	   16*0(%arg1), \XMM1
+	pxor	   16*0(%arg1), \XMM2
+	pxor	   16*0(%arg1), \XMM3
+	pxor	   16*0(%arg1), \XMM4
+	movdqa	   \TMP3, \TMP5
+	pshufd	   $78, \TMP3, \TMP1
+	pxor	   \TMP3, \TMP1
+	movdqa	   \TMP1, HashKey_k(%rsp)
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 = HashKey^2<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_2(%rsp)                           # HashKey_2 = HashKey^2<<1 (mod poly)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+	aesenc	   16*\index(%arg1), \XMM1
+	aesenc	   16*\index(%arg1), \XMM2
+	aesenc	   16*\index(%arg1), \XMM3
+	aesenc	   16*\index(%arg1), \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 = HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_3(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+	aesenc	   16*\index(%arg1), \XMM1
+	aesenc	   16*\index(%arg1), \XMM2
+	aesenc	   16*\index(%arg1), \XMM3
+	aesenc	   16*\index(%arg1), \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 = HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_4(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_4_k(%rsp)
+	aesenclast 160(%arg1), \XMM1
+	aesenclast 160(%arg1), \XMM2
+	aesenclast 160(%arg1), \XMM3
+	aesenclast 160(%arg1), \XMM4
+	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM1
+.if \operation == dec
+	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM1
+.endif
+	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM2
+.if \operation == dec
+	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM2
+.endif
+	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM3
+.if \operation == dec
+	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM3
+.endif
+	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM4
+.if \operation == dec
+	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM4
+.else
+	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
+	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
+	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
+	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+.endif
+	add	   $64, %r11
+	pshufb	   SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
+	pxor	   \XMMDst, \XMM1         # combine GHASHed value with the corresponding ciphertext
+	pshufb	   SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
+	pshufb	   SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
+	pshufb	   SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
+_initial_blocks_done\num_initial_blocks\operation:
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+	movdqa	  \XMM1, \XMM5
+	movdqa	  \XMM2, \XMM6
+	movdqa	  \XMM3, \XMM7
+	movdqa	  \XMM4, \XMM8
+
+        # multiply TMP5 * HashKey using karatsuba
+
+	movdqa	  \XMM5, \TMP4
+	pshufd	  $78, \XMM5, \TMP6
+	pxor	  \XMM5, \TMP6
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa	  HashKey_4(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
+	movdqa    \XMM0, \XMM1
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM2
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM3
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM4
+	pshufb	  SHUF_MASK(%rip), \XMM1	# perform a 16 byte swap
+	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
+	pshufb	  SHUF_MASK(%rip), \XMM2	# perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM3	# perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM4	# perform a 16 byte swap
+	pxor	  (%arg1), \XMM1
+	pxor	  (%arg1), \XMM2
+	pxor	  (%arg1), \XMM3
+	pxor	  (%arg1), \XMM4
+	movdqa	  HashKey_4_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
+	aesenc	  16(%arg1), \XMM1              # Round 1
+	aesenc	  16(%arg1), \XMM2
+	aesenc	  16(%arg1), \XMM3
+	aesenc	  16(%arg1), \XMM4
+	aesenc	  32(%arg1), \XMM1              # Round 2
+	aesenc	  32(%arg1), \XMM2
+	aesenc	  32(%arg1), \XMM3
+	aesenc	  32(%arg1), \XMM4
+	movdqa	  \XMM6, \TMP1
+	pshufd	  $78, \XMM6, \TMP2
+	pxor	  \XMM6, \TMP2
+	movdqa	  HashKey_3(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
+	aesenc    48(%arg1), \XMM1              # Round 3
+	aesenc    48(%arg1), \XMM2
+	aesenc    48(%arg1), \XMM3
+	aesenc    48(%arg1), \XMM4
+	pclmulqdq $0x00, \TMP5, \XMM6           # XMM6 = a0*b0
+	aesenc	  64(%arg1), \XMM1              # Round 4
+	aesenc	  64(%arg1), \XMM2
+	aesenc	  64(%arg1), \XMM3
+	aesenc	  64(%arg1), \XMM4
+	movdqa	  HashKey_3_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	aesenc	  80(%arg1), \XMM1              # Round 5
+	aesenc	  80(%arg1), \XMM2
+	aesenc	  80(%arg1), \XMM3
+	aesenc	  80(%arg1), \XMM4
+	pxor	  \TMP1, \TMP4			# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM6, \XMM5
+	pxor	  \TMP2, \TMP6
+	movdqa	  \XMM7, \TMP1
+	pshufd	  $78, \XMM7, \TMP2
+	pxor	  \XMM7, \TMP2
+	movdqa	  HashKey_2(%rsp ), \TMP5
+
+        # Multiply TMP5 * HashKey using karatsuba
+
+	pclmulqdq $0x11, \TMP5, \TMP1           # TMP1 = a1*b1
+	aesenc	  96(%arg1), \XMM1              # Round 6
+	aesenc	  96(%arg1), \XMM2
+	aesenc	  96(%arg1), \XMM3
+	aesenc	  96(%arg1), \XMM4
+	pclmulqdq $0x00, \TMP5, \XMM7           # XMM7 = a0*b0
+	aesenc	  112(%arg1), \XMM1             # Round 7
+	aesenc	  112(%arg1), \XMM2
+	aesenc	  112(%arg1), \XMM3
+	aesenc	  112(%arg1), \XMM4
+	movdqa	  HashKey_2_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	aesenc	  128(%arg1), \XMM1             # Round 8
+	aesenc	  128(%arg1), \XMM2
+	aesenc	  128(%arg1), \XMM3
+	aesenc	  128(%arg1), \XMM4
+	pxor	  \TMP1, \TMP4			# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM7, \XMM5
+	pxor	  \TMP2, \TMP6
+
+        # Multiply XMM8 * HashKey
+        # XMM8 and TMP5 hold the values for the two operands
+
+	movdqa	  \XMM8, \TMP1
+	pshufd	  $78, \XMM8, \TMP2
+	pxor	  \XMM8, \TMP2
+	movdqa	  HashKey(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1          # TMP1 = a1*b1
+	aesenc	  144(%arg1), \XMM1            # Round 9
+	aesenc	  144(%arg1), \XMM2
+	aesenc	  144(%arg1), \XMM3
+	aesenc	  144(%arg1), \XMM4
+	pclmulqdq $0x00, \TMP5, \XMM8          # XMM8 = a0*b0
+	aesenclast 160(%arg1), \XMM1           # Round 10
+	aesenclast 160(%arg1), \XMM2
+	aesenclast 160(%arg1), \XMM3
+	aesenclast 160(%arg1), \XMM4
+	movdqa    HashKey_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
+	movdqu	  (%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
+	movdqa    \TMP3, \XMM1
+.endif
+	movdqu	  16(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM2
+.endif
+	movdqu	  32(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM3
+.endif
+	movdqu	  48(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM4
+.else
+        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
+        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
+.endif
+	pshufb	  SHUF_MASK(%rip), \XMM1       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM2       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM3       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM4       # perform a 16 byte sway
+
+	pxor	  \TMP4, \TMP1
+	pxor	  \XMM8, \XMM5
+	pxor	  \TMP6, \TMP2
+	pxor	  \TMP1, \TMP2
+	pxor	  \XMM5, \TMP2
+	movdqa	  \TMP2, \TMP3
+	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
+	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
+	pxor	  \TMP3, \XMM5
+	pxor	  \TMP2, \TMP1		       # accumulate the results in TMP1:XMM5
+
+        # first phase of reduction
+
+	movdqa    \XMM5, \TMP2
+	movdqa    \XMM5, \TMP3
+	movdqa    \XMM5, \TMP4		       # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+	pslld     $31, \TMP2                   # packed right shift << 31
+	pslld     $30, \TMP3                   # packed right shift << 30
+	pslld     $25, \TMP4                   # packed right shift << 25
+	pxor      \TMP3, \TMP2	               # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP5
+	psrldq    $4, \TMP5                    # right shift T5 1 DW
+	pslldq    $12, \TMP2                   # left shift T2 3 DWs
+	pxor      \TMP2, \XMM5
+
+        # second phase of reduction
+
+	movdqa    \XMM5,\TMP2		       # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+	movdqa    \XMM5,\TMP3
+	movdqa    \XMM5,\TMP4
+	psrld     $1, \TMP2                    # packed left shift >>1
+	psrld     $2, \TMP3                    # packed left shift >>2
+	psrld     $7, \TMP4                    # packed left shift >>7
+	pxor      \TMP3,\TMP2		       # xor the shifted versions
+	pxor      \TMP4,\TMP2
+	pxor      \TMP5, \TMP2
+	pxor      \TMP2, \XMM5
+	pxor      \TMP1, \XMM5                 # result is in TMP1
+
+	pxor	  \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
+
+        # Multiply TMP6 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM1, \TMP6
+	pshufd	  $78, \XMM1, \TMP2
+	pxor	  \XMM1, \TMP2
+	movdqa	  HashKey_4(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
+	movdqa	  HashKey_4_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	movdqa	  \XMM1, \XMMDst
+	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM2, \TMP1
+	pshufd	  $78, \XMM2, \TMP2
+	pxor	  \XMM2, \TMP2
+	movdqa	  HashKey_3(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
+	movdqa	  HashKey_3_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM2, \XMMDst
+	pxor	  \TMP2, \XMM1              # results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM3, \TMP1
+	pshufd	  $78, \XMM3, \TMP2
+	pxor	  \XMM3, \TMP2
+	movdqa	  HashKey_2(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
+	movdqa	  HashKey_2_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM3, \XMMDst
+	pxor	  \TMP2, \XMM1              # results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM4, \TMP1
+	pshufd	  $78, \XMM4, \TMP2
+	pxor	  \XMM4, \TMP2
+	movdqa	  HashKey(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
+	movdqa	  HashKey_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM4, \XMMDst
+	pxor	  \XMM1, \TMP2
+	pxor	  \TMP6, \TMP2
+	pxor	  \XMMDst, \TMP2            # middle section of the temp results combined as in karatsuba algorithm
+	movdqa	  \TMP2, \TMP4
+	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
+	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
+	pxor	  \TMP4, \XMMDst
+	pxor	  \TMP2, \TMP6              # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
+
+        # first phase of the reduction
+
+	movdqa    \XMMDst, \TMP2
+	movdqa    \XMMDst, \TMP3
+	movdqa    \XMMDst, \TMP4            # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
+	pslld     $31, \TMP2                # packed right shifting << 31
+	pslld     $30, \TMP3                # packed right shifting << 30
+	pslld     $25, \TMP4                # packed right shifting << 25
+	pxor      \TMP3, \TMP2              # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP7
+	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
+	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
+	pxor      \TMP2, \XMMDst
+
+        # second phase of the reduction
+
+	movdqa    \XMMDst, \TMP2            # make 3 copies of XMMDst for doing 3 shift operations
+	movdqa    \XMMDst, \TMP3
+	movdqa    \XMMDst, \TMP4
+	psrld     $1, \TMP2                 # packed left shift >> 1
+	psrld     $2, \TMP3                 # packed left shift >> 2
+	psrld     $7, \TMP4                 # packed left shift >> 7
+	pxor      \TMP3, \TMP2              # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	pxor      \TMP7, \TMP2
+	pxor      \TMP2, \XMMDst
+	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
+.endm
+
+/* Encryption of a single block done*/
+.macro ENCRYPT_SINGLE_BLOCK XMM0
+
+	pxor	(%arg1), \XMM0
+	aesenc	16(%arg1), \XMM0
+	aesenc	32(%arg1), \XMM0
+	aesenc	48(%arg1), \XMM0
+	aesenc	64(%arg1), \XMM0
+	aesenc	80(%arg1), \XMM0
+	aesenc	96(%arg1), \XMM0
+	aesenc	112(%arg1), \XMM0
+	aesenc	128(%arg1), \XMM0
+	aesenc	144(%arg1), \XMM0
+	aesenclast 160(%arg1), \XMM0
+.endm
+
+
+/**********************************************************************************************
+*void aesni_gcm_dec(void *aes_ctx,               // AES Key schedule. Starts on a 16 byte boundary.
+*                   u8 *out,                     // Plaintext output. Encrypt in-place is allowed.
+*                   const u8 *in,                // Ciphertext input
+*                   u64 plaintext_len,           // Length of data in bytes for decryption.
+*                   u8 *iv,                      // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                                // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                                // concatenated with 0x00000001. 16-byte aligned pointer.
+*                   u8 *hash_subkey,             // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                   const u8 *aad,               // Additional Authentication Data (AAD)
+*                   u64 aad_len,                 // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                   u8  *auth_tag,               // Authenticated Tag output. The driver will compare this to the
+*                                                // given authentication tag and only return the plaintext if they match.
+*                   u64 auth_tag_len);           // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8.
+*
+*
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 64-bit Extended Sequence Number
+*
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes. The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes. For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+*******************************************************************************************/
+
+ENTRY(aesni_gcm_dec)
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	%rsp, %r14
+/*
+* states of %xmm registers %xmm6:%xmm15 not saved
+* all %xmm registers are clobbered
+*/
+	sub	$VARIABLE_OFFSET, %rsp
+	and	$~63, %rsp                        # align rsp to 64 bytes
+	mov	%arg6, %r12
+	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
+	pshufb	SHUF_MASK(%rip), %xmm13
+
+        # Precompute HashKey<<1 (mod poly) from the hash key (this is required for GHASH)
+
+	movdqa	%xmm13, %xmm2
+	psllq	$1, %xmm13
+	psrlq	$63, %xmm2
+	movdqa	%xmm2, %xmm1
+	pslldq	$8, %xmm2
+	psrldq	$8, %xmm1
+	por	%xmm2, %xmm13
+
+        # Reduction
+
+	pshufd	$0x24, %xmm1, %xmm2
+	pcmpeqd TWOONE(%rip), %xmm2
+	pand	POLY(%rip), %xmm2
+	pxor	%xmm2, %xmm13			# %xmm13 holds the HashKey<<1 (mod poly)
+
+
+        # Decrypt first few blocks
+
+	movdqa	%xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
+	mov	%arg4, %r13                     # save the number of bytes of plaintext/ciphertext
+	and	$-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
+	mov	%r13, %r12
+	and	$(3<<4), %r12
+	jz	_initial_num_blocks_is_0_decrypt
+	cmp	$(2<<4), %r12
+	jb	_initial_num_blocks_is_1_decrypt
+	je	_initial_num_blocks_is_2_decrypt
+_initial_num_blocks_is_3_decrypt:
+	INITIAL_BLOCKS	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
+	sub	$48, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_2_decrypt:
+	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
+	sub	$32, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_1_decrypt:
+	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
+	sub	$16, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_0_decrypt:
+	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
+_initial_blocks_decrypted:
+	cmp	$0, %r13
+	je	_zero_cipher_left_decrypt
+	sub	$64, %r13
+	je	_four_cipher_left_decrypt
+_decrypt_by_4:
+	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
+	add	$64, %r11
+	sub	$64, %r13
+	jne	_decrypt_by_4
+_four_cipher_left_decrypt:
+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_decrypt:
+	mov	%arg4, %r13
+	and	$15, %r13				# %r13 = arg4 (mod 16)
+	je	_multiple_of_16_bytes_decrypt
+
+        # Handle the last <16 byte block seperately
+
+	paddd	ONE(%rip), %xmm0                        # increment CNT to get Yn
+	pshufb	SHUF_MASK(%rip), %xmm0
+	ENCRYPT_SINGLE_BLOCK 	%xmm0			# E(K, Yn)
+	sub	$16, %r11
+	add	%r13, %r11
+	movdqu	(%arg3,%r11,1), %xmm1			# recieve the last <16 byte block
+	lea	SHIFT_MASK+16(%rip), %r12
+	sub	%r13, %r12				# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
+							# (%r13 is the number of bytes in plaintext mod 16)
+	movdqu	(%r12), %xmm2				# get the appropriate shuffle mask
+	pshufb	%xmm2, %xmm1				# right shift 16-%r13 butes
+	movdqa  %xmm1, %xmm2
+	pxor	%xmm1, %xmm0				# Ciphertext XOR E(K, Yn)
+	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1           # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
+	pand	%xmm1, %xmm0				# mask out top 16-%r13 bytes of %xmm0
+	pand    %xmm1, %xmm2
+	pshufb	SHUF_MASK(%rip),%xmm2
+	pxor	%xmm2, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6	# GHASH computation for the last <16 byte block
+	sub	%r13, %r11
+	add	$16, %r11
+
+        # output %r13 bytes
+
+	movq	%xmm0, %rax
+	cmp	$8, %r13
+	jle	_less_than_8_bytes_left_decrypt
+	mov	%rax, (%arg2 , %r11, 1)
+	add	$8, %r11
+	psrldq	$8, %xmm0
+	movq	%xmm0, %rax
+	sub	$8, %r13
+_less_than_8_bytes_left_decrypt:
+	mov	%al,  (%arg2, %r11, 1)
+	add	$1, %r11
+	shr	$8, %rax
+	sub	$1, %r13
+	jne	_less_than_8_bytes_left_decrypt
+_multiple_of_16_bytes_decrypt:
+	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
+	shl	$3, %r12		  # convert into number of bits
+	movd	%r12d, %xmm15		  # len(A) in %xmm15
+	shl	$3, %arg4		  # len(C) in bits (*128)
+	movq	%arg4, %xmm1
+	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
+	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
+	pxor	%xmm15, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6	# final GHASH computation
+	pshufb	SHUF_MASK(%rip), %xmm8
+	mov	%arg5, %rax		  # %rax = *Y0
+	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
+	ENCRYPT_SINGLE_BLOCK	%xmm0 	  # E(K, Y0)
+	pxor	%xmm8, %xmm0
+_return_T_decrypt:
+	mov	arg9, %r10                # %r10 = authTag
+	mov	arg10, %r11               # %r11 = auth_tag_len
+	cmp	$16, %r11
+	je	_T_16_decrypt
+	cmp	$12, %r11
+	je	_T_12_decrypt
+_T_8_decrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	jmp	_return_T_done_decrypt
+_T_12_decrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	psrldq	$8, %xmm0
+	movd	%xmm0, %eax
+	mov	%eax, 8(%r10)
+	jmp	_return_T_done_decrypt
+_T_16_decrypt:
+	movdqu	%xmm0, (%r10)
+_return_T_done_decrypt:
+	mov	%r14, %rsp
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	ret
+
+
+/***************************************************************************************************
+* void aesni_gcm_enc(void *aes_ctx,               // AES Key schedule. Starts on a 16 byte boundary.
+*                    u8 *out,                    // Ciphertext output. Encrypt in-place is allowed.
+*                    const u8 *in,               // Plaintext input
+*                    u64 plaintext_len,          // Length of data in bytes for encryption.
+*                    u8 *iv,                     // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                                // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                                // concatenated with 0x00000001. 16-byte aligned pointer.
+*                    u8 *hash_subkey,            // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                    const u8 *aad,              // Additional Authentication Data (AAD)
+*                    u64 aad_len,                // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                    u8 *auth_tag,               // Authenticated Tag output.
+*                    u64 auth_tag_len);          // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8.
+*
+*
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 64-bit Extended Sequence Number
+*
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes. The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes. For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+**************************************************************************************************************************/
+ENTRY(aesni_gcm_enc)
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	%rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+	sub	$VARIABLE_OFFSET, %rsp
+	and	$~63, %rsp
+	mov	%arg6, %r12
+	movdqu	(%r12), %xmm13
+	pshufb	SHUF_MASK(%rip), %xmm13
+
+        # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+	movdqa	%xmm13, %xmm2
+	psllq	$1, %xmm13
+	psrlq	$63, %xmm2
+	movdqa	%xmm2, %xmm1
+	pslldq	$8, %xmm2
+	psrldq	$8, %xmm1
+	por	%xmm2, %xmm13
+
+        # reduce HashKey<<1
+
+	pshufd	$0x24, %xmm1, %xmm2
+	pcmpeqd TWOONE(%rip), %xmm2
+	pand	POLY(%rip), %xmm2
+	pxor	%xmm2, %xmm13
+	movdqa	%xmm13, HashKey(%rsp)
+	mov	%arg4, %r13               # %xmm13 holds HashKey<<1 (mod poly)
+	and	$-16, %r13
+	mov	%r13, %r12
+
+        # Encrypt first few blocks
+
+	and	$(3<<4), %r12
+	jz	_initial_num_blocks_is_0_encrypt
+	cmp	$(2<<4), %r12
+	jb	_initial_num_blocks_is_1_encrypt
+	je	_initial_num_blocks_is_2_encrypt
+_initial_num_blocks_is_3_encrypt:
+	INITIAL_BLOCKS	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
+	sub	$48, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_2_encrypt:
+	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
+	sub	$32, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_1_encrypt:
+	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
+	sub	$16, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_0_encrypt:
+	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
+_initial_blocks_encrypted:
+
+        # Main loop - Encrypt remaining blocks
+
+	cmp	$0, %r13
+	je	_zero_cipher_left_encrypt
+	sub	$64, %r13
+	je	_four_cipher_left_encrypt
+_encrypt_by_4_encrypt:
+	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
+	add	$64, %r11
+	sub	$64, %r13
+	jne	_encrypt_by_4_encrypt
+_four_cipher_left_encrypt:
+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_encrypt:
+	mov	%arg4, %r13
+	and	$15, %r13			# %r13 = arg4 (mod 16)
+	je	_multiple_of_16_bytes_encrypt
+
+         # Handle the last <16 Byte block seperately
+
+	paddd	ONE(%rip), %xmm0                # INCR CNT to get Yn
+	pshufb	SHUF_MASK(%rip), %xmm0
+	ENCRYPT_SINGLE_BLOCK	%xmm0 		# Encrypt(K, Yn)
+	sub	$16, %r11
+	add	%r13, %r11
+	movdqu	(%arg3,%r11,1), %xmm1		# recieve the last <16 byte blocks
+	lea	SHIFT_MASK+16(%rip), %r12
+	sub	%r13, %r12			# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+                                                # (%r13 is the number of bytes in plaintext mod 16)
+	movdqu	(%r12), %xmm2			# get the appropriate shuffle mask
+	pshufb	%xmm2, %xmm1			# shift right 16-r13 byte
+	pxor	%xmm1, %xmm0			# Plaintext XOR Encrypt(K, Yn)
+	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1   # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+	pand	%xmm1, %xmm0			# mask out top 16-r13 bytes of xmm0
+
+	pshufb	SHUF_MASK(%rip),%xmm0
+	pxor	%xmm0, %xmm8
+	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6	# GHASH computation for the last <16 byte block
+	sub	%r13, %r11
+	add	$16, %r11
+	pshufb	SHUF_MASK(%rip), %xmm0          # shuffle xmm0 back to output as ciphertext
+
+        # Output %r13 bytes
+
+	movq	%xmm0, %rax
+	cmp	$8, %r13
+	jle	_less_than_8_bytes_left_encrypt
+	mov	%rax, (%arg2 , %r11, 1)
+	add	$8, %r11
+	psrldq	$8, %xmm0
+	movq	%xmm0, %rax
+	sub	$8, %r13
+_less_than_8_bytes_left_encrypt:
+	mov	%al,  (%arg2, %r11, 1)
+	add	$1, %r11
+	shr	$8, %rax
+	sub	$1, %r13
+	jne	_less_than_8_bytes_left_encrypt
+_multiple_of_16_bytes_encrypt:
+	mov	arg8, %r12	               # %r12 = addLen (number of bytes)
+	shl	$3, %r12
+	movd	%r12d, %xmm15	               # len(A) in %xmm15
+	shl	$3, %arg4	               # len(C) in bits (*128)
+	movq	%arg4, %xmm1
+	pslldq	$8, %xmm15	               # %xmm15 = len(A)||0x0000000000000000
+	pxor	%xmm1, %xmm15	               # %xmm15 = len(A)||len(C)
+	pxor	%xmm15, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6	# final GHASH computation
+
+	pshufb	SHUF_MASK(%rip), %xmm8         # perform a 16 byte swap
+	mov	%arg5, %rax		       # %rax  = *Y0
+	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
+	ENCRYPT_SINGLE_BLOCK	%xmm0          # Encrypt(K, Y0)
+	pxor	%xmm8, %xmm0
+_return_T_encrypt:
+	mov	arg9, %r10                     # %r10 = authTag
+	mov	arg10, %r11                    # %r11 = auth_tag_len
+	cmp	$16, %r11
+	je	_T_16_encrypt
+	cmp	$12, %r11
+	je	_T_12_encrypt
+_T_8_encrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	jmp	_return_T_done_encrypt
+_T_12_encrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	psrldq	$8, %xmm0
+	movd	%xmm0, %eax
+	mov	%eax, 8(%r10)
+	jmp	_return_T_done_encrypt
+_T_16_encrypt:
+	movdqu	%xmm0, (%r10)
+_return_T_done_encrypt:
+	mov	%r14, %rsp
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	ret
+
+#include <asm/inst.h>
+
 _key_expansion_128:
 _key_expansion_256a:
 	pshufd $0b11111111, %xmm1, %xmm1
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc..f0b0a6f 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
  * Copyright (C) 2008, Intel Corp.
  *    Author: Huang Ying <ying.huang@intel.com>
  *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
 #include <crypto/ctr.h>
 #include <asm/i387.h>
 #include <asm/aes.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
 
 #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
 #define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
 
-#define AESNI_ALIGN	16
+/* This data is stored at the end of the crypto_tfm struct.
+ * It's a type of per "session" data storage location.
+ * This needs to be 16 byte aligned.
+ */
+struct aesni_rfc4106_gcm_ctx {
+	u8 hash_subkey[16];
+	struct crypto_aes_ctx aes_key_expanded;
+	u8 nonce[4];
+	struct cryptd_aead *cryptd_tfm;
+};
+
+struct aesni_gcm_set_hash_subkey_result {
+	int err;
+	struct completion completion;
+};
+
+struct aesni_hash_subkey_req_data {
+	u8 iv[16];
+	struct aesni_gcm_set_hash_subkey_result result;
+	struct scatterlist sg;
+};
+
+#define AESNI_ALIGN	(16)
 #define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE-1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
 
 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 			     unsigned int key_len);
@@ -62,6 +97,57 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
+/* asmlinkage void aesni_gcm_enc()
+ * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
+ * const u8 *in, Plaintext input
+ * unsigned long plaintext_len, Length of data in bytes for encryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
+ *          is going to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
+ *          Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+/* asmlinkage void aesni_gcm_dec()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
+ * const u8 *in, Ciphertext input
+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
+ * to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+static inline struct
+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
+{
+	return
+		(struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)
+		crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
+}
+
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
 	unsigned long addr = (unsigned long)raw_ctx;
@@ -730,6 +816,422 @@ static struct crypto_alg ablk_xts_alg = {
 };
 #endif
 
+static int rfc4106_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ctx->cryptd_tfm = cryptd_tfm;
+	tfm->crt_aead.reqsize = sizeof(struct aead_request)
+		+ crypto_aead_reqsize(&cryptd_tfm->base);
+	return 0;
+}
+
+static void rfc4106_exit(struct crypto_tfm *tfm)
+{
+	struct aesni_rfc4106_gcm_ctx *ctx =
+		(struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+	if (!IS_ERR(ctx->cryptd_tfm))
+		cryptd_free_aead(ctx->cryptd_tfm);
+	return;
+}
+
+static void
+rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
+{
+	struct aesni_gcm_set_hash_subkey_result *result = req->data;
+
+	if (err == -EINPROGRESS)
+		return;
+	result->err = err;
+	complete(&result->completion);
+}
+
+static int
+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+{
+	struct crypto_ablkcipher *ctr_tfm;
+	struct ablkcipher_request *req;
+	int ret = -EINVAL;
+	struct aesni_hash_subkey_req_data *req_data;
+
+	ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
+	if (IS_ERR(ctr_tfm))
+		return PTR_ERR(ctr_tfm);
+
+	crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
+
+	ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
+	if (ret) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return ret;
+	}
+
+	req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
+	if (!req) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return -EINVAL;
+	}
+
+	req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+	if (!req_data) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return -ENOMEM;
+	}
+	memset(req_data->iv, 0, sizeof(req_data->iv));
+
+	/* Clear the data in the hash sub key container to zero.*/
+	/* We want to cipher all zeros to create the hash sub key. */
+	memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
+
+	init_completion(&req_data->result.completion);
+	sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
+	ablkcipher_request_set_tfm(req, ctr_tfm);
+	ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+					rfc4106_set_hash_subkey_done,
+					&req_data->result);
+
+	ablkcipher_request_set_crypt(req, &req_data->sg,
+		&req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
+
+	ret = crypto_ablkcipher_encrypt(req);
+	if (ret == -EINPROGRESS || ret == -EBUSY) {
+		ret = wait_for_completion_interruptible
+			(&req_data->result.completion);
+		if (!ret)
+			ret = req_data->result.err;
+	}
+	ablkcipher_request_free(req);
+	kfree(req_data);
+	crypto_free_ablkcipher(ctr_tfm);
+	return ret;
+}
+
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+						   unsigned int key_len)
+{
+	int ret = 0;
+	struct crypto_tfm *tfm = crypto_aead_tfm(parent);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+	u8 *new_key_mem = NULL;
+
+	if (key_len < 4) {
+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	/*Account for 4 byte nonce at the end.*/
+	key_len -= 4;
+	if (key_len != AES_KEYSIZE_128) {
+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
+	/*This must be on a 16 byte boundary!*/
+	if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
+		return -EINVAL;
+
+	if ((unsigned long)key % AESNI_ALIGN) {
+		/*key is not aligned: use an auxuliar aligned pointer*/
+		new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
+		if (!new_key_mem)
+			return -ENOMEM;
+
+		new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
+		memcpy(new_key_mem, key, key_len);
+		key = new_key_mem;
+	}
+
+	if (!irq_fpu_usable())
+		ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
+		key, key_len);
+	else {
+		kernel_fpu_begin();
+		ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
+		kernel_fpu_end();
+	}
+	/*This must be on a 16 byte boundary!*/
+	if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
+		ret = -EINVAL;
+		goto exit;
+	}
+	ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+exit:
+	kfree(new_key_mem);
+	return ret;
+}
+
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+				unsigned int authsize)
+{
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	switch (authsize) {
+	case 8:
+	case 12:
+	case 16:
+		break;
+	default:
+		return -EINVAL;
+	}
+	crypto_aead_crt(parent)->authsize = authsize;
+	crypto_aead_crt(cryptd_child)->authsize = authsize;
+	return 0;
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_aead_encrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = cryptd_child->base.crt_aead.encrypt(req);
+		kernel_fpu_end();
+		return ret;
+	}
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_aead_decrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = cryptd_child->base.crt_aead.decrypt(req);
+		kernel_fpu_end();
+		return ret;
+	}
+}
+
+static struct crypto_alg rfc4106_alg = {
+	.cra_name = "rfc4106(gcm(aes))",
+	.cra_driver_name = "rfc4106-gcm-aesni",
+	.cra_priority = 400,
+	.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+	.cra_blocksize = 1,
+	.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+	.cra_alignmask = 0,
+	.cra_type = &crypto_nivaead_type,
+	.cra_module = THIS_MODULE,
+	.cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
+	.cra_init = rfc4106_init,
+	.cra_exit = rfc4106_exit,
+	.cra_u = {
+		.aead = {
+			.setkey = rfc4106_set_key,
+			.setauthsize = rfc4106_set_authsize,
+			.encrypt = rfc4106_encrypt,
+			.decrypt = rfc4106_decrypt,
+			.geniv = "seqiv",
+			.ivsize = 8,
+			.maxauthsize = 16,
+		},
+	},
+};
+
+static int __driver_rfc4106_encrypt(struct aead_request *req)
+{
+	u8 one_entry_in_sg = 0;
+	u8 *src, *dst, *assoc;
+	__be32 counter = cpu_to_be32(1);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	void *aes_ctx = &(ctx->aes_key_expanded);
+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	u8 iv_tab[16+AESNI_ALIGN];
+	u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
+	struct scatter_walk src_sg_walk;
+	struct scatter_walk assoc_sg_walk;
+	struct scatter_walk dst_sg_walk;
+	unsigned int i;
+
+	/* Assuming we are supporting rfc4106 64-bit extended */
+	/* sequence numbers We need to have the AAD length equal */
+	/* to 8 or 12 bytes */
+	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+		return -EINVAL;
+	/* IV below built */
+	for (i = 0; i < 4; i++)
+		*(iv+i) = ctx->nonce[i];
+	for (i = 0; i < 8; i++)
+		*(iv+4+i) = req->iv[i];
+	*((__be32 *)(iv+12)) = counter;
+
+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+		one_entry_in_sg = 1;
+		scatterwalk_start(&src_sg_walk, req->src);
+		scatterwalk_start(&assoc_sg_walk, req->assoc);
+		src = scatterwalk_map(&src_sg_walk, 0);
+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		dst = src;
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_start(&dst_sg_walk, req->dst);
+			dst = scatterwalk_map(&dst_sg_walk, 0);
+		}
+
+	} else {
+		/* Allocate memory for src, dst, assoc */
+		src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+			GFP_ATOMIC);
+		if (unlikely(!src))
+			return -ENOMEM;
+		assoc = (src + req->cryptlen + auth_tag_len);
+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
+					req->assoclen, 0);
+		dst = src;
+	}
+
+	aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
+		+ ((unsigned long)req->cryptlen), auth_tag_len);
+
+	/* The authTag (aka the Integrity Check Value) needs to be written
+	 * back to the packet. */
+	if (one_entry_in_sg) {
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_unmap(dst, 0);
+			scatterwalk_done(&dst_sg_walk, 0, 0);
+		}
+		scatterwalk_unmap(src, 0);
+		scatterwalk_unmap(assoc, 0);
+		scatterwalk_done(&src_sg_walk, 0, 0);
+		scatterwalk_done(&assoc_sg_walk, 0, 0);
+	} else {
+		scatterwalk_map_and_copy(dst, req->dst, 0,
+			req->cryptlen + auth_tag_len, 1);
+		kfree(src);
+	}
+	return 0;
+}
+
+static int __driver_rfc4106_decrypt(struct aead_request *req)
+{
+	u8 one_entry_in_sg = 0;
+	u8 *src, *dst, *assoc;
+	unsigned long tempCipherLen = 0;
+	__be32 counter = cpu_to_be32(1);
+	int retval = 0;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	void *aes_ctx = &(ctx->aes_key_expanded);
+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	u8 iv_and_authTag[32+AESNI_ALIGN];
+	u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
+	u8 *authTag = iv + 16;
+	struct scatter_walk src_sg_walk;
+	struct scatter_walk assoc_sg_walk;
+	struct scatter_walk dst_sg_walk;
+	unsigned int i;
+
+	if (unlikely((req->cryptlen < auth_tag_len) ||
+		(req->assoclen != 8 && req->assoclen != 12)))
+		return -EINVAL;
+	/* Assuming we are supporting rfc4106 64-bit extended */
+	/* sequence numbers We need to have the AAD length */
+	/* equal to 8 or 12 bytes */
+
+	tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
+	/* IV below built */
+	for (i = 0; i < 4; i++)
+		*(iv+i) = ctx->nonce[i];
+	for (i = 0; i < 8; i++)
+		*(iv+4+i) = req->iv[i];
+	*((__be32 *)(iv+12)) = counter;
+
+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+		one_entry_in_sg = 1;
+		scatterwalk_start(&src_sg_walk, req->src);
+		scatterwalk_start(&assoc_sg_walk, req->assoc);
+		src = scatterwalk_map(&src_sg_walk, 0);
+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		dst = src;
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_start(&dst_sg_walk, req->dst);
+			dst = scatterwalk_map(&dst_sg_walk, 0);
+		}
+
+	} else {
+		/* Allocate memory for src, dst, assoc */
+		src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+		if (!src)
+			return -ENOMEM;
+		assoc = (src + req->cryptlen + auth_tag_len);
+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
+			req->assoclen, 0);
+		dst = src;
+	}
+
+	aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
+		authTag, auth_tag_len);
+
+	/* Compare generated tag with passed in tag. */
+	retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
+		-EBADMSG : 0;
+
+	if (one_entry_in_sg) {
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_unmap(dst, 0);
+			scatterwalk_done(&dst_sg_walk, 0, 0);
+		}
+		scatterwalk_unmap(src, 0);
+		scatterwalk_unmap(assoc, 0);
+		scatterwalk_done(&src_sg_walk, 0, 0);
+		scatterwalk_done(&assoc_sg_walk, 0, 0);
+	} else {
+		scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
+		kfree(src);
+	}
+	return retval;
+}
+
+static struct crypto_alg __rfc4106_alg = {
+	.cra_name		= "__gcm-aes-aesni",
+	.cra_driver_name	= "__driver-gcm-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_aead_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(__rfc4106_alg.cra_list),
+	.cra_u = {
+		.aead = {
+			.encrypt	= __driver_rfc4106_encrypt,
+			.decrypt	= __driver_rfc4106_decrypt,
+		},
+	},
+};
+
 static int __init aesni_init(void)
 {
 	int err;
@@ -738,6 +1240,7 @@ static int __init aesni_init(void)
 		printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
 		return -ENODEV;
 	}
+
 	if ((err = crypto_register_alg(&aesni_alg)))
 		goto aes_err;
 	if ((err = crypto_register_alg(&__aesni_alg)))
@@ -770,10 +1273,19 @@ static int __init aesni_init(void)
 	if ((err = crypto_register_alg(&ablk_xts_alg)))
 		goto ablk_xts_err;
 #endif
-
+	err = crypto_register_alg(&__rfc4106_alg);
+	if (err)
+		goto __aead_gcm_err;
+	err = crypto_register_alg(&rfc4106_alg);
+	if (err)
+		goto aead_gcm_err;
 	return err;
 
+aead_gcm_err:
+	crypto_unregister_alg(&__rfc4106_alg);
+__aead_gcm_err:
 #ifdef HAS_XTS
+	crypto_unregister_alg(&ablk_xts_alg);
 ablk_xts_err:
 #endif
 #ifdef HAS_PCBC
@@ -809,6 +1321,8 @@ aes_err:
 
 static void __exit aesni_exit(void)
 {
+	crypto_unregister_alg(&__rfc4106_alg);
+	crypto_unregister_alg(&rfc4106_alg);
 #ifdef HAS_XTS
 	crypto_unregister_alg(&ablk_xts_alg);
 #endif
-- 
1.5.3.rc5

--------------------------------------------------------------
Intel Shannon Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
Business address: Dromore House, East Park, Shannon, Co. Clare

This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). Any review or distribution by others is strictly prohibited. If you are not the intended recipient, please contact the sender and delete all copies.



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions
@ 2010-10-11 15:48 tadeusz.struk
  0 siblings, 0 replies; 19+ messages in thread
From: tadeusz.struk @ 2010-10-11 15:48 UTC (permalink / raw)
  To: herbert
  Cc: linux-kernel, linux-crypto, aidan.o.mahony, gabriele.paoloni,
	adrian.hoban, tadeusz.struk

>From 06444d8a95458d807ae14699e557739281d0b026 Mon Sep 17 00:00:00 2001
From: Adrian Hoban <ahoban@allen.ir.intel.com>
Date: Fri, 10 Sep 2010 18:08:45 +0100
Subject: [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions

This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
kernels. It supports 128-bit AES key size. This leverages the crypto
AEAD interface type to facilitate a combined AES & GCM operation to
be implemented in assembly code. The assembly code leverages Intel(R)
AES New Instructions and the PCLMULQDQ instruction.

Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com>
Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com>
Signed-off-by: James Guilford <james.guilford@intel.com>
Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com>
---
 arch/x86/crypto/aesni-intel_asm.S  | 1112 +++++++++++++++++++++++++++++++++++-
 arch/x86/crypto/aesni-intel_glue.c |  518 +++++++++++++++++-
 2 files changed, 1627 insertions(+), 3 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756..3950769 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -8,6 +8,17 @@
  *    Author: Huang Ying <ying.huang@intel.com>
  *            Vinodh Gopal <vinodh.gopal@intel.com>
  *            Kahraman Akdemir
+ *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *             Adrian Hoban <adrian.hoban@intel.com>
+ *             James Guilford (james.guilford@intel.com)
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Wajdi Feghali (wajdi.k.feghali@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -16,9 +27,50 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/inst.h>
+
+
+.data
+POLY:   .octa 0xC2000000000000000000000000000001
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+
+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
+MASK1:      .octa 0x0000000000000000ffffffffffffffff
+MASK2:      .octa 0xffffffffffffffff0000000000000000
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
+ZERO:       .octa 0x00000000000000000000000000000000
+ONE:        .octa 0x00000000000000000000000000000001
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+dec:        .octa 0x1
+enc:        .octa 0x2
+
 
 .text
+#define	STACK_OFFSET    8*3
+#define	HashKey		16*0	// store HashKey <<1 mod poly here
+#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
+#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
+#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
+#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64 bits of  HashKey <<1 mod poly here (for Karatsuba purposes)
+#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64 bits of  HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64 bits of  HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64 bits of  HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+#define	VARIABLE_OFFSET	16*8
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%r14)
+#define arg8 STACK_OFFSET+16(%r14)
+#define arg9 STACK_OFFSET+24(%r14)
+#define arg10 STACK_OFFSET+32(%r14)
+
 
 #define STATE1	%xmm0
 #define STATE2	%xmm4
@@ -47,6 +99,1064 @@
 #define T2	%r11
 #define TCTR_LOW T2
 
+
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro	GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+	movdqa	  \GH, \TMP1
+	pshufd	  $78, \GH, \TMP2
+	pshufd	  $78, \HK, \TMP3
+	pxor	  \GH, \TMP2            # TMP2 = a1+a0
+	pxor	  \HK, \TMP3            # TMP3 = b1+b0
+	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
+	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
+	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
+	pxor	  \GH, \TMP2
+	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
+	movdqa	  \TMP2, \TMP3
+	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
+	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
+	pxor	  \TMP3, \GH
+	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
+
+        # first phase of the reduction
+
+	movdqa    \GH, \TMP2
+	movdqa    \GH, \TMP3
+	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4 in in order to perform independent shifts
+	pslld     $31, \TMP2            # packed right shift <<31
+	pslld     $30, \TMP3            # packed right shift <<30
+	pslld     $25, \TMP4            # packed right shift <<25
+	pxor      \TMP3, \TMP2          # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP5
+	psrldq    $4, \TMP5             # right shift TMP5 1 DW
+	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
+	pxor      \TMP2, \GH
+
+        # second phase of the reduction
+
+	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4 in in order to perform independent shifts
+	movdqa    \GH,\TMP3
+	movdqa    \GH,\TMP4
+	psrld     $1,\TMP2              # packed left shift >>1
+	psrld     $2,\TMP3              # packed left shift >>2
+	psrld     $7,\TMP4              # packed left shift >>7
+	pxor      \TMP3,\TMP2		# xor the shifted versions
+	pxor      \TMP4,\TMP2
+	pxor      \TMP5, \TMP2
+	pxor      \TMP2, \GH
+	pxor      \TMP1, \GH            # result is in TMP1
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+
+	mov	   arg7, %r10           # %r10 = AAD
+	mov	   arg8, %r12           # %r12 = aadLen
+	mov	   %r12, %r11
+	pxor	   %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+	movd	   (%r10), \TMP1
+	pslldq	   $12, \TMP1
+	psrldq	   $4, %xmm\i
+	pxor	   \TMP1, %xmm\i
+	add	   $4, %r10
+	sub	   $4, %r12
+	jne	   _get_AAD_loop\num_initial_blocks\operation
+	cmp	   $16, %r11
+	je	   _get_AAD_loop2_done\num_initial_blocks\operation
+	mov	   $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+	psrldq	   $4, %xmm\i
+	sub	   $4, %r12
+	cmp	   %r11, %r12
+	jne	   _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+	pshufb	   SHUF_MASK(%rip), %xmm\i          # byte-reflect the AAD data
+	xor	   %r11, %r11                       # initialise the data pointer offset as zero
+
+
+        # start AES for num_initial_blocks blocks
+
+	mov	   %arg5, %rax                      # %rax = *Y0
+	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
+	pshufb	   SHUF_MASK(%rip), \XMM0
+.if \i_seq != 0
+.irpc index, \i_seq
+	paddd	   ONE(%rip), \XMM0                 # INCR Y0
+	movdqa	   \XMM0, %xmm\index
+	pshufb	   SHUF_MASK(%rip), %xmm\index      # perform a 16 byte swap
+.endr
+.irpc index, \i_seq
+	pxor	   16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+	aesenc     16*1(%rdi), %xmm\index          # Round 1
+.endr
+.irpc index, \i_seq
+	aesenc     16*2(%arg1), %xmm\index          # Round 2
+.endr
+.irpc index, \i_seq
+	aesenc     16*3(%arg1), %xmm\index          # Round 3
+.endr
+.irpc index, \i_seq
+	aesenc     16*4(%arg1), %xmm\index          # Round 4
+.endr
+.irpc index, \i_seq
+	aesenc     16*5(%arg1), %xmm\index          # Round 5
+.endr
+.irpc index, \i_seq
+	aesenc     16*6(%arg1), %xmm\index          # Round 6
+.endr
+.irpc index, \i_seq
+	aesenc     16*7(%arg1), %xmm\index          # Round 7
+.endr
+.irpc index, \i_seq
+	aesenc     16*8(%arg1), %xmm\index          # Round 8
+.endr
+.irpc index, \i_seq
+	aesenc     16*9(%arg1), %xmm\index          # Round 9
+.endr
+.irpc index, \i_seq
+	aesenclast 16*10(%arg1), %xmm\index         # Round 10
+.endr
+.irpc index, \i_seq
+	movdqu	   (%arg3 , %r11, 1), \TMP1
+	pxor	   \TMP1, %xmm\index
+	movdqu	   %xmm\index, (%arg2 , %r11, 1)    # write back plaintext/ciphertext for num_initial_blocks
+	add	   $16, %r11
+.if \operation == dec
+	movdqa     \TMP1, %xmm\index
+.endif
+	pshufb	   SHUF_MASK(%rip), %xmm\index      # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+
+        # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+        pxor       %xmm5, %xmm6
+	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+        pxor       %xmm6, %xmm7
+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+        pxor       %xmm7, %xmm8
+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+	cmp	   $64, %r13
+	jl	_initial_blocks_done\num_initial_blocks\operation   # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM1
+	pshufb	   SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM2
+	pshufb	   SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM3
+	pshufb	   SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
+	paddd	   ONE(%rip), \XMM0              # INCR Y0
+	movdqa	   \XMM0, \XMM4
+	pshufb	   SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
+	pxor	   16*0(%arg1), \XMM1
+	pxor	   16*0(%arg1), \XMM2
+	pxor	   16*0(%arg1), \XMM3
+	pxor	   16*0(%arg1), \XMM4
+	movdqa	   \TMP3, \TMP5
+	pshufd	   $78, \TMP3, \TMP1
+	pxor	   \TMP3, \TMP1
+	movdqa	   \TMP1, HashKey_k(%rsp)
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 = HashKey^2<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_2(%rsp)                           # HashKey_2 = HashKey^2<<1 (mod poly)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+	aesenc	   16*\index(%arg1), \XMM1
+	aesenc	   16*\index(%arg1), \XMM2
+	aesenc	   16*\index(%arg1), \XMM3
+	aesenc	   16*\index(%arg1), \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 = HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_3(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+	aesenc	   16*\index(%arg1), \XMM1
+	aesenc	   16*\index(%arg1), \XMM2
+	aesenc	   16*\index(%arg1), \XMM3
+	aesenc	   16*\index(%arg1), \XMM4
+.endr
+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7  # TMP5 = HashKey^3<<1 (mod poly)
+	movdqa	   \TMP5, HashKey_4(%rsp)
+	pshufd	   $78, \TMP5, \TMP1
+	pxor	   \TMP5, \TMP1
+	movdqa	   \TMP1, HashKey_4_k(%rsp)
+	aesenclast 160(%arg1), \XMM1
+	aesenclast 160(%arg1), \XMM2
+	aesenclast 160(%arg1), \XMM3
+	aesenclast 160(%arg1), \XMM4
+	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM1
+.if \operation == dec
+	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM1
+.endif
+	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM2
+.if \operation == dec
+	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM2
+.endif
+	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM3
+.if \operation == dec
+	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM3
+.endif
+	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
+	pxor	   \TMP1, \XMM4
+.if \operation == dec
+	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
+	movdqa     \TMP1, \XMM4
+.else
+	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
+	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
+	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
+	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+.endif
+	add	   $64, %r11
+	pshufb	   SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
+	pxor	   \XMMDst, \XMM1         # combine GHASHed value with the corresponding ciphertext
+	pshufb	   SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
+	pshufb	   SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
+	pshufb	   SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
+_initial_blocks_done\num_initial_blocks\operation:
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+	movdqa	  \XMM1, \XMM5
+	movdqa	  \XMM2, \XMM6
+	movdqa	  \XMM3, \XMM7
+	movdqa	  \XMM4, \XMM8
+
+        # multiply TMP5 * HashKey using karatsuba
+
+	movdqa	  \XMM5, \TMP4
+	pshufd	  $78, \XMM5, \TMP6
+	pxor	  \XMM5, \TMP6
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa	  HashKey_4(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
+	movdqa    \XMM0, \XMM1
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM2
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM3
+	paddd     ONE(%rip), \XMM0		# INCR CNT
+	movdqa    \XMM0, \XMM4
+	pshufb	  SHUF_MASK(%rip), \XMM1	# perform a 16 byte swap
+	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
+	pshufb	  SHUF_MASK(%rip), \XMM2	# perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM3	# perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM4	# perform a 16 byte swap
+	pxor	  (%arg1), \XMM1
+	pxor	  (%arg1), \XMM2
+	pxor	  (%arg1), \XMM3
+	pxor	  (%arg1), \XMM4
+	movdqa	  HashKey_4_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
+	aesenc	  16(%arg1), \XMM1              # Round 1
+	aesenc	  16(%arg1), \XMM2
+	aesenc	  16(%arg1), \XMM3
+	aesenc	  16(%arg1), \XMM4
+	aesenc	  32(%arg1), \XMM1              # Round 2
+	aesenc	  32(%arg1), \XMM2
+	aesenc	  32(%arg1), \XMM3
+	aesenc	  32(%arg1), \XMM4
+	movdqa	  \XMM6, \TMP1
+	pshufd	  $78, \XMM6, \TMP2
+	pxor	  \XMM6, \TMP2
+	movdqa	  HashKey_3(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
+	aesenc    48(%arg1), \XMM1              # Round 3
+	aesenc    48(%arg1), \XMM2
+	aesenc    48(%arg1), \XMM3
+	aesenc    48(%arg1), \XMM4
+	pclmulqdq $0x00, \TMP5, \XMM6           # XMM6 = a0*b0
+	aesenc	  64(%arg1), \XMM1              # Round 4
+	aesenc	  64(%arg1), \XMM2
+	aesenc	  64(%arg1), \XMM3
+	aesenc	  64(%arg1), \XMM4
+	movdqa	  HashKey_3_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	aesenc	  80(%arg1), \XMM1              # Round 5
+	aesenc	  80(%arg1), \XMM2
+	aesenc	  80(%arg1), \XMM3
+	aesenc	  80(%arg1), \XMM4
+	pxor	  \TMP1, \TMP4			# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM6, \XMM5
+	pxor	  \TMP2, \TMP6
+	movdqa	  \XMM7, \TMP1
+	pshufd	  $78, \XMM7, \TMP2
+	pxor	  \XMM7, \TMP2
+	movdqa	  HashKey_2(%rsp ), \TMP5
+
+        # Multiply TMP5 * HashKey using karatsuba
+
+	pclmulqdq $0x11, \TMP5, \TMP1           # TMP1 = a1*b1
+	aesenc	  96(%arg1), \XMM1              # Round 6
+	aesenc	  96(%arg1), \XMM2
+	aesenc	  96(%arg1), \XMM3
+	aesenc	  96(%arg1), \XMM4
+	pclmulqdq $0x00, \TMP5, \XMM7           # XMM7 = a0*b0
+	aesenc	  112(%arg1), \XMM1             # Round 7
+	aesenc	  112(%arg1), \XMM2
+	aesenc	  112(%arg1), \XMM3
+	aesenc	  112(%arg1), \XMM4
+	movdqa	  HashKey_2_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
+	aesenc	  128(%arg1), \XMM1             # Round 8
+	aesenc	  128(%arg1), \XMM2
+	aesenc	  128(%arg1), \XMM3
+	aesenc	  128(%arg1), \XMM4
+	pxor	  \TMP1, \TMP4			# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+	pxor	  \XMM7, \XMM5
+	pxor	  \TMP2, \TMP6
+
+        # Multiply XMM8 * HashKey
+        # XMM8 and TMP5 hold the values for the two operands
+
+	movdqa	  \XMM8, \TMP1
+	pshufd	  $78, \XMM8, \TMP2
+	pxor	  \XMM8, \TMP2
+	movdqa	  HashKey(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1          # TMP1 = a1*b1
+	aesenc	  144(%arg1), \XMM1            # Round 9
+	aesenc	  144(%arg1), \XMM2
+	aesenc	  144(%arg1), \XMM3
+	aesenc	  144(%arg1), \XMM4
+	pclmulqdq $0x00, \TMP5, \XMM8          # XMM8 = a0*b0
+	aesenclast 160(%arg1), \XMM1           # Round 10
+	aesenclast 160(%arg1), \XMM2
+	aesenclast 160(%arg1), \XMM3
+	aesenclast 160(%arg1), \XMM4
+	movdqa    HashKey_k(%rsp), \TMP5
+	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
+	movdqu	  (%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
+	movdqa    \TMP3, \XMM1
+.endif
+	movdqu	  16(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM2
+.endif
+	movdqu	  32(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM3
+.endif
+	movdqu	  48(%arg3,%r11,1), \TMP3
+	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
+.if \operation == dec
+	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
+	movdqa    \TMP3, \XMM4
+.else
+        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
+        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
+.endif
+	pshufb	  SHUF_MASK(%rip), \XMM1       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM2       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM3       # perform a 16 byte swap
+	pshufb	  SHUF_MASK(%rip), \XMM4       # perform a 16 byte sway
+
+	pxor	  \TMP4, \TMP1
+	pxor	  \XMM8, \XMM5
+	pxor	  \TMP6, \TMP2
+	pxor	  \TMP1, \TMP2
+	pxor	  \XMM5, \TMP2
+	movdqa	  \TMP2, \TMP3
+	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
+	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
+	pxor	  \TMP3, \XMM5
+	pxor	  \TMP2, \TMP1		       # accumulate the results in TMP1:XMM5
+
+        # first phase of reduction
+
+	movdqa    \XMM5, \TMP2
+	movdqa    \XMM5, \TMP3
+	movdqa    \XMM5, \TMP4		       # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+	pslld     $31, \TMP2                   # packed right shift << 31
+	pslld     $30, \TMP3                   # packed right shift << 30
+	pslld     $25, \TMP4                   # packed right shift << 25
+	pxor      \TMP3, \TMP2	               # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP5
+	psrldq    $4, \TMP5                    # right shift T5 1 DW
+	pslldq    $12, \TMP2                   # left shift T2 3 DWs
+	pxor      \TMP2, \XMM5
+
+        # second phase of reduction
+
+	movdqa    \XMM5,\TMP2		       # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+	movdqa    \XMM5,\TMP3
+	movdqa    \XMM5,\TMP4
+	psrld     $1, \TMP2                    # packed left shift >>1
+	psrld     $2, \TMP3                    # packed left shift >>2
+	psrld     $7, \TMP4                    # packed left shift >>7
+	pxor      \TMP3,\TMP2		       # xor the shifted versions
+	pxor      \TMP4,\TMP2
+	pxor      \TMP5, \TMP2
+	pxor      \TMP2, \XMM5
+	pxor      \TMP1, \XMM5                 # result is in TMP1
+
+	pxor	  \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
+
+        # Multiply TMP6 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM1, \TMP6
+	pshufd	  $78, \XMM1, \TMP2
+	pxor	  \XMM1, \TMP2
+	movdqa	  HashKey_4(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
+	movdqa	  HashKey_4_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	movdqa	  \XMM1, \XMMDst
+	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM2, \TMP1
+	pshufd	  $78, \XMM2, \TMP2
+	pxor	  \XMM2, \TMP2
+	movdqa	  HashKey_3(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
+	movdqa	  HashKey_3_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM2, \XMMDst
+	pxor	  \TMP2, \XMM1              # results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM3, \TMP1
+	pshufd	  $78, \XMM3, \TMP2
+	pxor	  \XMM3, \TMP2
+	movdqa	  HashKey_2(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
+	movdqa	  HashKey_2_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM3, \XMMDst
+	pxor	  \TMP2, \XMM1              # results accumulated in TMP6, XMMDst, XMM1
+
+        # Multiply TMP1 * HashKey (using Karatsuba)
+
+	movdqa	  \XMM4, \TMP1
+	pshufd	  $78, \XMM4, \TMP2
+	pxor	  \XMM4, \TMP2
+	movdqa	  HashKey(%rsp), \TMP5
+	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
+	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
+	movdqa	  HashKey_k(%rsp), \TMP4
+	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
+	pxor	  \TMP1, \TMP6
+	pxor	  \XMM4, \XMMDst
+	pxor	  \XMM1, \TMP2
+	pxor	  \TMP6, \TMP2
+	pxor	  \XMMDst, \TMP2            # middle section of the temp results combined as in karatsuba algorithm
+	movdqa	  \TMP2, \TMP4
+	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
+	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
+	pxor	  \TMP4, \XMMDst
+	pxor	  \TMP2, \TMP6              # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
+
+        # first phase of the reduction
+
+	movdqa    \XMMDst, \TMP2
+	movdqa    \XMMDst, \TMP3
+	movdqa    \XMMDst, \TMP4            # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
+	pslld     $31, \TMP2                # packed right shifting << 31
+	pslld     $30, \TMP3                # packed right shifting << 30
+	pslld     $25, \TMP4                # packed right shifting << 25
+	pxor      \TMP3, \TMP2              # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	movdqa    \TMP2, \TMP7
+	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
+	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
+	pxor      \TMP2, \XMMDst
+
+        # second phase of the reduction
+
+	movdqa    \XMMDst, \TMP2            # make 3 copies of XMMDst for doing 3 shift operations
+	movdqa    \XMMDst, \TMP3
+	movdqa    \XMMDst, \TMP4
+	psrld     $1, \TMP2                 # packed left shift >> 1
+	psrld     $2, \TMP3                 # packed left shift >> 2
+	psrld     $7, \TMP4                 # packed left shift >> 7
+	pxor      \TMP3, \TMP2              # xor the shifted versions
+	pxor      \TMP4, \TMP2
+	pxor      \TMP7, \TMP2
+	pxor      \TMP2, \XMMDst
+	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
+.endm
+
+/* Encryption of a single block done*/
+.macro ENCRYPT_SINGLE_BLOCK XMM0
+
+	pxor	(%arg1), \XMM0
+	aesenc	16(%arg1), \XMM0
+	aesenc	32(%arg1), \XMM0
+	aesenc	48(%arg1), \XMM0
+	aesenc	64(%arg1), \XMM0
+	aesenc	80(%arg1), \XMM0
+	aesenc	96(%arg1), \XMM0
+	aesenc	112(%arg1), \XMM0
+	aesenc	128(%arg1), \XMM0
+	aesenc	144(%arg1), \XMM0
+	aesenclast 160(%arg1), \XMM0
+.endm
+
+
+/**********************************************************************************************
+*void aesni_gcm_dec(void *aes_ctx,               // AES Key schedule. Starts on a 16 byte boundary.
+*                   u8 *out,                     // Plaintext output. Encrypt in-place is allowed.
+*                   const u8 *in,                // Ciphertext input
+*                   u64 plaintext_len,           // Length of data in bytes for decryption.
+*                   u8 *iv,                      // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                                // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                                // concatenated with 0x00000001. 16-byte aligned pointer.
+*                   u8 *hash_subkey,             // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                   const u8 *aad,               // Additional Authentication Data (AAD)
+*                   u64 aad_len,                 // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                   u8  *auth_tag,               // Authenticated Tag output. The driver will compare this to the
+*                                                // given authentication tag and only return the plaintext if they match.
+*                   u64 auth_tag_len);           // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8.
+*
+*
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 64-bit Extended Sequence Number
+*
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes. The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes. For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+*******************************************************************************************/
+
+ENTRY(aesni_gcm_dec)
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	%rsp, %r14
+/*
+* states of %xmm registers %xmm6:%xmm15 not saved
+* all %xmm registers are clobbered
+*/
+	sub	$VARIABLE_OFFSET, %rsp
+	and	$~63, %rsp                        # align rsp to 64 bytes
+	mov	%arg6, %r12
+	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
+	pshufb	SHUF_MASK(%rip), %xmm13
+
+        # Precompute HashKey<<1 (mod poly) from the hash key (this is required for GHASH)
+
+	movdqa	%xmm13, %xmm2
+	psllq	$1, %xmm13
+	psrlq	$63, %xmm2
+	movdqa	%xmm2, %xmm1
+	pslldq	$8, %xmm2
+	psrldq	$8, %xmm1
+	por	%xmm2, %xmm13
+
+        # Reduction
+
+	pshufd	$0x24, %xmm1, %xmm2
+	pcmpeqd TWOONE(%rip), %xmm2
+	pand	POLY(%rip), %xmm2
+	pxor	%xmm2, %xmm13			# %xmm13 holds the HashKey<<1 (mod poly)
+
+
+        # Decrypt first few blocks
+
+	movdqa	%xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
+	mov	%arg4, %r13                     # save the number of bytes of plaintext/ciphertext
+	and	$-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
+	mov	%r13, %r12
+	and	$(3<<4), %r12
+	jz	_initial_num_blocks_is_0_decrypt
+	cmp	$(2<<4), %r12
+	jb	_initial_num_blocks_is_1_decrypt
+	je	_initial_num_blocks_is_2_decrypt
+_initial_num_blocks_is_3_decrypt:
+	INITIAL_BLOCKS	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
+	sub	$48, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_2_decrypt:
+	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
+	sub	$32, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_1_decrypt:
+	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
+	sub	$16, %r13
+	jmp	_initial_blocks_decrypted
+_initial_num_blocks_is_0_decrypt:
+	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
+_initial_blocks_decrypted:
+	cmp	$0, %r13
+	je	_zero_cipher_left_decrypt
+	sub	$64, %r13
+	je	_four_cipher_left_decrypt
+_decrypt_by_4:
+	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
+	add	$64, %r11
+	sub	$64, %r13
+	jne	_decrypt_by_4
+_four_cipher_left_decrypt:
+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_decrypt:
+	mov	%arg4, %r13
+	and	$15, %r13				# %r13 = arg4 (mod 16)
+	je	_multiple_of_16_bytes_decrypt
+
+        # Handle the last <16 byte block seperately
+
+	paddd	ONE(%rip), %xmm0                        # increment CNT to get Yn
+	pshufb	SHUF_MASK(%rip), %xmm0
+	ENCRYPT_SINGLE_BLOCK 	%xmm0			# E(K, Yn)
+	sub	$16, %r11
+	add	%r13, %r11
+	movdqu	(%arg3,%r11,1), %xmm1			# recieve the last <16 byte block
+	lea	SHIFT_MASK+16(%rip), %r12
+	sub	%r13, %r12				# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
+							# (%r13 is the number of bytes in plaintext mod 16)
+	movdqu	(%r12), %xmm2				# get the appropriate shuffle mask
+	pshufb	%xmm2, %xmm1				# right shift 16-%r13 butes
+	movdqa  %xmm1, %xmm2
+	pxor	%xmm1, %xmm0				# Ciphertext XOR E(K, Yn)
+	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1           # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
+	pand	%xmm1, %xmm0				# mask out top 16-%r13 bytes of %xmm0
+	pand    %xmm1, %xmm2
+	pshufb	SHUF_MASK(%rip),%xmm2
+	pxor	%xmm2, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6	# GHASH computation for the last <16 byte block
+	sub	%r13, %r11
+	add	$16, %r11
+
+        # output %r13 bytes
+
+	movq	%xmm0, %rax
+	cmp	$8, %r13
+	jle	_less_than_8_bytes_left_decrypt
+	mov	%rax, (%arg2 , %r11, 1)
+	add	$8, %r11
+	psrldq	$8, %xmm0
+	movq	%xmm0, %rax
+	sub	$8, %r13
+_less_than_8_bytes_left_decrypt:
+	mov	%al,  (%arg2, %r11, 1)
+	add	$1, %r11
+	shr	$8, %rax
+	sub	$1, %r13
+	jne	_less_than_8_bytes_left_decrypt
+_multiple_of_16_bytes_decrypt:
+	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
+	shl	$3, %r12		  # convert into number of bits
+	movd	%r12d, %xmm15		  # len(A) in %xmm15
+	shl	$3, %arg4		  # len(C) in bits (*128)
+	movq	%arg4, %xmm1
+	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
+	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
+	pxor	%xmm15, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6	# final GHASH computation
+	pshufb	SHUF_MASK(%rip), %xmm8
+	mov	%arg5, %rax		  # %rax = *Y0
+	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
+	ENCRYPT_SINGLE_BLOCK	%xmm0 	  # E(K, Y0)
+	pxor	%xmm8, %xmm0
+_return_T_decrypt:
+	mov	arg9, %r10                # %r10 = authTag
+	mov	arg10, %r11               # %r11 = auth_tag_len
+	cmp	$16, %r11
+	je	_T_16_decrypt
+	cmp	$12, %r11
+	je	_T_12_decrypt
+_T_8_decrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	jmp	_return_T_done_decrypt
+_T_12_decrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	psrldq	$8, %xmm0
+	movd	%xmm0, %eax
+	mov	%eax, 8(%r10)
+	jmp	_return_T_done_decrypt
+_T_16_decrypt:
+	movdqu	%xmm0, (%r10)
+_return_T_done_decrypt:
+	mov	%r14, %rsp
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	ret
+
+
+/***************************************************************************************************
+* void aesni_gcm_enc(void *aes_ctx,               // AES Key schedule. Starts on a 16 byte boundary.
+*                    u8 *out,                    // Ciphertext output. Encrypt in-place is allowed.
+*                    const u8 *in,               // Plaintext input
+*                    u64 plaintext_len,          // Length of data in bytes for encryption.
+*                    u8 *iv,                     // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                                // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                                // concatenated with 0x00000001. 16-byte aligned pointer.
+*                    u8 *hash_subkey,            // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                    const u8 *aad,              // Additional Authentication Data (AAD)
+*                    u64 aad_len,                // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+*                    u8 *auth_tag,               // Authenticated Tag output.
+*                    u64 auth_tag_len);          // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8.
+*
+*
+*
+* Assumptions:
+*
+* keys:
+*       keys are pre-expanded and aligned to 16 bytes. we are using the first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                             Salt  (From the SA)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     Initialization Vector                     |
+*       |         (This is the sequence number from IPSec header)       |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x1                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+*       AAD padded to 128 bits with 0
+*       for example, assume AAD is a u32 vector
+*
+*       if AAD is 8 bytes:
+*       AAD[3] = {A0, A1};
+*       padded AAD in xmm register = {A1 A0 0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A1)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                     32-bit Sequence Number (A0)               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 32-bit Sequence Number
+*
+*       if AAD is 12 bytes:
+*       AAD[3] = {A0, A1, A2};
+*       padded AAD in xmm register = {A2 A1 A0 0}
+*
+*       0                   1                   2                   3
+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                               SPI (A2)                        |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                 64-bit Extended Sequence Number {A1,A0}       |
+*       |                                                               |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*       |                              0x0                              |
+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*                                       AAD Format with 64-bit Extended Sequence Number
+*
+*
+* aadLen:
+*       from the definition of the spec, aadLen can only be 8 or 12 bytes. The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes. For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+**************************************************************************************************************************/
+ENTRY(aesni_gcm_enc)
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	%rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+	sub	$VARIABLE_OFFSET, %rsp
+	and	$~63, %rsp
+	mov	%arg6, %r12
+	movdqu	(%r12), %xmm13
+	pshufb	SHUF_MASK(%rip), %xmm13
+
+        # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+	movdqa	%xmm13, %xmm2
+	psllq	$1, %xmm13
+	psrlq	$63, %xmm2
+	movdqa	%xmm2, %xmm1
+	pslldq	$8, %xmm2
+	psrldq	$8, %xmm1
+	por	%xmm2, %xmm13
+
+        # reduce HashKey<<1
+
+	pshufd	$0x24, %xmm1, %xmm2
+	pcmpeqd TWOONE(%rip), %xmm2
+	pand	POLY(%rip), %xmm2
+	pxor	%xmm2, %xmm13
+	movdqa	%xmm13, HashKey(%rsp)
+	mov	%arg4, %r13               # %xmm13 holds HashKey<<1 (mod poly)
+	and	$-16, %r13
+	mov	%r13, %r12
+
+        # Encrypt first few blocks
+
+	and	$(3<<4), %r12
+	jz	_initial_num_blocks_is_0_encrypt
+	cmp	$(2<<4), %r12
+	jb	_initial_num_blocks_is_1_encrypt
+	je	_initial_num_blocks_is_2_encrypt
+_initial_num_blocks_is_3_encrypt:
+	INITIAL_BLOCKS	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
+	sub	$48, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_2_encrypt:
+	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
+	sub	$32, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_1_encrypt:
+	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
+	sub	$16, %r13
+	jmp	_initial_blocks_encrypted
+_initial_num_blocks_is_0_encrypt:
+	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
+_initial_blocks_encrypted:
+
+        # Main loop - Encrypt remaining blocks
+
+	cmp	$0, %r13
+	je	_zero_cipher_left_encrypt
+	sub	$64, %r13
+	je	_four_cipher_left_encrypt
+_encrypt_by_4_encrypt:
+	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
+	add	$64, %r11
+	sub	$64, %r13
+	jne	_encrypt_by_4_encrypt
+_four_cipher_left_encrypt:
+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_encrypt:
+	mov	%arg4, %r13
+	and	$15, %r13			# %r13 = arg4 (mod 16)
+	je	_multiple_of_16_bytes_encrypt
+
+         # Handle the last <16 Byte block seperately
+
+	paddd	ONE(%rip), %xmm0                # INCR CNT to get Yn
+	pshufb	SHUF_MASK(%rip), %xmm0
+	ENCRYPT_SINGLE_BLOCK	%xmm0 		# Encrypt(K, Yn)
+	sub	$16, %r11
+	add	%r13, %r11
+	movdqu	(%arg3,%r11,1), %xmm1		# recieve the last <16 byte blocks
+	lea	SHIFT_MASK+16(%rip), %r12
+	sub	%r13, %r12			# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+                                                # (%r13 is the number of bytes in plaintext mod 16)
+	movdqu	(%r12), %xmm2			# get the appropriate shuffle mask
+	pshufb	%xmm2, %xmm1			# shift right 16-r13 byte
+	pxor	%xmm1, %xmm0			# Plaintext XOR Encrypt(K, Yn)
+	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1   # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+	pand	%xmm1, %xmm0			# mask out top 16-r13 bytes of xmm0
+
+	pshufb	SHUF_MASK(%rip),%xmm0
+	pxor	%xmm0, %xmm8
+	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6	# GHASH computation for the last <16 byte block
+	sub	%r13, %r11
+	add	$16, %r11
+	pshufb	SHUF_MASK(%rip), %xmm0          # shuffle xmm0 back to output as ciphertext
+
+        # Output %r13 bytes
+
+	movq	%xmm0, %rax
+	cmp	$8, %r13
+	jle	_less_than_8_bytes_left_encrypt
+	mov	%rax, (%arg2 , %r11, 1)
+	add	$8, %r11
+	psrldq	$8, %xmm0
+	movq	%xmm0, %rax
+	sub	$8, %r13
+_less_than_8_bytes_left_encrypt:
+	mov	%al,  (%arg2, %r11, 1)
+	add	$1, %r11
+	shr	$8, %rax
+	sub	$1, %r13
+	jne	_less_than_8_bytes_left_encrypt
+_multiple_of_16_bytes_encrypt:
+	mov	arg8, %r12	               # %r12 = addLen (number of bytes)
+	shl	$3, %r12
+	movd	%r12d, %xmm15	               # len(A) in %xmm15
+	shl	$3, %arg4	               # len(C) in bits (*128)
+	movq	%arg4, %xmm1
+	pslldq	$8, %xmm15	               # %xmm15 = len(A)||0x0000000000000000
+	pxor	%xmm1, %xmm15	               # %xmm15 = len(A)||len(C)
+	pxor	%xmm15, %xmm8
+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6	# final GHASH computation
+
+	pshufb	SHUF_MASK(%rip), %xmm8         # perform a 16 byte swap
+	mov	%arg5, %rax		       # %rax  = *Y0
+	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
+	ENCRYPT_SINGLE_BLOCK	%xmm0          # Encrypt(K, Y0)
+	pxor	%xmm8, %xmm0
+_return_T_encrypt:
+	mov	arg9, %r10                     # %r10 = authTag
+	mov	arg10, %r11                    # %r11 = auth_tag_len
+	cmp	$16, %r11
+	je	_T_16_encrypt
+	cmp	$12, %r11
+	je	_T_12_encrypt
+_T_8_encrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	jmp	_return_T_done_encrypt
+_T_12_encrypt:
+	movq	%xmm0, %rax
+	mov	%rax, (%r10)
+	psrldq	$8, %xmm0
+	movd	%xmm0, %eax
+	mov	%eax, 8(%r10)
+	jmp	_return_T_done_encrypt
+_T_16_encrypt:
+	movdqu	%xmm0, (%r10)
+_return_T_done_encrypt:
+	mov	%r14, %rsp
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	ret
+
+#include <asm/inst.h>
+
 _key_expansion_128:
 _key_expansion_256a:
 	pshufd $0b11111111, %xmm1, %xmm1
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc..f0b0a6f 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
  * Copyright (C) 2008, Intel Corp.
  *    Author: Huang Ying <ying.huang@intel.com>
  *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
 #include <crypto/ctr.h>
 #include <asm/i387.h>
 #include <asm/aes.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
 
 #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
 #define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
 
-#define AESNI_ALIGN	16
+/* This data is stored at the end of the crypto_tfm struct.
+ * It's a type of per "session" data storage location.
+ * This needs to be 16 byte aligned.
+ */
+struct aesni_rfc4106_gcm_ctx {
+	u8 hash_subkey[16];
+	struct crypto_aes_ctx aes_key_expanded;
+	u8 nonce[4];
+	struct cryptd_aead *cryptd_tfm;
+};
+
+struct aesni_gcm_set_hash_subkey_result {
+	int err;
+	struct completion completion;
+};
+
+struct aesni_hash_subkey_req_data {
+	u8 iv[16];
+	struct aesni_gcm_set_hash_subkey_result result;
+	struct scatterlist sg;
+};
+
+#define AESNI_ALIGN	(16)
 #define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE-1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
 
 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 			     unsigned int key_len);
@@ -62,6 +97,57 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
+/* asmlinkage void aesni_gcm_enc()
+ * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
+ * const u8 *in, Plaintext input
+ * unsigned long plaintext_len, Length of data in bytes for encryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
+ *          is going to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
+ *          Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
+			const u8 *in, unsigned long plaintext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+/* asmlinkage void aesni_gcm_dec()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
+ * const u8 *in, Ciphertext input
+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
+ * to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+			u8 *auth_tag, unsigned long auth_tag_len);
+
+static inline struct
+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
+{
+	return
+		(struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)
+		crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
+}
+
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
 	unsigned long addr = (unsigned long)raw_ctx;
@@ -730,6 +816,422 @@ static struct crypto_alg ablk_xts_alg = {
 };
 #endif
 
+static int rfc4106_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ctx->cryptd_tfm = cryptd_tfm;
+	tfm->crt_aead.reqsize = sizeof(struct aead_request)
+		+ crypto_aead_reqsize(&cryptd_tfm->base);
+	return 0;
+}
+
+static void rfc4106_exit(struct crypto_tfm *tfm)
+{
+	struct aesni_rfc4106_gcm_ctx *ctx =
+		(struct aesni_rfc4106_gcm_ctx *)
+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+	if (!IS_ERR(ctx->cryptd_tfm))
+		cryptd_free_aead(ctx->cryptd_tfm);
+	return;
+}
+
+static void
+rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
+{
+	struct aesni_gcm_set_hash_subkey_result *result = req->data;
+
+	if (err == -EINPROGRESS)
+		return;
+	result->err = err;
+	complete(&result->completion);
+}
+
+static int
+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+{
+	struct crypto_ablkcipher *ctr_tfm;
+	struct ablkcipher_request *req;
+	int ret = -EINVAL;
+	struct aesni_hash_subkey_req_data *req_data;
+
+	ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
+	if (IS_ERR(ctr_tfm))
+		return PTR_ERR(ctr_tfm);
+
+	crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
+
+	ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
+	if (ret) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return ret;
+	}
+
+	req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
+	if (!req) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return -EINVAL;
+	}
+
+	req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+	if (!req_data) {
+		crypto_free_ablkcipher(ctr_tfm);
+		return -ENOMEM;
+	}
+	memset(req_data->iv, 0, sizeof(req_data->iv));
+
+	/* Clear the data in the hash sub key container to zero.*/
+	/* We want to cipher all zeros to create the hash sub key. */
+	memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
+
+	init_completion(&req_data->result.completion);
+	sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
+	ablkcipher_request_set_tfm(req, ctr_tfm);
+	ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+					CRYPTO_TFM_REQ_MAY_BACKLOG,
+					rfc4106_set_hash_subkey_done,
+					&req_data->result);
+
+	ablkcipher_request_set_crypt(req, &req_data->sg,
+		&req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
+
+	ret = crypto_ablkcipher_encrypt(req);
+	if (ret == -EINPROGRESS || ret == -EBUSY) {
+		ret = wait_for_completion_interruptible
+			(&req_data->result.completion);
+		if (!ret)
+			ret = req_data->result.err;
+	}
+	ablkcipher_request_free(req);
+	kfree(req_data);
+	crypto_free_ablkcipher(ctr_tfm);
+	return ret;
+}
+
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+						   unsigned int key_len)
+{
+	int ret = 0;
+	struct crypto_tfm *tfm = crypto_aead_tfm(parent);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+	u8 *new_key_mem = NULL;
+
+	if (key_len < 4) {
+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	/*Account for 4 byte nonce at the end.*/
+	key_len -= 4;
+	if (key_len != AES_KEYSIZE_128) {
+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
+	/*This must be on a 16 byte boundary!*/
+	if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
+		return -EINVAL;
+
+	if ((unsigned long)key % AESNI_ALIGN) {
+		/*key is not aligned: use an auxuliar aligned pointer*/
+		new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
+		if (!new_key_mem)
+			return -ENOMEM;
+
+		new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
+		memcpy(new_key_mem, key, key_len);
+		key = new_key_mem;
+	}
+
+	if (!irq_fpu_usable())
+		ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
+		key, key_len);
+	else {
+		kernel_fpu_begin();
+		ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
+		kernel_fpu_end();
+	}
+	/*This must be on a 16 byte boundary!*/
+	if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
+		ret = -EINVAL;
+		goto exit;
+	}
+	ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+exit:
+	kfree(new_key_mem);
+	return ret;
+}
+
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+				unsigned int authsize)
+{
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	switch (authsize) {
+	case 8:
+	case 12:
+	case 16:
+		break;
+	default:
+		return -EINVAL;
+	}
+	crypto_aead_crt(parent)->authsize = authsize;
+	crypto_aead_crt(cryptd_child)->authsize = authsize;
+	return 0;
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_aead_encrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = cryptd_child->base.crt_aead.encrypt(req);
+		kernel_fpu_end();
+		return ret;
+	}
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		return crypto_aead_decrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = cryptd_child->base.crt_aead.decrypt(req);
+		kernel_fpu_end();
+		return ret;
+	}
+}
+
+static struct crypto_alg rfc4106_alg = {
+	.cra_name = "rfc4106(gcm(aes))",
+	.cra_driver_name = "rfc4106-gcm-aesni",
+	.cra_priority = 400,
+	.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+	.cra_blocksize = 1,
+	.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+	.cra_alignmask = 0,
+	.cra_type = &crypto_nivaead_type,
+	.cra_module = THIS_MODULE,
+	.cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
+	.cra_init = rfc4106_init,
+	.cra_exit = rfc4106_exit,
+	.cra_u = {
+		.aead = {
+			.setkey = rfc4106_set_key,
+			.setauthsize = rfc4106_set_authsize,
+			.encrypt = rfc4106_encrypt,
+			.decrypt = rfc4106_decrypt,
+			.geniv = "seqiv",
+			.ivsize = 8,
+			.maxauthsize = 16,
+		},
+	},
+};
+
+static int __driver_rfc4106_encrypt(struct aead_request *req)
+{
+	u8 one_entry_in_sg = 0;
+	u8 *src, *dst, *assoc;
+	__be32 counter = cpu_to_be32(1);
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	void *aes_ctx = &(ctx->aes_key_expanded);
+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	u8 iv_tab[16+AESNI_ALIGN];
+	u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
+	struct scatter_walk src_sg_walk;
+	struct scatter_walk assoc_sg_walk;
+	struct scatter_walk dst_sg_walk;
+	unsigned int i;
+
+	/* Assuming we are supporting rfc4106 64-bit extended */
+	/* sequence numbers We need to have the AAD length equal */
+	/* to 8 or 12 bytes */
+	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+		return -EINVAL;
+	/* IV below built */
+	for (i = 0; i < 4; i++)
+		*(iv+i) = ctx->nonce[i];
+	for (i = 0; i < 8; i++)
+		*(iv+4+i) = req->iv[i];
+	*((__be32 *)(iv+12)) = counter;
+
+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+		one_entry_in_sg = 1;
+		scatterwalk_start(&src_sg_walk, req->src);
+		scatterwalk_start(&assoc_sg_walk, req->assoc);
+		src = scatterwalk_map(&src_sg_walk, 0);
+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		dst = src;
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_start(&dst_sg_walk, req->dst);
+			dst = scatterwalk_map(&dst_sg_walk, 0);
+		}
+
+	} else {
+		/* Allocate memory for src, dst, assoc */
+		src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+			GFP_ATOMIC);
+		if (unlikely(!src))
+			return -ENOMEM;
+		assoc = (src + req->cryptlen + auth_tag_len);
+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
+					req->assoclen, 0);
+		dst = src;
+	}
+
+	aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
+		+ ((unsigned long)req->cryptlen), auth_tag_len);
+
+	/* The authTag (aka the Integrity Check Value) needs to be written
+	 * back to the packet. */
+	if (one_entry_in_sg) {
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_unmap(dst, 0);
+			scatterwalk_done(&dst_sg_walk, 0, 0);
+		}
+		scatterwalk_unmap(src, 0);
+		scatterwalk_unmap(assoc, 0);
+		scatterwalk_done(&src_sg_walk, 0, 0);
+		scatterwalk_done(&assoc_sg_walk, 0, 0);
+	} else {
+		scatterwalk_map_and_copy(dst, req->dst, 0,
+			req->cryptlen + auth_tag_len, 1);
+		kfree(src);
+	}
+	return 0;
+}
+
+static int __driver_rfc4106_decrypt(struct aead_request *req)
+{
+	u8 one_entry_in_sg = 0;
+	u8 *src, *dst, *assoc;
+	unsigned long tempCipherLen = 0;
+	__be32 counter = cpu_to_be32(1);
+	int retval = 0;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	void *aes_ctx = &(ctx->aes_key_expanded);
+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+	u8 iv_and_authTag[32+AESNI_ALIGN];
+	u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
+	u8 *authTag = iv + 16;
+	struct scatter_walk src_sg_walk;
+	struct scatter_walk assoc_sg_walk;
+	struct scatter_walk dst_sg_walk;
+	unsigned int i;
+
+	if (unlikely((req->cryptlen < auth_tag_len) ||
+		(req->assoclen != 8 && req->assoclen != 12)))
+		return -EINVAL;
+	/* Assuming we are supporting rfc4106 64-bit extended */
+	/* sequence numbers We need to have the AAD length */
+	/* equal to 8 or 12 bytes */
+
+	tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
+	/* IV below built */
+	for (i = 0; i < 4; i++)
+		*(iv+i) = ctx->nonce[i];
+	for (i = 0; i < 8; i++)
+		*(iv+4+i) = req->iv[i];
+	*((__be32 *)(iv+12)) = counter;
+
+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+		one_entry_in_sg = 1;
+		scatterwalk_start(&src_sg_walk, req->src);
+		scatterwalk_start(&assoc_sg_walk, req->assoc);
+		src = scatterwalk_map(&src_sg_walk, 0);
+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
+		dst = src;
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_start(&dst_sg_walk, req->dst);
+			dst = scatterwalk_map(&dst_sg_walk, 0);
+		}
+
+	} else {
+		/* Allocate memory for src, dst, assoc */
+		src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+		if (!src)
+			return -ENOMEM;
+		assoc = (src + req->cryptlen + auth_tag_len);
+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
+			req->assoclen, 0);
+		dst = src;
+	}
+
+	aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
+		authTag, auth_tag_len);
+
+	/* Compare generated tag with passed in tag. */
+	retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
+		-EBADMSG : 0;
+
+	if (one_entry_in_sg) {
+		if (unlikely(req->src != req->dst)) {
+			scatterwalk_unmap(dst, 0);
+			scatterwalk_done(&dst_sg_walk, 0, 0);
+		}
+		scatterwalk_unmap(src, 0);
+		scatterwalk_unmap(assoc, 0);
+		scatterwalk_done(&src_sg_walk, 0, 0);
+		scatterwalk_done(&assoc_sg_walk, 0, 0);
+	} else {
+		scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
+		kfree(src);
+	}
+	return retval;
+}
+
+static struct crypto_alg __rfc4106_alg = {
+	.cra_name		= "__gcm-aes-aesni",
+	.cra_driver_name	= "__driver-gcm-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_aead_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(__rfc4106_alg.cra_list),
+	.cra_u = {
+		.aead = {
+			.encrypt	= __driver_rfc4106_encrypt,
+			.decrypt	= __driver_rfc4106_decrypt,
+		},
+	},
+};
+
 static int __init aesni_init(void)
 {
 	int err;
@@ -738,6 +1240,7 @@ static int __init aesni_init(void)
 		printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
 		return -ENODEV;
 	}
+
 	if ((err = crypto_register_alg(&aesni_alg)))
 		goto aes_err;
 	if ((err = crypto_register_alg(&__aesni_alg)))
@@ -770,10 +1273,19 @@ static int __init aesni_init(void)
 	if ((err = crypto_register_alg(&ablk_xts_alg)))
 		goto ablk_xts_err;
 #endif
-
+	err = crypto_register_alg(&__rfc4106_alg);
+	if (err)
+		goto __aead_gcm_err;
+	err = crypto_register_alg(&rfc4106_alg);
+	if (err)
+		goto aead_gcm_err;
 	return err;
 
+aead_gcm_err:
+	crypto_unregister_alg(&__rfc4106_alg);
+__aead_gcm_err:
 #ifdef HAS_XTS
+	crypto_unregister_alg(&ablk_xts_alg);
 ablk_xts_err:
 #endif
 #ifdef HAS_PCBC
@@ -809,6 +1321,8 @@ aes_err:
 
 static void __exit aesni_exit(void)
 {
+	crypto_unregister_alg(&__rfc4106_alg);
+	crypto_unregister_alg(&rfc4106_alg);
 #ifdef HAS_XTS
 	crypto_unregister_alg(&ablk_xts_alg);
 #endif
-- 
1.5.3.rc5

--------------------------------------------------------------
Intel Shannon Limited
Registered in Ireland
Registered Office: Collinstown Industrial Park, Leixlip, County Kildare
Registered Number: 308263
Business address: Dromore House, East Park, Shannon, Co. Clare

This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). Any review or distribution by others is strictly prohibited. If you are not the intended recipient, please contact the sender and delete all copies.



^ permalink raw reply related	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2010-12-14  2:21 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-09-10 17:36 [PATCH 2/3] RFC4106 AES-GCM Driver Using Intel New Instructions Hoban, Adrian
2010-09-20  8:07 ` Herbert Xu
     [not found]   ` <AANLkTim_gVO2tHUw+vFC9vXRh8Dtv7N38HTCMUZ7LmNg@mail.gmail.com>
2010-09-20 11:21     ` Struk, Tadeusz
2010-09-23  7:58       ` Herbert Xu
2010-09-24 10:07         ` Paoloni, Gabriele
2010-10-11 15:48 tadeusz.struk
2010-10-12  8:31 tadeusz.struk
2010-10-19 12:49 ` Herbert Xu
     [not found]   ` <44813D8942D35947805CBEEA94195BB001B135B69E@irsmsx505.ger.corp.intel.com>
     [not found]     ` <24483BA4C9F69C43A7379639D35543D7C2F99F27@irsmsx502.ger.corp.intel.com>
2010-10-20 10:49       ` Hoban, Adrian
2010-10-20 13:02         ` Herbert Xu
2010-10-28 15:19 tadeusz.struk
2010-11-04 19:04 ` Herbert Xu
2010-11-18  1:23   ` Andrew Morton
2010-11-18  9:26     ` Herbert Xu
2010-11-18  9:38       ` Struk, Tadeusz
2010-12-03  0:14         ` Andrew Morton
2010-12-03  1:30           ` Herbert Xu
2010-12-03 11:05             ` Struk, Tadeusz
2010-12-14  2:21             ` Herbert Xu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).