All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode
@ 2018-10-09 12:55 Daniel P. Berrangé
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 1/6] crypto: expand algorithm coverage for cipher benchmark Daniel P. Berrangé
                   ` (6 more replies)
  0 siblings, 7 replies; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 12:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Alberto Garcia

The XTS cipher mode is significantly slower than CBC mode. This series
approximately doubles the XTS performance which will improve the I/O
rate for LUKS disks.

Daniel P. Berrangé (6):
  crypto: expand algorithm coverage for cipher benchmark
  crypto: remove code duplication in tweak encrypt/decrypt
  crypto: introduce a xts_uint128 data type
  crypto: convert xts_tweak_encdec to use xts_uint128 type
  crypto: convert xts_mult_x to use xts_uint128 type
  crypto: annotate xts_tweak_encdec as inlineable

 crypto/xts.c                    | 147 ++++++++++++++-----------------
 tests/benchmark-crypto-cipher.c | 149 +++++++++++++++++++++++++++-----
 2 files changed, 191 insertions(+), 105 deletions(-)

-- 
2.17.1

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [Qemu-devel] [PATCH 1/6] crypto: expand algorithm coverage for cipher benchmark
  2018-10-09 12:55 [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Daniel P. Berrangé
@ 2018-10-09 12:55 ` Daniel P. Berrangé
  2018-10-09 14:04   ` Marc-André Lureau
  2018-10-10 11:45   ` Alberto Garcia
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 2/6] crypto: remove code duplication in tweak encrypt/decrypt Daniel P. Berrangé
                   ` (5 subsequent siblings)
  6 siblings, 2 replies; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 12:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Alberto Garcia

Add testing coverage for AES with XTS, ECB and CTR modes

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
---
 tests/benchmark-crypto-cipher.c | 149 +++++++++++++++++++++++++++-----
 1 file changed, 126 insertions(+), 23 deletions(-)

diff --git a/tests/benchmark-crypto-cipher.c b/tests/benchmark-crypto-cipher.c
index f5a0d0bc32..a8325a9510 100644
--- a/tests/benchmark-crypto-cipher.c
+++ b/tests/benchmark-crypto-cipher.c
@@ -15,17 +15,27 @@
 #include "crypto/init.h"
 #include "crypto/cipher.h"
 
-static void test_cipher_speed(const void *opaque)
+static void test_cipher_speed(size_t chunk_size,
+                              QCryptoCipherMode mode,
+                              QCryptoCipherAlgorithm alg)
 {
     QCryptoCipher *cipher;
     Error *err = NULL;
     double total = 0.0;
-    size_t chunk_size = (size_t)opaque;
     uint8_t *key = NULL, *iv = NULL;
     uint8_t *plaintext = NULL, *ciphertext = NULL;
-    size_t nkey = qcrypto_cipher_get_key_len(QCRYPTO_CIPHER_ALG_AES_128);
-    size_t niv = qcrypto_cipher_get_iv_len(QCRYPTO_CIPHER_ALG_AES_128,
-                                           QCRYPTO_CIPHER_MODE_CBC);
+    size_t nkey;
+    size_t niv;
+
+    if (!qcrypto_cipher_supports(alg, mode)) {
+        return;
+    }
+
+    nkey = qcrypto_cipher_get_key_len(alg);
+    niv = qcrypto_cipher_get_iv_len(alg, mode);
+    if (mode == QCRYPTO_CIPHER_MODE_XTS) {
+        nkey *= 2;
+    }
 
     key = g_new0(uint8_t, nkey);
     memset(key, g_test_rand_int(), nkey);
@@ -38,14 +48,14 @@ static void test_cipher_speed(const void *opaque)
     plaintext = g_new0(uint8_t, chunk_size);
     memset(plaintext, g_test_rand_int(), chunk_size);
 
-    cipher = qcrypto_cipher_new(QCRYPTO_CIPHER_ALG_AES_128,
-                                QCRYPTO_CIPHER_MODE_CBC,
+    cipher = qcrypto_cipher_new(alg, mode,
                                 key, nkey, &err);
     g_assert(cipher != NULL);
 
-    g_assert(qcrypto_cipher_setiv(cipher,
-                                  iv, niv,
-                                  &err) == 0);
+    if (mode != QCRYPTO_CIPHER_MODE_ECB)
+        g_assert(qcrypto_cipher_setiv(cipher,
+                                      iv, niv,
+                                      &err) == 0);
 
     g_test_timer_start();
     do {
@@ -55,13 +65,26 @@ static void test_cipher_speed(const void *opaque)
                                         chunk_size,
                                         &err) == 0);
         total += chunk_size;
-    } while (g_test_timer_elapsed() < 5.0);
+    } while (g_test_timer_elapsed() < 1.0);
 
     total /= MiB;
-    g_print("cbc(aes128): ");
-    g_print("Testing chunk_size %zu bytes ", chunk_size);
-    g_print("done: %.2f MB in %.2f secs: ", total, g_test_timer_last());
-    g_print("%.2f MB/sec\n", total / g_test_timer_last());
+    g_print("Enc chunk %zu bytes ", chunk_size);
+    g_print("%.2f MB/sec ", total / g_test_timer_last());
+
+    total = 0.0;
+    g_test_timer_start();
+    do {
+        g_assert(qcrypto_cipher_decrypt(cipher,
+                                        plaintext,
+                                        ciphertext,
+                                        chunk_size,
+                                        &err) == 0);
+        total += chunk_size;
+    } while (g_test_timer_elapsed() < 1.0);
+
+    total /= MiB;
+    g_print("Dec chunk %zu bytes ", chunk_size);
+    g_print("%.2f MB/sec ", total / g_test_timer_last());
 
     qcrypto_cipher_free(cipher);
     g_free(plaintext);
@@ -70,19 +93,99 @@ static void test_cipher_speed(const void *opaque)
     g_free(key);
 }
 
-int main(int argc, char **argv)
+
+static void test_cipher_speed_ecb_aes_128(const void *opaque)
+{
+    size_t chunk_size = (size_t)opaque;
+    test_cipher_speed(chunk_size,
+                      QCRYPTO_CIPHER_MODE_ECB,
+                      QCRYPTO_CIPHER_ALG_AES_128);
+}
+
+static void test_cipher_speed_ecb_aes_256(const void *opaque)
 {
-    size_t i;
-    char name[64];
+    size_t chunk_size = (size_t)opaque;
+    test_cipher_speed(chunk_size,
+                      QCRYPTO_CIPHER_MODE_ECB,
+                      QCRYPTO_CIPHER_ALG_AES_256);
+}
+
+static void test_cipher_speed_cbc_aes_128(const void *opaque)
+{
+    size_t chunk_size = (size_t)opaque;
+    test_cipher_speed(chunk_size,
+                      QCRYPTO_CIPHER_MODE_CBC,
+                      QCRYPTO_CIPHER_ALG_AES_128);
+}
 
+static void test_cipher_speed_cbc_aes_256(const void *opaque)
+{
+    size_t chunk_size = (size_t)opaque;
+    test_cipher_speed(chunk_size,
+                      QCRYPTO_CIPHER_MODE_CBC,
+                      QCRYPTO_CIPHER_ALG_AES_256);
+}
+
+static void test_cipher_speed_ctr_aes_128(const void *opaque)
+{
+    size_t chunk_size = (size_t)opaque;
+    test_cipher_speed(chunk_size,
+                      QCRYPTO_CIPHER_MODE_CTR,
+                      QCRYPTO_CIPHER_ALG_AES_128);
+}
+
+static void test_cipher_speed_ctr_aes_256(const void *opaque)
+{
+    size_t chunk_size = (size_t)opaque;
+    test_cipher_speed(chunk_size,
+                      QCRYPTO_CIPHER_MODE_CTR,
+                      QCRYPTO_CIPHER_ALG_AES_256);
+}
+
+static void test_cipher_speed_xts_aes_128(const void *opaque)
+{
+    size_t chunk_size = (size_t)opaque;
+    test_cipher_speed(chunk_size,
+                      QCRYPTO_CIPHER_MODE_XTS,
+                      QCRYPTO_CIPHER_ALG_AES_128);
+}
+
+static void test_cipher_speed_xts_aes_256(const void *opaque)
+{
+    size_t chunk_size = (size_t)opaque;
+    test_cipher_speed(chunk_size,
+                      QCRYPTO_CIPHER_MODE_XTS,
+                      QCRYPTO_CIPHER_ALG_AES_256);
+}
+
+
+int main(int argc, char **argv)
+{
     g_test_init(&argc, &argv, NULL);
     g_assert(qcrypto_init(NULL) == 0);
 
-    for (i = 512; i <= 64 * KiB; i *= 2) {
-        memset(name, 0 , sizeof(name));
-        snprintf(name, sizeof(name), "/crypto/cipher/speed-%zu", i);
-        g_test_add_data_func(name, (void *)i, test_cipher_speed);
-    }
+#define ADD_TEST(mode, cipher, keysize, chunk)                          \
+    g_test_add_data_func(                                               \
+        "/crypto/cipher/" #mode "-" #cipher "-" #keysize "/chunk-" #chunk, \
+        (void *)chunk,                                                  \
+        test_cipher_speed_ ## mode ## _ ## cipher ## _ ## keysize)
+
+#define ADD_TESTS(chunk)                        \
+    do {                                        \
+        ADD_TEST(ecb, aes, 128, chunk);         \
+        ADD_TEST(ecb, aes, 256, chunk);         \
+        ADD_TEST(cbc, aes, 128, chunk);         \
+        ADD_TEST(cbc, aes, 256, chunk);         \
+        ADD_TEST(ctr, aes, 128, chunk);         \
+        ADD_TEST(ctr, aes, 256, chunk);         \
+        ADD_TEST(xts, aes, 128, chunk);         \
+        ADD_TEST(xts, aes, 256, chunk); \
+    } while (0)
+
+    ADD_TESTS(512);
+    ADD_TESTS(4096);
+    ADD_TESTS(16384);
+    ADD_TESTS(65536);
 
     return g_test_run();
 }
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [Qemu-devel] [PATCH 2/6] crypto: remove code duplication in tweak encrypt/decrypt
  2018-10-09 12:55 [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Daniel P. Berrangé
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 1/6] crypto: expand algorithm coverage for cipher benchmark Daniel P. Berrangé
@ 2018-10-09 12:55 ` Daniel P. Berrangé
  2018-10-09 13:43   ` Alberto Garcia
  2018-10-09 13:51   ` Marc-André Lureau
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type Daniel P. Berrangé
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 12:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Alberto Garcia

The tweak encrypt/decrypt functions are identical except for the
comments, so can be merged. Profiling data shows that the compiler is
in fact already merging the two merges in the object files.

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
---
 crypto/xts.c | 64 ++++++++++++----------------------------------------
 1 file changed, 15 insertions(+), 49 deletions(-)

diff --git a/crypto/xts.c b/crypto/xts.c
index 95212341f6..3c1a92f01d 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -43,20 +43,20 @@ static void xts_mult_x(uint8_t *I)
 
 
 /**
- * xts_tweak_uncrypt:
+ * xts_tweak_encdec:
  * @param ctxt: the cipher context
  * @param func: the cipher function
- * @src: buffer providing the cipher text of XTS_BLOCK_SIZE bytes
- * @dst: buffer to output the plain text of XTS_BLOCK_SIZE bytes
+ * @src: buffer providing the input text of XTS_BLOCK_SIZE bytes
+ * @dst: buffer to output the output text of XTS_BLOCK_SIZE bytes
  * @iv: the initialization vector tweak of XTS_BLOCK_SIZE bytes
  *
- * Decrypt data with a tweak
+ * Encrypt/decrypt data with a tweak
  */
-static void xts_tweak_decrypt(const void *ctx,
-                              xts_cipher_func *func,
-                              const uint8_t *src,
-                              uint8_t *dst,
-                              uint8_t *iv)
+static void xts_tweak_encdec(const void *ctx,
+                             xts_cipher_func *func,
+                             const uint8_t *src,
+                             uint8_t *dst,
+                             uint8_t *iv)
 {
     unsigned long x;
 
@@ -105,7 +105,7 @@ void xts_decrypt(const void *datactx,
     encfunc(tweakctx, XTS_BLOCK_SIZE, T, iv);
 
     for (i = 0; i < lim; i++) {
-        xts_tweak_decrypt(datactx, decfunc, src, dst, T);
+        xts_tweak_encdec(datactx, decfunc, src, dst, T);
 
         src += XTS_BLOCK_SIZE;
         dst += XTS_BLOCK_SIZE;
@@ -117,7 +117,7 @@ void xts_decrypt(const void *datactx,
         xts_mult_x(CC);
 
         /* PP = tweak decrypt block m-1 */
-        xts_tweak_decrypt(datactx, decfunc, src, PP, CC);
+        xts_tweak_encdec(datactx, decfunc, src, PP, CC);
 
         /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
         for (i = 0; i < mo; i++) {
@@ -129,7 +129,7 @@ void xts_decrypt(const void *datactx,
         }
 
         /* Pm-1 = Tweak uncrypt CC */
-        xts_tweak_decrypt(datactx, decfunc, CC, dst, T);
+        xts_tweak_encdec(datactx, decfunc, CC, dst, T);
     }
 
     /* Decrypt the iv back */
@@ -137,40 +137,6 @@ void xts_decrypt(const void *datactx,
 }
 
 
-/**
- * xts_tweak_crypt:
- * @param ctxt: the cipher context
- * @param func: the cipher function
- * @src: buffer providing the plain text of XTS_BLOCK_SIZE bytes
- * @dst: buffer to output the cipher text of XTS_BLOCK_SIZE bytes
- * @iv: the initialization vector tweak of XTS_BLOCK_SIZE bytes
- *
- * Encrypt data with a tweak
- */
-static void xts_tweak_encrypt(const void *ctx,
-                              xts_cipher_func *func,
-                              const uint8_t *src,
-                              uint8_t *dst,
-                              uint8_t *iv)
-{
-    unsigned long x;
-
-    /* tweak encrypt block i */
-    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
-        dst[x] = src[x] ^ iv[x];
-    }
-
-    func(ctx, XTS_BLOCK_SIZE, dst, dst);
-
-    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
-        dst[x] = dst[x] ^ iv[x];
-    }
-
-    /* LFSR the tweak */
-    xts_mult_x(iv);
-}
-
-
 void xts_encrypt(const void *datactx,
                  const void *tweakctx,
                  xts_cipher_func *encfunc,
@@ -200,7 +166,7 @@ void xts_encrypt(const void *datactx,
     encfunc(tweakctx, XTS_BLOCK_SIZE, T, iv);
 
     for (i = 0; i < lim; i++) {
-        xts_tweak_encrypt(datactx, encfunc, src, dst, T);
+        xts_tweak_encdec(datactx, encfunc, src, dst, T);
 
         dst += XTS_BLOCK_SIZE;
         src += XTS_BLOCK_SIZE;
@@ -209,7 +175,7 @@ void xts_encrypt(const void *datactx,
     /* if length is not a multiple of XTS_BLOCK_SIZE then */
     if (mo > 0) {
         /* CC = tweak encrypt block m-1 */
-        xts_tweak_encrypt(datactx, encfunc, src, CC, T);
+        xts_tweak_encdec(datactx, encfunc, src, CC, T);
 
         /* Cm = first length % XTS_BLOCK_SIZE bytes of CC */
         for (i = 0; i < mo; i++) {
@@ -222,7 +188,7 @@ void xts_encrypt(const void *datactx,
         }
 
         /* Cm-1 = Tweak encrypt PP */
-        xts_tweak_encrypt(datactx, encfunc, PP, dst, T);
+        xts_tweak_encdec(datactx, encfunc, PP, dst, T);
     }
 
     /* Decrypt the iv back */
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type
  2018-10-09 12:55 [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Daniel P. Berrangé
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 1/6] crypto: expand algorithm coverage for cipher benchmark Daniel P. Berrangé
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 2/6] crypto: remove code duplication in tweak encrypt/decrypt Daniel P. Berrangé
@ 2018-10-09 12:55 ` Daniel P. Berrangé
  2018-10-09 14:40   ` Alberto Garcia
  2018-10-09 14:50   ` Alberto Garcia
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type Daniel P. Berrangé
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 12:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Alberto Garcia

The new type is designed to allow use of 64-bit arithmetic instead
of operating 1-byte at a time. The following patches will use this to
improve performance.

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
---
 crypto/xts.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/crypto/xts.c b/crypto/xts.c
index 3c1a92f01d..ded4365191 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -26,6 +26,11 @@
 #include "qemu/osdep.h"
 #include "crypto/xts.h"
 
+typedef struct {
+    uint64_t a;
+    uint64_t b;
+} xts_uint128;
+
 static void xts_mult_x(uint8_t *I)
 {
     int x;
@@ -85,7 +90,7 @@ void xts_decrypt(const void *datactx,
                  uint8_t *dst,
                  const uint8_t *src)
 {
-    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
+    xts_uint128 PP, CC, T;
     unsigned long i, m, mo, lim;
 
     /* get number of blocks */
@@ -102,10 +107,10 @@ void xts_decrypt(const void *datactx,
     }
 
     /* encrypt the iv */
-    encfunc(tweakctx, XTS_BLOCK_SIZE, T, iv);
+    encfunc(tweakctx, XTS_BLOCK_SIZE, (uint8_t *)&T, iv);
 
     for (i = 0; i < lim; i++) {
-        xts_tweak_encdec(datactx, decfunc, src, dst, T);
+        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
 
         src += XTS_BLOCK_SIZE;
         dst += XTS_BLOCK_SIZE;
@@ -113,27 +118,27 @@ void xts_decrypt(const void *datactx,
 
     /* if length is not a multiple of XTS_BLOCK_SIZE then */
     if (mo > 0) {
-        memcpy(CC, T, XTS_BLOCK_SIZE);
-        xts_mult_x(CC);
+        memcpy(&CC, &T, XTS_BLOCK_SIZE);
+        xts_mult_x((uint8_t *)&CC);
 
         /* PP = tweak decrypt block m-1 */
-        xts_tweak_encdec(datactx, decfunc, src, PP, CC);
+        xts_tweak_encdec(datactx, decfunc, src, (uint8_t *)&PP, (uint8_t *)&CC);
 
         /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
         for (i = 0; i < mo; i++) {
-            CC[i] = src[XTS_BLOCK_SIZE + i];
-            dst[XTS_BLOCK_SIZE + i] = PP[i];
+            ((uint8_t *)&CC)[i] = src[XTS_BLOCK_SIZE + i];
+            dst[XTS_BLOCK_SIZE + i] = ((uint8_t *)&PP)[i];
         }
         for (; i < XTS_BLOCK_SIZE; i++) {
-            CC[i] = PP[i];
+            ((uint8_t *)&CC)[i] = ((uint8_t *)&PP)[i];
         }
 
         /* Pm-1 = Tweak uncrypt CC */
-        xts_tweak_encdec(datactx, decfunc, CC, dst, T);
+        xts_tweak_encdec(datactx, decfunc, (uint8_t *)&CC, dst, (uint8_t *)&T);
     }
 
     /* Decrypt the iv back */
-    decfunc(tweakctx, XTS_BLOCK_SIZE, iv, T);
+    decfunc(tweakctx, XTS_BLOCK_SIZE, iv, (uint8_t *)&T);
 }
 
 
@@ -146,7 +151,7 @@ void xts_encrypt(const void *datactx,
                  uint8_t *dst,
                  const uint8_t *src)
 {
-    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
+    xts_uint128 PP, CC, T;
     unsigned long i, m, mo, lim;
 
     /* get number of blocks */
@@ -163,10 +168,10 @@ void xts_encrypt(const void *datactx,
     }
 
     /* encrypt the iv */
-    encfunc(tweakctx, XTS_BLOCK_SIZE, T, iv);
+    encfunc(tweakctx, XTS_BLOCK_SIZE, (uint8_t *)&T, iv);
 
     for (i = 0; i < lim; i++) {
-        xts_tweak_encdec(datactx, encfunc, src, dst, T);
+        xts_tweak_encdec(datactx, encfunc, src, dst, (uint8_t *)&T);
 
         dst += XTS_BLOCK_SIZE;
         src += XTS_BLOCK_SIZE;
@@ -175,22 +180,22 @@ void xts_encrypt(const void *datactx,
     /* if length is not a multiple of XTS_BLOCK_SIZE then */
     if (mo > 0) {
         /* CC = tweak encrypt block m-1 */
-        xts_tweak_encdec(datactx, encfunc, src, CC, T);
+        xts_tweak_encdec(datactx, encfunc, src, (uint8_t *)&CC, (uint8_t *)&T);
 
         /* Cm = first length % XTS_BLOCK_SIZE bytes of CC */
         for (i = 0; i < mo; i++) {
-            PP[i] = src[XTS_BLOCK_SIZE + i];
-            dst[XTS_BLOCK_SIZE + i] = CC[i];
+            ((uint8_t *)&PP)[i] = src[XTS_BLOCK_SIZE + i];
+            dst[XTS_BLOCK_SIZE + i] = ((uint8_t *)&CC)[i];
         }
 
         for (; i < XTS_BLOCK_SIZE; i++) {
-            PP[i] = CC[i];
+            ((uint8_t *)&PP)[i] = ((uint8_t *)&CC)[i];
         }
 
         /* Cm-1 = Tweak encrypt PP */
-        xts_tweak_encdec(datactx, encfunc, PP, dst, T);
+        xts_tweak_encdec(datactx, encfunc, (uint8_t *)&PP, dst, (uint8_t *)&T);
     }
 
     /* Decrypt the iv back */
-    decfunc(tweakctx, XTS_BLOCK_SIZE, iv, T);
+    decfunc(tweakctx, XTS_BLOCK_SIZE, iv, (uint8_t *)&T);
 }
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type
  2018-10-09 12:55 [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Daniel P. Berrangé
                   ` (2 preceding siblings ...)
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type Daniel P. Berrangé
@ 2018-10-09 12:55 ` Daniel P. Berrangé
  2018-10-09 15:02   ` Alberto Garcia
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 5/6] crypto: convert xts_mult_x " Daniel P. Berrangé
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 12:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Alberto Garcia

Using 64-bit arithmetic increases the performance for xts-aes-128
when built with gcrypt:

  Encrypt: 235 MB/s -> 320 MB/s
  Decrypt: 245 MB/s -> 325 MB/s

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
---
 crypto/xts.c | 52 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/crypto/xts.c b/crypto/xts.c
index ded4365191..f109c8a3ee 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -31,6 +31,12 @@ typedef struct {
     uint64_t b;
 } xts_uint128;
 
+#define xts_uint128_xor(D, S1, S2)              \
+    do {                                        \
+        (D)->a = (S1)->a ^ (S2)->a;             \
+        (D)->b = (S1)->b ^ (S2)->b;             \
+    } while (0)
+
 static void xts_mult_x(uint8_t *I)
 {
     int x;
@@ -59,25 +65,19 @@ static void xts_mult_x(uint8_t *I)
  */
 static void xts_tweak_encdec(const void *ctx,
                              xts_cipher_func *func,
-                             const uint8_t *src,
-                             uint8_t *dst,
-                             uint8_t *iv)
+                             const xts_uint128 *src,
+                             xts_uint128 *dst,
+                             xts_uint128 *iv)
 {
-    unsigned long x;
-
     /* tweak encrypt block i */
-    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
-        dst[x] = src[x] ^ iv[x];
-    }
+    xts_uint128_xor(dst, src, iv);
 
-    func(ctx, XTS_BLOCK_SIZE, dst, dst);
+    func(ctx, XTS_BLOCK_SIZE, (uint8_t *)dst, (uint8_t *)dst);
 
-    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
-        dst[x] = dst[x] ^ iv[x];
-    }
+    xts_uint128_xor(dst, dst, iv);
 
     /* LFSR the tweak */
-    xts_mult_x(iv);
+    xts_mult_x((uint8_t *)iv);
 }
 
 
@@ -110,7 +110,11 @@ void xts_decrypt(const void *datactx,
     encfunc(tweakctx, XTS_BLOCK_SIZE, (uint8_t *)&T, iv);
 
     for (i = 0; i < lim; i++) {
-        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
+        xts_uint128 S, D;
+
+        memcpy(&S, src, XTS_BLOCK_SIZE);
+        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
+        memcpy(dst, &D, XTS_BLOCK_SIZE);
 
         src += XTS_BLOCK_SIZE;
         dst += XTS_BLOCK_SIZE;
@@ -118,11 +122,13 @@ void xts_decrypt(const void *datactx,
 
     /* if length is not a multiple of XTS_BLOCK_SIZE then */
     if (mo > 0) {
+        xts_uint128 S, D;
         memcpy(&CC, &T, XTS_BLOCK_SIZE);
         xts_mult_x((uint8_t *)&CC);
 
         /* PP = tweak decrypt block m-1 */
-        xts_tweak_encdec(datactx, decfunc, src, (uint8_t *)&PP, (uint8_t *)&CC);
+        memcpy(&S, src, XTS_BLOCK_SIZE);
+        xts_tweak_encdec(datactx, decfunc, &S, &PP, &CC);
 
         /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
         for (i = 0; i < mo; i++) {
@@ -134,7 +140,8 @@ void xts_decrypt(const void *datactx,
         }
 
         /* Pm-1 = Tweak uncrypt CC */
-        xts_tweak_encdec(datactx, decfunc, (uint8_t *)&CC, dst, (uint8_t *)&T);
+        xts_tweak_encdec(datactx, decfunc, &CC, &D, &T);
+        memcpy(dst, &D, XTS_BLOCK_SIZE);
     }
 
     /* Decrypt the iv back */
@@ -171,7 +178,11 @@ void xts_encrypt(const void *datactx,
     encfunc(tweakctx, XTS_BLOCK_SIZE, (uint8_t *)&T, iv);
 
     for (i = 0; i < lim; i++) {
-        xts_tweak_encdec(datactx, encfunc, src, dst, (uint8_t *)&T);
+        xts_uint128 S, D;
+
+        memcpy(&S, src, XTS_BLOCK_SIZE);
+        xts_tweak_encdec(datactx, encfunc, &S, &D, &T);
+        memcpy(dst, &D, XTS_BLOCK_SIZE);
 
         dst += XTS_BLOCK_SIZE;
         src += XTS_BLOCK_SIZE;
@@ -179,8 +190,10 @@ void xts_encrypt(const void *datactx,
 
     /* if length is not a multiple of XTS_BLOCK_SIZE then */
     if (mo > 0) {
+        xts_uint128 S, D;
         /* CC = tweak encrypt block m-1 */
-        xts_tweak_encdec(datactx, encfunc, src, (uint8_t *)&CC, (uint8_t *)&T);
+        memcpy(&S, src, XTS_BLOCK_SIZE);
+        xts_tweak_encdec(datactx, encfunc, &S, &CC, &T);
 
         /* Cm = first length % XTS_BLOCK_SIZE bytes of CC */
         for (i = 0; i < mo; i++) {
@@ -193,7 +206,8 @@ void xts_encrypt(const void *datactx,
         }
 
         /* Cm-1 = Tweak encrypt PP */
-        xts_tweak_encdec(datactx, encfunc, (uint8_t *)&PP, dst, (uint8_t *)&T);
+        xts_tweak_encdec(datactx, encfunc, &PP, &D, &T);
+        memcpy(dst, &D, XTS_BLOCK_SIZE);
     }
 
     /* Decrypt the iv back */
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [Qemu-devel] [PATCH 5/6] crypto: convert xts_mult_x to use xts_uint128 type
  2018-10-09 12:55 [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Daniel P. Berrangé
                   ` (3 preceding siblings ...)
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type Daniel P. Berrangé
@ 2018-10-09 12:55 ` Daniel P. Berrangé
  2018-10-09 13:52   ` Alberto Garcia
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 6/6] crypto: annotate xts_tweak_encdec as inlineable Daniel P. Berrangé
  2018-10-09 13:59 ` [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Marc-André Lureau
  6 siblings, 1 reply; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 12:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Alberto Garcia

Using 64-bit arithmetic increases the performance for xts-aes-128
when built with gcrypt:

  Encrypt: 320 MB/s -> 460 MB/s
  Decrypt: 325 MB/s -> 485 MB/s

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
---
 crypto/xts.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/crypto/xts.c b/crypto/xts.c
index f109c8a3ee..bba3280746 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -37,19 +37,17 @@ typedef struct {
         (D)->b = (S1)->b ^ (S2)->b;             \
     } while (0)
 
-static void xts_mult_x(uint8_t *I)
+static void xts_mult_x(xts_uint128 *I)
 {
-    int x;
-    uint8_t t, tt;
+    uint64_t tt;
 
-    for (x = t = 0; x < 16; x++) {
-        tt = I[x] >> 7;
-        I[x] = ((I[x] << 1) | t) & 0xFF;
-        t = tt;
-    }
-    if (tt) {
-        I[0] ^= 0x87;
+    tt = I->a >> 63;
+    I->a = I->a << 1;
+
+    if (I->b >> 63) {
+        I->a ^= 0x87;
     }
+    I->b = (I->b << 1) | tt;
 }
 
 
@@ -77,7 +75,7 @@ static void xts_tweak_encdec(const void *ctx,
     xts_uint128_xor(dst, dst, iv);
 
     /* LFSR the tweak */
-    xts_mult_x((uint8_t *)iv);
+    xts_mult_x(iv);
 }
 
 
@@ -124,7 +122,7 @@ void xts_decrypt(const void *datactx,
     if (mo > 0) {
         xts_uint128 S, D;
         memcpy(&CC, &T, XTS_BLOCK_SIZE);
-        xts_mult_x((uint8_t *)&CC);
+        xts_mult_x(&CC);
 
         /* PP = tweak decrypt block m-1 */
         memcpy(&S, src, XTS_BLOCK_SIZE);
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [Qemu-devel] [PATCH 6/6] crypto: annotate xts_tweak_encdec as inlineable
  2018-10-09 12:55 [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Daniel P. Berrangé
                   ` (4 preceding siblings ...)
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 5/6] crypto: convert xts_mult_x " Daniel P. Berrangé
@ 2018-10-09 12:55 ` Daniel P. Berrangé
  2018-10-09 15:37   ` Alberto Garcia
  2018-10-09 13:59 ` [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Marc-André Lureau
  6 siblings, 1 reply; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 12:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Daniel P. Berrangé, Alberto Garcia

Encouraging the compiler to inline xts_tweak_encdec increases the
performance for xts-aes-128 when built with gcrypt:

  Encrypt: 460 MB/s -> 485 MB/s
  Decrypt: 485 MB/s -> 505 MB/s

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
---
 crypto/xts.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/crypto/xts.c b/crypto/xts.c
index bba3280746..02d3bc3f16 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -61,11 +61,11 @@ static void xts_mult_x(xts_uint128 *I)
  *
  * Encrypt/decrypt data with a tweak
  */
-static void xts_tweak_encdec(const void *ctx,
-                             xts_cipher_func *func,
-                             const xts_uint128 *src,
-                             xts_uint128 *dst,
-                             xts_uint128 *iv)
+static inline void xts_tweak_encdec(const void *ctx,
+                                    xts_cipher_func *func,
+                                    const xts_uint128 *src,
+                                    xts_uint128 *dst,
+                                    xts_uint128 *iv)
 {
     /* tweak encrypt block i */
     xts_uint128_xor(dst, src, iv);
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 2/6] crypto: remove code duplication in tweak encrypt/decrypt
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 2/6] crypto: remove code duplication in tweak encrypt/decrypt Daniel P. Berrangé
@ 2018-10-09 13:43   ` Alberto Garcia
  2018-10-09 13:51   ` Marc-André Lureau
  1 sibling, 0 replies; 27+ messages in thread
From: Alberto Garcia @ 2018-10-09 13:43 UTC (permalink / raw)
  To: Daniel P. Berrangé, qemu-devel

On Tue 09 Oct 2018 02:55:37 PM CEST, Daniel P. Berrangé wrote:
> The tweak encrypt/decrypt functions are identical except for the
> comments, so can be merged. Profiling data shows that the compiler is
> in fact already merging the two merges in the object files.
>
> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>

Reviewed-by: Alberto Garcia <berto@igalia.com>

Berto

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 2/6] crypto: remove code duplication in tweak encrypt/decrypt
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 2/6] crypto: remove code duplication in tweak encrypt/decrypt Daniel P. Berrangé
  2018-10-09 13:43   ` Alberto Garcia
@ 2018-10-09 13:51   ` Marc-André Lureau
  1 sibling, 0 replies; 27+ messages in thread
From: Marc-André Lureau @ 2018-10-09 13:51 UTC (permalink / raw)
  To: Daniel P. Berrange; +Cc: QEMU, Alberto Garcia

Hi

On Tue, Oct 9, 2018 at 4:56 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> The tweak encrypt/decrypt functions are identical except for the
> comments, so can be merged. Profiling data shows that the compiler is
> in fact already merging the two merges in the object files.
>
> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>

Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>

> ---
>  crypto/xts.c | 64 ++++++++++++----------------------------------------
>  1 file changed, 15 insertions(+), 49 deletions(-)
>
> diff --git a/crypto/xts.c b/crypto/xts.c
> index 95212341f6..3c1a92f01d 100644
> --- a/crypto/xts.c
> +++ b/crypto/xts.c
> @@ -43,20 +43,20 @@ static void xts_mult_x(uint8_t *I)
>
>
>  /**
> - * xts_tweak_uncrypt:
> + * xts_tweak_encdec:
>   * @param ctxt: the cipher context
>   * @param func: the cipher function
> - * @src: buffer providing the cipher text of XTS_BLOCK_SIZE bytes
> - * @dst: buffer to output the plain text of XTS_BLOCK_SIZE bytes
> + * @src: buffer providing the input text of XTS_BLOCK_SIZE bytes
> + * @dst: buffer to output the output text of XTS_BLOCK_SIZE bytes
>   * @iv: the initialization vector tweak of XTS_BLOCK_SIZE bytes
>   *
> - * Decrypt data with a tweak
> + * Encrypt/decrypt data with a tweak
>   */
> -static void xts_tweak_decrypt(const void *ctx,
> -                              xts_cipher_func *func,
> -                              const uint8_t *src,
> -                              uint8_t *dst,
> -                              uint8_t *iv)
> +static void xts_tweak_encdec(const void *ctx,
> +                             xts_cipher_func *func,
> +                             const uint8_t *src,
> +                             uint8_t *dst,
> +                             uint8_t *iv)
>  {
>      unsigned long x;
>
> @@ -105,7 +105,7 @@ void xts_decrypt(const void *datactx,
>      encfunc(tweakctx, XTS_BLOCK_SIZE, T, iv);
>
>      for (i = 0; i < lim; i++) {
> -        xts_tweak_decrypt(datactx, decfunc, src, dst, T);
> +        xts_tweak_encdec(datactx, decfunc, src, dst, T);
>
>          src += XTS_BLOCK_SIZE;
>          dst += XTS_BLOCK_SIZE;
> @@ -117,7 +117,7 @@ void xts_decrypt(const void *datactx,
>          xts_mult_x(CC);
>
>          /* PP = tweak decrypt block m-1 */
> -        xts_tweak_decrypt(datactx, decfunc, src, PP, CC);
> +        xts_tweak_encdec(datactx, decfunc, src, PP, CC);
>
>          /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
>          for (i = 0; i < mo; i++) {
> @@ -129,7 +129,7 @@ void xts_decrypt(const void *datactx,
>          }
>
>          /* Pm-1 = Tweak uncrypt CC */
> -        xts_tweak_decrypt(datactx, decfunc, CC, dst, T);
> +        xts_tweak_encdec(datactx, decfunc, CC, dst, T);
>      }
>
>      /* Decrypt the iv back */
> @@ -137,40 +137,6 @@ void xts_decrypt(const void *datactx,
>  }
>
>
> -/**
> - * xts_tweak_crypt:
> - * @param ctxt: the cipher context
> - * @param func: the cipher function
> - * @src: buffer providing the plain text of XTS_BLOCK_SIZE bytes
> - * @dst: buffer to output the cipher text of XTS_BLOCK_SIZE bytes
> - * @iv: the initialization vector tweak of XTS_BLOCK_SIZE bytes
> - *
> - * Encrypt data with a tweak
> - */
> -static void xts_tweak_encrypt(const void *ctx,
> -                              xts_cipher_func *func,
> -                              const uint8_t *src,
> -                              uint8_t *dst,
> -                              uint8_t *iv)
> -{
> -    unsigned long x;
> -
> -    /* tweak encrypt block i */
> -    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
> -        dst[x] = src[x] ^ iv[x];
> -    }
> -
> -    func(ctx, XTS_BLOCK_SIZE, dst, dst);
> -
> -    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
> -        dst[x] = dst[x] ^ iv[x];
> -    }
> -
> -    /* LFSR the tweak */
> -    xts_mult_x(iv);
> -}
> -
> -
>  void xts_encrypt(const void *datactx,
>                   const void *tweakctx,
>                   xts_cipher_func *encfunc,
> @@ -200,7 +166,7 @@ void xts_encrypt(const void *datactx,
>      encfunc(tweakctx, XTS_BLOCK_SIZE, T, iv);
>
>      for (i = 0; i < lim; i++) {
> -        xts_tweak_encrypt(datactx, encfunc, src, dst, T);
> +        xts_tweak_encdec(datactx, encfunc, src, dst, T);
>
>          dst += XTS_BLOCK_SIZE;
>          src += XTS_BLOCK_SIZE;
> @@ -209,7 +175,7 @@ void xts_encrypt(const void *datactx,
>      /* if length is not a multiple of XTS_BLOCK_SIZE then */
>      if (mo > 0) {
>          /* CC = tweak encrypt block m-1 */
> -        xts_tweak_encrypt(datactx, encfunc, src, CC, T);
> +        xts_tweak_encdec(datactx, encfunc, src, CC, T);
>
>          /* Cm = first length % XTS_BLOCK_SIZE bytes of CC */
>          for (i = 0; i < mo; i++) {
> @@ -222,7 +188,7 @@ void xts_encrypt(const void *datactx,
>          }
>
>          /* Cm-1 = Tweak encrypt PP */
> -        xts_tweak_encrypt(datactx, encfunc, PP, dst, T);
> +        xts_tweak_encdec(datactx, encfunc, PP, dst, T);
>      }
>
>      /* Decrypt the iv back */
> --
> 2.17.1
>
>


--
Marc-André Lureau

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 5/6] crypto: convert xts_mult_x to use xts_uint128 type
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 5/6] crypto: convert xts_mult_x " Daniel P. Berrangé
@ 2018-10-09 13:52   ` Alberto Garcia
  2018-10-09 13:55     ` Daniel P. Berrangé
  0 siblings, 1 reply; 27+ messages in thread
From: Alberto Garcia @ 2018-10-09 13:52 UTC (permalink / raw)
  To: Daniel P. Berrangé, qemu-devel

On Tue 09 Oct 2018 02:55:40 PM CEST, Daniel P. Berrangé wrote:
> -static void xts_mult_x(uint8_t *I)
> +static void xts_mult_x(xts_uint128 *I)
>  {
> -    int x;
> -    uint8_t t, tt;
> +    uint64_t tt;
>  
> -    for (x = t = 0; x < 16; x++) {
> -        tt = I[x] >> 7;
> -        I[x] = ((I[x] << 1) | t) & 0xFF;
> -        t = tt;
> -    }
> -    if (tt) {
> -        I[0] ^= 0x87;
> +    tt = I->a >> 63;
> +    I->a = I->a << 1;
> +
> +    if (I->b >> 63) {
> +        I->a ^= 0x87;
>      }
> +    I->b = (I->b << 1) | tt;
>  }

Does this work fine in big-endian CPUs?

Berto

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 5/6] crypto: convert xts_mult_x to use xts_uint128 type
  2018-10-09 13:52   ` Alberto Garcia
@ 2018-10-09 13:55     ` Daniel P. Berrangé
  2018-10-09 14:25       ` Alberto Garcia
  0 siblings, 1 reply; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 13:55 UTC (permalink / raw)
  To: Alberto Garcia; +Cc: qemu-devel

On Tue, Oct 09, 2018 at 03:52:53PM +0200, Alberto Garcia wrote:
> On Tue 09 Oct 2018 02:55:40 PM CEST, Daniel P. Berrangé wrote:
> > -static void xts_mult_x(uint8_t *I)
> > +static void xts_mult_x(xts_uint128 *I)
> >  {
> > -    int x;
> > -    uint8_t t, tt;
> > +    uint64_t tt;
> >  
> > -    for (x = t = 0; x < 16; x++) {
> > -        tt = I[x] >> 7;
> > -        I[x] = ((I[x] << 1) | t) & 0xFF;
> > -        t = tt;
> > -    }
> > -    if (tt) {
> > -        I[0] ^= 0x87;
> > +    tt = I->a >> 63;
> > +    I->a = I->a << 1;
> > +
> > +    if (I->b >> 63) {
> > +        I->a ^= 0x87;
> >      }
> > +    I->b = (I->b << 1) | tt;
> >  }
> 
> Does this work fine in big-endian CPUs?

Hmm, that's a good question. I'd expect tests/test-crypto-xts to crash
and burn if it doesn't, so guess I'll need to find somewhere to validate
that.

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode
  2018-10-09 12:55 [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Daniel P. Berrangé
                   ` (5 preceding siblings ...)
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 6/6] crypto: annotate xts_tweak_encdec as inlineable Daniel P. Berrangé
@ 2018-10-09 13:59 ` Marc-André Lureau
  2018-10-09 14:13   ` Daniel P. Berrangé
  6 siblings, 1 reply; 27+ messages in thread
From: Marc-André Lureau @ 2018-10-09 13:59 UTC (permalink / raw)
  To: Daniel P. Berrange; +Cc: QEMU, Alberto Garcia

Hi

On Tue, Oct 9, 2018 at 4:57 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> The XTS cipher mode is significantly slower than CBC mode. This series
> approximately doubles the XTS performance which will improve the I/O
> rate for LUKS disks.
>
> Daniel P. Berrangé (6):
>   crypto: expand algorithm coverage for cipher benchmark
>   crypto: remove code duplication in tweak encrypt/decrypt
>   crypto: introduce a xts_uint128 data type
>   crypto: convert xts_tweak_encdec to use xts_uint128 type
>   crypto: convert xts_mult_x to use xts_uint128 type
>   crypto: annotate xts_tweak_encdec as inlineable
>
>  crypto/xts.c                    | 147 ++++++++++++++-----------------
>  tests/benchmark-crypto-cipher.c | 149 +++++++++++++++++++++++++++-----
>  2 files changed, 191 insertions(+), 105 deletions(-)

By using a constant amount of data to process, it's easier to measure
perfomance with perf stat:

diff --git a/tests/benchmark-crypto-cipher.c b/tests/benchmark-crypto-cipher.c
index a8325a9510..32a19987e6 100644
--- a/tests/benchmark-crypto-cipher.c
+++ b/tests/benchmark-crypto-cipher.c
@@ -65,7 +65,7 @@ static void test_cipher_speed(size_t chunk_size,
                                         chunk_size,
                                         &err) == 0);
         total += chunk_size;
-    } while (g_test_timer_elapsed() < 1.0);
+    } while (total / MiB < 500);

     total /= MiB;
     g_print("Enc chunk %zu bytes ", chunk_size);
@@ -80,7 +80,7 @@ static void test_cipher_speed(size_t chunk_size,
                                         chunk_size,
                                         &err) == 0);
         total += chunk_size;
-    } while (g_test_timer_elapsed() < 1.0);
+    } while (total / MiB < 500);

On my laptop: before your series:
       3701.625051      task-clock:u (msec)       #    0.997 CPUs
utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               438      page-faults:u             #    0.118 K/sec
    10,823,305,761      cycles:u                  #    2.924 GHz
    29,774,419,538      instructions:u            #    2.75  insn per
cycle
     4,919,267,782      branches:u                # 1328.948 M/sec
        32,923,105      branch-misses:u           #    0.67% of all
branches

       3.712998264 seconds time elapsed

Ater:
       2151.201355      task-clock:u (msec)       #    1.000 CPUs
utilized
                 0      context-switches:u        #    0.000 K/sec
                 0      cpu-migrations:u          #    0.000 K/sec
               431      page-faults:u             #    0.200 K/sec
     7,073,869,618      cycles:u                  #    3.288 GHz
     8,573,595,534      instructions:u            #    1.21  insn per
cycle
     1,576,926,668      branches:u                #  733.045 M/sec
           148,987      branch-misses:u           #    0.01% of all
branches

       2.151520872 seconds time elapsed


-- 
Marc-André Lureau

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/6] crypto: expand algorithm coverage for cipher benchmark
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 1/6] crypto: expand algorithm coverage for cipher benchmark Daniel P. Berrangé
@ 2018-10-09 14:04   ` Marc-André Lureau
  2018-10-10 11:45   ` Alberto Garcia
  1 sibling, 0 replies; 27+ messages in thread
From: Marc-André Lureau @ 2018-10-09 14:04 UTC (permalink / raw)
  To: Daniel P. Berrange; +Cc: QEMU, Alberto Garcia

Hi

On Tue, Oct 9, 2018 at 4:57 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> Add testing coverage for AES with XTS, ECB and CTR modes
>
> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
> ---
>  tests/benchmark-crypto-cipher.c | 149 +++++++++++++++++++++++++++-----

Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>

>  1 file changed, 126 insertions(+), 23 deletions(-)
>
> diff --git a/tests/benchmark-crypto-cipher.c b/tests/benchmark-crypto-cipher.c
> index f5a0d0bc32..a8325a9510 100644
> --- a/tests/benchmark-crypto-cipher.c
> +++ b/tests/benchmark-crypto-cipher.c
> @@ -15,17 +15,27 @@
>  #include "crypto/init.h"
>  #include "crypto/cipher.h"
>
> -static void test_cipher_speed(const void *opaque)
> +static void test_cipher_speed(size_t chunk_size,
> +                              QCryptoCipherMode mode,
> +                              QCryptoCipherAlgorithm alg)
>  {
>      QCryptoCipher *cipher;
>      Error *err = NULL;
>      double total = 0.0;
> -    size_t chunk_size = (size_t)opaque;
>      uint8_t *key = NULL, *iv = NULL;
>      uint8_t *plaintext = NULL, *ciphertext = NULL;
> -    size_t nkey = qcrypto_cipher_get_key_len(QCRYPTO_CIPHER_ALG_AES_128);
> -    size_t niv = qcrypto_cipher_get_iv_len(QCRYPTO_CIPHER_ALG_AES_128,
> -                                           QCRYPTO_CIPHER_MODE_CBC);
> +    size_t nkey;
> +    size_t niv;
> +
> +    if (!qcrypto_cipher_supports(alg, mode)) {
> +        return;
> +    }
> +
> +    nkey = qcrypto_cipher_get_key_len(alg);
> +    niv = qcrypto_cipher_get_iv_len(alg, mode);
> +    if (mode == QCRYPTO_CIPHER_MODE_XTS) {
> +        nkey *= 2;
> +    }
>
>      key = g_new0(uint8_t, nkey);
>      memset(key, g_test_rand_int(), nkey);
> @@ -38,14 +48,14 @@ static void test_cipher_speed(const void *opaque)
>      plaintext = g_new0(uint8_t, chunk_size);
>      memset(plaintext, g_test_rand_int(), chunk_size);
>
> -    cipher = qcrypto_cipher_new(QCRYPTO_CIPHER_ALG_AES_128,
> -                                QCRYPTO_CIPHER_MODE_CBC,
> +    cipher = qcrypto_cipher_new(alg, mode,
>                                  key, nkey, &err);
>      g_assert(cipher != NULL);
>
> -    g_assert(qcrypto_cipher_setiv(cipher,
> -                                  iv, niv,
> -                                  &err) == 0);
> +    if (mode != QCRYPTO_CIPHER_MODE_ECB)
> +        g_assert(qcrypto_cipher_setiv(cipher,
> +                                      iv, niv,
> +                                      &err) == 0);
>
>      g_test_timer_start();
>      do {
> @@ -55,13 +65,26 @@ static void test_cipher_speed(const void *opaque)
>                                          chunk_size,
>                                          &err) == 0);
>          total += chunk_size;
> -    } while (g_test_timer_elapsed() < 5.0);
> +    } while (g_test_timer_elapsed() < 1.0);
>
>      total /= MiB;
> -    g_print("cbc(aes128): ");
> -    g_print("Testing chunk_size %zu bytes ", chunk_size);
> -    g_print("done: %.2f MB in %.2f secs: ", total, g_test_timer_last());
> -    g_print("%.2f MB/sec\n", total / g_test_timer_last());
> +    g_print("Enc chunk %zu bytes ", chunk_size);
> +    g_print("%.2f MB/sec ", total / g_test_timer_last());
> +
> +    total = 0.0;
> +    g_test_timer_start();
> +    do {
> +        g_assert(qcrypto_cipher_decrypt(cipher,
> +                                        plaintext,
> +                                        ciphertext,
> +                                        chunk_size,
> +                                        &err) == 0);
> +        total += chunk_size;
> +    } while (g_test_timer_elapsed() < 1.0);
> +
> +    total /= MiB;
> +    g_print("Dec chunk %zu bytes ", chunk_size);
> +    g_print("%.2f MB/sec ", total / g_test_timer_last());
>
>      qcrypto_cipher_free(cipher);
>      g_free(plaintext);
> @@ -70,19 +93,99 @@ static void test_cipher_speed(const void *opaque)
>      g_free(key);
>  }
>
> -int main(int argc, char **argv)
> +
> +static void test_cipher_speed_ecb_aes_128(const void *opaque)
> +{
> +    size_t chunk_size = (size_t)opaque;
> +    test_cipher_speed(chunk_size,
> +                      QCRYPTO_CIPHER_MODE_ECB,
> +                      QCRYPTO_CIPHER_ALG_AES_128);
> +}
> +
> +static void test_cipher_speed_ecb_aes_256(const void *opaque)
>  {
> -    size_t i;
> -    char name[64];
> +    size_t chunk_size = (size_t)opaque;
> +    test_cipher_speed(chunk_size,
> +                      QCRYPTO_CIPHER_MODE_ECB,
> +                      QCRYPTO_CIPHER_ALG_AES_256);
> +}
> +
> +static void test_cipher_speed_cbc_aes_128(const void *opaque)
> +{
> +    size_t chunk_size = (size_t)opaque;
> +    test_cipher_speed(chunk_size,
> +                      QCRYPTO_CIPHER_MODE_CBC,
> +                      QCRYPTO_CIPHER_ALG_AES_128);
> +}
>
> +static void test_cipher_speed_cbc_aes_256(const void *opaque)
> +{
> +    size_t chunk_size = (size_t)opaque;
> +    test_cipher_speed(chunk_size,
> +                      QCRYPTO_CIPHER_MODE_CBC,
> +                      QCRYPTO_CIPHER_ALG_AES_256);
> +}
> +
> +static void test_cipher_speed_ctr_aes_128(const void *opaque)
> +{
> +    size_t chunk_size = (size_t)opaque;
> +    test_cipher_speed(chunk_size,
> +                      QCRYPTO_CIPHER_MODE_CTR,
> +                      QCRYPTO_CIPHER_ALG_AES_128);
> +}
> +
> +static void test_cipher_speed_ctr_aes_256(const void *opaque)
> +{
> +    size_t chunk_size = (size_t)opaque;
> +    test_cipher_speed(chunk_size,
> +                      QCRYPTO_CIPHER_MODE_CTR,
> +                      QCRYPTO_CIPHER_ALG_AES_256);
> +}
> +
> +static void test_cipher_speed_xts_aes_128(const void *opaque)
> +{
> +    size_t chunk_size = (size_t)opaque;
> +    test_cipher_speed(chunk_size,
> +                      QCRYPTO_CIPHER_MODE_XTS,
> +                      QCRYPTO_CIPHER_ALG_AES_128);
> +}
> +
> +static void test_cipher_speed_xts_aes_256(const void *opaque)
> +{
> +    size_t chunk_size = (size_t)opaque;
> +    test_cipher_speed(chunk_size,
> +                      QCRYPTO_CIPHER_MODE_XTS,
> +                      QCRYPTO_CIPHER_ALG_AES_256);
> +}
> +
> +
> +int main(int argc, char **argv)
> +{
>      g_test_init(&argc, &argv, NULL);
>      g_assert(qcrypto_init(NULL) == 0);
>
> -    for (i = 512; i <= 64 * KiB; i *= 2) {
> -        memset(name, 0 , sizeof(name));
> -        snprintf(name, sizeof(name), "/crypto/cipher/speed-%zu", i);
> -        g_test_add_data_func(name, (void *)i, test_cipher_speed);
> -    }
> +#define ADD_TEST(mode, cipher, keysize, chunk)                          \
> +    g_test_add_data_func(                                               \
> +        "/crypto/cipher/" #mode "-" #cipher "-" #keysize "/chunk-" #chunk, \
> +        (void *)chunk,                                                  \
> +        test_cipher_speed_ ## mode ## _ ## cipher ## _ ## keysize)
> +
> +#define ADD_TESTS(chunk)                        \
> +    do {                                        \
> +        ADD_TEST(ecb, aes, 128, chunk);         \
> +        ADD_TEST(ecb, aes, 256, chunk);         \
> +        ADD_TEST(cbc, aes, 128, chunk);         \
> +        ADD_TEST(cbc, aes, 256, chunk);         \
> +        ADD_TEST(ctr, aes, 128, chunk);         \
> +        ADD_TEST(ctr, aes, 256, chunk);         \
> +        ADD_TEST(xts, aes, 128, chunk);         \
> +        ADD_TEST(xts, aes, 256, chunk); \
> +    } while (0)
> +
> +    ADD_TESTS(512);
> +    ADD_TESTS(4096);
> +    ADD_TESTS(16384);
> +    ADD_TESTS(65536);
>
>      return g_test_run();
>  }
> --
> 2.17.1
>
>


--
Marc-André Lureau

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode
  2018-10-09 13:59 ` [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Marc-André Lureau
@ 2018-10-09 14:13   ` Daniel P. Berrangé
  2018-10-09 14:27     ` Marc-André Lureau
  0 siblings, 1 reply; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 14:13 UTC (permalink / raw)
  To: Marc-André Lureau; +Cc: QEMU, Alberto Garcia

On Tue, Oct 09, 2018 at 05:59:46PM +0400, Marc-André Lureau wrote:
> Hi
> 
> On Tue, Oct 9, 2018 at 4:57 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> >
> > The XTS cipher mode is significantly slower than CBC mode. This series
> > approximately doubles the XTS performance which will improve the I/O
> > rate for LUKS disks.
> >
> > Daniel P. Berrangé (6):
> >   crypto: expand algorithm coverage for cipher benchmark
> >   crypto: remove code duplication in tweak encrypt/decrypt
> >   crypto: introduce a xts_uint128 data type
> >   crypto: convert xts_tweak_encdec to use xts_uint128 type
> >   crypto: convert xts_mult_x to use xts_uint128 type
> >   crypto: annotate xts_tweak_encdec as inlineable
> >
> >  crypto/xts.c                    | 147 ++++++++++++++-----------------
> >  tests/benchmark-crypto-cipher.c | 149 +++++++++++++++++++++++++++-----
> >  2 files changed, 191 insertions(+), 105 deletions(-)
> 
> By using a constant amount of data to process, it's easier to measure
> perfomance with perf stat:

The problem is that the different encryption modes have wildly
different performance. eg while XTS gets 400 MB/s, ECB gets
3000 MB/s. I want the test to run long enough to minimize the
noise, and picking a data size large enough for best ECB
perf while not being excessively large for XTS is hard. THus
I prefer to have a fixed execution time for each test.


Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 5/6] crypto: convert xts_mult_x to use xts_uint128 type
  2018-10-09 13:55     ` Daniel P. Berrangé
@ 2018-10-09 14:25       ` Alberto Garcia
  0 siblings, 0 replies; 27+ messages in thread
From: Alberto Garcia @ 2018-10-09 14:25 UTC (permalink / raw)
  To: Daniel P. Berrangé; +Cc: qemu-devel

On Tue 09 Oct 2018 03:55:34 PM CEST, Daniel P. Berrangé wrote:
> On Tue, Oct 09, 2018 at 03:52:53PM +0200, Alberto Garcia wrote:
>> On Tue 09 Oct 2018 02:55:40 PM CEST, Daniel P. Berrangé wrote:
>> > -static void xts_mult_x(uint8_t *I)
>> > +static void xts_mult_x(xts_uint128 *I)
>> >  {
>> > -    int x;
>> > -    uint8_t t, tt;
>> > +    uint64_t tt;
>> >  
>> > -    for (x = t = 0; x < 16; x++) {
>> > -        tt = I[x] >> 7;
>> > -        I[x] = ((I[x] << 1) | t) & 0xFF;
>> > -        t = tt;
>> > -    }
>> > -    if (tt) {
>> > -        I[0] ^= 0x87;
>> > +    tt = I->a >> 63;
>> > +    I->a = I->a << 1;
>> > +
>> > +    if (I->b >> 63) {
>> > +        I->a ^= 0x87;
>> >      }
>> > +    I->b = (I->b << 1) | tt;
>> >  }
>> 
>> Does this work fine in big-endian CPUs?
>
> Hmm, that's a good question. I'd expect tests/test-crypto-xts to crash
> and burn if it doesn't, so guess I'll need to find somewhere to validate
> that.

I just tried in s390x:

/crypto/xts/t-1-key-32-ptx-32: **
ERROR:/home/berto/qemu/tests/test-crypto-xts.c:386:test_xts: assertion failed: (memcmp(out, data->CTX, data->PTLEN) == 0)
Aborted

Berto

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode
  2018-10-09 14:13   ` Daniel P. Berrangé
@ 2018-10-09 14:27     ` Marc-André Lureau
  0 siblings, 0 replies; 27+ messages in thread
From: Marc-André Lureau @ 2018-10-09 14:27 UTC (permalink / raw)
  To: Daniel P. Berrange; +Cc: QEMU, Alberto Garcia

Hi

On Tue, Oct 9, 2018 at 6:13 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Tue, Oct 09, 2018 at 05:59:46PM +0400, Marc-André Lureau wrote:
> > Hi
> >
> > On Tue, Oct 9, 2018 at 4:57 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > >
> > > The XTS cipher mode is significantly slower than CBC mode. This series
> > > approximately doubles the XTS performance which will improve the I/O
> > > rate for LUKS disks.
> > >
> > > Daniel P. Berrangé (6):
> > >   crypto: expand algorithm coverage for cipher benchmark
> > >   crypto: remove code duplication in tweak encrypt/decrypt
> > >   crypto: introduce a xts_uint128 data type
> > >   crypto: convert xts_tweak_encdec to use xts_uint128 type
> > >   crypto: convert xts_mult_x to use xts_uint128 type
> > >   crypto: annotate xts_tweak_encdec as inlineable
> > >
> > >  crypto/xts.c                    | 147 ++++++++++++++-----------------
> > >  tests/benchmark-crypto-cipher.c | 149 +++++++++++++++++++++++++++-----
> > >  2 files changed, 191 insertions(+), 105 deletions(-)
> >
> > By using a constant amount of data to process, it's easier to measure
> > perfomance with perf stat:
>
> The problem is that the different encryption modes have wildly
> different performance. eg while XTS gets 400 MB/s, ECB gets
> 3000 MB/s. I want the test to run long enough to minimize the
> noise, and picking a data size large enough for best ECB
> perf while not being excessively large for XTS is hard. THus
> I prefer to have a fixed execution time for each test.

I understand, I was just giving you some nice numbers to back your patches ;)

Otoh, I think having a fixed-size work for benchmark is more reliable,
even if the test runs quickly. I wouldn't rely on the current
benchmark results, they are quite unpredictable on my system.

>
> Regards,
> Daniel
> --
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|



-- 
Marc-André Lureau

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type Daniel P. Berrangé
@ 2018-10-09 14:40   ` Alberto Garcia
  2018-10-09 14:50   ` Alberto Garcia
  1 sibling, 0 replies; 27+ messages in thread
From: Alberto Garcia @ 2018-10-09 14:40 UTC (permalink / raw)
  To: Daniel P. Berrangé, qemu-devel

On Tue 09 Oct 2018 02:55:38 PM CEST, Daniel P. Berrangé wrote:
> The new type is designed to allow use of 64-bit arithmetic instead
> of operating 1-byte at a time. The following patches will use this to
> improve performance.
>
> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>

I suppose that the fixes for the endianness problem may end up requiring
you to change this, but the patch itself is fine as it is now.

Reviewed-by: Alberto Garcia <berto@igalia.com>

Berto

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type Daniel P. Berrangé
  2018-10-09 14:40   ` Alberto Garcia
@ 2018-10-09 14:50   ` Alberto Garcia
  2018-10-09 14:58     ` Daniel P. Berrangé
  1 sibling, 1 reply; 27+ messages in thread
From: Alberto Garcia @ 2018-10-09 14:50 UTC (permalink / raw)
  To: Daniel P. Berrangé, qemu-devel

On Tue 09 Oct 2018 02:55:38 PM CEST, Daniel P. Berrangé wrote:

> @@ -85,7 +90,7 @@ void xts_decrypt(const void *datactx,
>                   uint8_t *dst,
>                   const uint8_t *src)
>  {
> -    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
> +    xts_uint128 PP, CC, T;
>      unsigned long i, m, mo, lim;

   [...]

>          /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
>          for (i = 0; i < mo; i++) {
> -            CC[i] = src[XTS_BLOCK_SIZE + i];
> -            dst[XTS_BLOCK_SIZE + i] = PP[i];
> +            ((uint8_t *)&CC)[i] = src[XTS_BLOCK_SIZE + i];
> +            dst[XTS_BLOCK_SIZE + i] = ((uint8_t *)&PP)[i];
>          }

On second thoughts, these casts are a bit cumbersome. I wonder if it
isn't better to keep the array a uint8_t[] and only treat it as
xts_uint128 in the places where you actually do 64-bit operations
(xts_uint128_xor, xts_mult_x).

Berto

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type
  2018-10-09 14:50   ` Alberto Garcia
@ 2018-10-09 14:58     ` Daniel P. Berrangé
  2018-10-09 15:14       ` Alberto Garcia
  0 siblings, 1 reply; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 14:58 UTC (permalink / raw)
  To: Alberto Garcia; +Cc: qemu-devel

On Tue, Oct 09, 2018 at 04:50:16PM +0200, Alberto Garcia wrote:
> On Tue 09 Oct 2018 02:55:38 PM CEST, Daniel P. Berrangé wrote:
> 
> > @@ -85,7 +90,7 @@ void xts_decrypt(const void *datactx,
> >                   uint8_t *dst,
> >                   const uint8_t *src)
> >  {
> > -    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
> > +    xts_uint128 PP, CC, T;
> >      unsigned long i, m, mo, lim;
> 
>    [...]
> 
> >          /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
> >          for (i = 0; i < mo; i++) {
> > -            CC[i] = src[XTS_BLOCK_SIZE + i];
> > -            dst[XTS_BLOCK_SIZE + i] = PP[i];
> > +            ((uint8_t *)&CC)[i] = src[XTS_BLOCK_SIZE + i];
> > +            dst[XTS_BLOCK_SIZE + i] = ((uint8_t *)&PP)[i];
> >          }
> 
> On second thoughts, these casts are a bit cumbersome. I wonder if it
> isn't better to keep the array a uint8_t[] and only treat it as
> xts_uint128 in the places where you actually do 64-bit operations
> (xts_uint128_xor, xts_mult_x).

I had done that originally, but it just shifts ugly casts from one
place to another place in the code. I preferred the idea of storing
it all as a 128bit data type since that's matching the operational
block size.

A further alternative is for xts_uint128 to be a union providing
both, and then have an extra level of access for respective fields,
which I had also tried at one time but ultimately i decided I didn't
mind the casts.

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type Daniel P. Berrangé
@ 2018-10-09 15:02   ` Alberto Garcia
  2018-10-09 15:06     ` Daniel P. Berrangé
  0 siblings, 1 reply; 27+ messages in thread
From: Alberto Garcia @ 2018-10-09 15:02 UTC (permalink / raw)
  To: Daniel P. Berrangé, qemu-devel

On Tue 09 Oct 2018 02:55:39 PM CEST, Daniel P. Berrangé wrote:
> Using 64-bit arithmetic increases the performance for xts-aes-128
> when built with gcrypt:
>
>   Encrypt: 235 MB/s -> 320 MB/s
>   Decrypt: 245 MB/s -> 325 MB/s
>
> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
> ---
>  crypto/xts.c | 52 +++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 33 insertions(+), 19 deletions(-)
>
> diff --git a/crypto/xts.c b/crypto/xts.c
> index ded4365191..f109c8a3ee 100644
> --- a/crypto/xts.c
> +++ b/crypto/xts.c
> @@ -31,6 +31,12 @@ typedef struct {
>      uint64_t b;
>  } xts_uint128;
>  
> +#define xts_uint128_xor(D, S1, S2)              \
> +    do {                                        \
> +        (D)->a = (S1)->a ^ (S2)->a;             \
> +        (D)->b = (S1)->b ^ (S2)->b;             \
> +    } while (0)
> +
>  static void xts_mult_x(uint8_t *I)
>  {
>      int x;
> @@ -59,25 +65,19 @@ static void xts_mult_x(uint8_t *I)
>   */
>  static void xts_tweak_encdec(const void *ctx,
>                               xts_cipher_func *func,
> -                             const uint8_t *src,
> -                             uint8_t *dst,
> -                             uint8_t *iv)
> +                             const xts_uint128 *src,
> +                             xts_uint128 *dst,
> +                             xts_uint128 *iv)
>  {
> -    unsigned long x;
> -
>      /* tweak encrypt block i */
> -    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
> -        dst[x] = src[x] ^ iv[x];
> -    }
> +    xts_uint128_xor(dst, src, iv);
>  
> -    func(ctx, XTS_BLOCK_SIZE, dst, dst);
> +    func(ctx, XTS_BLOCK_SIZE, (uint8_t *)dst, (uint8_t *)dst);

In the line of what I said earlier, perhaps it's clearer if you leave
everything as uint8_t * and simply make xts_uint128_xor() treat the
array as xts_uint128 internally.

>      for (i = 0; i < lim; i++) {
> -        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
> +        xts_uint128 S, D;
> +
> +        memcpy(&S, src, XTS_BLOCK_SIZE);
> +        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
> +        memcpy(dst, &D, XTS_BLOCK_SIZE);

Why do you need S and D?

Berto

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type
  2018-10-09 15:02   ` Alberto Garcia
@ 2018-10-09 15:06     ` Daniel P. Berrangé
  2018-10-09 15:30       ` Alberto Garcia
  0 siblings, 1 reply; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 15:06 UTC (permalink / raw)
  To: Alberto Garcia; +Cc: qemu-devel

On Tue, Oct 09, 2018 at 05:02:39PM +0200, Alberto Garcia wrote:
> On Tue 09 Oct 2018 02:55:39 PM CEST, Daniel P. Berrangé wrote:
> > Using 64-bit arithmetic increases the performance for xts-aes-128
> > when built with gcrypt:
> >
> >   Encrypt: 235 MB/s -> 320 MB/s
> >   Decrypt: 245 MB/s -> 325 MB/s
> >
> > Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
> > ---
> >  crypto/xts.c | 52 +++++++++++++++++++++++++++++++++-------------------
> >  1 file changed, 33 insertions(+), 19 deletions(-)
> >
> > diff --git a/crypto/xts.c b/crypto/xts.c
> > index ded4365191..f109c8a3ee 100644
> > --- a/crypto/xts.c
> > +++ b/crypto/xts.c
> > @@ -31,6 +31,12 @@ typedef struct {
> >      uint64_t b;
> >  } xts_uint128;
> >  
> > +#define xts_uint128_xor(D, S1, S2)              \
> > +    do {                                        \
> > +        (D)->a = (S1)->a ^ (S2)->a;             \
> > +        (D)->b = (S1)->b ^ (S2)->b;             \
> > +    } while (0)
> > +
> >  static void xts_mult_x(uint8_t *I)
> >  {
> >      int x;
> > @@ -59,25 +65,19 @@ static void xts_mult_x(uint8_t *I)
> >   */
> >  static void xts_tweak_encdec(const void *ctx,
> >                               xts_cipher_func *func,
> > -                             const uint8_t *src,
> > -                             uint8_t *dst,
> > -                             uint8_t *iv)
> > +                             const xts_uint128 *src,
> > +                             xts_uint128 *dst,
> > +                             xts_uint128 *iv)
> >  {
> > -    unsigned long x;
> > -
> >      /* tweak encrypt block i */
> > -    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
> > -        dst[x] = src[x] ^ iv[x];
> > -    }
> > +    xts_uint128_xor(dst, src, iv);
> >  
> > -    func(ctx, XTS_BLOCK_SIZE, dst, dst);
> > +    func(ctx, XTS_BLOCK_SIZE, (uint8_t *)dst, (uint8_t *)dst);
> 
> In the line of what I said earlier, perhaps it's clearer if you leave
> everything as uint8_t * and simply make xts_uint128_xor() treat the
> array as xts_uint128 internally.
> 
> >      for (i = 0; i < lim; i++) {
> > -        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
> > +        xts_uint128 S, D;
> > +
> > +        memcpy(&S, src, XTS_BLOCK_SIZE);
> > +        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
> > +        memcpy(dst, &D, XTS_BLOCK_SIZE);
> 
> Why do you need S and D?

I think src & dst pointers can't be guaranteed to be aligned sufficiently
for int64 operations, if we just cast from uint8t*. 

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type
  2018-10-09 14:58     ` Daniel P. Berrangé
@ 2018-10-09 15:14       ` Alberto Garcia
  0 siblings, 0 replies; 27+ messages in thread
From: Alberto Garcia @ 2018-10-09 15:14 UTC (permalink / raw)
  To: Daniel P. Berrangé; +Cc: qemu-devel

On Tue 09 Oct 2018 04:58:39 PM CEST, Daniel P. Berrangé wrote:
>> > @@ -85,7 +90,7 @@ void xts_decrypt(const void *datactx,
>> >                   uint8_t *dst,
>> >                   const uint8_t *src)
>> >  {
>> > -    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
>> > +    xts_uint128 PP, CC, T;
>> >      unsigned long i, m, mo, lim;
>> 
>>    [...]
>> 
>> >          /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
>> >          for (i = 0; i < mo; i++) {
>> > -            CC[i] = src[XTS_BLOCK_SIZE + i];
>> > -            dst[XTS_BLOCK_SIZE + i] = PP[i];
>> > +            ((uint8_t *)&CC)[i] = src[XTS_BLOCK_SIZE + i];
>> > +            dst[XTS_BLOCK_SIZE + i] = ((uint8_t *)&PP)[i];
>> >          }
>> 
>> On second thoughts, these casts are a bit cumbersome. I wonder if it
>> isn't better to keep the array a uint8_t[] and only treat it as
>> xts_uint128 in the places where you actually do 64-bit operations
>> (xts_uint128_xor, xts_mult_x).
>
> I had done that originally, but it just shifts ugly casts from one
> place to another place in the code.

Does it really? There's a dozen casts to uint8_t * in different
places. If you use uint_8[] you would only need something like this:

static void xts_mult_x(uint8_t *I8)
{
    xts_uint128 *I = (xts_uint128 *) I8;
    /* ... the rest of the function remains the same ... */
}

And something similar in xts_uint128_xor(), which could be an inline
function instead of a macro.

Berto

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type
  2018-10-09 15:06     ` Daniel P. Berrangé
@ 2018-10-09 15:30       ` Alberto Garcia
  2018-10-09 15:31         ` Daniel P. Berrangé
  0 siblings, 1 reply; 27+ messages in thread
From: Alberto Garcia @ 2018-10-09 15:30 UTC (permalink / raw)
  To: Daniel P. Berrangé; +Cc: qemu-devel

>> >      for (i = 0; i < lim; i++) {
>> > -        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
>> > +        xts_uint128 S, D;
>> > +
>> > +        memcpy(&S, src, XTS_BLOCK_SIZE);
>> > +        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
>> > +        memcpy(dst, &D, XTS_BLOCK_SIZE);
>> 
>> Why do you need S and D?
>
> I think src & dst pointers can't be guaranteed to be aligned
> sufficiently for int64 operations, if we just cast from uint8t*.

I see. I did a quick test without the memcpy() calls and it doesn't seem
to have a visible effect on performance, but if it turns out that it
does then maybe this is worth investigating further. I suspect all
buffers received by this code are allocated with qemu_try_blockalign()
anyway, so it should be safe.

Berto

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type
  2018-10-09 15:30       ` Alberto Garcia
@ 2018-10-09 15:31         ` Daniel P. Berrangé
  2018-10-11 12:16           ` Alberto Garcia
  0 siblings, 1 reply; 27+ messages in thread
From: Daniel P. Berrangé @ 2018-10-09 15:31 UTC (permalink / raw)
  To: Alberto Garcia; +Cc: qemu-devel

On Tue, Oct 09, 2018 at 05:30:25PM +0200, Alberto Garcia wrote:
> >> >      for (i = 0; i < lim; i++) {
> >> > -        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
> >> > +        xts_uint128 S, D;
> >> > +
> >> > +        memcpy(&S, src, XTS_BLOCK_SIZE);
> >> > +        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
> >> > +        memcpy(dst, &D, XTS_BLOCK_SIZE);
> >> 
> >> Why do you need S and D?
> >
> > I think src & dst pointers can't be guaranteed to be aligned
> > sufficiently for int64 operations, if we just cast from uint8t*.
> 
> I see. I did a quick test without the memcpy() calls and it doesn't seem
> to have a visible effect on performance, but if it turns out that it
> does then maybe this is worth investigating further. I suspect all
> buffers received by this code are allocated with qemu_try_blockalign()
> anyway, so it should be safe.

The extra memcpy() calls certainly had a perf impact when I added
them, so if we can determine that we can safely do without, that
would be desirable.


Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 6/6] crypto: annotate xts_tweak_encdec as inlineable
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 6/6] crypto: annotate xts_tweak_encdec as inlineable Daniel P. Berrangé
@ 2018-10-09 15:37   ` Alberto Garcia
  0 siblings, 0 replies; 27+ messages in thread
From: Alberto Garcia @ 2018-10-09 15:37 UTC (permalink / raw)
  To: Daniel P. Berrangé, qemu-devel

On Tue 09 Oct 2018 02:55:41 PM CEST, Daniel P. Berrangé wrote:
> Encouraging the compiler to inline xts_tweak_encdec increases the
> performance for xts-aes-128 when built with gcrypt:
>
>   Encrypt: 460 MB/s -> 485 MB/s
>   Decrypt: 485 MB/s -> 505 MB/s

:-)

> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>

Reviewed-by: Alberto Garcia <berto@igalia.com>

Berto

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 1/6] crypto: expand algorithm coverage for cipher benchmark
  2018-10-09 12:55 ` [Qemu-devel] [PATCH 1/6] crypto: expand algorithm coverage for cipher benchmark Daniel P. Berrangé
  2018-10-09 14:04   ` Marc-André Lureau
@ 2018-10-10 11:45   ` Alberto Garcia
  1 sibling, 0 replies; 27+ messages in thread
From: Alberto Garcia @ 2018-10-10 11:45 UTC (permalink / raw)
  To: Daniel P. Berrangé, qemu-devel

On Tue 09 Oct 2018 02:55:36 PM CEST, Daniel P. Berrangé wrote:
> Add testing coverage for AES with XTS, ECB and CTR modes
>
> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>

Reviewed-by: Alberto Garcia <berto@igalia.com>

Berto

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type
  2018-10-09 15:31         ` Daniel P. Berrangé
@ 2018-10-11 12:16           ` Alberto Garcia
  0 siblings, 0 replies; 27+ messages in thread
From: Alberto Garcia @ 2018-10-11 12:16 UTC (permalink / raw)
  To: Daniel P. Berrangé; +Cc: qemu-devel

On Tue 09 Oct 2018 05:31:32 PM CEST, Daniel P. Berrangé wrote:
> On Tue, Oct 09, 2018 at 05:30:25PM +0200, Alberto Garcia wrote:
>> >> >      for (i = 0; i < lim; i++) {
>> >> > -        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
>> >> > +        xts_uint128 S, D;
>> >> > +
>> >> > +        memcpy(&S, src, XTS_BLOCK_SIZE);
>> >> > +        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
>> >> > +        memcpy(dst, &D, XTS_BLOCK_SIZE);
>> >> 
>> >> Why do you need S and D?
>> >
>> > I think src & dst pointers can't be guaranteed to be aligned
>> > sufficiently for int64 operations, if we just cast from uint8t*.
>> 
>> I see. I did a quick test without the memcpy() calls and it doesn't
>> seem to have a visible effect on performance, but if it turns out
>> that it does then maybe this is worth investigating further. I
>> suspect all buffers received by this code are allocated with
>> qemu_try_blockalign() anyway, so it should be safe.
>
> The extra memcpy() calls certainly had a perf impact when I added
> them, so if we can determine that we can safely do without, that would
> be desirable.

So I was having a look at this. From the block layer everything comes
properly aligned. Then there's VirtioCrypto, which seems to allow XTS
mode but I couldn't quite tell from virtio_crypto_sym_op_helper() if all
buffers are guaranteed to be aligned.

What you could do is a runtime check (with QEMU_PTR_IS_ALIGNED()) and
decide what implementation to use.

A couple of additional thoughts:

- x86_64 (and others) allow unaligned memory accesses, and that might be
  faster than copying the buffer using memcpy(). I haven't measured it
  however.

- qcrypto_block_{encrypt,decrypt}_helper() (used for encrypted block
  I/O) use the same buffer for input and output, so maybe it's worth
  exploring if this fact allows for additional optimization if you still
  need to use memcpy().

Berto

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2018-10-11 12:17 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-09 12:55 [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Daniel P. Berrangé
2018-10-09 12:55 ` [Qemu-devel] [PATCH 1/6] crypto: expand algorithm coverage for cipher benchmark Daniel P. Berrangé
2018-10-09 14:04   ` Marc-André Lureau
2018-10-10 11:45   ` Alberto Garcia
2018-10-09 12:55 ` [Qemu-devel] [PATCH 2/6] crypto: remove code duplication in tweak encrypt/decrypt Daniel P. Berrangé
2018-10-09 13:43   ` Alberto Garcia
2018-10-09 13:51   ` Marc-André Lureau
2018-10-09 12:55 ` [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type Daniel P. Berrangé
2018-10-09 14:40   ` Alberto Garcia
2018-10-09 14:50   ` Alberto Garcia
2018-10-09 14:58     ` Daniel P. Berrangé
2018-10-09 15:14       ` Alberto Garcia
2018-10-09 12:55 ` [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type Daniel P. Berrangé
2018-10-09 15:02   ` Alberto Garcia
2018-10-09 15:06     ` Daniel P. Berrangé
2018-10-09 15:30       ` Alberto Garcia
2018-10-09 15:31         ` Daniel P. Berrangé
2018-10-11 12:16           ` Alberto Garcia
2018-10-09 12:55 ` [Qemu-devel] [PATCH 5/6] crypto: convert xts_mult_x " Daniel P. Berrangé
2018-10-09 13:52   ` Alberto Garcia
2018-10-09 13:55     ` Daniel P. Berrangé
2018-10-09 14:25       ` Alberto Garcia
2018-10-09 12:55 ` [Qemu-devel] [PATCH 6/6] crypto: annotate xts_tweak_encdec as inlineable Daniel P. Berrangé
2018-10-09 15:37   ` Alberto Garcia
2018-10-09 13:59 ` [Qemu-devel] [PATCH 0/6] crypto: improve performance of XTS cipher mode Marc-André Lureau
2018-10-09 14:13   ` Daniel P. Berrangé
2018-10-09 14:27     ` Marc-André Lureau

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.