All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/2] This patch updates runtime check of AVX512
@ 2022-08-08  7:48 ling xu
  2022-08-08  7:48 ` [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function ling xu
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: ling xu @ 2022-08-08  7:48 UTC (permalink / raw)
  To: qemu-devel; +Cc: quintela, dgilbert, ling xu

This patch updates runtime check of AVX512 and update avx512 support for
xbzrle_encode_buffer function to accelerate xbzrle encoding speed.

The runtime check is updated in meson.build and meson_options.txt.

The updated AVX512 algorithm is provided in ram.c, xbzrle.c and
xbzrle.h.

The test code is provided in test-xbzrle.c.

Previous discussion is refered below:
https://www.mail-archive.com/qemu-devel@nongnu.org/msg903520.html

ling xu (2):
  Update AVX512 support for xbzrle_encode_buffer function
  Test code for AVX512 support for xbzrle_encode_buffer

 meson.build              |  16 ++
 meson_options.txt        |   2 +
 migration/ram.c          |  41 ++++++
 migration/xbzrle.c       | 181 +++++++++++++++++++++++
 migration/xbzrle.h       |   4 +
 tests/unit/test-xbzrle.c | 307 ++++++++++++++++++++++++++++++++++++---
 6 files changed, 534 insertions(+), 17 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
  2022-08-08  7:48 [PATCH v3 0/2] This patch updates runtime check of AVX512 ling xu
@ 2022-08-08  7:48 ` ling xu
  2022-08-08 13:12   ` Juan Quintela
  2022-08-09 18:41   ` Richard Henderson
  2022-08-08  7:48 ` [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer ling xu
  2022-08-08 11:54 ` [PATCH v3 0/2] This patch updates runtime check of AVX512 Juan Quintela
  2 siblings, 2 replies; 14+ messages in thread
From: ling xu @ 2022-08-08  7:48 UTC (permalink / raw)
  To: qemu-devel; +Cc: quintela, dgilbert, ling xu, Zhou Zhao, Jun Jin

This commit update runtime check of AVX512, and implements avx512 of
xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
Compared with C version of xbzrle_encode_buffer function, avx512 version
can achieve almost 60%-70% performance improvement on unit test provided
by Qemu. In addition, we provide one more unit test called
"test_encode_decode_random", in which dirty data are randomly located in
4K page, and this case can achieve almost 140% performance gain.

Signed-off-by: ling xu <ling1.xu@intel.com>
Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
Co-authored-by: Jun Jin <jun.i.jin@intel.com>
---
 meson.build        |  16 ++++
 meson_options.txt  |   2 +
 migration/ram.c    |  41 ++++++++++
 migration/xbzrle.c | 181 +++++++++++++++++++++++++++++++++++++++++++++
 migration/xbzrle.h |   4 +
 5 files changed, 244 insertions(+)

diff --git a/meson.build b/meson.build
index 294e9a8f32..4222b77e9f 100644
--- a/meson.build
+++ b/meson.build
@@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
     int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
+  .require(cc.links('''
+    #pragma GCC push_options
+    #pragma GCC target("avx512bw")
+    #include <cpuid.h>
+    #include <immintrin.h>
+    static int bar(void *a) {
+
+      __m512i x = *(__m512i *)a;
+      __m512i res= _mm512_abs_epi8(x);
+      return res[1];
+    }
+    int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512BW not available').allowed())
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt
index e58e158396..07194bf680 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
        description: 'AVX2 optimizations')
 option('avx512f', type: 'feature', value: 'disabled',
        description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+       description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
        description: 'Linux keyring support')
 
diff --git a/migration/ram.c b/migration/ram.c
index dc1de9ddbc..d9c1ac2f7a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,35 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
 
+#if defined(CONFIG_AVX512BW_OPT)
+static bool IS_CPU_SUPPORT_AVX512BW;
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+    unsigned max = __get_cpuid_max(0, NULL);
+    int a, b, c, d;
+    IS_CPU_SUPPORT_AVX512BW = false;
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+         /* We must check that AVX is not just available, but usable.  */
+        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+            int bv;
+            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+            __cpuid_count(7, 0, a, b, c, d);
+           /* 0xe6:
+            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+            *                    and ZMM16-ZMM31 state are enabled by OS)
+            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+            */
+            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+                IS_CPU_SUPPORT_AVX512BW = true;
+            }
+        }
+    }
+    return ;
+}
+#endif
+
 XBZRLECacheStats xbzrle_counters;
 
 /* struct contains XBZRLE cache and a static page
@@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
     /* XBZRLE encoding (if there is no overflow) */
+    #if defined(CONFIG_AVX512BW_OPT)
+    if (likely(IS_CPU_SUPPORT_AVX512BW)) {
+        encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf,
+                                               TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
+                                               TARGET_PAGE_SIZE);
+    } else {
+        encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
+                                           TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
+                                           TARGET_PAGE_SIZE);
+    }
+    #else
     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
                                        TARGET_PAGE_SIZE);
+    #endif
 
     /*
      * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index 1ba482ded9..4db09fdbdb 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -174,3 +174,184 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
 
     return d;
 }
+
+#if defined(CONFIG_AVX512BW_OPT)
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+
+#include <immintrin.h>
+#include <math.h>
+#define SET_ZERO512(r) r = _mm512_set1_epi32(0)
+int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                             uint8_t *dst, int dlen)
+{
+    uint32_t zrun_len = 0, nzrun_len = 0;
+    int d = 0, i = 0, num = 0;
+    uint8_t *nzrun_start = NULL;
+    int count512s = (slen >> 6);
+    int res = slen % 64;
+    bool never_same = true;
+    while (count512s--) {
+        if (d + 2 > dlen) {
+            return -1;
+        }
+        __m512i old_data = _mm512_mask_loadu_epi8(old_data,
+                               0xffffffffffffffff, old_buf + i);
+        __m512i new_data = _mm512_mask_loadu_epi8(new_data,
+                                                 0xffffffffffffffff, new_buf + i);
+        /* in mask bit 1 for same, 0 for diff */
+        __mmask64  comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
+
+        int bytesToCheck = 64;
+        bool is_same = (comp & 0x1);
+        while (bytesToCheck) {
+            if (is_same) {
+                if (nzrun_len) {
+                    d += uleb128_encode_small(dst + d, nzrun_len);
+                    if (d + nzrun_len > dlen) {
+                        return -1;
+                    }
+                    nzrun_start = new_buf + i - nzrun_len;
+                    memcpy(dst + d, nzrun_start, nzrun_len);
+                    d += nzrun_len;
+                    nzrun_len = 0;
+                }
+                if (comp == 0xffffffffffffffff) {
+                    i += 64;
+                    zrun_len += 64;
+                    break;
+                }
+                never_same = false;
+                num = __builtin_ctzl(~comp);
+                num = (num < bytesToCheck) ? num : bytesToCheck;
+                zrun_len += num;
+                bytesToCheck -= num;
+                comp >>= num;
+                i += num;
+                if (bytesToCheck) {
+                    /* still has different data after same data */
+                    d += uleb128_encode_small(dst + d, zrun_len);
+                    zrun_len = 0;
+                } else {
+                    break;
+                }
+            }
+            if (never_same || zrun_len) {
+                /*
+                 * never_same only acts if
+                 * data begins with diff in first count512s
+                 */
+                d += uleb128_encode_small(dst + d, zrun_len);
+                zrun_len = 0;
+                never_same = false;
+            }
+            /* has diff */
+            if ((bytesToCheck == 64) && (comp == 0x0)) {
+                i += 64;
+                nzrun_len += 64;
+                break;
+            }
+            num = __builtin_ctzl(comp);
+            num = (num < bytesToCheck) ? num : bytesToCheck;
+            nzrun_len += num;
+            bytesToCheck -= num;
+            comp >>= num;
+            i += num;
+            if (bytesToCheck) {
+                /* mask like 111000 */
+                d += uleb128_encode_small(dst + d, nzrun_len);
+                /* overflow */
+                if (d + nzrun_len > dlen) {
+                    return -1;
+                }
+                nzrun_start = new_buf + i - nzrun_len;
+                memcpy(dst + d, nzrun_start, nzrun_len);
+                d += nzrun_len;
+                nzrun_len = 0;
+                is_same = true;
+            }
+        }
+    }
+    if (res) {
+        /* the number of data is less than 64 */
+        unsigned long long mask = pow(2, res);
+        mask -= 1;
+        __m512i r = SET_ZERO512(r);
+        __m512i old_data = _mm512_mask_loadu_epi8(r, mask, old_buf + i);
+        __m512i new_data = _mm512_mask_loadu_epi8(r, mask, new_buf + i);
+        __mmask64 comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
+
+        int bytesToCheck = res;
+        bool is_same = (comp & 0x1);
+        while (bytesToCheck) {
+            if (is_same) {
+                if (nzrun_len) {
+                    d += uleb128_encode_small(dst + d, nzrun_len);
+                    if (d + nzrun_len > dlen) {
+                        return -1;
+                    }
+                    nzrun_start = new_buf + i - nzrun_len;
+                    memcpy(dst + d, nzrun_start, nzrun_len);
+                    d += nzrun_len;
+                    nzrun_len = 0;
+                }
+                never_same = false;
+                num = __builtin_ctzl(~comp);
+                num = (num < bytesToCheck) ? num : bytesToCheck;
+                zrun_len += num;
+                bytesToCheck -= num;
+                comp >>= num;
+                i += num;
+                if (bytesToCheck) {
+                    /* diff after same */
+                    d += uleb128_encode_small(dst + d, zrun_len);
+                    zrun_len = 0;
+                } else {
+                    break;
+                }
+            }
+
+            if (never_same || zrun_len) {
+                d += uleb128_encode_small(dst + d, zrun_len);
+                zrun_len = 0;
+                never_same = false;
+            }
+            /* has diff */
+            num = __builtin_ctzl(comp);
+            num = (num < bytesToCheck) ? num : bytesToCheck;
+            nzrun_len += num;
+            bytesToCheck -= num;
+            comp >>= num;
+            i += num;
+            if (bytesToCheck) {
+                d += uleb128_encode_small(dst + d, nzrun_len);
+                /* overflow */
+                if (d + nzrun_len > dlen) {
+                    return -1;
+                }
+                nzrun_start = new_buf + i - nzrun_len;
+                memcpy(dst + d, nzrun_start, nzrun_len);
+                d += nzrun_len;
+                nzrun_len = 0;
+                is_same = true;
+            }
+        }
+    }
+
+    if (zrun_len) {
+        return (zrun_len == slen) ? 0 : d;
+    }
+    if (nzrun_len != 0) {
+        d += uleb128_encode_small(dst + d, nzrun_len);
+        /* overflow */
+        if (d + nzrun_len > dlen) {
+            return -1;
+        }
+        nzrun_start = new_buf + i - nzrun_len;
+        memcpy(dst + d, nzrun_start, nzrun_len);
+        d += nzrun_len;
+    }
+    return d;
+}
+#pragma GCC pop_options
+#endif
\ No newline at end of file
diff --git a/migration/xbzrle.h b/migration/xbzrle.h
index a0db507b9c..6247de5f00 100644
--- a/migration/xbzrle.h
+++ b/migration/xbzrle.h
@@ -18,4 +18,8 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
                          uint8_t *dst, int dlen);
 
 int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
+#if defined(CONFIG_AVX512BW_OPT)
+int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                             uint8_t *dst, int dlen);
+#endif
 #endif
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer
  2022-08-08  7:48 [PATCH v3 0/2] This patch updates runtime check of AVX512 ling xu
  2022-08-08  7:48 ` [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function ling xu
@ 2022-08-08  7:48 ` ling xu
  2022-08-08  8:08   ` Thomas Huth
  2022-08-09 18:30   ` Richard Henderson
  2022-08-08 11:54 ` [PATCH v3 0/2] This patch updates runtime check of AVX512 Juan Quintela
  2 siblings, 2 replies; 14+ messages in thread
From: ling xu @ 2022-08-08  7:48 UTC (permalink / raw)
  To: qemu-devel; +Cc: quintela, dgilbert, ling xu, Zhou Zhao, Jun Jin

Signed-off-by: ling xu <ling1.xu@intel.com>
Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
Co-authored-by: Jun Jin <jun.i.jin@intel.com>
---
 tests/unit/test-xbzrle.c | 307 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 290 insertions(+), 17 deletions(-)

diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
index ef951b6e54..653016826f 100644
--- a/tests/unit/test-xbzrle.c
+++ b/tests/unit/test-xbzrle.c
@@ -38,111 +38,280 @@ static void test_uleb(void)
     g_assert(val == 0);
 }
 
-static void test_encode_decode_zero(void)
+static float *test_encode_decode_zero(void)
 {
     uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
     int i = 0;
-    int dlen = 0;
+    int dlen = 0, dlen512 = 0;
     int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
     for (i = diff_len; i > 0; i--) {
         buffer[1000 + i] = i;
+        buffer512[1000 + i] = i;
     }
 
     buffer[1000 + diff_len + 3] = 103;
     buffer[1000 + diff_len + 5] = 105;
 
+    buffer512[1000 + diff_len + 3] = 103;
+    buffer512[1000 + diff_len + 5] = 105;
+
     /* encode zero page */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
     dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
                        XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
     g_assert(dlen == 0);
 
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(dlen512 == 0);
+
+    static float result_zero[2];
+    result_zero[0] = time_val;
+    result_zero[1] = time_val512;
+
     g_free(buffer);
     g_free(compressed);
+    g_free(buffer512);
+    g_free(compressed512);
+
+    return result_zero;
+}
+
+static void test_encode_decode_zero_range(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
+    for (i = 0; i < 10000; i++) {
+        res = test_encode_decode_zero();
+        time_raw += res[0];
+        time_512 += res[1];
+    }
+    printf("Zero test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
 }
 
-static void test_encode_decode_unchanged(void)
+static float *test_encode_decode_unchanged(void)
 {
     uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
     int i = 0;
-    int dlen = 0;
+    int dlen = 0, dlen512 = 0;
     int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
     for (i = diff_len; i > 0; i--) {
         test[1000 + i] = i + 4;
+        test512[1000 + i] = i + 4;
     }
 
     test[1000 + diff_len + 3] = 107;
     test[1000 + diff_len + 5] = 109;
 
+    test512[1000 + diff_len + 3] = 107;
+    test512[1000 + diff_len + 5] = 109;
+
     /* test unchanged buffer */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
     dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
                                 XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
     g_assert(dlen == 0);
 
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_512(test512, test512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(dlen512 == 0);
+
+    static float result_unchanged[2];
+    result_unchanged[0] = time_val;
+    result_unchanged[1] = time_val512;
+
     g_free(test);
     g_free(compressed);
+    g_free(test512);
+    g_free(compressed512);
+
+    return result_unchanged;
 }
 
-static void test_encode_decode_1_byte(void)
+static void test_encode_decode_unchanged_range(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
+    for (i = 0; i < 10000; i++) {
+        res = test_encode_decode_unchanged();
+        time_raw += res[0];
+        time_512 += res[1];
+    }
+    printf("Unchanged test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static float *test_encode_decode_1_byte(void)
 {
     uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
-    int dlen = 0, rc = 0;
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
+    int dlen = 0, rc = 0, dlen512 = 0, rc512 = 0;
     uint8_t buf[2];
+    uint8_t buf512[2];
 
     test[XBZRLE_PAGE_SIZE - 1] = 1;
+    test512[XBZRLE_PAGE_SIZE - 1] = 1;
 
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
     dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
                        XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
     g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
 
     rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE);
     g_assert(rc == XBZRLE_PAGE_SIZE);
     g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
 
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_512(buffer512, test512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(dlen512 == (uleb128_encode_small(&buf512[0], 4095) + 2));
+
+    rc512 = xbzrle_decode_buffer(compressed512, dlen512, buffer512,
+                                 XBZRLE_PAGE_SIZE);
+    g_assert(rc512 == XBZRLE_PAGE_SIZE);
+    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
+
+    static float result_1_byte[2];
+    result_1_byte[0] = time_val;
+    result_1_byte[1] = time_val512;
+
     g_free(buffer);
     g_free(compressed);
     g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+    return result_1_byte;
 }
 
-static void test_encode_decode_overflow(void)
+static void test_encode_decode_1_byte_range(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
+    for (i = 0; i < 10000; i++) {
+        res = test_encode_decode_1_byte();
+        time_raw += res[0];
+        time_512 += res[1];
+    }
+    printf("1 byte test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static float *test_encode_decode_overflow(void)
 {
     uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0, rc = 0;
+    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    int i = 0, rc = 0, rc512 = 0;
 
     for (i = 0; i < XBZRLE_PAGE_SIZE / 2 - 1; i++) {
         test[i * 2] = 1;
+        test512[i * 2] = 1;
     }
 
     /* encode overflow */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
     rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
                               XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
     g_assert(rc == -1);
 
+    t_start512 = clock();
+    rc512 = xbzrle_encode_buffer_512(buffer512, test512, XBZRLE_PAGE_SIZE,
+                                     compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(rc512 == -1);
+
+    static float result_overflow[2];
+    result_overflow[0] = time_val;
+    result_overflow[1] = time_val512;
+
     g_free(buffer);
     g_free(compressed);
     g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+    return result_overflow;
+}
+
+static void test_encode_decode_overflow_range(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
+    for (i = 0; i < 10000; i++) {
+        res = test_encode_decode_overflow();
+        time_raw += res[0];
+        time_512 += res[1];
+    }
+    printf("Overflow test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
 }
 
-static void encode_decode_range(void)
+static float *encode_decode_range(void)
 {
     uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
     uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0, rc = 0;
-    int dlen = 0;
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    int i = 0, rc = 0, rc512 = 0;
+    int dlen = 0, dlen512 = 0;
 
     int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
     for (i = diff_len; i > 0; i--) {
         buffer[1000 + i] = i;
         test[1000 + i] = i + 4;
+        buffer512[1000 + i] = i;
+        test512[1000 + i] = i + 4;
     }
 
     buffer[1000 + diff_len + 3] = 103;
@@ -151,26 +320,129 @@ static void encode_decode_range(void)
     buffer[1000 + diff_len + 5] = 105;
     test[1000 + diff_len + 5] = 109;
 
+    buffer512[1000 + diff_len + 3] = 103;
+    test512[1000 + diff_len + 3] = 107;
+
+    buffer512[1000 + diff_len + 5] = 105;
+    test512[1000 + diff_len + 5] = 109;
+
     /* test encode/decode */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
     dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
                                 XBZRLE_PAGE_SIZE);
-
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
     rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
     g_assert(rc < XBZRLE_PAGE_SIZE);
     g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
 
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_512(test512, buffer512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
+    g_assert(rc512 < XBZRLE_PAGE_SIZE);
+    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
+
+    static float result_range[2];
+    result_range[0] = time_val;
+    result_range[1] = time_val512;
+
     g_free(buffer);
     g_free(compressed);
     g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+    return result_range;
 }
 
 static void test_encode_decode(void)
 {
     int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
+    for (i = 0; i < 10000; i++) {
+        res = encode_decode_range();
+        time_raw += res[0];
+        time_512 += res[1];
+    }
+    printf("Encode decode test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
 
+static float *encode_decode_random(void)
+{
+    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
+    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    int i = 0, rc = 0, rc512 = 0;
+    int dlen = 0, dlen512 = 0;
+
+    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
+    /* store the index of diff */
+    int dirty_index[diff_len];
+    for (int j = 0; j < diff_len; j++) {
+        dirty_index[j] = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
+    }
+    for (i = diff_len - 1; i >= 0; i--) {
+        buffer[dirty_index[i]] = i;
+        test[dirty_index[i]] = i + 4;
+        buffer512[dirty_index[i]] = i;
+        test512[dirty_index[i]] = i + 4;
+    }
+
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
+    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
+                                XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
+    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
+    g_assert(rc < XBZRLE_PAGE_SIZE);
+
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_512(test512, buffer512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
+    g_assert(rc512 < XBZRLE_PAGE_SIZE);
+
+    static float result_random[2];
+    result_random[0] = time_val;
+    result_random[1] = time_val512;
+
+    g_free(buffer);
+    g_free(compressed);
+    g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+    return result_random;
+}
+
+static void test_encode_decode_random(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
     for (i = 0; i < 10000; i++) {
-        encode_decode_range();
+        res = encode_decode_random();
+        time_raw += res[0];
+        time_512 += res[1];
     }
+    printf("Random test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
 }
 
 int main(int argc, char **argv)
@@ -178,13 +450,14 @@ int main(int argc, char **argv)
     g_test_init(&argc, &argv, NULL);
     g_test_rand_int();
     g_test_add_func("/xbzrle/uleb", test_uleb);
-    g_test_add_func("/xbzrle/encode_decode_zero", test_encode_decode_zero);
+    g_test_add_func("/xbzrle/encode_decode_zero", test_encode_decode_zero_range);
     g_test_add_func("/xbzrle/encode_decode_unchanged",
-                    test_encode_decode_unchanged);
-    g_test_add_func("/xbzrle/encode_decode_1_byte", test_encode_decode_1_byte);
+                    test_encode_decode_unchanged_range);
+    g_test_add_func("/xbzrle/encode_decode_1_byte", test_encode_decode_1_byte_range);
     g_test_add_func("/xbzrle/encode_decode_overflow",
-                    test_encode_decode_overflow);
+                    test_encode_decode_overflow_range);
     g_test_add_func("/xbzrle/encode_decode", test_encode_decode);
+    g_test_add_func("/xbzrle/encode_decode_random", test_encode_decode_random);
 
     return g_test_run();
 }
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer
  2022-08-08  7:48 ` [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer ling xu
@ 2022-08-08  8:08   ` Thomas Huth
  2022-08-08  8:30     ` Xu, Ling1
  2022-08-09 18:30   ` Richard Henderson
  1 sibling, 1 reply; 14+ messages in thread
From: Thomas Huth @ 2022-08-08  8:08 UTC (permalink / raw)
  To: ling xu, qemu-devel; +Cc: quintela, dgilbert, Zhou Zhao, Jun Jin

On 08/08/2022 09.48, ling xu wrote:
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>   tests/unit/test-xbzrle.c | 307 ++++++++++++++++++++++++++++++++++++---
>   1 file changed, 290 insertions(+), 17 deletions(-)
> 
> diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
> index ef951b6e54..653016826f 100644
> --- a/tests/unit/test-xbzrle.c
> +++ b/tests/unit/test-xbzrle.c
> @@ -38,111 +38,280 @@ static void test_uleb(void)
>       g_assert(val == 0);
>   }
>   
> -static void test_encode_decode_zero(void)
> +static float *test_encode_decode_zero(void)
>   {
>       uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
>       uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
> +    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
> +    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
>       int i = 0;
> -    int dlen = 0;
> +    int dlen = 0, dlen512 = 0;
>       int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
>   
>       for (i = diff_len; i > 0; i--) {
>           buffer[1000 + i] = i;
> +        buffer512[1000 + i] = i;
>       }
>   
>       buffer[1000 + diff_len + 3] = 103;
>       buffer[1000 + diff_len + 5] = 105;
>   
> +    buffer512[1000 + diff_len + 3] = 103;
> +    buffer512[1000 + diff_len + 5] = 105;
> +
>       /* encode zero page */
> +    time_t t_start, t_end, t_start512, t_end512;
> +    t_start = clock();
>       dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
>                          XBZRLE_PAGE_SIZE);
> +    t_end = clock();
> +    float time_val = difftime(t_end, t_start);
>       g_assert(dlen == 0);
>   
> +    t_start512 = clock();
> +    dlen512 = xbzrle_encode_buffer_512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
> +                                       compressed512, XBZRLE_PAGE_SIZE);

Does this also still work on systems without AVX? If I've got patch 1/2 
right, this function is only defined if CONFIG_AVX512BW_OPT has been set, so 
using it unconditionally here seems to be wrong?

  Thomas



^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer
  2022-08-08  8:08   ` Thomas Huth
@ 2022-08-08  8:30     ` Xu, Ling1
  0 siblings, 0 replies; 14+ messages in thread
From: Xu, Ling1 @ 2022-08-08  8:30 UTC (permalink / raw)
  To: Thomas Huth, qemu-devel; +Cc: quintela, dgilbert, Zhao, Zhou, Jin, Jun I

Hi, Thomas,
      Thanks for your reply. This test code can only work on system supporting avx512. It's reasonably to add condition check in test code to, agree to your suggestion. I'll add condition check in test code later. 

Best Regards
Ling

-----Original Message-----
From: Thomas Huth <thuth@redhat.com> 
Sent: Monday, August 8, 2022 4:09 PM
To: Xu, Ling1 <ling1.xu@intel.com>; qemu-devel@nongnu.org
Cc: quintela@redhat.com; dgilbert@redhat.com; Zhao, Zhou <zhou.zhao@intel.com>; Jin, Jun I <jun.i.jin@intel.com>
Subject: Re: [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer

On 08/08/2022 09.48, ling xu wrote:
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>   tests/unit/test-xbzrle.c | 307 ++++++++++++++++++++++++++++++++++++---
>   1 file changed, 290 insertions(+), 17 deletions(-)
> 
> diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c index 
> ef951b6e54..653016826f 100644
> --- a/tests/unit/test-xbzrle.c
> +++ b/tests/unit/test-xbzrle.c
> @@ -38,111 +38,280 @@ static void test_uleb(void)
>       g_assert(val == 0);
>   }
>   
> -static void test_encode_decode_zero(void)
> +static float *test_encode_decode_zero(void)
>   {
>       uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
>       uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
> +    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
> +    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
>       int i = 0;
> -    int dlen = 0;
> +    int dlen = 0, dlen512 = 0;
>       int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 
> 1006);
>   
>       for (i = diff_len; i > 0; i--) {
>           buffer[1000 + i] = i;
> +        buffer512[1000 + i] = i;
>       }
>   
>       buffer[1000 + diff_len + 3] = 103;
>       buffer[1000 + diff_len + 5] = 105;
>   
> +    buffer512[1000 + diff_len + 3] = 103;
> +    buffer512[1000 + diff_len + 5] = 105;
> +
>       /* encode zero page */
> +    time_t t_start, t_end, t_start512, t_end512;
> +    t_start = clock();
>       dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
>                          XBZRLE_PAGE_SIZE);
> +    t_end = clock();
> +    float time_val = difftime(t_end, t_start);
>       g_assert(dlen == 0);
>   
> +    t_start512 = clock();
> +    dlen512 = xbzrle_encode_buffer_512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
> +                                       compressed512, 
> + XBZRLE_PAGE_SIZE);

Does this also still work on systems without AVX? If I've got patch 1/2 right, this function is only defined if CONFIG_AVX512BW_OPT has been set, so using it unconditionally here seems to be wrong?

  Thomas


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 0/2] This patch updates runtime check of AVX512
  2022-08-08  7:48 [PATCH v3 0/2] This patch updates runtime check of AVX512 ling xu
  2022-08-08  7:48 ` [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function ling xu
  2022-08-08  7:48 ` [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer ling xu
@ 2022-08-08 11:54 ` Juan Quintela
  2022-08-09  1:19   ` Xu, Ling1
  2 siblings, 1 reply; 14+ messages in thread
From: Juan Quintela @ 2022-08-08 11:54 UTC (permalink / raw)
  To: ling xu; +Cc: qemu-devel, dgilbert

ling xu <ling1.xu@intel.com> wrote:
> This patch updates runtime check of AVX512 and update avx512 support for
> xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
>
> The runtime check is updated in meson.build and meson_options.txt.
>
> The updated AVX512 algorithm is provided in ram.c, xbzrle.c and
> xbzrle.h.
>
> The test code is provided in test-xbzrle.c.
>
> Previous discussion is refered below:
> https://www.mail-archive.com/qemu-devel@nongnu.org/msg903520.html
>
> ling xu (2):
>   Update AVX512 support for xbzrle_encode_buffer function
>   Test code for AVX512 support for xbzrle_encode_buffer

I think this v3 and previous v3 are identical except for mthe link to
the previous discussion.

Later, Juan.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
  2022-08-08  7:48 ` [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function ling xu
@ 2022-08-08 13:12   ` Juan Quintela
  2022-08-09  7:51     ` Xu, Ling1
  2022-08-09 18:41   ` Richard Henderson
  1 sibling, 1 reply; 14+ messages in thread
From: Juan Quintela @ 2022-08-08 13:12 UTC (permalink / raw)
  To: ling xu; +Cc: qemu-devel, dgilbert, Zhou Zhao, Jun Jin

ling xu <ling1.xu@intel.com> wrote:
> This commit update runtime check of AVX512, and implements avx512 of
> xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
> Compared with C version of xbzrle_encode_buffer function, avx512 version
> can achieve almost 60%-70% performance improvement on unit test provided
> by Qemu. In addition, we provide one more unit test called
> "test_encode_decode_random", in which dirty data are randomly located in
> 4K page, and this case can achieve almost 140% performance gain.
>
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>  meson.build        |  16 ++++
>  meson_options.txt  |   2 +
>  migration/ram.c    |  41 ++++++++++
>  migration/xbzrle.c | 181 +++++++++++++++++++++++++++++++++++++++++++++
>  migration/xbzrle.h |   4 +
>  5 files changed, 244 insertions(+)
>
> diff --git a/meson.build b/meson.build
> index 294e9a8f32..4222b77e9f 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
>      int main(int argc, char *argv[]) { return bar(argv[0]); }
>    '''), error_message: 'AVX512F not available').allowed())
>  
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
> +  .require(cc.links('''
> +    #pragma GCC push_options
> +    #pragma GCC target("avx512bw")
> +    #include <cpuid.h>
> +    #include <immintrin.h>
> +    static int bar(void *a) {
> +
> +      __m512i x = *(__m512i *)a;
> +      __m512i res= _mm512_abs_epi8(x);
> +      return res[1];
> +    }
> +    int main(int argc, char *argv[]) { return bar(argv[0]); }
> +  '''), error_message: 'AVX512BW not available').allowed())
> +
>  have_pvrdma = get_option('pvrdma') \
>    .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
>    .require(cc.compiles(gnu_source_prefix + '''
> diff --git a/meson_options.txt b/meson_options.txt
> index e58e158396..07194bf680 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
>         description: 'AVX2 optimizations')
>  option('avx512f', type: 'feature', value: 'disabled',
>         description: 'AVX512F optimizations')
> +option('avx512bw', type: 'feature', value: 'auto',
> +       description: 'AVX512BW optimizations')
>  option('keyring', type: 'feature', value: 'auto',
>         description: 'Linux keyring support')
>  

[no clue about meson, it looks ok]

> diff --git a/migration/ram.c b/migration/ram.c
> index dc1de9ddbc..d9c1ac2f7a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -83,6 +83,35 @@
>  /* 0x80 is reserved in migration.h start with 0x100 next */
>  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
>  
> +#if defined(CONFIG_AVX512BW_OPT)
> +static bool IS_CPU_SUPPORT_AVX512BW;

An all caps global variable?

> +#include "qemu/cpuid.h"
> +static void __attribute__((constructor)) init_cpu_flag(void)
> +{
> +    unsigned max = __get_cpuid_max(0, NULL);
> +    int a, b, c, d;
> +    IS_CPU_SUPPORT_AVX512BW = false;
> +    if (max >= 1) {
> +        __cpuid(1, a, b, c, d);
> +         /* We must check that AVX is not just available, but usable.  */
> +        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> +            int bv;
> +            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +            __cpuid_count(7, 0, a, b, c, d);
> +           /* 0xe6:
> +            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> +            *                    and ZMM16-ZMM31 state are enabled by OS)
> +            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
> +            */
> +            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
> +                IS_CPU_SUPPORT_AVX512BW = true;
> +            }
> +        }
> +    }
> +    return ;
> +}
> +#endif
> +
>  XBZRLECacheStats xbzrle_counters;
>  
>  /* struct contains XBZRLE cache and a static page
> @@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
>      memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
>  
>      /* XBZRLE encoding (if there is no overflow) */
> +    #if defined(CONFIG_AVX512BW_OPT)
> +    if (likely(IS_CPU_SUPPORT_AVX512BW)) {

All distributions are go to have compile time support for AVX, but I am
not sure the percentage of machines that support avx

> +        encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf,
> +                                               TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                               TARGET_PAGE_SIZE);
> +    } else {
> +        encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
> +                                           TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                           TARGET_PAGE_SIZE);
> +    }

the else part is the same than the #else part
> +    #else
>      encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
>                                         TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
>                                         TARGET_PAGE_SIZE);
> +    #endif

So, why don't just create a new function pointer:

int (*xbzrle_encode_buffer_func)(uint8_t *old_buf, uint8_t *new_buf, int slen,
                                 uint8_t *dst, int dlen) = xbzrle_encode_buffer;


And aad into init_cpu_flag() something in the line of:

	xbzrle_encode_buffer_func = xbrrle_encode_buffer_512;

?


>      /*
>       * Update the cache contents, so that it corresponds to the data
> diff --git a/migration/xbzrle.c b/migration/xbzrle.c
> index 1ba482ded9..4db09fdbdb 100644
> --- a/migration/xbzrle.c
> +++ b/migration/xbzrle.c
> @@ -174,3 +174,184 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
>  
>      return d;
>  }
> +
> +#if defined(CONFIG_AVX512BW_OPT)
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +
> +#include <immintrin.h>
> +#include <math.h>
> +#define SET_ZERO512(r) r = _mm512_set1_epi32(0)
> +int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen)
> +{

This is just personal taste, but I would rename this to:

xbzrle_encode_buffer_avx512?

> +    uint32_t zrun_len = 0, nzrun_len = 0;
> +    int d = 0, i = 0, num = 0;
> +    uint8_t *nzrun_start = NULL;
> +    int count512s = (slen >> 6);
> +    int res = slen % 64;

res variable here means residual, normally we use "res" with meaning of
"result" in qemu.

> +    bool never_same = true;
> +    while (count512s--) {
> +        if (d + 2 > dlen) {
> +            return -1;
> +        }
> +        __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> +                               0xffffffffffffffff, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> +                                                 0xffffffffffffffff, new_buf + i);
> +        /* in mask bit 1 for same, 0 for diff */
> +        __mmask64  comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = 64;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                if (comp == 0xffffffffffffffff) {
> +                    i += 64;
> +                    zrun_len += 64;
> +                    break;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* still has different data after same data */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +            if (never_same || zrun_len) {
> +                /*
> +                 * never_same only acts if
> +                 * data begins with diff in first count512s
> +                 */
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            if ((bytesToCheck == 64) && (comp == 0x0)) {
> +                i += 64;
> +                nzrun_len += 64;
> +                break;
> +            }
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                /* mask like 111000 */
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +    if (res) {
> +        /* the number of data is less than 64 */
> +        unsigned long long mask = pow(2, res);

Not your fault.

21st century.  Someone still use long long in a new API, sniff.

> +        mask -= 1;
> +        __m512i r = SET_ZERO512(r);
> +        __m512i old_data = _mm512_mask_loadu_epi8(r, mask, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(r, mask, new_buf + i);
> +        __mmask64 comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = res;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* diff after same */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +
> +            if (never_same || zrun_len) {
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +
> +    if (zrun_len) {
> +        return (zrun_len == slen) ? 0 : d;
> +    }
> +    if (nzrun_len != 0) {
> +        d += uleb128_encode_small(dst + d, nzrun_len);
> +        /* overflow */
> +        if (d + nzrun_len > dlen) {
> +            return -1;
> +        }
> +        nzrun_start = new_buf + i - nzrun_len;
> +        memcpy(dst + d, nzrun_start, nzrun_len);
> +        d += nzrun_len;
> +    }
> +    return d;
> +}
> +#pragma GCC pop_options
> +#endif
> \ No newline at end of file
> diff --git a/migration/xbzrle.h b/migration/xbzrle.h
> index a0db507b9c..6247de5f00 100644
> --- a/migration/xbzrle.h
> +++ b/migration/xbzrle.h
> @@ -18,4 +18,8 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
>                           uint8_t *dst, int dlen);
>  
>  int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
> +#if defined(CONFIG_AVX512BW_OPT)
> +int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen);
> +#endif
>  #endif



^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v3 0/2] This patch updates runtime check of AVX512
  2022-08-08 11:54 ` [PATCH v3 0/2] This patch updates runtime check of AVX512 Juan Quintela
@ 2022-08-09  1:19   ` Xu, Ling1
  0 siblings, 0 replies; 14+ messages in thread
From: Xu, Ling1 @ 2022-08-09  1:19 UTC (permalink / raw)
  To: quintela; +Cc: qemu-devel, dgilbert

Hi, Juan, 
     You are right, this v3 and previous v3 are identical except the link to previous discussion. The previous [patch v3 0/2] was sent failed as shown in my mail, so I resend this patch. Sorry for the ambiguity of resending same patch, and thanks for your time ~

Best Regards
Ling

-----Original Message-----
From: Juan Quintela <quintela@redhat.com> 
Sent: Monday, August 8, 2022 7:54 PM
To: Xu, Ling1 <ling1.xu@intel.com>
Cc: qemu-devel@nongnu.org; dgilbert@redhat.com
Subject: Re: [PATCH v3 0/2] This patch updates runtime check of AVX512

ling xu <ling1.xu@intel.com> wrote:
> This patch updates runtime check of AVX512 and update avx512 support 
> for xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
>
> The runtime check is updated in meson.build and meson_options.txt.
>
> The updated AVX512 algorithm is provided in ram.c, xbzrle.c and 
> xbzrle.h.
>
> The test code is provided in test-xbzrle.c.
>
> Previous discussion is refered below:
> https://www.mail-archive.com/qemu-devel@nongnu.org/msg903520.html
>
> ling xu (2):
>   Update AVX512 support for xbzrle_encode_buffer function
>   Test code for AVX512 support for xbzrle_encode_buffer

I think this v3 and previous v3 are identical except for mthe link to the previous discussion.

Later, Juan.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
  2022-08-08 13:12   ` Juan Quintela
@ 2022-08-09  7:51     ` Xu, Ling1
  2022-08-09 18:25       ` Richard Henderson
  0 siblings, 1 reply; 14+ messages in thread
From: Xu, Ling1 @ 2022-08-09  7:51 UTC (permalink / raw)
  To: quintela; +Cc: qemu-devel, dgilbert, Zhao, Zhou, Jin, Jun I

Hi, Juan, 
      Thanks for your advice. We have revised our code including: 1) change "IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that variable isn't global variable; 2) use a function pointer to simplify code in ram.c; 3) change function name "xbzrle_encode_buffer_512" to "xbzrle_encode_buffer_avx512", change variable "res" to "countResidual" for better understanding, and replace "unsigned long long" with "uint64_t". 
       We will submit patch v4 to fix all issues mentioned in comments. 

Best Regard,
Ling

-----Original Message-----
From: Juan Quintela <quintela@redhat.com> 
Sent: Monday, August 8, 2022 9:12 PM
To: Xu, Ling1 <ling1.xu@intel.com>
Cc: qemu-devel@nongnu.org; dgilbert@redhat.com; Zhao, Zhou <zhou.zhao@intel.com>; Jin, Jun I <jun.i.jin@intel.com>
Subject: Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

ling xu <ling1.xu@intel.com> wrote:
> This commit update runtime check of AVX512, and implements avx512 of 
> xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
> Compared with C version of xbzrle_encode_buffer function, avx512 
> version can achieve almost 60%-70% performance improvement on unit 
> test provided by Qemu. In addition, we provide one more unit test 
> called "test_encode_decode_random", in which dirty data are randomly 
> located in 4K page, and this case can achieve almost 140% performance gain.
>
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>  meson.build        |  16 ++++
>  meson_options.txt  |   2 +
>  migration/ram.c    |  41 ++++++++++
>  migration/xbzrle.c | 181 +++++++++++++++++++++++++++++++++++++++++++++
>  migration/xbzrle.h |   4 +
>  5 files changed, 244 insertions(+)
>
> diff --git a/meson.build b/meson.build index 294e9a8f32..4222b77e9f 
> 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
>      int main(int argc, char *argv[]) { return bar(argv[0]); }
>    '''), error_message: 'AVX512F not available').allowed())
>  
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, 
> +cannot enable AVX512BW') \
> +  .require(cc.links('''
> +    #pragma GCC push_options
> +    #pragma GCC target("avx512bw")
> +    #include <cpuid.h>
> +    #include <immintrin.h>
> +    static int bar(void *a) {
> +
> +      __m512i x = *(__m512i *)a;
> +      __m512i res= _mm512_abs_epi8(x);
> +      return res[1];
> +    }
> +    int main(int argc, char *argv[]) { return bar(argv[0]); }  '''), 
> + error_message: 'AVX512BW not available').allowed())
> +
>  have_pvrdma = get_option('pvrdma') \
>    .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
>    .require(cc.compiles(gnu_source_prefix + '''
> diff --git a/meson_options.txt b/meson_options.txt index 
> e58e158396..07194bf680 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
>         description: 'AVX2 optimizations')  option('avx512f', type: 
> 'feature', value: 'disabled',
>         description: 'AVX512F optimizations')
> +option('avx512bw', type: 'feature', value: 'auto',
> +       description: 'AVX512BW optimizations')
>  option('keyring', type: 'feature', value: 'auto',
>         description: 'Linux keyring support')
>  

[no clue about meson, it looks ok]

> diff --git a/migration/ram.c b/migration/ram.c index 
> dc1de9ddbc..d9c1ac2f7a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -83,6 +83,35 @@
>  /* 0x80 is reserved in migration.h start with 0x100 next */
>  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
>  
> +#if defined(CONFIG_AVX512BW_OPT)
> +static bool IS_CPU_SUPPORT_AVX512BW;

An all caps global variable?

> +#include "qemu/cpuid.h"
> +static void __attribute__((constructor)) init_cpu_flag(void) {
> +    unsigned max = __get_cpuid_max(0, NULL);
> +    int a, b, c, d;
> +    IS_CPU_SUPPORT_AVX512BW = false;
> +    if (max >= 1) {
> +        __cpuid(1, a, b, c, d);
> +         /* We must check that AVX is not just available, but usable.  */
> +        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> +            int bv;
> +            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +            __cpuid_count(7, 0, a, b, c, d);
> +           /* 0xe6:
> +            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> +            *                    and ZMM16-ZMM31 state are enabled by OS)
> +            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
> +            */
> +            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
> +                IS_CPU_SUPPORT_AVX512BW = true;
> +            }
> +        }
> +    }
> +    return ;
> +}
> +#endif
> +
>  XBZRLECacheStats xbzrle_counters;
>  
>  /* struct contains XBZRLE cache and a static page @@ -802,9 +831,21 
> @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
>      memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
>  
>      /* XBZRLE encoding (if there is no overflow) */
> +    #if defined(CONFIG_AVX512BW_OPT)
> +    if (likely(IS_CPU_SUPPORT_AVX512BW)) {

All distributions are go to have compile time support for AVX, but I am not sure the percentage of machines that support avx

> +        encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf,
> +                                               TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                               TARGET_PAGE_SIZE);
> +    } else {
> +        encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
> +                                           TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                           TARGET_PAGE_SIZE);
> +    }

the else part is the same than the #else part
> +    #else
>      encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
>                                         TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
>                                         TARGET_PAGE_SIZE);
> +    #endif

So, why don't just create a new function pointer:

int (*xbzrle_encode_buffer_func)(uint8_t *old_buf, uint8_t *new_buf, int slen,
                                 uint8_t *dst, int dlen) = xbzrle_encode_buffer;


And aad into init_cpu_flag() something in the line of:

	xbzrle_encode_buffer_func = xbrrle_encode_buffer_512;

?


>      /*
>       * Update the cache contents, so that it corresponds to the data 
> diff --git a/migration/xbzrle.c b/migration/xbzrle.c index 
> 1ba482ded9..4db09fdbdb 100644
> --- a/migration/xbzrle.c
> +++ b/migration/xbzrle.c
> @@ -174,3 +174,184 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, 
> uint8_t *dst, int dlen)
>  
>      return d;
>  }
> +
> +#if defined(CONFIG_AVX512BW_OPT)
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +
> +#include <immintrin.h>
> +#include <math.h>
> +#define SET_ZERO512(r) r = _mm512_set1_epi32(0) int 
> +xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen) {

This is just personal taste, but I would rename this to:

xbzrle_encode_buffer_avx512?

> +    uint32_t zrun_len = 0, nzrun_len = 0;
> +    int d = 0, i = 0, num = 0;
> +    uint8_t *nzrun_start = NULL;
> +    int count512s = (slen >> 6);
> +    int res = slen % 64;

res variable here means residual, normally we use "res" with meaning of "result" in qemu.

> +    bool never_same = true;
> +    while (count512s--) {
> +        if (d + 2 > dlen) {
> +            return -1;
> +        }
> +        __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> +                               0xffffffffffffffff, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> +                                                 0xffffffffffffffff, new_buf + i);
> +        /* in mask bit 1 for same, 0 for diff */
> +        __mmask64  comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = 64;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                if (comp == 0xffffffffffffffff) {
> +                    i += 64;
> +                    zrun_len += 64;
> +                    break;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* still has different data after same data */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +            if (never_same || zrun_len) {
> +                /*
> +                 * never_same only acts if
> +                 * data begins with diff in first count512s
> +                 */
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            if ((bytesToCheck == 64) && (comp == 0x0)) {
> +                i += 64;
> +                nzrun_len += 64;
> +                break;
> +            }
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                /* mask like 111000 */
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +    if (res) {
> +        /* the number of data is less than 64 */
> +        unsigned long long mask = pow(2, res);

Not your fault.

21st century.  Someone still use long long in a new API, sniff.

> +        mask -= 1;
> +        __m512i r = SET_ZERO512(r);
> +        __m512i old_data = _mm512_mask_loadu_epi8(r, mask, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(r, mask, new_buf + i);
> +        __mmask64 comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = res;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* diff after same */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +
> +            if (never_same || zrun_len) {
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +
> +    if (zrun_len) {
> +        return (zrun_len == slen) ? 0 : d;
> +    }
> +    if (nzrun_len != 0) {
> +        d += uleb128_encode_small(dst + d, nzrun_len);
> +        /* overflow */
> +        if (d + nzrun_len > dlen) {
> +            return -1;
> +        }
> +        nzrun_start = new_buf + i - nzrun_len;
> +        memcpy(dst + d, nzrun_start, nzrun_len);
> +        d += nzrun_len;
> +    }
> +    return d;
> +}
> +#pragma GCC pop_options
> +#endif
> \ No newline at end of file
> diff --git a/migration/xbzrle.h b/migration/xbzrle.h index 
> a0db507b9c..6247de5f00 100644
> --- a/migration/xbzrle.h
> +++ b/migration/xbzrle.h
> @@ -18,4 +18,8 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
>                           uint8_t *dst, int dlen);
>  
>  int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int 
> dlen);
> +#if defined(CONFIG_AVX512BW_OPT)
> +int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen); #endif
>  #endif



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
  2022-08-09  7:51     ` Xu, Ling1
@ 2022-08-09 18:25       ` Richard Henderson
  2022-08-11  7:23         ` Xu, Ling1
  0 siblings, 1 reply; 14+ messages in thread
From: Richard Henderson @ 2022-08-09 18:25 UTC (permalink / raw)
  To: Xu, Ling1, quintela; +Cc: qemu-devel, dgilbert, Zhao, Zhou, Jin, Jun I

On 8/9/22 00:51, Xu, Ling1 wrote:
> Hi, Juan,
>        Thanks for your advice. We have revised our code including: 1) change "IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that variable isn't global variable;

You can remove this variable entirely...

> 2) use a function pointer to simplify code in ram.c;

... because it's redundant with the function pointer.


r~


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer
  2022-08-08  7:48 ` [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer ling xu
  2022-08-08  8:08   ` Thomas Huth
@ 2022-08-09 18:30   ` Richard Henderson
  1 sibling, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2022-08-09 18:30 UTC (permalink / raw)
  To: ling xu, qemu-devel; +Cc: quintela, dgilbert, Zhou Zhao, Jun Jin

On 8/8/22 00:48, ling xu wrote:
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>   tests/unit/test-xbzrle.c | 307 ++++++++++++++++++++++++++++++++++++---
>   1 file changed, 290 insertions(+), 17 deletions(-)
> 
> diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
> index ef951b6e54..653016826f 100644
> --- a/tests/unit/test-xbzrle.c
> +++ b/tests/unit/test-xbzrle.c
> @@ -38,111 +38,280 @@ static void test_uleb(void)
>       g_assert(val == 0);
>   }
>   
> -static void test_encode_decode_zero(void)
> +static float *test_encode_decode_zero(void)
>   {
>       uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
>       uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
> +    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
> +    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
>       int i = 0;
> -    int dlen = 0;
> +    int dlen = 0, dlen512 = 0;
>       int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
>   
>       for (i = diff_len; i > 0; i--) {
>           buffer[1000 + i] = i;
> +        buffer512[1000 + i] = i;
>       }
>   
>       buffer[1000 + diff_len + 3] = 103;
>       buffer[1000 + diff_len + 5] = 105;
>   
> +    buffer512[1000 + diff_len + 3] = 103;
> +    buffer512[1000 + diff_len + 5] = 105;
> +
>       /* encode zero page */
> +    time_t t_start, t_end, t_start512, t_end512;
> +    t_start = clock();
>       dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
>                          XBZRLE_PAGE_SIZE);
> +    t_end = clock();
> +    float time_val = difftime(t_end, t_start);
>       g_assert(dlen == 0);
>   
> +    t_start512 = clock();
> +    dlen512 = xbzrle_encode_buffer_512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
> +                                       compressed512, XBZRLE_PAGE_SIZE);
> +    t_end512 = clock();
> +    float time_val512 = difftime(t_end512, t_start512);
> +    g_assert(dlen512 == 0);
> +
> +    static float result_zero[2];
> +    result_zero[0] = time_val;
> +    result_zero[1] = time_val512;
> +
>       g_free(buffer);
>       g_free(compressed);
> +    g_free(buffer512);
> +    g_free(compressed512);
> +
> +    return result_zero;
> +}

Why are you returning a pointer to static storage?
I'll note that this isn't so much "testing" as "benchmarking".

Does the speedup from using 512-bit vectors make up for the clock slowdown that is 
enforced on the cpu cluster?  As far as I know, it is still quite rare for avx512 to 
actually pay off.

I suggest you model testing on test_buffer_is_zero_next_accel().



r~


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
  2022-08-08  7:48 ` [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function ling xu
  2022-08-08 13:12   ` Juan Quintela
@ 2022-08-09 18:41   ` Richard Henderson
  1 sibling, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2022-08-09 18:41 UTC (permalink / raw)
  To: ling xu, qemu-devel; +Cc: quintela, dgilbert, Zhou Zhao, Jun Jin

On 8/8/22 00:48, ling xu wrote:
> This commit update runtime check of AVX512, and implements avx512 of
> xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
> Compared with C version of xbzrle_encode_buffer function, avx512 version
> can achieve almost 60%-70% performance improvement on unit test provided
> by Qemu. In addition, we provide one more unit test called
> "test_encode_decode_random", in which dirty data are randomly located in
> 4K page, and this case can achieve almost 140% performance gain.
> 
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>   meson.build        |  16 ++++
>   meson_options.txt  |   2 +
>   migration/ram.c    |  41 ++++++++++
>   migration/xbzrle.c | 181 +++++++++++++++++++++++++++++++++++++++++++++
>   migration/xbzrle.h |   4 +
>   5 files changed, 244 insertions(+)
> 
> diff --git a/meson.build b/meson.build
> index 294e9a8f32..4222b77e9f 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
>       int main(int argc, char *argv[]) { return bar(argv[0]); }
>     '''), error_message: 'AVX512F not available').allowed())
>   
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
> +  .require(cc.links('''
> +    #pragma GCC push_options
> +    #pragma GCC target("avx512bw")
> +    #include <cpuid.h>
> +    #include <immintrin.h>
> +    static int bar(void *a) {
> +
> +      __m512i x = *(__m512i *)a;
> +      __m512i res= _mm512_abs_epi8(x);
> +      return res[1];
> +    }
> +    int main(int argc, char *argv[]) { return bar(argv[0]); }
> +  '''), error_message: 'AVX512BW not available').allowed())
> +
>   have_pvrdma = get_option('pvrdma') \
>     .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
>     .require(cc.compiles(gnu_source_prefix + '''
> diff --git a/meson_options.txt b/meson_options.txt
> index e58e158396..07194bf680 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
>          description: 'AVX2 optimizations')
>   option('avx512f', type: 'feature', value: 'disabled',
>          description: 'AVX512F optimizations')
> +option('avx512bw', type: 'feature', value: 'auto',
> +       description: 'AVX512BW optimizations')
>   option('keyring', type: 'feature', value: 'auto',
>          description: 'Linux keyring support')
>   
> diff --git a/migration/ram.c b/migration/ram.c
> index dc1de9ddbc..d9c1ac2f7a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -83,6 +83,35 @@
>   /* 0x80 is reserved in migration.h start with 0x100 next */
>   #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
>   
> +#if defined(CONFIG_AVX512BW_OPT)
> +static bool IS_CPU_SUPPORT_AVX512BW;
> +#include "qemu/cpuid.h"
> +static void __attribute__((constructor)) init_cpu_flag(void)
> +{
> +    unsigned max = __get_cpuid_max(0, NULL);
> +    int a, b, c, d;
> +    IS_CPU_SUPPORT_AVX512BW = false;
> +    if (max >= 1) {
> +        __cpuid(1, a, b, c, d);
> +         /* We must check that AVX is not just available, but usable.  */
> +        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> +            int bv;
> +            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +            __cpuid_count(7, 0, a, b, c, d);
> +           /* 0xe6:
> +            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> +            *                    and ZMM16-ZMM31 state are enabled by OS)
> +            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
> +            */
> +            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
> +                IS_CPU_SUPPORT_AVX512BW = true;
> +            }
> +        }
> +    }
> +    return ;
> +}
> +#endif
> +
>   XBZRLECacheStats xbzrle_counters;
>   
>   /* struct contains XBZRLE cache and a static page
> @@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
>       memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
>   
>       /* XBZRLE encoding (if there is no overflow) */
> +    #if defined(CONFIG_AVX512BW_OPT)
> +    if (likely(IS_CPU_SUPPORT_AVX512BW)) {
> +        encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf,
> +                                               TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                               TARGET_PAGE_SIZE);
> +    } else {
> +        encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
> +                                           TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                           TARGET_PAGE_SIZE);
> +    }
> +    #else
>       encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
>                                          TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
>                                          TARGET_PAGE_SIZE);
> +    #endif
>   
>       /*
>        * Update the cache contents, so that it corresponds to the data
> diff --git a/migration/xbzrle.c b/migration/xbzrle.c
> index 1ba482ded9..4db09fdbdb 100644
> --- a/migration/xbzrle.c
> +++ b/migration/xbzrle.c
> @@ -174,3 +174,184 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
>   
>       return d;
>   }
> +
> +#if defined(CONFIG_AVX512BW_OPT)
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +
> +#include <immintrin.h>
> +#include <math.h>
> +#define SET_ZERO512(r) r = _mm512_set1_epi32(0)
> +int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen)
> +{
> +    uint32_t zrun_len = 0, nzrun_len = 0;
> +    int d = 0, i = 0, num = 0;
> +    uint8_t *nzrun_start = NULL;
> +    int count512s = (slen >> 6);
> +    int res = slen % 64;
> +    bool never_same = true;
> +    while (count512s--) {
> +        if (d + 2 > dlen) {
> +            return -1;
> +        }
> +        __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> +                               0xffffffffffffffff, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> +                                                 0xffffffffffffffff, new_buf + i);
> +        /* in mask bit 1 for same, 0 for diff */
> +        __mmask64  comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = 64;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                if (comp == 0xffffffffffffffff) {
> +                    i += 64;
> +                    zrun_len += 64;
> +                    break;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* still has different data after same data */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +            if (never_same || zrun_len) {
> +                /*
> +                 * never_same only acts if
> +                 * data begins with diff in first count512s
> +                 */
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            if ((bytesToCheck == 64) && (comp == 0x0)) {
> +                i += 64;
> +                nzrun_len += 64;
> +                break;
> +            }
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                /* mask like 111000 */
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +    if (res) {
> +        /* the number of data is less than 64 */
> +        unsigned long long mask = pow(2, res);

Um, what?  This is a stupid version of "1ull << res".


> +        mask -= 1;
> +        __m512i r = SET_ZERO512(r);
> +        __m512i old_data = _mm512_mask_loadu_epi8(r, mask, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(r, mask, new_buf + i);
> +        __mmask64 comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = res;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {

Why have you unrolled this from the main loop?  That's the major advantage of using 
predicate registers, being able to fold the head (and/or tail) into the same loop.

> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);

Type error -- ctzl used with long long (which should be uint64_t).
You should be using ctz64().

> +                num = (num < bytesToCheck) ? num : bytesToCheck;

Why this test?  Don't you already know that ~comp != 0?

> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* diff after same */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +
> +            if (never_same || zrun_len) {
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }

More generally, what benefit are you *really* getting out of avx512?  You're doing 
predicated loads and compares, but they're strictly length-based.  Then you're using the 
result of the comparison in serial.  I really can't imagine this being efficient at all.


r~


^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
  2022-08-09 18:25       ` Richard Henderson
@ 2022-08-11  7:23         ` Xu, Ling1
  0 siblings, 0 replies; 14+ messages in thread
From: Xu, Ling1 @ 2022-08-11  7:23 UTC (permalink / raw)
  To: Richard Henderson, quintela; +Cc: qemu-devel, dgilbert, Zhao, Zhou, Jin, Jun I

Hi, Richard,
      Thanks for your nice comments! Your suggestions are very helpful. We have revised code in ram.c according to your comments. As for "unroll residual from main loop" problem in algorithm, we will fix this later. Thanks for your time and patience~

Best Regards,
Ling

-----Original Message-----
From: Richard Henderson <richard.henderson@linaro.org> 
Sent: Wednesday, August 10, 2022 2:25 AM
To: Xu, Ling1 <ling1.xu@intel.com>; quintela@redhat.com
Cc: qemu-devel@nongnu.org; dgilbert@redhat.com; Zhao, Zhou <zhou.zhao@intel.com>; Jin, Jun I <jun.i.jin@intel.com>
Subject: Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

On 8/9/22 00:51, Xu, Ling1 wrote:
> Hi, Juan,
>        Thanks for your advice. We have revised our code including: 1) change "IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that variable isn't global variable;

You can remove this variable entirely...

> 2) use a function pointer to simplify code in ram.c;

... because it's redundant with the function pointer.


r~

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer
  2022-08-08  7:34 ling xu
@ 2022-08-08  7:34 ` ling xu
  0 siblings, 0 replies; 14+ messages in thread
From: ling xu @ 2022-08-08  7:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: quintela, dgilbert, ling xu, Zhou Zhao, Jun Jin

Signed-off-by: ling xu <ling1.xu@intel.com>
Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
Co-authored-by: Jun Jin <jun.i.jin@intel.com>
---
 tests/unit/test-xbzrle.c | 307 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 290 insertions(+), 17 deletions(-)

diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
index ef951b6e54..653016826f 100644
--- a/tests/unit/test-xbzrle.c
+++ b/tests/unit/test-xbzrle.c
@@ -38,111 +38,280 @@ static void test_uleb(void)
     g_assert(val == 0);
 }
 
-static void test_encode_decode_zero(void)
+static float *test_encode_decode_zero(void)
 {
     uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
     int i = 0;
-    int dlen = 0;
+    int dlen = 0, dlen512 = 0;
     int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
     for (i = diff_len; i > 0; i--) {
         buffer[1000 + i] = i;
+        buffer512[1000 + i] = i;
     }
 
     buffer[1000 + diff_len + 3] = 103;
     buffer[1000 + diff_len + 5] = 105;
 
+    buffer512[1000 + diff_len + 3] = 103;
+    buffer512[1000 + diff_len + 5] = 105;
+
     /* encode zero page */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
     dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
                        XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
     g_assert(dlen == 0);
 
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(dlen512 == 0);
+
+    static float result_zero[2];
+    result_zero[0] = time_val;
+    result_zero[1] = time_val512;
+
     g_free(buffer);
     g_free(compressed);
+    g_free(buffer512);
+    g_free(compressed512);
+
+    return result_zero;
+}
+
+static void test_encode_decode_zero_range(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
+    for (i = 0; i < 10000; i++) {
+        res = test_encode_decode_zero();
+        time_raw += res[0];
+        time_512 += res[1];
+    }
+    printf("Zero test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
 }
 
-static void test_encode_decode_unchanged(void)
+static float *test_encode_decode_unchanged(void)
 {
     uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
     int i = 0;
-    int dlen = 0;
+    int dlen = 0, dlen512 = 0;
     int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
     for (i = diff_len; i > 0; i--) {
         test[1000 + i] = i + 4;
+        test512[1000 + i] = i + 4;
     }
 
     test[1000 + diff_len + 3] = 107;
     test[1000 + diff_len + 5] = 109;
 
+    test512[1000 + diff_len + 3] = 107;
+    test512[1000 + diff_len + 5] = 109;
+
     /* test unchanged buffer */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
     dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
                                 XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
     g_assert(dlen == 0);
 
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_512(test512, test512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(dlen512 == 0);
+
+    static float result_unchanged[2];
+    result_unchanged[0] = time_val;
+    result_unchanged[1] = time_val512;
+
     g_free(test);
     g_free(compressed);
+    g_free(test512);
+    g_free(compressed512);
+
+    return result_unchanged;
 }
 
-static void test_encode_decode_1_byte(void)
+static void test_encode_decode_unchanged_range(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
+    for (i = 0; i < 10000; i++) {
+        res = test_encode_decode_unchanged();
+        time_raw += res[0];
+        time_512 += res[1];
+    }
+    printf("Unchanged test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static float *test_encode_decode_1_byte(void)
 {
     uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
-    int dlen = 0, rc = 0;
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
+    int dlen = 0, rc = 0, dlen512 = 0, rc512 = 0;
     uint8_t buf[2];
+    uint8_t buf512[2];
 
     test[XBZRLE_PAGE_SIZE - 1] = 1;
+    test512[XBZRLE_PAGE_SIZE - 1] = 1;
 
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
     dlen = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
                        XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
     g_assert(dlen == (uleb128_encode_small(&buf[0], 4095) + 2));
 
     rc = xbzrle_decode_buffer(compressed, dlen, buffer, XBZRLE_PAGE_SIZE);
     g_assert(rc == XBZRLE_PAGE_SIZE);
     g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
 
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_512(buffer512, test512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(dlen512 == (uleb128_encode_small(&buf512[0], 4095) + 2));
+
+    rc512 = xbzrle_decode_buffer(compressed512, dlen512, buffer512,
+                                 XBZRLE_PAGE_SIZE);
+    g_assert(rc512 == XBZRLE_PAGE_SIZE);
+    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
+
+    static float result_1_byte[2];
+    result_1_byte[0] = time_val;
+    result_1_byte[1] = time_val512;
+
     g_free(buffer);
     g_free(compressed);
     g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+    return result_1_byte;
 }
 
-static void test_encode_decode_overflow(void)
+static void test_encode_decode_1_byte_range(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
+    for (i = 0; i < 10000; i++) {
+        res = test_encode_decode_1_byte();
+        time_raw += res[0];
+        time_512 += res[1];
+    }
+    printf("1 byte test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static float *test_encode_decode_overflow(void)
 {
     uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0, rc = 0;
+    uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    int i = 0, rc = 0, rc512 = 0;
 
     for (i = 0; i < XBZRLE_PAGE_SIZE / 2 - 1; i++) {
         test[i * 2] = 1;
+        test512[i * 2] = 1;
     }
 
     /* encode overflow */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
     rc = xbzrle_encode_buffer(buffer, test, XBZRLE_PAGE_SIZE, compressed,
                               XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
     g_assert(rc == -1);
 
+    t_start512 = clock();
+    rc512 = xbzrle_encode_buffer_512(buffer512, test512, XBZRLE_PAGE_SIZE,
+                                     compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    g_assert(rc512 == -1);
+
+    static float result_overflow[2];
+    result_overflow[0] = time_val;
+    result_overflow[1] = time_val512;
+
     g_free(buffer);
     g_free(compressed);
     g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+    return result_overflow;
+}
+
+static void test_encode_decode_overflow_range(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
+    for (i = 0; i < 10000; i++) {
+        res = test_encode_decode_overflow();
+        time_raw += res[0];
+        time_512 += res[1];
+    }
+    printf("Overflow test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
 }
 
-static void encode_decode_range(void)
+static float *encode_decode_range(void)
 {
     uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
     uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
     uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
-    int i = 0, rc = 0;
-    int dlen = 0;
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    int i = 0, rc = 0, rc512 = 0;
+    int dlen = 0, dlen512 = 0;
 
     int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
     for (i = diff_len; i > 0; i--) {
         buffer[1000 + i] = i;
         test[1000 + i] = i + 4;
+        buffer512[1000 + i] = i;
+        test512[1000 + i] = i + 4;
     }
 
     buffer[1000 + diff_len + 3] = 103;
@@ -151,26 +320,129 @@ static void encode_decode_range(void)
     buffer[1000 + diff_len + 5] = 105;
     test[1000 + diff_len + 5] = 109;
 
+    buffer512[1000 + diff_len + 3] = 103;
+    test512[1000 + diff_len + 3] = 107;
+
+    buffer512[1000 + diff_len + 5] = 105;
+    test512[1000 + diff_len + 5] = 109;
+
     /* test encode/decode */
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
     dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
                                 XBZRLE_PAGE_SIZE);
-
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
     rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
     g_assert(rc < XBZRLE_PAGE_SIZE);
     g_assert(memcmp(test, buffer, XBZRLE_PAGE_SIZE) == 0);
 
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_512(test512, buffer512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
+    g_assert(rc512 < XBZRLE_PAGE_SIZE);
+    g_assert(memcmp(test512, buffer512, XBZRLE_PAGE_SIZE) == 0);
+
+    static float result_range[2];
+    result_range[0] = time_val;
+    result_range[1] = time_val512;
+
     g_free(buffer);
     g_free(compressed);
     g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+    return result_range;
 }
 
 static void test_encode_decode(void)
 {
     int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
+    for (i = 0; i < 10000; i++) {
+        res = encode_decode_range();
+        time_raw += res[0];
+        time_512 += res[1];
+    }
+    printf("Encode decode test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
 
+static float *encode_decode_random(void)
+{
+    uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
+    uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    uint8_t *compressed512 = g_malloc(XBZRLE_PAGE_SIZE);
+    uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+    int i = 0, rc = 0, rc512 = 0;
+    int dlen = 0, dlen512 = 0;
+
+    int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
+    /* store the index of diff */
+    int dirty_index[diff_len];
+    for (int j = 0; j < diff_len; j++) {
+        dirty_index[j] = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1);
+    }
+    for (i = diff_len - 1; i >= 0; i--) {
+        buffer[dirty_index[i]] = i;
+        test[dirty_index[i]] = i + 4;
+        buffer512[dirty_index[i]] = i;
+        test512[dirty_index[i]] = i + 4;
+    }
+
+    time_t t_start, t_end, t_start512, t_end512;
+    t_start = clock();
+    dlen = xbzrle_encode_buffer(test, buffer, XBZRLE_PAGE_SIZE, compressed,
+                                XBZRLE_PAGE_SIZE);
+    t_end = clock();
+    float time_val = difftime(t_end, t_start);
+    rc = xbzrle_decode_buffer(compressed, dlen, test, XBZRLE_PAGE_SIZE);
+    g_assert(rc < XBZRLE_PAGE_SIZE);
+
+    t_start512 = clock();
+    dlen512 = xbzrle_encode_buffer_512(test512, buffer512, XBZRLE_PAGE_SIZE,
+                                       compressed512, XBZRLE_PAGE_SIZE);
+    t_end512 = clock();
+    float time_val512 = difftime(t_end512, t_start512);
+    rc512 = xbzrle_decode_buffer(compressed512, dlen512, test512, XBZRLE_PAGE_SIZE);
+    g_assert(rc512 < XBZRLE_PAGE_SIZE);
+
+    static float result_random[2];
+    result_random[0] = time_val;
+    result_random[1] = time_val512;
+
+    g_free(buffer);
+    g_free(compressed);
+    g_free(test);
+    g_free(buffer512);
+    g_free(compressed512);
+    g_free(test512);
+
+    return result_random;
+}
+
+static void test_encode_decode_random(void)
+{
+    int i;
+    float time_raw = 0.0, time_512 = 0.0;
+    float *res;
     for (i = 0; i < 10000; i++) {
-        encode_decode_range();
+        res = encode_decode_random();
+        time_raw += res[0];
+        time_512 += res[1];
     }
+    printf("Random test:\n");
+    printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+    printf("512 xbzrle_encode time is %f ms\n", time_512);
 }
 
 int main(int argc, char **argv)
@@ -178,13 +450,14 @@ int main(int argc, char **argv)
     g_test_init(&argc, &argv, NULL);
     g_test_rand_int();
     g_test_add_func("/xbzrle/uleb", test_uleb);
-    g_test_add_func("/xbzrle/encode_decode_zero", test_encode_decode_zero);
+    g_test_add_func("/xbzrle/encode_decode_zero", test_encode_decode_zero_range);
     g_test_add_func("/xbzrle/encode_decode_unchanged",
-                    test_encode_decode_unchanged);
-    g_test_add_func("/xbzrle/encode_decode_1_byte", test_encode_decode_1_byte);
+                    test_encode_decode_unchanged_range);
+    g_test_add_func("/xbzrle/encode_decode_1_byte", test_encode_decode_1_byte_range);
     g_test_add_func("/xbzrle/encode_decode_overflow",
-                    test_encode_decode_overflow);
+                    test_encode_decode_overflow_range);
     g_test_add_func("/xbzrle/encode_decode", test_encode_decode);
+    g_test_add_func("/xbzrle/encode_decode_random", test_encode_decode_random);
 
     return g_test_run();
 }
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-08-11  7:28 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-08  7:48 [PATCH v3 0/2] This patch updates runtime check of AVX512 ling xu
2022-08-08  7:48 ` [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function ling xu
2022-08-08 13:12   ` Juan Quintela
2022-08-09  7:51     ` Xu, Ling1
2022-08-09 18:25       ` Richard Henderson
2022-08-11  7:23         ` Xu, Ling1
2022-08-09 18:41   ` Richard Henderson
2022-08-08  7:48 ` [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer ling xu
2022-08-08  8:08   ` Thomas Huth
2022-08-08  8:30     ` Xu, Ling1
2022-08-09 18:30   ` Richard Henderson
2022-08-08 11:54 ` [PATCH v3 0/2] This patch updates runtime check of AVX512 Juan Quintela
2022-08-09  1:19   ` Xu, Ling1
  -- strict thread matches above, loose matches on Subject: below --
2022-08-08  7:34 ling xu
2022-08-08  7:34 ` [PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer ling xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.