From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:41103) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1bjqIw-0003VR-Ks for qemu-devel@nongnu.org; Tue, 13 Sep 2016 12:11:30 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1bjqIr-0002Ir-GA for qemu-devel@nongnu.org; Tue, 13 Sep 2016 12:11:25 -0400 Received: from mail-wm0-f68.google.com ([74.125.82.68]:36154) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1bjqIr-0002Gx-7K for qemu-devel@nongnu.org; Tue, 13 Sep 2016 12:11:21 -0400 Received: by mail-wm0-f68.google.com with SMTP id b184so3071796wma.3 for ; Tue, 13 Sep 2016 09:11:21 -0700 (PDT) Sender: Paolo Bonzini From: Paolo Bonzini Date: Tue, 13 Sep 2016 18:10:05 +0200 Message-Id: <1473783005-113609-11-git-send-email-pbonzini@redhat.com> In-Reply-To: <1473783005-113609-1-git-send-email-pbonzini@redhat.com> References: <1473783005-113609-1-git-send-email-pbonzini@redhat.com> Subject: [Qemu-devel] [PATCH 10/10] cutils: Rewrite x86 buffer zero checking List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: rth@twiddle.net From: Richard Henderson Handle alignment of buffers, so that the vector paths can be used more often. Add versions for AVX1 and SSE4.1, both of which have incremental improvements over SSE2. Signed-off-by: Richard Henderson Signed-off-by: Paolo Bonzini --- util/bufferiszero.c | 139 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 96 insertions(+), 43 deletions(-) diff --git a/util/bufferiszero.c b/util/bufferiszero.c index 4d8a8c8..c23589c 100644 --- a/util/bufferiszero.c +++ b/util/bufferiszero.c @@ -26,38 +26,6 @@ #include "qemu/cutils.h" #include "qemu/bswap.h" - -/* vector definitions */ - -extern void link_error(void); - -#define ACCEL_BUFFER_ZERO(NAME, SIZE, VECTYPE, NONZERO) \ -static bool NAME(const void *buf, size_t len) \ -{ \ - const void *end = buf + len; \ - do { \ - const VECTYPE *p = buf; \ - VECTYPE t; \ - __builtin_prefetch(buf + SIZE); \ - barrier(); \ - if (SIZE == sizeof(VECTYPE) * 4) { \ - t = (p[0] | p[1]) | (p[2] | p[3]); \ - } else if (SIZE == sizeof(VECTYPE) * 8) { \ - t = p[0] | p[1]; \ - t |= p[2] | p[3]; \ - t |= p[4] | p[5]; \ - t |= p[6] | p[7]; \ - } else { \ - link_error(); \ - } \ - if (unlikely(NONZERO(t))) { \ - return false; \ - } \ - buf += SIZE; \ - } while (buf < end); \ - return true; \ -} - static bool buffer_zero_int(const void *buf, size_t len) { @@ -102,24 +70,110 @@ buffer_zero_int(const void *buf, size_t len) #pragma GCC push_options #pragma GCC target("sse2") #include -#define SSE2_NONZERO(X) \ - (_mm_movemask_epi8(_mm_cmpeq_epi8((X), _mm_setzero_si128())) != 0xFFFF) -ACCEL_BUFFER_ZERO(buffer_zero_sse2, 64, __m128i, SSE2_NONZERO) + +static bool +buffer_zero_sse2(const void *buf, size_t len) +{ + __m128i t = _mm_loadu_si128(buf); + __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16); + __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16); + __m128i zero = _mm_setzero_si128(); + + /* Loop over 16-byte aligned blocks of 64. */ + while (likely(p <= e)) { + __builtin_prefetch(p); + t = _mm_cmpeq_epi8(t, zero); + if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) { + return false; + } + t = p[-4] | p[-3] | p[-2] | p[-1]; + p += 4; + } + + /* Finish the aligned tail. */ + t |= e[-3]; + t |= e[-2]; + t |= e[-1]; + + /* Finish the unaligned tail. */ + t |= _mm_loadu_si128(buf + len - 16); + + return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF; +} #pragma GCC pop_options #ifdef CONFIG_AVX2_OPT #pragma GCC push_options #pragma GCC target("sse4") #include -#define SSE4_NONZERO(X) !_mm_testz_si128((X), (X)) -ACCEL_BUFFER_ZERO(buffer_zero_sse4, 64, __m128i, SSE4_NONZERO) + +static bool +buffer_zero_sse4(const void *buf, size_t len) +{ + __m128i t = _mm_loadu_si128(buf); + __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16); + __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16); + + /* Loop over 16-byte aligned blocks of 64. */ + while (likely(p <= e)) { + __builtin_prefetch(p); + if (unlikely(!_mm_testz_si128(t, t))) { + return false; + } + t = p[-4] | p[-3] | p[-2] | p[-1]; + p += 4; + } + + /* Finish the aligned tail. */ + t |= e[-3]; + t |= e[-2]; + t |= e[-1]; + + /* Finish the unaligned tail. */ + t |= _mm_loadu_si128(buf + len - 16); + + return _mm_testz_si128(t, t); +} #pragma GCC pop_options #pragma GCC push_options #pragma GCC target("avx2") #include -#define AVX2_NONZERO(X) !_mm256_testz_si256((X), (X)) -ACCEL_BUFFER_ZERO(buffer_zero_avx2, 128, __m256i, AVX2_NONZERO) + +static bool +buffer_zero_avx2(const void *buf, size_t len) +{ + /* Begin with an unaligned head of 32 bytes. */ + __m256i t = _mm256_loadu_si256(buf); + __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32); + __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32); + + if (likely(p <= e)) { + /* Loop over 32-byte aligned blocks of 128. */ + do { + __builtin_prefetch(p); + if (unlikely(!_mm256_testz_si256(t, t))) { + return false; + } + t = p[-4] | p[-3] | p[-2] | p[-1]; + p += 4; + } while (p <= e); + } else { + t |= _mm256_loadu_si256(buf + 32); + if (len <= 128) { + goto last2; + } + } + + /* Finish the last block of 128 unaligned. */ + t |= _mm256_loadu_si256(buf + len - 4 * 32); + t |= _mm256_loadu_si256(buf + len - 3 * 32); + last2: + t |= _mm256_loadu_si256(buf + len - 2 * 32); + t |= _mm256_loadu_si256(buf + len - 1 * 32); + + return _mm256_testz_si256(t, t); +} #pragma GCC pop_options #endif @@ -177,16 +231,15 @@ bool test_buffer_is_zero_next_accel(void) static bool select_accel_fn(const void *buf, size_t len) { - uintptr_t ibuf = (uintptr_t)buf; #ifdef CONFIG_AVX2_OPT - if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) { + if (len >= 128 && (cpuid_cache & CACHE_AVX2)) { return buffer_zero_avx2(buf, len); } - if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE4)) { + if (len >= 64 && (cpuid_cache & CACHE_SSE4)) { return buffer_zero_sse4(buf, len); } #endif - if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) { + if (len >= 64 && (cpuid_cache & CACHE_SSE2)) { return buffer_zero_sse2(buf, len); } return buffer_zero_int(buf, len); -- 1.8.3.1