All of lore.kernel.org
 help / color / mirror / Atom feed
From: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
To: dev@dpdk.org
Cc: yipeng1.wang@intel.com, sameh.gobriel@intel.com,
	bruce.richardson@intel.com, konstantin.ananyev@intel.com,
	stephen@networkplumber.org, thomas@monjalon.net
Subject: [dpdk-dev] [PATCH v6 1/4] hash: add new toeplitz hash implementation
Date: Tue, 26 Oct 2021 21:32:12 +0100	[thread overview]
Message-ID: <1635280335-164030-2-git-send-email-vladimir.medvedkin@intel.com> (raw)
In-Reply-To: <1635280335-164030-1-git-send-email-vladimir.medvedkin@intel.com>
In-Reply-To: <1630944239-363648-1-git-send-email-vladimir.medvedkin@intel.com>

This patch add a new Toeplitz hash implementation using
Galios Fields New Instructions (GFNI).

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 app/test/test_thash.c                       | 172 ++++++++++++++++++++++++++
 doc/api/doxy-api-index.md                   |   1 +
 doc/guides/prog_guide/toeplitz_hash_lib.rst |  28 ++++-
 doc/guides/rel_notes/release_21_11.rst      |   4 +
 lib/hash/meson.build                        |   6 +-
 lib/hash/rte_thash.c                        |  29 +++++
 lib/hash/rte_thash.h                        |  35 ++++++
 lib/hash/rte_thash_gfni.h                   |  54 +++++++++
 lib/hash/rte_thash_x86_gfni.h               | 182 ++++++++++++++++++++++++++++
 lib/hash/version.map                        |   5 +
 10 files changed, 511 insertions(+), 5 deletions(-)
 create mode 100644 lib/hash/rte_thash_gfni.h
 create mode 100644 lib/hash/rte_thash_x86_gfni.h

diff --git a/app/test/test_thash.c b/app/test/test_thash.c
index d8981fb..22d784e 100644
--- a/app/test/test_thash.c
+++ b/app/test/test_thash.c
@@ -6,6 +6,7 @@
 #include <rte_eal.h>
 #include <rte_ip.h>
 #include <rte_random.h>
+#include <rte_malloc.h>
 
 #include "test.h"
 
@@ -78,6 +79,34 @@ uint8_t default_rss_key[] = {
 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
 };
 
+static const uint8_t big_rss_key[] = {
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
+	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
+	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
+	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
+	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
+};
+
 static int
 test_toeplitz_hash_calc(void)
 {
@@ -145,6 +174,146 @@ test_toeplitz_hash_calc(void)
 }
 
 static int
+test_toeplitz_hash_gfni(void)
+{
+	uint32_t i, j;
+	union rte_thash_tuple tuple;
+	uint32_t rss_l3, rss_l3l4;
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < RTE_DIM(v4_tbl); i++) {
+		tuple.v4.src_addr = rte_cpu_to_be_32(v4_tbl[i].src_ip);
+		tuple.v4.dst_addr = rte_cpu_to_be_32(v4_tbl[i].dst_ip);
+		tuple.v4.sport = rte_cpu_to_be_16(v4_tbl[i].dst_port);
+		tuple.v4.dport = rte_cpu_to_be_16(v4_tbl[i].src_port);
+
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V4_L4_LEN * 4);
+		if ((rss_l3 != v4_tbl[i].hash_l3) ||
+				(rss_l3l4 != v4_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	for (i = 0; i < RTE_DIM(v6_tbl); i++) {
+		for (j = 0; j < RTE_DIM(tuple.v6.src_addr); j++)
+			tuple.v6.src_addr[j] = v6_tbl[i].src_ip[j];
+		for (j = 0; j < RTE_DIM(tuple.v6.dst_addr); j++)
+			tuple.v6.dst_addr[j] = v6_tbl[i].dst_ip[j];
+		tuple.v6.sport = rte_cpu_to_be_16(v6_tbl[i].dst_port);
+		tuple.v6.dport = rte_cpu_to_be_16(v6_tbl[i].src_port);
+		rss_l3 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L3_LEN * 4);
+		rss_l3l4 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)&tuple,
+				RTE_THASH_V6_L4_LEN * 4);
+		if ((rss_l3 != v6_tbl[i].hash_l3) ||
+				(rss_l3l4 != v6_tbl[i].hash_l3l4))
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+#define DATA_SZ		4
+#define ITER		1000
+
+enum {
+	SCALAR_DATA_BUF_1_HASH_IDX = 0,
+	SCALAR_DATA_BUF_2_HASH_IDX,
+	GFNI_DATA_BUF_1_HASH_IDX,
+	GFNI_DATA_BUF_2_HASH_IDX,
+	HASH_IDXES
+};
+
+static int
+test_toeplitz_hash_rand_data(void)
+{
+	uint32_t data[2][DATA_SZ];
+	uint32_t scalar_data[2][DATA_SZ];
+	uint32_t hash[HASH_IDXES] = { 0 };
+	uint64_t rss_key_matrixes[RTE_DIM(default_rss_key)];
+	int i, j;
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	rte_thash_complete_matrix(rss_key_matrixes, default_rss_key,
+		RTE_DIM(default_rss_key));
+
+	for (i = 0; i < ITER; i++) {
+		for (j = 0; j < DATA_SZ; j++) {
+			data[0][j] = rte_rand();
+			data[1][j] = rte_rand();
+			scalar_data[0][j] = rte_cpu_to_be_32(data[0][j]);
+			scalar_data[1][j] = rte_cpu_to_be_32(data[1][j]);
+		}
+
+		hash[SCALAR_DATA_BUF_1_HASH_IDX] = rte_softrss(scalar_data[0],
+			DATA_SZ, default_rss_key);
+		hash[SCALAR_DATA_BUF_2_HASH_IDX] = rte_softrss(scalar_data[1],
+			DATA_SZ, default_rss_key);
+		hash[GFNI_DATA_BUF_1_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[0],
+			DATA_SZ * sizeof(uint32_t));
+		hash[GFNI_DATA_BUF_2_HASH_IDX] = rte_thash_gfni(
+			rss_key_matrixes, (uint8_t *)data[1],
+			DATA_SZ * sizeof(uint32_t));
+
+		if ((hash[SCALAR_DATA_BUF_1_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_1_HASH_IDX]) ||
+				(hash[SCALAR_DATA_BUF_2_HASH_IDX] !=
+				hash[GFNI_DATA_BUF_2_HASH_IDX]))
+
+			return -TEST_FAILED;
+	}
+
+	return TEST_SUCCESS;
+}
+
+enum {
+	RSS_V4_IDX,
+	RSS_V6_IDX
+};
+
+static int
+test_big_tuple_gfni(void)
+{
+	uint32_t arr[16];
+	uint32_t arr_softrss[16];
+	uint32_t hash_1, hash_2;
+	uint64_t rss_key_matrixes[RTE_DIM(big_rss_key)];
+	unsigned int i, size = RTE_DIM(arr) * sizeof(uint32_t);
+
+	if (!rte_thash_gfni_supported())
+		return TEST_SKIPPED;
+
+	/* Convert RSS key into matrixes */
+	rte_thash_complete_matrix(rss_key_matrixes, big_rss_key,
+		RTE_DIM(big_rss_key));
+
+	for (i = 0; i < RTE_DIM(arr); i++) {
+		arr[i] = rte_rand();
+		arr_softrss[i] = rte_be_to_cpu_32(arr[i]);
+	}
+
+	hash_1 = rte_softrss(arr_softrss, RTE_DIM(arr), big_rss_key);
+	hash_2 = rte_thash_gfni(rss_key_matrixes, (uint8_t *)arr, size);
+
+	if (hash_1 != hash_2)
+		return -TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static int
 test_create_invalid(void)
 {
 	struct rte_thash_ctx *ctx;
@@ -577,6 +746,9 @@ static struct unit_test_suite thash_tests = {
 	.teardown = NULL,
 	.unit_test_cases = {
 	TEST_CASE(test_toeplitz_hash_calc),
+	TEST_CASE(test_toeplitz_hash_gfni),
+	TEST_CASE(test_toeplitz_hash_rand_data),
+	TEST_CASE(test_big_tuple_gfni),
 	TEST_CASE(test_create_invalid),
 	TEST_CASE(test_multiple_create),
 	TEST_CASE(test_free_null),
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 49892a3..4245b96 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -142,6 +142,7 @@ The public API headers are grouped by topics:
   [hash]               (@ref rte_hash.h),
   [jhash]              (@ref rte_jhash.h),
   [thash]              (@ref rte_thash.h),
+  [thash_gfni]         (@ref rte_thash_gfni.h),
   [FBK hash]           (@ref rte_fbk_hash.h),
   [CRC hash]           (@ref rte_hash_crc.h)
 
diff --git a/doc/guides/prog_guide/toeplitz_hash_lib.rst b/doc/guides/prog_guide/toeplitz_hash_lib.rst
index f916857..acdd8c3 100644
--- a/doc/guides/prog_guide/toeplitz_hash_lib.rst
+++ b/doc/guides/prog_guide/toeplitz_hash_lib.rst
@@ -19,24 +19,44 @@ to calculate the RSS hash sum to spread the traffic among the queues.
 Toeplitz hash function API
 --------------------------
 
-There are two functions that provide calculation of the Toeplitz hash sum:
+There are three functions that provide calculation of the Toeplitz hash sum:
 
 * ``rte_softrss()``
 * ``rte_softrss_be()``
+* ``rte_thash_gfni()``
 
-Both of these functions take the parameters:
+First two functions are scalar implementation and take the parameters:
 
 * A pointer to the tuple, containing fields extracted from the packet.
 * A length of this tuple counted in double words.
 * A pointer to the RSS hash key corresponding to the one installed on the NIC.
 
-Both functions expect the tuple to be in "host" byte order
-and a multiple of 4 bytes in length.
+Both of above mentioned _softrss_ functions expect the tuple to be in
+"host" byte order and a multiple of 4 bytes in length.
 The ``rte_softrss()`` function expects the ``rss_key``
 to be exactly the same as the one installed on the NIC.
 The ``rte_softrss_be`` function is a faster implementation,
 but it expects ``rss_key`` to be converted to the host byte order.
 
+The last function is vectorized implementation using
+Galois Fields New Instructions. Could be used if ``rte_thash_gfni_supported`` returns true.
+It expects the tuple to be in network byte order.
+
+``rte_thash_gfni()`` calculates the hash value for a single tuple
+
+``rte_thash_gfni()`` takes the parameters:
+
+* A pointer to the matrices derived from the RSS hash key using ``rte_thash_complete_matrix()``.
+* A pointer to the tuple.
+* A length of the tuple in bytes.
+
+``rte_thash_complete_matrix()`` is a function that calculates matrices required by
+GFNI implementations from the RSS hash key. It takes the parameters:
+
+* A pointer to the memory where the matrices will be written.
+* A pointer to the RSS hash key.
+* Length of the RSS hash key in bytes.
+
 
 Predictable RSS
 ---------------
diff --git a/doc/guides/rel_notes/release_21_11.rst b/doc/guides/rel_notes/release_21_11.rst
index 1ccac87..4daeb4a 100644
--- a/doc/guides/rel_notes/release_21_11.rst
+++ b/doc/guides/rel_notes/release_21_11.rst
@@ -305,6 +305,10 @@ New Features
     * Pcapng format with timestamps and meta-data.
     * Fixes packet capture with stripped VLAN tags.
 
+* **Added optimized Toeplitz hash implementation.**
+
+  Added optimized Toeplitz hash implementation using Galois Fields New Instructions.
+
 
 Removed Items
 -------------
diff --git a/lib/hash/meson.build b/lib/hash/meson.build
index 9bc5ef9..12b1afc 100644
--- a/lib/hash/meson.build
+++ b/lib/hash/meson.build
@@ -7,8 +7,12 @@ headers = files(
         'rte_hash.h',
         'rte_jhash.h',
         'rte_thash.h',
+        'rte_thash_gfni.h',
+)
+indirect_headers += files(
+        'rte_crc_arm64.h',
+        'rte_thash_x86_gfni.h',
 )
-indirect_headers += files('rte_crc_arm64.h')
 
 sources = files('rte_cuckoo_hash.c', 'rte_fbk_hash.c', 'rte_thash.c')
 deps += ['net']
diff --git a/lib/hash/rte_thash.c b/lib/hash/rte_thash.c
index 696a112..e605a6f 100644
--- a/lib/hash/rte_thash.c
+++ b/lib/hash/rte_thash.c
@@ -90,6 +90,35 @@ struct rte_thash_ctx {
 	uint8_t		hash_key[0];
 };
 
+int
+rte_thash_gfni_supported(void)
+{
+#ifdef RTE_THASH_GFNI_DEFINED
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_GFNI) &&
+			(rte_vect_get_max_simd_bitwidth() >=
+			RTE_VECT_SIMD_512))
+		return 1;
+#endif
+
+	return 0;
+};
+
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key, int size)
+{
+	int i, j;
+	uint8_t *m = (uint8_t *)matrixes;
+	uint8_t left_part, right_part;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < 8; j++) {
+			left_part = rss_key[i] << j;
+			right_part = (uint16_t)(rss_key[i + 1]) >> (8 - j);
+			m[i * 8 + j] = left_part|right_part;
+		}
+	}
+}
+
 static inline uint32_t
 get_bit_lfsr(struct thash_lfsr *lfsr)
 {
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index a26fe56..40146cf 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -24,6 +24,7 @@ extern "C" {
 #include <rte_config.h>
 #include <rte_ip.h>
 #include <rte_common.h>
+#include <rte_thash_gfni.h>
 
 #if defined(RTE_ARCH_X86) || defined(__ARM_NEON)
 #include <rte_vect.h>
@@ -219,6 +220,40 @@ rte_softrss_be(uint32_t *input_tuple, uint32_t input_len,
 	return ret;
 }
 
+/**
+ * Indicates if GFNI implementations of the Toeplitz hash are supported.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @return
+ *  1 if GFNI is supported
+ *  0 otherwise
+ */
+__rte_experimental
+int
+rte_thash_gfni_supported(void);
+
+/**
+ * Converts Toeplitz hash key (RSS key) into matrixes required
+ * for GFNI implementation
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param matrixes
+ *  pointer to the memory where matrices will be written.
+ *  Note: the size of this memory must be equal to size * 8
+ * @param rss_key
+ *  pointer to the Toeplitz hash key
+ * @param size
+ *  Size of the rss_key in bytes.
+ */
+__rte_experimental
+void
+rte_thash_complete_matrix(uint64_t *matrixes, const uint8_t *rss_key,
+	int size);
+
 /** @internal Logarithm of minimum size of the RSS ReTa */
 #define	RTE_THASH_RETA_SZ_MIN	2U
 /** @internal Logarithm of maximum size of the RSS ReTa */
diff --git a/lib/hash/rte_thash_gfni.h b/lib/hash/rte_thash_gfni.h
new file mode 100644
index 0000000..bbacd41
--- /dev/null
+++ b/lib/hash/rte_thash_gfni.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_GFNI_H_
+#define _RTE_THASH_GFNI_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_log.h>
+
+#ifdef RTE_ARCH_X86
+
+#include <rte_thash_x86_gfni.h>
+
+#endif
+
+#ifndef RTE_THASH_GFNI_DEFINED
+
+/**
+ * Calculate Toeplitz hash.
+ * Dummy implementation.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *mtrx __rte_unused,
+	const uint8_t *key __rte_unused, int len __rte_unused)
+{
+	RTE_LOG(ERR, HASH, "%s is undefined under given arch\n", __func__);
+	return 0;
+}
+
+#endif /* RTE_THASH_GFNI_DEFINED */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_GFNI_H_ */
diff --git a/lib/hash/rte_thash_x86_gfni.h b/lib/hash/rte_thash_x86_gfni.h
new file mode 100644
index 0000000..1cb7353
--- /dev/null
+++ b/lib/hash/rte_thash_x86_gfni.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef _RTE_THASH_X86_GFNI_H_
+#define _RTE_THASH_X86_GFNI_H_
+
+/**
+ * @file
+ *
+ * Optimized Toeplitz hash functions implementation
+ * using Galois Fields New Instructions.
+ */
+
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __GFNI__
+#define RTE_THASH_GFNI_DEFINED
+
+#define RTE_THASH_FIRST_ITER_MSK	0x0f0f0f0f0f0e0c08
+#define RTE_THASH_PERM_MSK		0x0f0f0f0f0f0f0f0f
+#define RTE_THASH_FIRST_ITER_MSK_2	0xf0f0f0f0f0e0c080
+#define RTE_THASH_PERM_MSK_2		0xf0f0f0f0f0f0f0f0
+#define RTE_THASH_REWIND_MSK		0x0000000000113377
+
+__rte_internal
+static inline void
+__rte_thash_xor_reduce(__m512i xor_acc, uint32_t *val_1, uint32_t *val_2)
+{
+	__m256i tmp_256_1, tmp_256_2;
+	__m128i tmp128_1, tmp128_2;
+	uint64_t tmp_1, tmp_2;
+
+	tmp_256_1 = _mm512_castsi512_si256(xor_acc);
+	tmp_256_2 = _mm512_extracti32x8_epi32(xor_acc, 1);
+	tmp_256_1 = _mm256_xor_si256(tmp_256_1, tmp_256_2);
+
+	tmp128_1 = _mm256_castsi256_si128(tmp_256_1);
+	tmp128_2 = _mm256_extracti32x4_epi32(tmp_256_1, 1);
+	tmp128_1 = _mm_xor_si128(tmp128_1, tmp128_2);
+
+	tmp_1 = _mm_extract_epi64(tmp128_1, 0);
+	tmp_2 = _mm_extract_epi64(tmp128_1, 1);
+	tmp_1 ^= tmp_2;
+
+	*val_1 = (uint32_t)tmp_1;
+	*val_2 = (uint32_t)(tmp_1 >> 32);
+}
+
+__rte_internal
+static inline __m512i
+__rte_thash_gfni(const uint64_t *mtrx, const uint8_t *tuple,
+	const uint8_t *secondary_tuple, int len)
+{
+	__m512i permute_idx = _mm512_set_epi8(7, 6, 5, 4, 7, 6, 5, 4,
+						6, 5, 4, 3, 6, 5, 4, 3,
+						5, 4, 3, 2, 5, 4, 3, 2,
+						4, 3, 2, 1, 4, 3, 2, 1,
+						3, 2, 1, 0, 3, 2, 1, 0,
+						2, 1, 0, -1, 2, 1, 0, -1,
+						1, 0, -1, -2, 1, 0, -1, -2,
+						0, -1, -2, -3, 0, -1, -2, -3);
+
+	const __m512i rewind_idx = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 0, 0, 0, 0, 0,
+						0, 0, 0, 59, 0, 0, 0, 59,
+						0, 0, 59, 58, 0, 0, 59, 58,
+						0, 59, 58, 57, 0, 59, 58, 57);
+	const __mmask64 rewind_mask = RTE_THASH_REWIND_MSK;
+	const __m512i shift_8 = _mm512_set1_epi8(8);
+	__m512i xor_acc = _mm512_setzero_si512();
+	__m512i perm_bytes = _mm512_setzero_si512();
+	__m512i vals, matrixes, tuple_bytes, tuple_bytes_2;
+	__mmask64 load_mask, permute_mask, permute_mask_2;
+	int chunk_len = 0, i = 0;
+	uint8_t mtrx_msk;
+	const int prepend = 3;
+
+	for (; len > 0; len -= 64, tuple += 64) {
+		if (i == 8)
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+
+		permute_mask = RTE_THASH_FIRST_ITER_MSK;
+		load_mask = (len >= 64) ? UINT64_MAX : ((1ULL << len) - 1);
+		tuple_bytes = _mm512_maskz_loadu_epi8(load_mask, tuple);
+		if (secondary_tuple) {
+			permute_mask_2 = RTE_THASH_FIRST_ITER_MSK_2;
+			tuple_bytes_2 = _mm512_maskz_loadu_epi8(load_mask,
+				secondary_tuple);
+		}
+
+		chunk_len = __builtin_popcountll(load_mask);
+		for (i = 0; i < ((chunk_len + prepend) / 8); i++, mtrx += 8) {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(perm_bytes,
+					permute_mask_2, permute_idx,
+					tuple_bytes_2);
+
+			matrixes = _mm512_maskz_loadu_epi64(UINT8_MAX, mtrx);
+			vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes,
+				matrixes, 0);
+
+			xor_acc = _mm512_xor_si512(xor_acc, vals);
+			permute_idx = _mm512_add_epi8(permute_idx, shift_8);
+			permute_mask = RTE_THASH_PERM_MSK;
+			if (secondary_tuple)
+				permute_mask_2 = RTE_THASH_PERM_MSK_2;
+		}
+	}
+
+	int rest_len = (chunk_len + prepend) % 8;
+	if (rest_len != 0) {
+		mtrx_msk = (1 << (rest_len % 8)) - 1;
+		matrixes = _mm512_maskz_loadu_epi64(mtrx_msk, mtrx);
+		if (i == 8) {
+			perm_bytes = _mm512_maskz_permutexvar_epi8(rewind_mask,
+				rewind_idx, perm_bytes);
+		} else {
+			perm_bytes = _mm512_mask_permutexvar_epi8(perm_bytes,
+				permute_mask, permute_idx, tuple_bytes);
+
+			if (secondary_tuple)
+				perm_bytes =
+					_mm512_mask_permutexvar_epi8(
+					perm_bytes, permute_mask_2,
+					permute_idx, tuple_bytes_2);
+		}
+
+		vals = _mm512_gf2p8affine_epi64_epi8(perm_bytes, matrixes, 0);
+		xor_acc = _mm512_xor_si512(xor_acc, vals);
+	}
+
+	return xor_acc;
+}
+
+/**
+ * Calculate Toeplitz hash.
+ *
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @param m
+ *  Pointer to the matrices generated from the corresponding
+ *  RSS hash key using rte_thash_complete_matrix().
+ * @param tuple
+ *  Pointer to the data to be hashed. Data must be in network byte order.
+ * @param len
+ *  Length of the data to be hashed.
+ * @return
+ *  Calculated Toeplitz hash value.
+ */
+__rte_experimental
+static inline uint32_t
+rte_thash_gfni(const uint64_t *m, const uint8_t *tuple, int len)
+{
+	uint32_t val, val_zero;
+
+	__m512i xor_acc = __rte_thash_gfni(m, tuple, NULL, len);
+	__rte_thash_xor_reduce(xor_acc, &val, &val_zero);
+
+	return val;
+}
+
+#endif /* _GFNI_ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_THASH_X86_GFNI_H_ */
diff --git a/lib/hash/version.map b/lib/hash/version.map
index 8185470..153ab87 100644
--- a/lib/hash/version.map
+++ b/lib/hash/version.map
@@ -37,6 +37,7 @@ DPDK_22 {
 EXPERIMENTAL {
 	global:
 
+	#added in 21.05
 	rte_thash_add_helper;
 	rte_thash_adjust_tuple;
 	rte_thash_find_existing;
@@ -45,4 +46,8 @@ EXPERIMENTAL {
 	rte_thash_get_helper;
 	rte_thash_get_key;
 	rte_thash_init_ctx;
+
+	#added in 21.11
+	rte_thash_complete_matrix;
+	rte_thash_gfni_supported;
 };
-- 
2.7.4


  parent reply	other threads:[~2021-10-26 20:32 UTC|newest]

Thread overview: 72+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-06 16:03 [dpdk-dev] [PATCH 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
2021-09-06 16:03 ` [dpdk-dev] [PATCH 1/5] hash: add new toeplitz " Vladimir Medvedkin
2021-10-07 18:23   ` Ananyev, Konstantin
2021-10-08 11:19     ` Ananyev, Konstantin
2021-10-15  9:11     ` Medvedkin, Vladimir
2021-10-15 10:55       ` Ananyev, Konstantin
2021-10-15 13:09         ` Medvedkin, Vladimir
2021-09-06 16:03 ` [dpdk-dev] [PATCH 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-08 11:31   ` Ananyev, Konstantin
2021-10-15  9:13     ` Medvedkin, Vladimir
2021-09-06 16:03 ` [dpdk-dev] [PATCH 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
2021-09-06 16:03 ` [dpdk-dev] [PATCH 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
2021-09-07  0:35   ` Stephen Hemminger
2021-09-08 13:59     ` Medvedkin, Vladimir
2021-09-06 16:03 ` [dpdk-dev] [PATCH 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 0/5] optimized Toeplitz hash implementation Vladimir Medvedkin
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 " Vladimir Medvedkin
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 " Vladimir Medvedkin
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 " Vladimir Medvedkin
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 1/5] hash: add new toeplitz " Vladimir Medvedkin
2021-10-25 17:05         ` Thomas Monjalon
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
2021-10-25 17:04         ` Thomas Monjalon
2021-10-26 20:30           ` Medvedkin, Vladimir
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
2021-10-21 18:54       ` [dpdk-dev] [PATCH v5 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-25 17:02         ` Thomas Monjalon
2021-10-26 20:29           ` Medvedkin, Vladimir
2021-10-27  8:29             ` Thomas Monjalon
2021-10-27 15:48               ` Medvedkin, Vladimir
2021-10-25 17:27         ` Stephen Hemminger
2021-10-26 20:31           ` Medvedkin, Vladimir
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
2021-10-21 17:18     ` [dpdk-dev] [PATCH v4 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
2021-10-21  9:42     ` Ananyev, Konstantin
2021-10-21 17:17       ` Medvedkin, Vladimir
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-21  9:46     ` Ananyev, Konstantin
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
2021-10-20 18:20   ` [dpdk-dev] [PATCH v3 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 1/5] hash: add new toeplitz hash implementation Vladimir Medvedkin
2021-10-15 16:58   ` Stephen Hemminger
2021-10-18 10:40     ` Ananyev, Konstantin
2021-10-19  1:15       ` Stephen Hemminger
2021-10-19 15:42         ` Medvedkin, Vladimir
2021-10-18 11:08     ` Medvedkin, Vladimir
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 2/5] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 3/5] doc/hash: update documentation for the thash library Vladimir Medvedkin
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 4/5] test/thash: add tests for a new Toeplitz hash function Vladimir Medvedkin
2021-10-15  9:30 ` [dpdk-dev] [PATCH v2 5/5] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
2021-10-26 20:32 ` Vladimir Medvedkin [this message]
2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 2/4] hash: add bulk toeplitz " Vladimir Medvedkin
2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-26 20:32 ` [dpdk-dev] [PATCH v6 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 1/4] hash: add new toeplitz " Vladimir Medvedkin
2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 2/4] hash: add bulk " Vladimir Medvedkin
2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
2021-10-27 16:16 ` [dpdk-dev] [PATCH v7 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin
2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 0/4] optimized Toeplitz hash implementation Vladimir Medvedkin
2021-11-04 10:20   ` Thomas Monjalon
2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 1/4] hash: add new toeplitz " Vladimir Medvedkin
2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 2/4] hash: add bulk " Vladimir Medvedkin
2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 3/4] hash: enable gfni thash implementation Vladimir Medvedkin
2021-11-02 18:38 ` [dpdk-dev] [PATCH v8 4/4] test/thash: add performance tests for the Toeplitz hash Vladimir Medvedkin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1635280335-164030-2-git-send-email-vladimir.medvedkin@intel.com \
    --to=vladimir.medvedkin@intel.com \
    --cc=bruce.richardson@intel.com \
    --cc=dev@dpdk.org \
    --cc=konstantin.ananyev@intel.com \
    --cc=sameh.gobriel@intel.com \
    --cc=stephen@networkplumber.org \
    --cc=thomas@monjalon.net \
    --cc=yipeng1.wang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.