All of lore.kernel.org
 help / color / mirror / Atom feed
From: Konstantin Ananyev <konstantin.ananyev@intel.com>
To: dev@dpdk.org
Cc: jerinj@marvell.com, ruifeng.wang@arm.com,
	vladimir.medvedkin@intel.com,
	Konstantin Ananyev <konstantin.ananyev@intel.com>
Subject: [dpdk-dev] [PATCH 20.11 7/7] acl: enhance AVX512 classify implementation
Date: Fri,  7 Aug 2020 17:28:29 +0100	[thread overview]
Message-ID: <20200807162829.11690-8-konstantin.ananyev@intel.com> (raw)
In-Reply-To: <20200807162829.11690-1-konstantin.ananyev@intel.com>

Add search_avx512x16x2() which uses mostly 512-bit width
registers/instructions and is able to process up to 32 flows in
parallel.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---

These patch depends on:
https://patches.dpdk.org/patch/70429/
to be applied first.

 lib/librte_acl/acl_run_avx512.c    |   3 +
 lib/librte_acl/acl_run_avx512x16.h | 635 +++++++++++++++++++++++++++++
 2 files changed, 638 insertions(+)
 create mode 100644 lib/librte_acl/acl_run_avx512x16.h

diff --git a/lib/librte_acl/acl_run_avx512.c b/lib/librte_acl/acl_run_avx512.c
index 8ee996679..332e359fb 100644
--- a/lib/librte_acl/acl_run_avx512.c
+++ b/lib/librte_acl/acl_run_avx512.c
@@ -121,11 +121,14 @@ resolve_mcgt8_avx512x1(uint32_t result[],
 }
 
 #include "acl_run_avx512x8.h"
+#include "acl_run_avx512x16.h"
 
 int
 rte_acl_classify_avx512(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint32_t *results, uint32_t num, uint32_t categories)
 {
+	if (num >= 2 * MAX_SEARCHES_AVX16)
+		return search_avx512x16x2(ctx, data, results, num, categories);
 	if (num >= MAX_SEARCHES_AVX16)
 		return search_avx512x8x2(ctx, data, results, num, categories);
 	if (num >= MAX_SEARCHES_SSE8)
diff --git a/lib/librte_acl/acl_run_avx512x16.h b/lib/librte_acl/acl_run_avx512x16.h
new file mode 100644
index 000000000..53216bda3
--- /dev/null
+++ b/lib/librte_acl/acl_run_avx512x16.h
@@ -0,0 +1,635 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#define	MASK16_BIT	(sizeof(__mmask16) * CHAR_BIT)
+
+#define NUM_AVX512X16X2	(2 * MASK16_BIT)
+#define MSK_AVX512X16X2	(NUM_AVX512X16X2 - 1)
+
+static const __rte_x86_zmm_t zmm_match_mask = {
+	.u32 = {
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+		RTE_ACL_NODE_MATCH,
+	},
+};
+
+static const __rte_x86_zmm_t zmm_index_mask = {
+	.u32 = {
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX,
+	},
+};
+
+static const __rte_x86_zmm_t zmm_trlo_idle = {
+	.u32 = {
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+		RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,
+	},
+};
+
+static const __rte_x86_zmm_t zmm_trhi_idle = {
+	.u32 = {
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+	},
+};
+
+static const __rte_x86_zmm_t zmm_shuffle_input = {
+	.u32 = {
+		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
+		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
+		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
+		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
+	},
+};
+
+static const __rte_x86_zmm_t zmm_four_32 = {
+	.u32 = {
+		4, 4, 4, 4,
+		4, 4, 4, 4,
+		4, 4, 4, 4,
+		4, 4, 4, 4,
+	},
+};
+
+static const __rte_x86_zmm_t zmm_idx_add = {
+	.u32 = {
+		0, 1, 2, 3,
+		4, 5, 6, 7,
+		8, 9, 10, 11,
+		12, 13, 14, 15,
+	},
+};
+
+static const __rte_x86_zmm_t zmm_range_base = {
+	.u32 = {
+		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
+	},
+};
+
+/*
+ * Calculate the address of the next transition for
+ * all types of nodes. Note that only DFA nodes and range
+ * nodes actually transition to another node. Match
+ * nodes not supposed to be encountered here.
+ * For quad range nodes:
+ * Calculate number of range boundaries that are less than the
+ * input value. Range boundaries for each node are in signed 8 bit,
+ * ordered from -128 to 127.
+ * This is effectively a popcnt of bytes that are greater than the
+ * input byte.
+ * Single nodes are processed in the same ways as quad range nodes.
+ */
+static __rte_always_inline __m512i
+calc_addr16(__m512i index_mask, __m512i next_input, __m512i shuffle_input,
+	__m512i four_32, __m512i range_base, __m512i tr_lo, __m512i tr_hi)
+{
+	__mmask64 qm;
+	__mmask16 dfa_msk;
+	__m512i addr, in, node_type, r, t;
+	__m512i dfa_ofs, quad_ofs;
+
+	t = _mm512_xor_si512(index_mask, index_mask);
+	in = _mm512_shuffle_epi8(next_input, shuffle_input);
+
+	/* Calc node type and node addr */
+	node_type = _mm512_andnot_si512(index_mask, tr_lo);
+	addr = _mm512_and_si512(index_mask, tr_lo);
+
+	/* mask for DFA type(0) nodes */
+	dfa_msk = _mm512_cmpeq_epi32_mask(node_type, t);
+
+	/* DFA calculations. */
+	r = _mm512_srli_epi32(in, 30);
+	r = _mm512_add_epi8(r, range_base);
+	t = _mm512_srli_epi32(in, 24);
+	r = _mm512_shuffle_epi8(tr_hi, r);
+
+	dfa_ofs = _mm512_sub_epi32(t, r);
+
+	/* QUAD/SINGLE calculations. */
+	qm = _mm512_cmpgt_epi8_mask(in, tr_hi);
+	t = _mm512_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX);
+	t = _mm512_lzcnt_epi32(t);
+	t = _mm512_srli_epi32(t, 3);
+	quad_ofs = _mm512_sub_epi32(four_32, t);
+
+	/* blend DFA and QUAD/SINGLE. */
+	t = _mm512_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs);
+
+	/* calculate address for next transitions. */
+	addr = _mm512_add_epi32(addr, t);
+	return addr;
+}
+
+/*
+ * Process 8 transitions in parallel.
+ * tr_lo contains low 32 bits for 8 transition.
+ * tr_hi contains high 32 bits for 8 transition.
+ * next_input contains up to 4 input bytes for 8 flows.
+ */
+static __rte_always_inline __m512i
+transition16(__m512i next_input, const uint64_t *trans, __m512i *tr_lo,
+	__m512i *tr_hi)
+{
+	const int32_t *tr;
+	__m512i addr;
+
+	tr = (const int32_t *)(uintptr_t)trans;
+
+	/* Calculate the address (array index) for all 8 transitions. */
+	addr = calc_addr16(zmm_index_mask.z, next_input, zmm_shuffle_input.z,
+		zmm_four_32.z, zmm_range_base.z, *tr_lo, *tr_hi);
+
+	/* load lower 32 bits of 8 transactions at once. */
+	*tr_lo = _mm512_i32gather_epi32(addr, tr, sizeof(trans[0]));
+
+	next_input = _mm512_srli_epi32(next_input, CHAR_BIT);
+
+	/* load high 32 bits of 8 transactions at once. */
+	*tr_hi = _mm512_i32gather_epi32(addr, (tr + 1), sizeof(trans[0]));
+
+	return next_input;
+}
+
+static __rte_always_inline void
+first_trans16(const struct acl_flow_avx512 *flow, __m512i next_input,
+	__mmask16 msk, __m512i *tr_lo, __m512i *tr_hi)
+{
+	const int32_t *tr;
+	__m512i addr, root;
+
+	tr = (const int32_t *)(uintptr_t)flow->trans;
+
+	addr = _mm512_set1_epi32(UINT8_MAX);
+	root = _mm512_set1_epi32(flow->root_index);
+
+	addr = _mm512_and_si512(next_input, addr);
+	addr = _mm512_add_epi32(root, addr);
+
+	/* load lower 32 bits of 8 transactions at once. */
+	*tr_lo = _mm512_mask_i32gather_epi32(*tr_lo, msk, addr, tr,
+		sizeof(flow->trans[0]));
+
+	/* load high 32 bits of 8 transactions at once. */
+	*tr_hi = _mm512_mask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1),
+		sizeof(flow->trans[0]));
+}
+
+static inline __m512i
+get_next_4bytes_avx512x16(const struct acl_flow_avx512 *flow, __m512i pdata[2],
+	uint32_t msk, __m512i *di)
+{
+	const int32_t *div;
+	__m512i one, zero, t, p[2];
+	ymm_t inp[2];
+
+	static const __rte_x86_zmm_t zmm_pminp = {
+		.u32 = {
+			0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+			0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+		},
+	};
+
+	const __mmask16 pmidx_msk = 0x5555;
+
+	static const __rte_x86_zmm_t zmm_pmidx[2] = {
+		[0] = {
+			.u32 = {
+				0, 0, 1, 0, 2, 0, 3, 0,
+				4, 0, 5, 0, 6, 0, 7, 0,
+			},
+		},
+		[1] = {
+			.u32 = {
+				8, 0, 9, 0, 10, 0, 11, 0,
+				12, 0, 13, 0, 14, 0, 15, 0,
+			},
+		},
+	};
+
+	div = (const int32_t *)flow->data_index;
+
+	one = _mm512_set1_epi32(1);
+	zero = _mm512_xor_si512(one, one);
+
+	t = _mm512_mask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0]));
+
+	*di = _mm512_mask_add_epi32(*di, msk, *di, one);
+
+	p[0] = _mm512_maskz_permutexvar_epi32(pmidx_msk, zmm_pmidx[0].z, t);
+	p[1] = _mm512_maskz_permutexvar_epi32(pmidx_msk, zmm_pmidx[1].z, t);
+
+	p[0] = _mm512_add_epi64(p[0], pdata[0]);
+	p[1] = _mm512_add_epi64(p[1], pdata[1]);
+
+	inp[0] = _mm512_mask_i64gather_epi32(_mm512_castsi512_si256(zero),
+		(msk & UINT8_MAX), p[0], NULL, sizeof(uint8_t));
+	inp[1] = _mm512_mask_i64gather_epi32(_mm512_castsi512_si256(zero),
+		(msk >> CHAR_BIT), p[1], NULL, sizeof(uint8_t));
+
+	return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]),
+			zmm_pminp.z, _mm512_castsi256_si512(inp[1]));
+}
+
+static inline void
+start_flow16(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
+	__m512i pdata[2], __m512i *idx, __m512i *di)
+{
+	uint32_t n, nm[2];
+	__m512i ni, nd[2];
+
+	n = __builtin_popcount(msk & UINT8_MAX);
+	nm[0] = (1 << n) - 1;
+	nm[1] = (1 << (num - n)) - 1;
+
+	nd[0] = _mm512_maskz_loadu_epi64(nm[0],
+		flow->idata + flow->num_packets);
+	nd[1] = _mm512_maskz_loadu_epi64(nm[1],
+		flow->idata + flow->num_packets + n);
+
+	ni = _mm512_set1_epi32(flow->num_packets);
+	ni = _mm512_add_epi32(ni, zmm_idx_add.z);
+
+	pdata[0] = _mm512_mask_expand_epi64(pdata[0], (msk & UINT8_MAX), nd[0]);
+	pdata[1] = _mm512_mask_expand_epi64(pdata[1], (msk >> CHAR_BIT), nd[1]);
+
+	*idx = _mm512_mask_expand_epi32(*idx, msk, ni);
+	*di = _mm512_maskz_mov_epi32(msk ^ UINT16_MAX, *di);
+
+	flow->num_packets += num;
+}
+
+static inline uint32_t
+update_flow_mask16(const struct acl_flow_avx512 *flow, __mmask16 *fmsk,
+	__mmask16 *rmsk)
+{
+	uint32_t i, j, k, m, n;
+
+	fmsk[0] ^= rmsk[0];
+	m = rmsk[0];
+
+	k = __builtin_popcount(m);
+	n = flow->total_packets - flow->num_packets;
+
+	if (n < k) {
+		/* reduce mask */
+		for (i = k - n; i != 0; i--) {
+			j = sizeof(m) * CHAR_BIT - 1 - __builtin_clz(m);
+			m ^= 1 << j;
+		}
+	} else
+		n = k;
+
+	rmsk[0] = m;
+	fmsk[0] |= rmsk[0];
+
+	return n;
+}
+
+static inline uint32_t
+match_process_avx512x16(struct acl_flow_avx512 *flow, __mmask16 *fmsk,
+	__mmask16 *rmsk, __m512i pdata[2], __m512i *di, __m512i *idx,
+	__m512i *tr_lo, __m512i *tr_hi)
+{
+	uint32_t n;
+	__m512i res;
+
+	if (rmsk[0] == 0)
+		return 0;
+
+	/* extract match indexes */
+	res = _mm512_and_si512(tr_lo[0], zmm_index_mask.z);
+
+	/* mask  matched transitions to nop */
+	tr_lo[0] = _mm512_mask_mov_epi32(tr_lo[0], rmsk[0], zmm_trlo_idle.z);
+	tr_hi[0] = _mm512_mask_mov_epi32(tr_hi[0], rmsk[0], zmm_trhi_idle.z);
+
+	/* save found match indexes */
+	_mm512_mask_i32scatter_epi32(flow->matches, rmsk[0],
+		idx[0], res, sizeof(flow->matches[0]));
+
+	/* update masks and start new flows for matches */
+	n = update_flow_mask16(flow, fmsk, rmsk);
+	start_flow16(flow, n, rmsk[0], pdata, idx, di);
+
+	return n;
+}
+
+static inline void
+match_check_process_avx512x16x2(struct acl_flow_avx512 *flow, __mmask16 fm[2],
+	__m512i pdata[4], __m512i di[2], __m512i idx[2], __m512i inp[2],
+	__m512i tr_lo[2], __m512i tr_hi[2])
+{
+	uint32_t n[2];
+	__mmask16 rm[2];
+
+	/* check for matches */
+	rm[0] = _mm512_test_epi32_mask(tr_lo[0], zmm_match_mask.z);
+	rm[1] = _mm512_test_epi32_mask(tr_lo[1], zmm_match_mask.z);
+
+	while ((rm[0] | rm[1]) != 0) {
+
+		n[0] = match_process_avx512x16(flow, &fm[0], &rm[0], &pdata[0],
+			&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
+		n[1] = match_process_avx512x16(flow, &fm[1], &rm[1], &pdata[2],
+			&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
+
+		if (n[0] != 0) {
+			inp[0] = get_next_4bytes_avx512x16(flow, &pdata[0],
+				rm[0], &di[0]);
+			first_trans16(flow, inp[0], rm[0], &tr_lo[0],
+				&tr_hi[0]);
+			rm[0] = _mm512_test_epi32_mask(tr_lo[0],
+				zmm_match_mask.z);
+		}
+
+		if (n[1] != 0) {
+			inp[1] = get_next_4bytes_avx512x16(flow, &pdata[2],
+				rm[1], &di[1]);
+			first_trans16(flow, inp[1], rm[1], &tr_lo[1],
+				&tr_hi[1]);
+			rm[1] = _mm512_test_epi32_mask(tr_lo[1],
+				zmm_match_mask.z);
+		}
+	}
+}
+
+static inline void
+search_trie_avx512x16x2(struct acl_flow_avx512 *flow)
+{
+	__mmask16 fm[2];
+	__m512i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
+
+	/* first 1B load */
+	start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[0], &idx[0], &di[0]);
+	start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[2], &idx[1], &di[1]);
+
+	in[0] = get_next_4bytes_avx512x16(flow, &pdata[0], UINT16_MAX, &di[0]);
+	in[1] = get_next_4bytes_avx512x16(flow, &pdata[2], UINT16_MAX, &di[1]);
+
+	first_trans16(flow, in[0], UINT16_MAX, &tr_lo[0], &tr_hi[0]);
+	first_trans16(flow, in[1], UINT16_MAX, &tr_lo[1], &tr_hi[1]);
+
+	fm[0] = UINT16_MAX;
+	fm[1] = UINT16_MAX;
+
+	/* match check */
+	match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,
+		tr_lo, tr_hi);
+
+	while ((fm[0] | fm[1]) != 0) {
+
+		/* load next 4B */
+
+		in[0] = get_next_4bytes_avx512x16(flow, &pdata[0], fm[0],
+			&di[0]);
+		in[1] = get_next_4bytes_avx512x16(flow, &pdata[2], fm[1],
+			&di[1]);
+
+		/* main 4B loop */
+
+		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		/* check for matches */
+		match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,
+			tr_lo, tr_hi);
+	}
+}
+
+static inline __m512i
+resolve_match_idx_avx512x16(__m512i mi)
+{
+	RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
+		1 << (match_log + 2));
+	return _mm512_slli_epi32(mi, match_log);
+}
+
+static inline __m512i
+resolve_pri_avx512x16(const int32_t res[], const int32_t pri[],
+	const uint32_t match[], __mmask16 msk, uint32_t nb_trie,
+	uint32_t nb_skip)
+{
+	uint32_t i;
+	const uint32_t *pm;
+	__mmask16 m;
+	__m512i cp, cr, np, nr, mch;
+
+	const __m512i zero = _mm512_set1_epi32(0);
+
+	mch = _mm512_maskz_loadu_epi32(msk, match);
+	mch = resolve_match_idx_avx512x16(mch);
+
+	cr = _mm512_mask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0]));
+	cp = _mm512_mask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0]));
+
+	for (i = 1, pm = match + nb_skip; i != nb_trie;
+			i++, pm += nb_skip) {
+
+		mch = _mm512_maskz_loadu_epi32(msk, pm);
+		mch = resolve_match_idx_avx512x16(mch);
+
+		nr = _mm512_mask_i32gather_epi32(zero, msk, mch, res,
+			sizeof(res[0]));
+		np = _mm512_mask_i32gather_epi32(zero, msk, mch, pri,
+			sizeof(pri[0]));
+
+		m = _mm512_cmpgt_epi32_mask(cp, np);
+		cr = _mm512_mask_mov_epi32(nr, m, cr);
+		cp = _mm512_mask_mov_epi32(np, m, cp);
+	}
+
+	return cr;
+}
+
+/*
+ * Resolve num (<= 16) matches for single category
+ */
+static inline void
+resolve_sc_avx512x16(uint32_t result[], const int32_t res[],
+	const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
+	uint32_t nb_trie, uint32_t nb_skip)
+{
+	__mmask16 msk;
+	__m512i cr;
+
+	msk = (1 << nb_pkt) - 1;
+	cr = resolve_pri_avx512x16(res, pri, match, msk, nb_trie, nb_skip);
+	_mm512_mask_storeu_epi32(result, msk, cr);
+}
+
+/*
+ * Resolve matches for single category
+ */
+static inline void
+resolve_sc_avx512x16x2(uint32_t result[],
+	const struct rte_acl_match_results pr[], const uint32_t match[],
+	uint32_t nb_pkt, uint32_t nb_trie)
+{
+	uint32_t i, j, k, n;
+	const uint32_t *pm;
+	const int32_t *res, *pri;
+	__mmask16 m[2];
+	__m512i cp[2], cr[2], np[2], nr[2], mch[2];
+
+	res = (const int32_t *)pr->results;
+	pri = pr->priority;
+
+	for (k = 0; k != (nb_pkt & ~MSK_AVX512X16X2); k += NUM_AVX512X16X2) {
+
+		j = k + MASK16_BIT;
+
+		/* load match indexes for first trie */
+		mch[0] = _mm512_loadu_si512(match + k);
+		mch[1] = _mm512_loadu_si512(match + j);
+
+		mch[0] = resolve_match_idx_avx512x16(mch[0]);
+		mch[1] = resolve_match_idx_avx512x16(mch[1]);
+
+		/* load matches and their priorities for first trie */
+
+		cr[0] = _mm512_i32gather_epi32(mch[0], res, sizeof(res[0]));
+		cr[1] = _mm512_i32gather_epi32(mch[1], res, sizeof(res[0]));
+
+		cp[0] = _mm512_i32gather_epi32(mch[0], pri, sizeof(pri[0]));
+		cp[1] = _mm512_i32gather_epi32(mch[1], pri, sizeof(pri[0]));
+
+		/* select match with highest priority */
+		for (i = 1, pm = match + nb_pkt; i != nb_trie;
+				i++, pm += nb_pkt) {
+
+			mch[0] = _mm512_loadu_si512(pm + k);
+			mch[1] = _mm512_loadu_si512(pm + j);
+
+			mch[0] = resolve_match_idx_avx512x16(mch[0]);
+			mch[1] = resolve_match_idx_avx512x16(mch[1]);
+
+			nr[0] = _mm512_i32gather_epi32(mch[0], res,
+				sizeof(res[0]));
+			nr[1] = _mm512_i32gather_epi32(mch[1], res,
+				sizeof(res[0]));
+
+			np[0] = _mm512_i32gather_epi32(mch[0], pri,
+				sizeof(pri[0]));
+			np[1] = _mm512_i32gather_epi32(mch[1], pri,
+				sizeof(pri[0]));
+
+			m[0] = _mm512_cmpgt_epi32_mask(cp[0], np[0]);
+			m[1] = _mm512_cmpgt_epi32_mask(cp[1], np[1]);
+
+			cr[0] = _mm512_mask_mov_epi32(nr[0], m[0], cr[0]);
+			cr[1] = _mm512_mask_mov_epi32(nr[1], m[1], cr[1]);
+
+			cp[0] = _mm512_mask_mov_epi32(np[0], m[0], cp[0]);
+			cp[1] = _mm512_mask_mov_epi32(np[1], m[1], cp[1]);
+		}
+
+		_mm512_storeu_si512(result + k, cr[0]);
+		_mm512_storeu_si512(result + j, cr[1]);
+	}
+
+	n = nb_pkt - k;
+	if (n != 0) {
+		if (n > MASK16_BIT) {
+			resolve_sc_avx512x16(result + k, res, pri, match + k,
+				MASK16_BIT, nb_trie, nb_pkt);
+			k += MASK16_BIT;
+			n -= MASK16_BIT;
+		}
+		resolve_sc_avx512x16(result + k, res, pri, match + k, n,
+				nb_trie, nb_pkt);
+	}
+}
+
+static inline int
+search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t total_packets, uint32_t categories)
+{
+	uint32_t i, *pm;
+	const struct rte_acl_match_results *pr;
+	struct acl_flow_avx512 flow;
+	uint32_t match[ctx->num_tries * total_packets];
+
+	for (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {
+
+		/* setup for next trie */
+		acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
+
+		/* process the trie */
+		search_trie_avx512x16x2(&flow);
+	}
+
+	/* resolve matches */
+	pr = (const struct rte_acl_match_results *)
+		(ctx->trans_table + ctx->match_index);
+
+	if (categories == 1)
+		resolve_sc_avx512x16x2(results, pr, match, total_packets,
+			ctx->num_tries);
+	else if (categories <= RTE_ACL_MAX_CATEGORIES / 2)
+		resolve_mcle8_avx512x1(results, pr, match, total_packets,
+			categories, ctx->num_tries);
+	else
+		resolve_mcgt8_avx512x1(results, pr, match, total_packets,
+			categories, ctx->num_tries);
+
+	return 0;
+}
-- 
2.17.1


  parent reply	other threads:[~2020-08-07 16:30 UTC|newest]

Thread overview: 70+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-08-07 16:28 [dpdk-dev] [PATCH 20.11 0/7] acl: introduce AVX512 classify method Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 1/7] acl: fix x86 build when compiler doesn't support AVX2 Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 2/7] app/acl: few small improvements Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 3/7] acl: remove of unused enum value Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 4/7] acl: add infrastructure to support AVX512 classify Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 5/7] app/acl: add AVX512 classify support Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 6/7] acl: introduce AVX512 classify implementation Konstantin Ananyev
2020-08-07 16:28 ` Konstantin Ananyev [this message]
2020-09-15 16:50 ` [dpdk-dev] [PATCH v2 00/12] acl: introduce AVX512 classify method Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 01/12] acl: fix x86 build when compiler doesn't support AVX2 Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 02/12] doc: fix mixing classify methods in ACL guide Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 03/12] acl: remove of unused enum value Konstantin Ananyev
2020-09-27  3:27     ` Ruifeng Wang
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 04/12] acl: remove library constructor Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 05/12] app/acl: few small improvements Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 06/12] test/acl: expand classify test coverage Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 07/12] acl: add infrastructure to support AVX512 classify Konstantin Ananyev
2020-09-16  9:11     ` Bruce Richardson
2020-09-16  9:36       ` Medvedkin, Vladimir
2020-09-16  9:49         ` Bruce Richardson
2020-09-16 10:06           ` Ananyev, Konstantin
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 08/12] acl: introduce AVX512 classify implementation Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 09/12] acl: enhance " Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 10/12] acl: for AVX512 classify use 4B load whenever possible Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 11/12] test/acl: add AVX512 classify support Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 12/12] app/acl: " Konstantin Ananyev
2020-10-05 18:45   ` [dpdk-dev] [PATCH v3 00/14] acl: introduce AVX512 classify methods Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 01/14] acl: fix x86 build when compiler doesn't support AVX2 Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 02/14] doc: fix missing classify methods in ACL guide Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 03/14] acl: remove of unused enum value Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 04/14] acl: remove library constructor Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 05/14] app/acl: few small improvements Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 06/14] test/acl: expand classify test coverage Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 07/14] acl: add infrastructure to support AVX512 classify Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 08/14] acl: introduce 256-bit width AVX512 classify implementation Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 09/14] acl: update default classify algorithm selection Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 10/14] acl: introduce 512-bit width AVX512 classify implementation Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 11/14] acl: for AVX512 classify use 4B load whenever possible Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 12/14] acl: deduplicate AVX512 code paths Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 13/14] test/acl: add AVX512 classify support Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 14/14] app/acl: " Konstantin Ananyev
2020-10-06 15:03     ` [dpdk-dev] [PATCH v4 00/14] acl: introduce AVX512 classify methods Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 01/14] acl: fix x86 build when compiler doesn't support AVX2 Konstantin Ananyev
2020-10-08 13:42         ` [dpdk-dev] [dpdk-stable] " David Marchand
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 02/14] doc: fix missing classify methods in ACL guide Konstantin Ananyev
2020-10-08 13:42         ` David Marchand
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 03/14] acl: remove of unused enum value Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 04/14] acl: remove library constructor Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 05/14] app/acl: few small improvements Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 06/14] test/acl: expand classify test coverage Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 07/14] acl: add infrastructure to support AVX512 classify Konstantin Ananyev
2020-10-13 19:17         ` David Marchand
2020-10-13 22:26           ` Ananyev, Konstantin
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 08/14] acl: introduce 256-bit width AVX512 classify implementation Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 09/14] acl: update default classify algorithm selection Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 10/14] acl: introduce 512-bit width AVX512 classify implementation Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 11/14] acl: for AVX512 classify use 4B load whenever possible Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 12/14] acl: deduplicate AVX512 code paths Konstantin Ananyev
2020-10-16 15:56         ` Ferruh Yigit
2020-10-16 16:20           ` Thomas Monjalon
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 13/14] test/acl: add AVX512 classify support Konstantin Ananyev
2020-10-14 10:26         ` David Marchand
2020-10-14 10:32           ` Ananyev, Konstantin
2020-10-14 10:35             ` David Marchand
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 14/14] app/acl: " Konstantin Ananyev
2020-10-14 12:40       ` [dpdk-dev] [PATCH v4 00/14] acl: introduce AVX512 classify methods David Marchand
2020-10-06 15:05     ` [dpdk-dev] [PATCH v3 " David Marchand
2020-10-06 16:07       ` Ananyev, Konstantin
2020-10-08 10:49         ` David Marchand
2020-10-14  9:23         ` Kinsella, Ray

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200807162829.11690-8-konstantin.ananyev@intel.com \
    --to=konstantin.ananyev@intel.com \
    --cc=dev@dpdk.org \
    --cc=jerinj@marvell.com \
    --cc=ruifeng.wang@arm.com \
    --cc=vladimir.medvedkin@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.