All of lore.kernel.org
 help / color / mirror / Atom feed
From: Konstantin Ananyev <konstantin.ananyev@intel.com>
To: dev@dpdk.org
Cc: jerinj@marvell.com, ruifeng.wang@arm.com,
	vladimir.medvedkin@intel.com,
	Konstantin Ananyev <konstantin.ananyev@intel.com>
Subject: [dpdk-dev] [PATCH v4 12/14] acl: deduplicate AVX512 code paths
Date: Tue,  6 Oct 2020 16:03:14 +0100	[thread overview]
Message-ID: <20201006150316.5776-13-konstantin.ananyev@intel.com> (raw)
In-Reply-To: <20201006150316.5776-1-konstantin.ananyev@intel.com>

Current rte_acl_classify_avx512x32() and rte_acl_classify_avx512x16()
code paths are very similar. The only differences are due to
256/512 register/instrincts naming conventions.
So to deduplicate the code:
  - Move common code into “acl_run_avx512_common.h”
  - Use macros to hide difference in naming conventions

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/librte_acl/acl_run_avx512_common.h | 477 +++++++++++++++++++++
 lib/librte_acl/acl_run_avx512x16.h     | 569 ++++---------------------
 lib/librte_acl/acl_run_avx512x8.h      | 565 ++++--------------------
 3 files changed, 654 insertions(+), 957 deletions(-)
 create mode 100644 lib/librte_acl/acl_run_avx512_common.h

diff --git a/lib/librte_acl/acl_run_avx512_common.h b/lib/librte_acl/acl_run_avx512_common.h
new file mode 100644
index 0000000000..1baf79b7ae
--- /dev/null
+++ b/lib/librte_acl/acl_run_avx512_common.h
@@ -0,0 +1,477 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+/*
+ * WARNING: It is not recommended to include this file directly.
+ * Please include "acl_run_avx512x*.h" instead.
+ * To make this file to generate proper code an includer has to
+ * define several macros, refer to "acl_run_avx512x*.h" for more details.
+ */
+
+/*
+ * Calculate the address of the next transition for
+ * all types of nodes. Note that only DFA nodes and range
+ * nodes actually transition to another node. Match
+ * nodes not supposed to be encountered here.
+ * For quad range nodes:
+ * Calculate number of range boundaries that are less than the
+ * input value. Range boundaries for each node are in signed 8 bit,
+ * ordered from -128 to 127.
+ * This is effectively a popcnt of bytes that are greater than the
+ * input byte.
+ * Single nodes are processed in the same ways as quad range nodes.
+ */
+static __rte_always_inline _T_simd
+_F_(calc_addr)(_T_simd index_mask, _T_simd next_input, _T_simd shuffle_input,
+	_T_simd four_32, _T_simd range_base, _T_simd tr_lo, _T_simd tr_hi)
+{
+	__mmask64 qm;
+	_T_mask dfa_msk;
+	_T_simd addr, in, node_type, r, t;
+	_T_simd dfa_ofs, quad_ofs;
+
+	t = _M_SI_(xor)(index_mask, index_mask);
+	in = _M_I_(shuffle_epi8)(next_input, shuffle_input);
+
+	/* Calc node type and node addr */
+	node_type = _M_SI_(andnot)(index_mask, tr_lo);
+	addr = _M_SI_(and)(index_mask, tr_lo);
+
+	/* mask for DFA type(0) nodes */
+	dfa_msk = _M_I_(cmpeq_epi32_mask)(node_type, t);
+
+	/* DFA calculations. */
+	r = _M_I_(srli_epi32)(in, 30);
+	r = _M_I_(add_epi8)(r, range_base);
+	t = _M_I_(srli_epi32)(in, 24);
+	r = _M_I_(shuffle_epi8)(tr_hi, r);
+
+	dfa_ofs = _M_I_(sub_epi32)(t, r);
+
+	/* QUAD/SINGLE calculations. */
+	qm = _M_I_(cmpgt_epi8_mask)(in, tr_hi);
+	t = _M_I_(maskz_set1_epi8)(qm, (uint8_t)UINT8_MAX);
+	t = _M_I_(lzcnt_epi32)(t);
+	t = _M_I_(srli_epi32)(t, 3);
+	quad_ofs = _M_I_(sub_epi32)(four_32, t);
+
+	/* blend DFA and QUAD/SINGLE. */
+	t = _M_I_(mask_mov_epi32)(quad_ofs, dfa_msk, dfa_ofs);
+
+	/* calculate address for next transitions. */
+	addr = _M_I_(add_epi32)(addr, t);
+	return addr;
+}
+
+/*
+ * Process _N_ transitions in parallel.
+ * tr_lo contains low 32 bits for _N_ transition.
+ * tr_hi contains high 32 bits for _N_ transition.
+ * next_input contains up to 4 input bytes for _N_ flows.
+ */
+static __rte_always_inline _T_simd
+_F_(trans)(_T_simd next_input, const uint64_t *trans, _T_simd *tr_lo,
+	_T_simd *tr_hi)
+{
+	const int32_t *tr;
+	_T_simd addr;
+
+	tr = (const int32_t *)(uintptr_t)trans;
+
+	/* Calculate the address (array index) for all _N_ transitions. */
+	addr = _F_(calc_addr)(_SV_(index_mask), next_input, _SV_(shuffle_input),
+		_SV_(four_32), _SV_(range_base), *tr_lo, *tr_hi);
+
+	/* load lower 32 bits of _N_ transactions at once. */
+	*tr_lo = _M_GI_(i32gather_epi32, addr, tr, sizeof(trans[0]));
+
+	next_input = _M_I_(srli_epi32)(next_input, CHAR_BIT);
+
+	/* load high 32 bits of _N_ transactions at once. */
+	*tr_hi = _M_GI_(i32gather_epi32, addr, (tr + 1), sizeof(trans[0]));
+
+	return next_input;
+}
+
+/*
+ * Execute first transition for up to _N_ flows in parallel.
+ * next_input should contain one input byte for up to _N_ flows.
+ * msk - mask of active flows.
+ * tr_lo contains low 32 bits for up to _N_ transitions.
+ * tr_hi contains high 32 bits for up to _N_ transitions.
+ */
+static __rte_always_inline void
+_F_(first_trans)(const struct acl_flow_avx512 *flow, _T_simd next_input,
+	_T_mask msk, _T_simd *tr_lo, _T_simd *tr_hi)
+{
+	const int32_t *tr;
+	_T_simd addr, root;
+
+	tr = (const int32_t *)(uintptr_t)flow->trans;
+
+	addr = _M_I_(set1_epi32)(UINT8_MAX);
+	root = _M_I_(set1_epi32)(flow->root_index);
+
+	addr = _M_SI_(and)(next_input, addr);
+	addr = _M_I_(add_epi32)(root, addr);
+
+	/* load lower 32 bits of _N_ transactions at once. */
+	*tr_lo = _M_MGI_(mask_i32gather_epi32)(*tr_lo, msk, addr, tr,
+		sizeof(flow->trans[0]));
+
+	/* load high 32 bits of _N_ transactions at once. */
+	*tr_hi = _M_MGI_(mask_i32gather_epi32)(*tr_hi, msk, addr, (tr + 1),
+		sizeof(flow->trans[0]));
+}
+
+/*
+ * Load and return next 4 input bytes for up to _N_ flows in parallel.
+ * pdata - 8x2 pointers to flow input data
+ * mask - mask of active flows.
+ * di - data indexes for these _N_ flows.
+ */
+static inline _T_simd
+_F_(get_next_bytes)(const struct acl_flow_avx512 *flow, _T_simd pdata[2],
+	uint32_t msk, _T_simd *di, uint32_t bnum)
+{
+	const int32_t *div;
+	uint32_t m[2];
+	_T_simd one, zero, t, p[2];
+
+	div = (const int32_t *)flow->data_index;
+
+	one = _M_I_(set1_epi32)(1);
+	zero = _M_SI_(xor)(one, one);
+
+	/* load data offsets for given indexes */
+	t = _M_MGI_(mask_i32gather_epi32)(zero, msk, *di, div, sizeof(div[0]));
+
+	/* increment data indexes */
+	*di = _M_I_(mask_add_epi32)(*di, msk, *di, one);
+
+	/*
+	 * unsigned expand 32-bit indexes to 64-bit
+	 * (for later pointer arithmetic), i.e:
+	 * for (i = 0; i != _N_; i++)
+	 *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
+	 */
+	p[0] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[0]),
+			t);
+	p[1] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[1]),
+			t);
+
+	p[0] = _M_I_(add_epi64)(p[0], pdata[0]);
+	p[1] = _M_I_(add_epi64)(p[1], pdata[1]);
+
+	/* load input byte(s), either one or four */
+
+	m[0] = msk & _SIMD_PTR_MSK_;
+	m[1] = msk >> _SIMD_PTR_NUM_;
+
+	return _F_(gather_bytes)(zero, p, m, bnum);
+}
+
+/*
+ * Start up to _N_ new flows.
+ * num - number of flows to start
+ * msk - mask of new flows.
+ * pdata - pointers to flow input data
+ * idx - match indexed for given flows
+ * di - data indexes for these flows.
+ */
+static inline void
+_F_(start_flow)(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
+	_T_simd pdata[2], _T_simd *idx, _T_simd *di)
+{
+	uint32_t n, m[2], nm[2];
+	_T_simd ni, nd[2];
+
+	/* split mask into two - one for each pdata[] */
+	m[0] = msk & _SIMD_PTR_MSK_;
+	m[1] = msk >> _SIMD_PTR_NUM_;
+
+	/* calculate masks for new flows */
+	n = __builtin_popcount(m[0]);
+	nm[0] = (1 << n) - 1;
+	nm[1] = (1 << (num - n)) - 1;
+
+	/* load input data pointers for new flows */
+	nd[0] = _M_I_(maskz_loadu_epi64)(nm[0],
+			flow->idata + flow->num_packets);
+	nd[1] = _M_I_(maskz_loadu_epi64)(nm[1],
+			flow->idata + flow->num_packets + n);
+
+	/* calculate match indexes of new flows */
+	ni = _M_I_(set1_epi32)(flow->num_packets);
+	ni = _M_I_(add_epi32)(ni, _SV_(idx_add));
+
+	/* merge new and existing flows data */
+	pdata[0] = _M_I_(mask_expand_epi64)(pdata[0], m[0], nd[0]);
+	pdata[1] = _M_I_(mask_expand_epi64)(pdata[1], m[1], nd[1]);
+
+	/* update match and data indexes */
+	*idx = _M_I_(mask_expand_epi32)(*idx, msk, ni);
+	*di = _M_I_(maskz_mov_epi32)(msk ^ _SIMD_MASK_MAX_, *di);
+
+	flow->num_packets += num;
+}
+
+/*
+ * Process found matches for up to _N_ flows.
+ * fmsk - mask of active flows
+ * rmsk - mask of found matches
+ * pdata - pointers to flow input data
+ * di - data indexes for these flows
+ * idx - match indexed for given flows
+ * tr_lo contains low 32 bits for up to _N_ transitions.
+ * tr_hi contains high 32 bits for up to _N_ transitions.
+ */
+static inline uint32_t
+_F_(match_process)(struct acl_flow_avx512 *flow, uint32_t *fmsk,
+	uint32_t *rmsk, _T_simd pdata[2], _T_simd *di, _T_simd *idx,
+	_T_simd *tr_lo, _T_simd *tr_hi)
+{
+	uint32_t n;
+	_T_simd res;
+
+	if (rmsk[0] == 0)
+		return 0;
+
+	/* extract match indexes */
+	res = _M_SI_(and)(tr_lo[0], _SV_(index_mask));
+
+	/* mask  matched transitions to nop */
+	tr_lo[0] = _M_I_(mask_mov_epi32)(tr_lo[0], rmsk[0], _SV_(trlo_idle));
+	tr_hi[0] = _M_I_(mask_mov_epi32)(tr_hi[0], rmsk[0], _SV_(trhi_idle));
+
+	/* save found match indexes */
+	_M_I_(mask_i32scatter_epi32)(flow->matches, rmsk[0], idx[0], res,
+			sizeof(flow->matches[0]));
+
+	/* update masks and start new flows for matches */
+	n = update_flow_mask(flow, fmsk, rmsk);
+	_F_(start_flow)(flow, n, rmsk[0], pdata, idx, di);
+
+	return n;
+}
+
+/*
+ * Test for matches ut to (2 * _N_) flows at once,
+ * if matches exist - process them and start new flows.
+ */
+static inline void
+_F_(match_check_process)(struct acl_flow_avx512 *flow, uint32_t fm[2],
+	_T_simd pdata[4], _T_simd di[2], _T_simd idx[2], _T_simd inp[2],
+	_T_simd tr_lo[2], _T_simd tr_hi[2])
+{
+	uint32_t n[2];
+	uint32_t rm[2];
+
+	/* check for matches */
+	rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask));
+	rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask));
+
+	/* till unprocessed matches exist */
+	while ((rm[0] | rm[1]) != 0) {
+
+		/* process matches and start new flows */
+		n[0] = _F_(match_process)(flow, &fm[0], &rm[0], &pdata[0],
+			&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
+		n[1] = _F_(match_process)(flow, &fm[1], &rm[1], &pdata[2],
+			&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
+
+		/* execute first transition for new flows, if any */
+
+		if (n[0] != 0) {
+			inp[0] = _F_(get_next_bytes)(flow, &pdata[0],
+					rm[0], &di[0], flow->first_load_sz);
+			_F_(first_trans)(flow, inp[0], rm[0], &tr_lo[0],
+					&tr_hi[0]);
+			rm[0] = _M_I_(test_epi32_mask)(tr_lo[0],
+					_SV_(match_mask));
+		}
+
+		if (n[1] != 0) {
+			inp[1] = _F_(get_next_bytes)(flow, &pdata[2],
+					rm[1], &di[1], flow->first_load_sz);
+			_F_(first_trans)(flow, inp[1], rm[1], &tr_lo[1],
+					&tr_hi[1]);
+			rm[1] = _M_I_(test_epi32_mask)(tr_lo[1],
+					_SV_(match_mask));
+		}
+	}
+}
+
+/*
+ * Perform search for up to (2 * _N_) flows in parallel.
+ * Use two sets of metadata, each serves _N_ flows max.
+ */
+static inline void
+_F_(search_trie)(struct acl_flow_avx512 *flow)
+{
+	uint32_t fm[2];
+	_T_simd di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
+
+	/* first 1B load */
+	_F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
+			&pdata[0], &idx[0], &di[0]);
+	_F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
+			&pdata[2], &idx[1], &di[1]);
+
+	in[0] = _F_(get_next_bytes)(flow, &pdata[0], _SIMD_MASK_MAX_, &di[0],
+			flow->first_load_sz);
+	in[1] = _F_(get_next_bytes)(flow, &pdata[2], _SIMD_MASK_MAX_, &di[1],
+			flow->first_load_sz);
+
+	_F_(first_trans)(flow, in[0], _SIMD_MASK_MAX_, &tr_lo[0], &tr_hi[0]);
+	_F_(first_trans)(flow, in[1], _SIMD_MASK_MAX_, &tr_lo[1], &tr_hi[1]);
+
+	fm[0] = _SIMD_MASK_MAX_;
+	fm[1] = _SIMD_MASK_MAX_;
+
+	/* match check */
+	_F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi);
+
+	while ((fm[0] | fm[1]) != 0) {
+
+		/* load next 4B */
+
+		in[0] = _F_(get_next_bytes)(flow, &pdata[0], fm[0],
+				&di[0], sizeof(uint32_t));
+		in[1] = _F_(get_next_bytes)(flow, &pdata[2], fm[1],
+				&di[1], sizeof(uint32_t));
+
+		/* main 4B loop */
+
+		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		/* check for matches */
+		_F_(match_check_process)(flow, fm, pdata, di, idx, in,
+			tr_lo, tr_hi);
+	}
+}
+
+/*
+ * resolve match index to actual result/priority offset.
+ */
+static inline _T_simd
+_F_(resolve_match_idx)(_T_simd mi)
+{
+	RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
+		1 << (match_log + 2));
+	return _M_I_(slli_epi32)(mi, match_log);
+}
+
+/*
+ * Resolve multiple matches for the same flow based on priority.
+ */
+static inline _T_simd
+_F_(resolve_pri)(const int32_t res[], const int32_t pri[],
+	const uint32_t match[], _T_mask msk, uint32_t nb_trie,
+	uint32_t nb_skip)
+{
+	uint32_t i;
+	const uint32_t *pm;
+	_T_mask m;
+	_T_simd cp, cr, np, nr, mch;
+
+	const _T_simd zero = _M_I_(set1_epi32)(0);
+
+	/* get match indexes */
+	mch = _M_I_(maskz_loadu_epi32)(msk, match);
+	mch = _F_(resolve_match_idx)(mch);
+
+	/* read result and priority values for first trie */
+	cr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res, sizeof(res[0]));
+	cp = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri, sizeof(pri[0]));
+
+	/*
+	 * read result and priority values for next tries and select one
+	 * with highest priority.
+	 */
+	for (i = 1, pm = match + nb_skip; i != nb_trie;
+			i++, pm += nb_skip) {
+
+		mch = _M_I_(maskz_loadu_epi32)(msk, pm);
+		mch = _F_(resolve_match_idx)(mch);
+
+		nr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res,
+				sizeof(res[0]));
+		np = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri,
+				sizeof(pri[0]));
+
+		m = _M_I_(cmpgt_epi32_mask)(cp, np);
+		cr = _M_I_(mask_mov_epi32)(nr, m, cr);
+		cp = _M_I_(mask_mov_epi32)(np, m, cp);
+	}
+
+	return cr;
+}
+
+/*
+ * Resolve num (<= _N_) matches for single category
+ */
+static inline void
+_F_(resolve_sc)(uint32_t result[], const int32_t res[],
+	const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
+	uint32_t nb_trie, uint32_t nb_skip)
+{
+	_T_mask msk;
+	_T_simd cr;
+
+	msk = (1 << nb_pkt) - 1;
+	cr = _F_(resolve_pri)(res, pri, match, msk, nb_trie, nb_skip);
+	_M_I_(mask_storeu_epi32)(result, msk, cr);
+}
+
+/*
+ * Resolve matches for single category
+ */
+static inline void
+_F_(resolve_single_cat)(uint32_t result[],
+	const struct rte_acl_match_results pr[], const uint32_t match[],
+	uint32_t nb_pkt, uint32_t nb_trie)
+{
+	uint32_t j, k, n;
+	const int32_t *res, *pri;
+	_T_simd cr[2];
+
+	res = (const int32_t *)pr->results;
+	pri = pr->priority;
+
+	for (k = 0; k != (nb_pkt & ~_SIMD_FLOW_MSK_); k += _SIMD_FLOW_NUM_) {
+
+		j = k + _SIMD_MASK_BIT_;
+
+		cr[0] = _F_(resolve_pri)(res, pri, match + k, _SIMD_MASK_MAX_,
+				nb_trie, nb_pkt);
+		cr[1] = _F_(resolve_pri)(res, pri, match + j, _SIMD_MASK_MAX_,
+				nb_trie, nb_pkt);
+
+		_M_SI_(storeu)((void *)(result + k), cr[0]);
+		_M_SI_(storeu)((void *)(result + j), cr[1]);
+	}
+
+	n = nb_pkt - k;
+	if (n != 0) {
+		if (n > _SIMD_MASK_BIT_) {
+			_F_(resolve_sc)(result + k, res, pri, match + k,
+				_SIMD_MASK_BIT_, nb_trie, nb_pkt);
+			k += _SIMD_MASK_BIT_;
+			n -= _SIMD_MASK_BIT_;
+		}
+		_F_(resolve_sc)(result + k, res, pri, match + k, n,
+				nb_trie, nb_pkt);
+	}
+}
diff --git a/lib/librte_acl/acl_run_avx512x16.h b/lib/librte_acl/acl_run_avx512x16.h
index a39df8f3c0..da244bc257 100644
--- a/lib/librte_acl/acl_run_avx512x16.h
+++ b/lib/librte_acl/acl_run_avx512x16.h
@@ -2,16 +2,57 @@
  * Copyright(c) 2020 Intel Corporation
  */
 
-#define	MASK16_BIT	(sizeof(__mmask16) * CHAR_BIT)
+/*
+ * Defines required by "acl_run_avx512_common.h".
+ * Note that all of them has to be undefined by the end
+ * of this file, as "acl_run_avx512_common.h" can be included several
+ * times from different *.h files for the same *.c.
+ */
+
+/*
+ * This implementation uses 512-bit registers(zmm) and instrincts.
+ * So our main SIMD type is 512-bit width and each such variable can
+ * process sizeof(__m512i) / sizeof(uint32_t) == 16 entries in parallel.
+ */
+#define _T_simd		__m512i
+#define _T_mask		__mmask16
+
+/* Naming convention for static const variables. */
+#define _SC_(x)		zmm_##x
+#define _SV_(x)		(zmm_##x.z)
+
+/* Naming convention for internal functions. */
+#define _F_(x)		x##_avx512x16
+
+/*
+ * Same instrincts have different syntaxis (depending on the bit-width),
+ * so to overcome that few macros need to be defined.
+ */
+
+/* Naming convention for generic epi(packed integers) type instrincts. */
+#define _M_I_(x)	_mm512_##x
+
+/* Naming convention for si(whole simd integer) type instrincts. */
+#define _M_SI_(x)	_mm512_##x##_si512
+
+/* Naming convention for masked gather type instrincts. */
+#define _M_MGI_(x)	_mm512_##x
+
+/* Naming convention for gather type instrincts. */
+#define _M_GI_(name, idx, base, scale)	_mm512_##name(idx, base, scale)
 
-#define NUM_AVX512X16X2	(2 * MASK16_BIT)
-#define MSK_AVX512X16X2	(NUM_AVX512X16X2 - 1)
+/* num/mask of transitions per SIMD regs */
+#define _SIMD_MASK_BIT_	(sizeof(_T_simd) / sizeof(uint32_t))
+#define _SIMD_MASK_MAX_	RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
+
+#define _SIMD_FLOW_NUM_	(2 * _SIMD_MASK_BIT_)
+#define _SIMD_FLOW_MSK_	(_SIMD_FLOW_NUM_ - 1)
 
 /* num/mask of pointers per SIMD regs */
-#define ZMM_PTR_NUM	(sizeof(__m512i) / sizeof(uintptr_t))
-#define ZMM_PTR_MSK	RTE_LEN2MASK(ZMM_PTR_NUM, uint32_t)
+#define _SIMD_PTR_NUM_	(sizeof(_T_simd) / sizeof(uintptr_t))
+#define _SIMD_PTR_MSK_	RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
 
-static const __rte_x86_zmm_t zmm_match_mask = {
+static const __rte_x86_zmm_t _SC_(match_mask) = {
 	.u32 = {
 		RTE_ACL_NODE_MATCH,
 		RTE_ACL_NODE_MATCH,
@@ -32,7 +73,7 @@ static const __rte_x86_zmm_t zmm_match_mask = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_index_mask = {
+static const __rte_x86_zmm_t _SC_(index_mask) = {
 	.u32 = {
 		RTE_ACL_NODE_INDEX,
 		RTE_ACL_NODE_INDEX,
@@ -53,7 +94,7 @@ static const __rte_x86_zmm_t zmm_index_mask = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_trlo_idle = {
+static const __rte_x86_zmm_t _SC_(trlo_idle) = {
 	.u32 = {
 		RTE_ACL_IDLE_NODE,
 		RTE_ACL_IDLE_NODE,
@@ -74,7 +115,7 @@ static const __rte_x86_zmm_t zmm_trlo_idle = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_trhi_idle = {
+static const __rte_x86_zmm_t _SC_(trhi_idle) = {
 	.u32 = {
 		0, 0, 0, 0,
 		0, 0, 0, 0,
@@ -83,7 +124,7 @@ static const __rte_x86_zmm_t zmm_trhi_idle = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_shuffle_input = {
+static const __rte_x86_zmm_t _SC_(shuffle_input) = {
 	.u32 = {
 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
@@ -92,7 +133,7 @@ static const __rte_x86_zmm_t zmm_shuffle_input = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_four_32 = {
+static const __rte_x86_zmm_t _SC_(four_32) = {
 	.u32 = {
 		4, 4, 4, 4,
 		4, 4, 4, 4,
@@ -101,7 +142,7 @@ static const __rte_x86_zmm_t zmm_four_32 = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_idx_add = {
+static const __rte_x86_zmm_t _SC_(idx_add) = {
 	.u32 = {
 		0, 1, 2, 3,
 		4, 5, 6, 7,
@@ -110,7 +151,7 @@ static const __rte_x86_zmm_t zmm_idx_add = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_range_base = {
+static const __rte_x86_zmm_t _SC_(range_base) = {
 	.u32 = {
 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
@@ -119,16 +160,16 @@ static const __rte_x86_zmm_t zmm_range_base = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_pminp = {
+static const __rte_x86_zmm_t _SC_(pminp) = {
 	.u32 = {
 		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 	},
 };
 
-static const __mmask16 zmm_pmidx_msk = 0x5555;
+static const _T_mask _SC_(pmidx_msk) = 0x5555;
 
-static const __rte_x86_zmm_t zmm_pmidx[2] = {
+static const __rte_x86_zmm_t _SC_(pmidx[2]) = {
 	[0] = {
 		.u32 = {
 			0, 0, 1, 0, 2, 0, 3, 0,
@@ -148,7 +189,7 @@ static const __rte_x86_zmm_t zmm_pmidx[2] = {
  * gather load on a byte quantity. So we have to mimic it in SW,
  * by doing 8x1B scalar loads.
  */
-static inline ymm_t
+static inline __m256i
 _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
 {
 	rte_ymm_t v;
@@ -156,7 +197,7 @@ _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
 
 	static const uint32_t zero;
 
-	p.z = _mm512_mask_set1_epi64(pdata, mask ^ ZMM_PTR_MSK,
+	p.z = _mm512_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
 		(uintptr_t)&zero);
 
 	v.u32[0] = *(uint8_t *)p.u64[0];
@@ -172,369 +213,29 @@ _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
 }
 
 /*
- * Calculate the address of the next transition for
- * all types of nodes. Note that only DFA nodes and range
- * nodes actually transition to another node. Match
- * nodes not supposed to be encountered here.
- * For quad range nodes:
- * Calculate number of range boundaries that are less than the
- * input value. Range boundaries for each node are in signed 8 bit,
- * ordered from -128 to 127.
- * This is effectively a popcnt of bytes that are greater than the
- * input byte.
- * Single nodes are processed in the same ways as quad range nodes.
- */
-static __rte_always_inline __m512i
-calc_addr16(__m512i index_mask, __m512i next_input, __m512i shuffle_input,
-	__m512i four_32, __m512i range_base, __m512i tr_lo, __m512i tr_hi)
-{
-	__mmask64 qm;
-	__mmask16 dfa_msk;
-	__m512i addr, in, node_type, r, t;
-	__m512i dfa_ofs, quad_ofs;
-
-	t = _mm512_xor_si512(index_mask, index_mask);
-	in = _mm512_shuffle_epi8(next_input, shuffle_input);
-
-	/* Calc node type and node addr */
-	node_type = _mm512_andnot_si512(index_mask, tr_lo);
-	addr = _mm512_and_si512(index_mask, tr_lo);
-
-	/* mask for DFA type(0) nodes */
-	dfa_msk = _mm512_cmpeq_epi32_mask(node_type, t);
-
-	/* DFA calculations. */
-	r = _mm512_srli_epi32(in, 30);
-	r = _mm512_add_epi8(r, range_base);
-	t = _mm512_srli_epi32(in, 24);
-	r = _mm512_shuffle_epi8(tr_hi, r);
-
-	dfa_ofs = _mm512_sub_epi32(t, r);
-
-	/* QUAD/SINGLE calculations. */
-	qm = _mm512_cmpgt_epi8_mask(in, tr_hi);
-	t = _mm512_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX);
-	t = _mm512_lzcnt_epi32(t);
-	t = _mm512_srli_epi32(t, 3);
-	quad_ofs = _mm512_sub_epi32(four_32, t);
-
-	/* blend DFA and QUAD/SINGLE. */
-	t = _mm512_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs);
-
-	/* calculate address for next transitions. */
-	addr = _mm512_add_epi32(addr, t);
-	return addr;
-}
-
-/*
- * Process 16 transitions in parallel.
- * tr_lo contains low 32 bits for 16 transition.
- * tr_hi contains high 32 bits for 16 transition.
- * next_input contains up to 4 input bytes for 16 flows.
+ * Gather 4/1 input bytes for up to 16 (2*8) locations in parallel.
  */
 static __rte_always_inline __m512i
-transition16(__m512i next_input, const uint64_t *trans, __m512i *tr_lo,
-	__m512i *tr_hi)
-{
-	const int32_t *tr;
-	__m512i addr;
-
-	tr = (const int32_t *)(uintptr_t)trans;
-
-	/* Calculate the address (array index) for all 16 transitions. */
-	addr = calc_addr16(zmm_index_mask.z, next_input, zmm_shuffle_input.z,
-		zmm_four_32.z, zmm_range_base.z, *tr_lo, *tr_hi);
-
-	/* load lower 32 bits of 16 transactions at once. */
-	*tr_lo = _mm512_i32gather_epi32(addr, tr, sizeof(trans[0]));
-
-	next_input = _mm512_srli_epi32(next_input, CHAR_BIT);
-
-	/* load high 32 bits of 16 transactions at once. */
-	*tr_hi = _mm512_i32gather_epi32(addr, (tr + 1), sizeof(trans[0]));
-
-	return next_input;
-}
-
-/*
- * Execute first transition for up to 16 flows in parallel.
- * next_input should contain one input byte for up to 16 flows.
- * msk - mask of active flows.
- * tr_lo contains low 32 bits for up to 16 transitions.
- * tr_hi contains high 32 bits for up to 16 transitions.
- */
-static __rte_always_inline void
-first_trans16(const struct acl_flow_avx512 *flow, __m512i next_input,
-	__mmask16 msk, __m512i *tr_lo, __m512i *tr_hi)
+_F_(gather_bytes)(__m512i zero, const __m512i p[2], const uint32_t m[2],
+	uint32_t bnum)
 {
-	const int32_t *tr;
-	__m512i addr, root;
-
-	tr = (const int32_t *)(uintptr_t)flow->trans;
-
-	addr = _mm512_set1_epi32(UINT8_MAX);
-	root = _mm512_set1_epi32(flow->root_index);
-
-	addr = _mm512_and_si512(next_input, addr);
-	addr = _mm512_add_epi32(root, addr);
-
-	/* load lower 32 bits of 16 transactions at once. */
-	*tr_lo = _mm512_mask_i32gather_epi32(*tr_lo, msk, addr, tr,
-		sizeof(flow->trans[0]));
-
-	/* load high 32 bits of 16 transactions at once. */
-	*tr_hi = _mm512_mask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1),
-		sizeof(flow->trans[0]));
-}
-
-/*
- * Load and return next 4 input bytes for up to 16 flows in parallel.
- * pdata - 8x2 pointers to flow input data
- * mask - mask of active flows.
- * di - data indexes for these 16 flows.
- */
-static inline __m512i
-get_next_bytes_avx512x16(const struct acl_flow_avx512 *flow, __m512i pdata[2],
-	uint32_t msk, __m512i *di, uint32_t bnum)
-{
-	const int32_t *div;
-	uint32_t m[2];
-	__m512i one, zero, t, p[2];
-	ymm_t inp[2];
-
-	div = (const int32_t *)flow->data_index;
-
-	one = _mm512_set1_epi32(1);
-	zero = _mm512_xor_si512(one, one);
-
-	/* load data offsets for given indexes */
-	t = _mm512_mask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0]));
-
-	/* increment data indexes */
-	*di = _mm512_mask_add_epi32(*di, msk, *di, one);
-
-	/*
-	 * unsigned expand 32-bit indexes to 64-bit
-	 * (for later pointer arithmetic), i.e:
-	 * for (i = 0; i != 16; i++)
-	 *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
-	 */
-	p[0] = _mm512_maskz_permutexvar_epi32(zmm_pmidx_msk, zmm_pmidx[0].z, t);
-	p[1] = _mm512_maskz_permutexvar_epi32(zmm_pmidx_msk, zmm_pmidx[1].z, t);
-
-	p[0] = _mm512_add_epi64(p[0], pdata[0]);
-	p[1] = _mm512_add_epi64(p[1], pdata[1]);
-
-	/* load input byte(s), either one or four */
-
-	m[0] = msk & ZMM_PTR_MSK;
-	m[1] = msk >> ZMM_PTR_NUM;
+	__m256i inp[2];
 
 	if (bnum == sizeof(uint8_t)) {
 		inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);
 		inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);
 	} else {
 		inp[0] = _mm512_mask_i64gather_epi32(
-				_mm512_castsi512_si256(zero), m[0], p[0],
-				NULL, sizeof(uint8_t));
+				_mm512_castsi512_si256(zero),
+				m[0], p[0], NULL, sizeof(uint8_t));
 		inp[1] = _mm512_mask_i64gather_epi32(
-				_mm512_castsi512_si256(zero), m[1], p[1],
-				NULL, sizeof(uint8_t));
+				_mm512_castsi512_si256(zero),
+				m[1], p[1], NULL, sizeof(uint8_t));
 	}
 
 	/* squeeze input into one 512-bit register */
 	return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]),
-			zmm_pminp.z, _mm512_castsi256_si512(inp[1]));
-}
-
-/*
- * Start up to 16 new flows.
- * num - number of flows to start
- * msk - mask of new flows.
- * pdata - pointers to flow input data
- * idx - match indexed for given flows
- * di - data indexes for these flows.
- */
-static inline void
-start_flow16(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
-	__m512i pdata[2], __m512i *idx, __m512i *di)
-{
-	uint32_t n, m[2], nm[2];
-	__m512i ni, nd[2];
-
-	/* split mask into two - one for each pdata[] */
-	m[0] = msk & ZMM_PTR_MSK;
-	m[1] = msk >> ZMM_PTR_NUM;
-
-	/* calculate masks for new flows */
-	n = __builtin_popcount(m[0]);
-	nm[0] = (1 << n) - 1;
-	nm[1] = (1 << (num - n)) - 1;
-
-	/* load input data pointers for new flows */
-	nd[0] = _mm512_maskz_loadu_epi64(nm[0],
-		flow->idata + flow->num_packets);
-	nd[1] = _mm512_maskz_loadu_epi64(nm[1],
-		flow->idata + flow->num_packets + n);
-
-	/* calculate match indexes of new flows */
-	ni = _mm512_set1_epi32(flow->num_packets);
-	ni = _mm512_add_epi32(ni, zmm_idx_add.z);
-
-	/* merge new and existing flows data */
-	pdata[0] = _mm512_mask_expand_epi64(pdata[0], m[0], nd[0]);
-	pdata[1] = _mm512_mask_expand_epi64(pdata[1], m[1], nd[1]);
-
-	/* update match and data indexes */
-	*idx = _mm512_mask_expand_epi32(*idx, msk, ni);
-	*di = _mm512_maskz_mov_epi32(msk ^ UINT16_MAX, *di);
-
-	flow->num_packets += num;
-}
-
-/*
- * Process found matches for up to 16 flows.
- * fmsk - mask of active flows
- * rmsk - mask of found matches
- * pdata - pointers to flow input data
- * di - data indexes for these flows
- * idx - match indexed for given flows
- * tr_lo contains low 32 bits for up to 8 transitions.
- * tr_hi contains high 32 bits for up to 8 transitions.
- */
-static inline uint32_t
-match_process_avx512x16(struct acl_flow_avx512 *flow, uint32_t *fmsk,
-	uint32_t *rmsk, __m512i pdata[2], __m512i *di, __m512i *idx,
-	__m512i *tr_lo, __m512i *tr_hi)
-{
-	uint32_t n;
-	__m512i res;
-
-	if (rmsk[0] == 0)
-		return 0;
-
-	/* extract match indexes */
-	res = _mm512_and_si512(tr_lo[0], zmm_index_mask.z);
-
-	/* mask  matched transitions to nop */
-	tr_lo[0] = _mm512_mask_mov_epi32(tr_lo[0], rmsk[0], zmm_trlo_idle.z);
-	tr_hi[0] = _mm512_mask_mov_epi32(tr_hi[0], rmsk[0], zmm_trhi_idle.z);
-
-	/* save found match indexes */
-	_mm512_mask_i32scatter_epi32(flow->matches, rmsk[0],
-		idx[0], res, sizeof(flow->matches[0]));
-
-	/* update masks and start new flows for matches */
-	n = update_flow_mask(flow, fmsk, rmsk);
-	start_flow16(flow, n, rmsk[0], pdata, idx, di);
-
-	return n;
-}
-
-/*
- * Test for matches ut to 32 (2x16) flows at once,
- * if matches exist - process them and start new flows.
- */
-static inline void
-match_check_process_avx512x16x2(struct acl_flow_avx512 *flow, uint32_t fm[2],
-	__m512i pdata[4], __m512i di[2], __m512i idx[2], __m512i inp[2],
-	__m512i tr_lo[2], __m512i tr_hi[2])
-{
-	uint32_t n[2];
-	uint32_t rm[2];
-
-	/* check for matches */
-	rm[0] = _mm512_test_epi32_mask(tr_lo[0], zmm_match_mask.z);
-	rm[1] = _mm512_test_epi32_mask(tr_lo[1], zmm_match_mask.z);
-
-	/* till unprocessed matches exist */
-	while ((rm[0] | rm[1]) != 0) {
-
-		/* process matches and start new flows */
-		n[0] = match_process_avx512x16(flow, &fm[0], &rm[0], &pdata[0],
-			&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
-		n[1] = match_process_avx512x16(flow, &fm[1], &rm[1], &pdata[2],
-			&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
-
-		/* execute first transition for new flows, if any */
-
-		if (n[0] != 0) {
-			inp[0] = get_next_bytes_avx512x16(flow, &pdata[0],
-				rm[0], &di[0], flow->first_load_sz);
-			first_trans16(flow, inp[0], rm[0], &tr_lo[0],
-				&tr_hi[0]);
-			rm[0] = _mm512_test_epi32_mask(tr_lo[0],
-				zmm_match_mask.z);
-		}
-
-		if (n[1] != 0) {
-			inp[1] = get_next_bytes_avx512x16(flow, &pdata[2],
-				rm[1], &di[1], flow->first_load_sz);
-			first_trans16(flow, inp[1], rm[1], &tr_lo[1],
-				&tr_hi[1]);
-			rm[1] = _mm512_test_epi32_mask(tr_lo[1],
-				zmm_match_mask.z);
-		}
-	}
-}
-
-/*
- * Perform search for up to 32 flows in parallel.
- * Use two sets of metadata, each serves 16 flows max.
- * So in fact we perform search for 2x16 flows.
- */
-static inline void
-search_trie_avx512x16x2(struct acl_flow_avx512 *flow)
-{
-	uint32_t fm[2];
-	__m512i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
-
-	/* first 1B load */
-	start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[0], &idx[0], &di[0]);
-	start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[2], &idx[1], &di[1]);
-
-	in[0] = get_next_bytes_avx512x16(flow, &pdata[0], UINT16_MAX, &di[0],
-			flow->first_load_sz);
-	in[1] = get_next_bytes_avx512x16(flow, &pdata[2], UINT16_MAX, &di[1],
-			flow->first_load_sz);
-
-	first_trans16(flow, in[0], UINT16_MAX, &tr_lo[0], &tr_hi[0]);
-	first_trans16(flow, in[1], UINT16_MAX, &tr_lo[1], &tr_hi[1]);
-
-	fm[0] = UINT16_MAX;
-	fm[1] = UINT16_MAX;
-
-	/* match check */
-	match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,
-		tr_lo, tr_hi);
-
-	while ((fm[0] | fm[1]) != 0) {
-
-		/* load next 4B */
-
-		in[0] = get_next_bytes_avx512x16(flow, &pdata[0], fm[0],
-			&di[0], sizeof(uint32_t));
-		in[1] = get_next_bytes_avx512x16(flow, &pdata[2], fm[1],
-			&di[1], sizeof(uint32_t));
-
-		/* main 4B loop */
-
-		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		/* check for matches */
-		match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,
-			tr_lo, tr_hi);
-	}
+			_SV_(pminp), _mm512_castsi256_si512(inp[1]));
 }
 
 /*
@@ -582,120 +283,12 @@ resolve_mcgt8_avx512x1(uint32_t result[],
 	}
 }
 
-/*
- * resolve match index to actual result/priority offset.
- */
-static inline __m512i
-resolve_match_idx_avx512x16(__m512i mi)
-{
-	RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
-		1 << (match_log + 2));
-	return _mm512_slli_epi32(mi, match_log);
-}
-
-/*
- * Resolve multiple matches for the same flow based on priority.
- */
-static inline __m512i
-resolve_pri_avx512x16(const int32_t res[], const int32_t pri[],
-	const uint32_t match[], __mmask16 msk, uint32_t nb_trie,
-	uint32_t nb_skip)
-{
-	uint32_t i;
-	const uint32_t *pm;
-	__mmask16 m;
-	__m512i cp, cr, np, nr, mch;
-
-	const __m512i zero = _mm512_set1_epi32(0);
-
-	/* get match indexes */
-	mch = _mm512_maskz_loadu_epi32(msk, match);
-	mch = resolve_match_idx_avx512x16(mch);
-
-	/* read result and priority values for first trie */
-	cr = _mm512_mask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0]));
-	cp = _mm512_mask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0]));
-
-	/*
-	 * read result and priority values for next tries and select one
-	 * with highest priority.
-	 */
-	for (i = 1, pm = match + nb_skip; i != nb_trie;
-			i++, pm += nb_skip) {
-
-		mch = _mm512_maskz_loadu_epi32(msk, pm);
-		mch = resolve_match_idx_avx512x16(mch);
-
-		nr = _mm512_mask_i32gather_epi32(zero, msk, mch, res,
-			sizeof(res[0]));
-		np = _mm512_mask_i32gather_epi32(zero, msk, mch, pri,
-			sizeof(pri[0]));
-
-		m = _mm512_cmpgt_epi32_mask(cp, np);
-		cr = _mm512_mask_mov_epi32(nr, m, cr);
-		cp = _mm512_mask_mov_epi32(np, m, cp);
-	}
-
-	return cr;
-}
-
-/*
- * Resolve num (<= 16) matches for single category
- */
-static inline void
-resolve_sc_avx512x16(uint32_t result[], const int32_t res[],
-	const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
-	uint32_t nb_trie, uint32_t nb_skip)
-{
-	__mmask16 msk;
-	__m512i cr;
-
-	msk = (1 << nb_pkt) - 1;
-	cr = resolve_pri_avx512x16(res, pri, match, msk, nb_trie, nb_skip);
-	_mm512_mask_storeu_epi32(result, msk, cr);
-}
+#include "acl_run_avx512_common.h"
 
 /*
- * Resolve matches for single category
+ * Perform search for up to (2 * 16) flows in parallel.
+ * Use two sets of metadata, each serves 16 flows max.
  */
-static inline void
-resolve_sc_avx512x16x2(uint32_t result[],
-	const struct rte_acl_match_results pr[], const uint32_t match[],
-	uint32_t nb_pkt, uint32_t nb_trie)
-{
-	uint32_t j, k, n;
-	const int32_t *res, *pri;
-	__m512i cr[2];
-
-	res = (const int32_t *)pr->results;
-	pri = pr->priority;
-
-	for (k = 0; k != (nb_pkt & ~MSK_AVX512X16X2); k += NUM_AVX512X16X2) {
-
-		j = k + MASK16_BIT;
-
-		cr[0] = resolve_pri_avx512x16(res, pri, match + k, UINT16_MAX,
-				nb_trie, nb_pkt);
-		cr[1] = resolve_pri_avx512x16(res, pri, match + j, UINT16_MAX,
-				nb_trie, nb_pkt);
-
-		_mm512_storeu_si512(result + k, cr[0]);
-		_mm512_storeu_si512(result + j, cr[1]);
-	}
-
-	n = nb_pkt - k;
-	if (n != 0) {
-		if (n > MASK16_BIT) {
-			resolve_sc_avx512x16(result + k, res, pri, match + k,
-				MASK16_BIT, nb_trie, nb_pkt);
-			k += MASK16_BIT;
-			n -= MASK16_BIT;
-		}
-		resolve_sc_avx512x16(result + k, res, pri, match + k, n,
-				nb_trie, nb_pkt);
-	}
-}
-
 static inline int
 search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint32_t *results, uint32_t total_packets, uint32_t categories)
@@ -711,7 +304,7 @@ search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
 
 		/* process the trie */
-		search_trie_avx512x16x2(&flow);
+		_F_(search_trie)(&flow);
 	}
 
 	/* resolve matches */
@@ -719,7 +312,7 @@ search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		(ctx->trans_table + ctx->match_index);
 
 	if (categories == 1)
-		resolve_sc_avx512x16x2(results, pr, match, total_packets,
+		_F_(resolve_single_cat)(results, pr, match, total_packets,
 			ctx->num_tries);
 	else if (categories <= RTE_ACL_MAX_CATEGORIES / 2)
 		resolve_mcle8_avx512x1(results, pr, match, total_packets,
@@ -730,3 +323,19 @@ search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 	return 0;
 }
+
+#undef _SIMD_PTR_MSK_
+#undef _SIMD_PTR_NUM_
+#undef _SIMD_FLOW_MSK_
+#undef _SIMD_FLOW_NUM_
+#undef _SIMD_MASK_MAX_
+#undef _SIMD_MASK_BIT_
+#undef _M_GI_
+#undef _M_MGI_
+#undef _M_SI_
+#undef _M_I_
+#undef _F_
+#undef _SV_
+#undef _SC_
+#undef _T_mask
+#undef _T_simd
diff --git a/lib/librte_acl/acl_run_avx512x8.h b/lib/librte_acl/acl_run_avx512x8.h
index fedd79b9ae..61ac9d1b47 100644
--- a/lib/librte_acl/acl_run_avx512x8.h
+++ b/lib/librte_acl/acl_run_avx512x8.h
@@ -2,16 +2,57 @@
  * Copyright(c) 2020 Intel Corporation
  */
 
-#define MASK8_BIT	(sizeof(__mmask8) * CHAR_BIT)
+/*
+ * Defines required by "acl_run_avx512_common.h".
+ * Note that all of them has to be undefined by the end
+ * of this file, as "acl_run_avx512_common.h" can be included several
+ * times from different *.h files for the same *.c.
+ */
+
+/*
+ * This implementation uses 256-bit registers(ymm) and instrincts.
+ * So our main SIMD type is 256-bit width and each such variable can
+ * process sizeof(__m256i) / sizeof(uint32_t) == 8 entries in parallel.
+ */
+#define _T_simd		__m256i
+#define _T_mask		__mmask8
+
+/* Naming convention for static const variables. */
+#define _SC_(x)		ymm_##x
+#define _SV_(x)		(ymm_##x.y)
+
+/* Naming convention for internal functions. */
+#define _F_(x)		x##_avx512x8
+
+/*
+ * Same instrincts have different syntaxis (depending on the bit-width),
+ * so to overcome that few macros need to be defined.
+ */
+
+/* Naming convention for generic epi(packed integers) type instrincts. */
+#define _M_I_(x)	_mm256_##x
+
+/* Naming convention for si(whole simd integer) type instrincts. */
+#define _M_SI_(x)	_mm256_##x##_si256
 
-#define NUM_AVX512X8X2	(2 * MASK8_BIT)
-#define MSK_AVX512X8X2	(NUM_AVX512X8X2 - 1)
+/* Naming convention for masked gather type instrincts. */
+#define _M_MGI_(x)	_mm256_m##x
+
+/* Naming convention for gather type instrincts. */
+#define _M_GI_(name, idx, base, scale)	_mm256_##name(base, idx, scale)
+
+/* num/mask of transitions per SIMD regs */
+#define _SIMD_MASK_BIT_	(sizeof(_T_simd) / sizeof(uint32_t))
+#define _SIMD_MASK_MAX_	RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
+
+#define _SIMD_FLOW_NUM_	(2 * _SIMD_MASK_BIT_)
+#define _SIMD_FLOW_MSK_	(_SIMD_FLOW_NUM_ - 1)
 
 /* num/mask of pointers per SIMD regs */
-#define YMM_PTR_NUM	(sizeof(__m256i) / sizeof(uintptr_t))
-#define YMM_PTR_MSK	RTE_LEN2MASK(YMM_PTR_NUM, uint32_t)
+#define _SIMD_PTR_NUM_	(sizeof(_T_simd) / sizeof(uintptr_t))
+#define _SIMD_PTR_MSK_	RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
 
-static const rte_ymm_t ymm_match_mask = {
+static const rte_ymm_t _SC_(match_mask) = {
 	.u32 = {
 		RTE_ACL_NODE_MATCH,
 		RTE_ACL_NODE_MATCH,
@@ -24,7 +65,7 @@ static const rte_ymm_t ymm_match_mask = {
 	},
 };
 
-static const rte_ymm_t ymm_index_mask = {
+static const rte_ymm_t _SC_(index_mask) = {
 	.u32 = {
 		RTE_ACL_NODE_INDEX,
 		RTE_ACL_NODE_INDEX,
@@ -37,7 +78,7 @@ static const rte_ymm_t ymm_index_mask = {
 	},
 };
 
-static const rte_ymm_t ymm_trlo_idle = {
+static const rte_ymm_t _SC_(trlo_idle) = {
 	.u32 = {
 		RTE_ACL_IDLE_NODE,
 		RTE_ACL_IDLE_NODE,
@@ -50,51 +91,51 @@ static const rte_ymm_t ymm_trlo_idle = {
 	},
 };
 
-static const rte_ymm_t ymm_trhi_idle = {
+static const rte_ymm_t _SC_(trhi_idle) = {
 	.u32 = {
 		0, 0, 0, 0,
 		0, 0, 0, 0,
 	},
 };
 
-static const rte_ymm_t ymm_shuffle_input = {
+static const rte_ymm_t _SC_(shuffle_input) = {
 	.u32 = {
 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 	},
 };
 
-static const rte_ymm_t ymm_four_32 = {
+static const rte_ymm_t _SC_(four_32) = {
 	.u32 = {
 		4, 4, 4, 4,
 		4, 4, 4, 4,
 	},
 };
 
-static const rte_ymm_t ymm_idx_add = {
+static const rte_ymm_t _SC_(idx_add) = {
 	.u32 = {
 		0, 1, 2, 3,
 		4, 5, 6, 7,
 	},
 };
 
-static const rte_ymm_t ymm_range_base = {
+static const rte_ymm_t _SC_(range_base) = {
 	.u32 = {
 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 	},
 };
 
-static const rte_ymm_t ymm_pminp = {
+static const rte_ymm_t _SC_(pminp) = {
 	.u32 = {
 		0x00, 0x01, 0x02, 0x03,
 		0x08, 0x09, 0x0a, 0x0b,
 	},
 };
 
-static const __mmask16 ymm_pmidx_msk = 0x55;
+static const __mmask16 _SC_(pmidx_msk) = 0x55;
 
-static const rte_ymm_t ymm_pmidx[2] = {
+static const rte_ymm_t _SC_(pmidx[2]) = {
 	[0] = {
 		.u32 = {
 			0, 0, 1, 0, 2, 0, 3, 0,
@@ -120,7 +161,7 @@ _m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask)
 
 	static const uint32_t zero;
 
-	p.y = _mm256_mask_set1_epi64(pdata, mask ^ YMM_PTR_MSK,
+	p.y = _mm256_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
 		(uintptr_t)&zero);
 
 	v.u32[0] = *(uint8_t *)p.u64[0];
@@ -132,483 +173,37 @@ _m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask)
 }
 
 /*
- * Calculate the address of the next transition for
- * all types of nodes. Note that only DFA nodes and range
- * nodes actually transition to another node. Match
- * nodes not supposed to be encountered here.
- * For quad range nodes:
- * Calculate number of range boundaries that are less than the
- * input value. Range boundaries for each node are in signed 8 bit,
- * ordered from -128 to 127.
- * This is effectively a popcnt of bytes that are greater than the
- * input byte.
- * Single nodes are processed in the same ways as quad range nodes.
- */
-static __rte_always_inline __m256i
-calc_addr8(__m256i index_mask, __m256i next_input, __m256i shuffle_input,
-	__m256i four_32, __m256i range_base, __m256i tr_lo, __m256i tr_hi)
-{
-	__mmask32 qm;
-	__mmask8 dfa_msk;
-	__m256i addr, in, node_type, r, t;
-	__m256i dfa_ofs, quad_ofs;
-
-	t = _mm256_xor_si256(index_mask, index_mask);
-	in = _mm256_shuffle_epi8(next_input, shuffle_input);
-
-	/* Calc node type and node addr */
-	node_type = _mm256_andnot_si256(index_mask, tr_lo);
-	addr = _mm256_and_si256(index_mask, tr_lo);
-
-	/* mask for DFA type(0) nodes */
-	dfa_msk = _mm256_cmpeq_epi32_mask(node_type, t);
-
-	/* DFA calculations. */
-	r = _mm256_srli_epi32(in, 30);
-	r = _mm256_add_epi8(r, range_base);
-	t = _mm256_srli_epi32(in, 24);
-	r = _mm256_shuffle_epi8(tr_hi, r);
-
-	dfa_ofs = _mm256_sub_epi32(t, r);
-
-	/* QUAD/SINGLE calculations. */
-	qm = _mm256_cmpgt_epi8_mask(in, tr_hi);
-	t = _mm256_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX);
-	t = _mm256_lzcnt_epi32(t);
-	t = _mm256_srli_epi32(t, 3);
-	quad_ofs = _mm256_sub_epi32(four_32, t);
-
-	/* blend DFA and QUAD/SINGLE. */
-	t = _mm256_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs);
-
-	/* calculate address for next transitions. */
-	addr = _mm256_add_epi32(addr, t);
-	return addr;
-}
-
-/*
- * Process 16 transitions in parallel.
- * tr_lo contains low 32 bits for 16 transition.
- * tr_hi contains high 32 bits for 16 transition.
- * next_input contains up to 4 input bytes for 16 flows.
+ * Gather 4/1 input bytes for up to 8 (2*8) locations in parallel.
  */
 static __rte_always_inline __m256i
-transition8(__m256i next_input, const uint64_t *trans, __m256i *tr_lo,
-	__m256i *tr_hi)
-{
-	const int32_t *tr;
-	__m256i addr;
-
-	tr = (const int32_t *)(uintptr_t)trans;
-
-	/* Calculate the address (array index) for all 8 transitions. */
-	addr = calc_addr8(ymm_index_mask.y, next_input, ymm_shuffle_input.y,
-		ymm_four_32.y, ymm_range_base.y, *tr_lo, *tr_hi);
-
-	/* load lower 32 bits of 16 transactions at once. */
-	*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
-
-	next_input = _mm256_srli_epi32(next_input, CHAR_BIT);
-
-	/* load high 32 bits of 16 transactions at once. */
-	*tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));
-
-	return next_input;
-}
-
-/*
- * Execute first transition for up to 16 flows in parallel.
- * next_input should contain one input byte for up to 16 flows.
- * msk - mask of active flows.
- * tr_lo contains low 32 bits for up to 16 transitions.
- * tr_hi contains high 32 bits for up to 16 transitions.
- */
-static __rte_always_inline void
-first_trans8(const struct acl_flow_avx512 *flow, __m256i next_input,
-	__mmask8 msk, __m256i *tr_lo, __m256i *tr_hi)
+_F_(gather_bytes)(__m256i zero, const __m256i p[2], const uint32_t m[2],
+	uint32_t bnum)
 {
-	const int32_t *tr;
-	__m256i addr, root;
-
-	tr = (const int32_t *)(uintptr_t)flow->trans;
-
-	addr = _mm256_set1_epi32(UINT8_MAX);
-	root = _mm256_set1_epi32(flow->root_index);
-
-	addr = _mm256_and_si256(next_input, addr);
-	addr = _mm256_add_epi32(root, addr);
-
-	/* load lower 32 bits of 16 transactions at once. */
-	*tr_lo = _mm256_mmask_i32gather_epi32(*tr_lo, msk, addr, tr,
-		sizeof(flow->trans[0]));
-
-	/* load high 32 bits of 16 transactions at once. */
-	*tr_hi = _mm256_mmask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1),
-		sizeof(flow->trans[0]));
-}
-
-/*
- * Load and return next 4 input bytes for up to 16 flows in parallel.
- * pdata - 8x2 pointers to flow input data
- * mask - mask of active flows.
- * di - data indexes for these 16 flows.
- */
-static inline __m256i
-get_next_bytes_avx512x8(const struct acl_flow_avx512 *flow, __m256i pdata[2],
-	uint32_t msk, __m256i *di, uint32_t bnum)
-{
-	const int32_t *div;
-	uint32_t m[2];
-	__m256i one, zero, t, p[2];
 	__m128i inp[2];
 
-	div = (const int32_t *)flow->data_index;
-
-	one = _mm256_set1_epi32(1);
-	zero = _mm256_xor_si256(one, one);
-
-	/* load data offsets for given indexes */
-	t = _mm256_mmask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0]));
-
-	/* increment data indexes */
-	*di = _mm256_mask_add_epi32(*di, msk, *di, one);
-
-	/*
-	 * unsigned expand 32-bit indexes to 64-bit
-	 * (for later pointer arithmetic), i.e:
-	 * for (i = 0; i != 16; i++)
-	 *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
-	 */
-	p[0] = _mm256_maskz_permutexvar_epi32(ymm_pmidx_msk, ymm_pmidx[0].y, t);
-	p[1] = _mm256_maskz_permutexvar_epi32(ymm_pmidx_msk, ymm_pmidx[1].y, t);
-
-	p[0] = _mm256_add_epi64(p[0], pdata[0]);
-	p[1] = _mm256_add_epi64(p[1], pdata[1]);
-
-	/* load input byte(s), either one or four */
-
-	m[0] = msk & YMM_PTR_MSK;
-	m[1] = msk >> YMM_PTR_NUM;
-
 	if (bnum == sizeof(uint8_t)) {
 		inp[0] = _m256_mask_gather_epi8x4(p[0], m[0]);
 		inp[1] = _m256_mask_gather_epi8x4(p[1], m[1]);
 	} else {
 		inp[0] = _mm256_mmask_i64gather_epi32(
-				_mm256_castsi256_si128(zero), m[0], p[0],
-				NULL, sizeof(uint8_t));
+				_mm256_castsi256_si128(zero),
+				m[0], p[0], NULL, sizeof(uint8_t));
 		inp[1] = _mm256_mmask_i64gather_epi32(
-				_mm256_castsi256_si128(zero), m[1], p[1],
-				NULL, sizeof(uint8_t));
+				_mm256_castsi256_si128(zero),
+				m[1], p[1], NULL, sizeof(uint8_t));
 	}
 
-	/* squeeze input into one 512-bit register */
+	/* squeeze input into one 256-bit register */
 	return _mm256_permutex2var_epi32(_mm256_castsi128_si256(inp[0]),
-			ymm_pminp.y,  _mm256_castsi128_si256(inp[1]));
-}
-
-/*
- * Start up to 16 new flows.
- * num - number of flows to start
- * msk - mask of new flows.
- * pdata - pointers to flow input data
- * idx - match indexed for given flows
- * di - data indexes for these flows.
- */
-static inline void
-start_flow8(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
-	__m256i pdata[2], __m256i *idx, __m256i *di)
-{
-	uint32_t n, m[2], nm[2];
-	__m256i ni, nd[2];
-
-	m[0] = msk & YMM_PTR_MSK;
-	m[1] = msk >> YMM_PTR_NUM;
-
-	n = __builtin_popcount(m[0]);
-	nm[0] = (1 << n) - 1;
-	nm[1] = (1 << (num - n)) - 1;
-
-	/* load input data pointers for new flows */
-	nd[0] = _mm256_maskz_loadu_epi64(nm[0],
-		flow->idata + flow->num_packets);
-	nd[1] = _mm256_maskz_loadu_epi64(nm[1],
-		flow->idata + flow->num_packets + n);
-
-	/* calculate match indexes of new flows */
-	ni = _mm256_set1_epi32(flow->num_packets);
-	ni = _mm256_add_epi32(ni, ymm_idx_add.y);
-
-	/* merge new and existing flows data */
-	pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]);
-	pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]);
-
-	/* update match and data indexes */
-	*idx = _mm256_mask_expand_epi32(*idx, msk, ni);
-	*di = _mm256_maskz_mov_epi32(msk ^ UINT8_MAX, *di);
-
-	flow->num_packets += num;
-}
-
-/*
- * Process found matches for up to 16 flows.
- * fmsk - mask of active flows
- * rmsk - mask of found matches
- * pdata - pointers to flow input data
- * di - data indexes for these flows
- * idx - match indexed for given flows
- * tr_lo contains low 32 bits for up to 8 transitions.
- * tr_hi contains high 32 bits for up to 8 transitions.
- */
-static inline uint32_t
-match_process_avx512x8(struct acl_flow_avx512 *flow, uint32_t *fmsk,
-	uint32_t *rmsk, __m256i pdata[2], __m256i *di, __m256i *idx,
-	__m256i *tr_lo, __m256i *tr_hi)
-{
-	uint32_t n;
-	__m256i res;
-
-	if (rmsk[0] == 0)
-		return 0;
-
-	/* extract match indexes */
-	res = _mm256_and_si256(tr_lo[0], ymm_index_mask.y);
-
-	/* mask  matched transitions to nop */
-	tr_lo[0] = _mm256_mask_mov_epi32(tr_lo[0], rmsk[0], ymm_trlo_idle.y);
-	tr_hi[0] = _mm256_mask_mov_epi32(tr_hi[0], rmsk[0], ymm_trhi_idle.y);
-
-	/* save found match indexes */
-	_mm256_mask_i32scatter_epi32(flow->matches, rmsk[0],
-		idx[0], res, sizeof(flow->matches[0]));
-
-	/* update masks and start new flows for matches */
-	n = update_flow_mask(flow, fmsk, rmsk);
-	start_flow8(flow, n, rmsk[0], pdata, idx, di);
-
-	return n;
-}
-
-/*
- * Test for matches ut to 32 (2x16) flows at once,
- * if matches exist - process them and start new flows.
- */
-static inline void
-match_check_process_avx512x8x2(struct acl_flow_avx512 *flow, uint32_t fm[2],
-	__m256i pdata[4], __m256i di[2], __m256i idx[2], __m256i inp[2],
-	__m256i tr_lo[2], __m256i tr_hi[2])
-{
-	uint32_t n[2];
-	uint32_t rm[2];
-
-	/* check for matches */
-	rm[0] = _mm256_test_epi32_mask(tr_lo[0], ymm_match_mask.y);
-	rm[1] = _mm256_test_epi32_mask(tr_lo[1], ymm_match_mask.y);
-
-	/* till unprocessed matches exist */
-	while ((rm[0] | rm[1]) != 0) {
-
-		/* process matches and start new flows */
-		n[0] = match_process_avx512x8(flow, &fm[0], &rm[0], &pdata[0],
-			&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
-		n[1] = match_process_avx512x8(flow, &fm[1], &rm[1], &pdata[2],
-			&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
-
-		/* execute first transition for new flows, if any */
-
-		if (n[0] != 0) {
-			inp[0] = get_next_bytes_avx512x8(flow, &pdata[0],
-				rm[0], &di[0], flow->first_load_sz);
-			first_trans8(flow, inp[0], rm[0], &tr_lo[0],
-				&tr_hi[0]);
-			rm[0] = _mm256_test_epi32_mask(tr_lo[0],
-				ymm_match_mask.y);
-		}
-
-		if (n[1] != 0) {
-			inp[1] = get_next_bytes_avx512x8(flow, &pdata[2],
-				rm[1], &di[1], flow->first_load_sz);
-			first_trans8(flow, inp[1], rm[1], &tr_lo[1],
-				&tr_hi[1]);
-			rm[1] = _mm256_test_epi32_mask(tr_lo[1],
-				ymm_match_mask.y);
-		}
-	}
-}
-
-/*
- * Perform search for up to 32 flows in parallel.
- * Use two sets of metadata, each serves 16 flows max.
- * So in fact we perform search for 2x16 flows.
- */
-static inline void
-search_trie_avx512x8x2(struct acl_flow_avx512 *flow)
-{
-	uint32_t fm[2];
-	__m256i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
-
-	/* first 1B load */
-	start_flow8(flow, MASK8_BIT, UINT8_MAX, &pdata[0], &idx[0], &di[0]);
-	start_flow8(flow, MASK8_BIT, UINT8_MAX, &pdata[2], &idx[1], &di[1]);
-
-	in[0] = get_next_bytes_avx512x8(flow, &pdata[0], UINT8_MAX, &di[0],
-			flow->first_load_sz);
-	in[1] = get_next_bytes_avx512x8(flow, &pdata[2], UINT8_MAX, &di[1],
-			flow->first_load_sz);
-
-	first_trans8(flow, in[0], UINT8_MAX, &tr_lo[0], &tr_hi[0]);
-	first_trans8(flow, in[1], UINT8_MAX, &tr_lo[1], &tr_hi[1]);
-
-	fm[0] = UINT8_MAX;
-	fm[1] = UINT8_MAX;
-
-	/* match check */
-	match_check_process_avx512x8x2(flow, fm, pdata, di, idx, in,
-		tr_lo, tr_hi);
-
-	while ((fm[0] | fm[1]) != 0) {
-
-		/* load next 4B */
-
-		in[0] = get_next_bytes_avx512x8(flow, &pdata[0], fm[0],
-			&di[0], sizeof(uint32_t));
-		in[1] = get_next_bytes_avx512x8(flow, &pdata[2], fm[1],
-			&di[1], sizeof(uint32_t));
-
-		/* main 4B loop */
-
-		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		/* check for matches */
-		match_check_process_avx512x8x2(flow, fm, pdata, di, idx, in,
-			tr_lo, tr_hi);
-	}
-}
-
-/*
- * resolve match index to actual result/priority offset.
- */
-static inline __m256i
-resolve_match_idx_avx512x8(__m256i mi)
-{
-	RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
-		1 << (match_log + 2));
-	return _mm256_slli_epi32(mi, match_log);
+			_SV_(pminp), _mm256_castsi128_si256(inp[1]));
 }
 
-/*
- * Resolve multiple matches for the same flow based on priority.
- */
-static inline __m256i
-resolve_pri_avx512x8(const int32_t res[], const int32_t pri[],
-	const uint32_t match[], __mmask8 msk, uint32_t nb_trie,
-	uint32_t nb_skip)
-{
-	uint32_t i;
-	const uint32_t *pm;
-	__mmask16 m;
-	__m256i cp, cr, np, nr, mch;
-
-	const __m256i zero = _mm256_set1_epi32(0);
-
-	/* get match indexes */
-	mch = _mm256_maskz_loadu_epi32(msk, match);
-	mch = resolve_match_idx_avx512x8(mch);
-
-	/* read result and priority values for first trie */
-	cr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0]));
-	cp = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0]));
-
-	/*
-	 * read result and priority values for next tries and select one
-	 * with highest priority.
-	 */
-	for (i = 1, pm = match + nb_skip; i != nb_trie;
-			i++, pm += nb_skip) {
-
-		mch = _mm256_maskz_loadu_epi32(msk, pm);
-		mch = resolve_match_idx_avx512x8(mch);
-
-		nr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res,
-			sizeof(res[0]));
-		np = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri,
-			sizeof(pri[0]));
-
-		m = _mm256_cmpgt_epi32_mask(cp, np);
-		cr = _mm256_mask_mov_epi32(nr, m, cr);
-		cp = _mm256_mask_mov_epi32(np, m, cp);
-	}
-
-	return cr;
-}
-
-/*
- * Resolve num (<= 8) matches for single category
- */
-static inline void
-resolve_sc_avx512x8(uint32_t result[], const int32_t res[],
-	const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
-	uint32_t nb_trie, uint32_t nb_skip)
-{
-	__mmask8 msk;
-	__m256i cr;
-
-	msk = (1 << nb_pkt) - 1;
-	cr = resolve_pri_avx512x8(res, pri, match, msk, nb_trie, nb_skip);
-	_mm256_mask_storeu_epi32(result, msk, cr);
-}
+#include "acl_run_avx512_common.h"
 
 /*
- * Resolve matches for single category
+ * Perform search for up to (2 * 8) flows in parallel.
+ * Use two sets of metadata, each serves 8 flows max.
  */
-static inline void
-resolve_sc_avx512x8x2(uint32_t result[],
-	const struct rte_acl_match_results pr[], const uint32_t match[],
-	uint32_t nb_pkt, uint32_t nb_trie)
-{
-	uint32_t j, k, n;
-	const int32_t *res, *pri;
-	__m256i cr[2];
-
-	res = (const int32_t *)pr->results;
-	pri = pr->priority;
-
-	for (k = 0; k != (nb_pkt & ~MSK_AVX512X8X2); k += NUM_AVX512X8X2) {
-
-		j = k + MASK8_BIT;
-
-		cr[0] = resolve_pri_avx512x8(res, pri, match + k, UINT8_MAX,
-				nb_trie, nb_pkt);
-		cr[1] = resolve_pri_avx512x8(res, pri, match + j, UINT8_MAX,
-				nb_trie, nb_pkt);
-
-		_mm256_storeu_si256((void *)(result + k), cr[0]);
-		_mm256_storeu_si256((void *)(result + j), cr[1]);
-	}
-
-	n = nb_pkt - k;
-	if (n != 0) {
-		if (n > MASK8_BIT) {
-			resolve_sc_avx512x8(result + k, res, pri, match + k,
-				MASK8_BIT, nb_trie, nb_pkt);
-			k += MASK8_BIT;
-			n -= MASK8_BIT;
-		}
-		resolve_sc_avx512x8(result + k, res, pri, match + k, n,
-				nb_trie, nb_pkt);
-	}
-}
-
 static inline int
 search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint32_t *results, uint32_t total_packets, uint32_t categories)
@@ -624,7 +219,7 @@ search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
 
 		/* process the trie */
-		search_trie_avx512x8x2(&flow);
+		_F_(search_trie)(&flow);
 	}
 
 	/* resolve matches */
@@ -632,7 +227,7 @@ search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		(ctx->trans_table + ctx->match_index);
 
 	if (categories == 1)
-		resolve_sc_avx512x8x2(results, pr, match, total_packets,
+		_F_(resolve_single_cat)(results, pr, match, total_packets,
 			ctx->num_tries);
 	else
 		resolve_mcle8_avx512x1(results, pr, match, total_packets,
@@ -640,3 +235,19 @@ search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 	return 0;
 }
+
+#undef _SIMD_PTR_MSK_
+#undef _SIMD_PTR_NUM_
+#undef _SIMD_FLOW_MSK_
+#undef _SIMD_FLOW_NUM_
+#undef _SIMD_MASK_MAX_
+#undef _SIMD_MASK_BIT_
+#undef _M_GI_
+#undef _M_MGI_
+#undef _M_SI_
+#undef _M_I_
+#undef _F_
+#undef _SV_
+#undef _SC_
+#undef _T_mask
+#undef _T_simd
-- 
2.17.1


  parent reply	other threads:[~2020-10-06 15:13 UTC|newest]

Thread overview: 70+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-08-07 16:28 [dpdk-dev] [PATCH 20.11 0/7] acl: introduce AVX512 classify method Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 1/7] acl: fix x86 build when compiler doesn't support AVX2 Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 2/7] app/acl: few small improvements Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 3/7] acl: remove of unused enum value Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 4/7] acl: add infrastructure to support AVX512 classify Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 5/7] app/acl: add AVX512 classify support Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 6/7] acl: introduce AVX512 classify implementation Konstantin Ananyev
2020-08-07 16:28 ` [dpdk-dev] [PATCH 20.11 7/7] acl: enhance " Konstantin Ananyev
2020-09-15 16:50 ` [dpdk-dev] [PATCH v2 00/12] acl: introduce AVX512 classify method Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 01/12] acl: fix x86 build when compiler doesn't support AVX2 Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 02/12] doc: fix mixing classify methods in ACL guide Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 03/12] acl: remove of unused enum value Konstantin Ananyev
2020-09-27  3:27     ` Ruifeng Wang
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 04/12] acl: remove library constructor Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 05/12] app/acl: few small improvements Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 06/12] test/acl: expand classify test coverage Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 07/12] acl: add infrastructure to support AVX512 classify Konstantin Ananyev
2020-09-16  9:11     ` Bruce Richardson
2020-09-16  9:36       ` Medvedkin, Vladimir
2020-09-16  9:49         ` Bruce Richardson
2020-09-16 10:06           ` Ananyev, Konstantin
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 08/12] acl: introduce AVX512 classify implementation Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 09/12] acl: enhance " Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 10/12] acl: for AVX512 classify use 4B load whenever possible Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 11/12] test/acl: add AVX512 classify support Konstantin Ananyev
2020-09-15 16:50   ` [dpdk-dev] [PATCH v2 12/12] app/acl: " Konstantin Ananyev
2020-10-05 18:45   ` [dpdk-dev] [PATCH v3 00/14] acl: introduce AVX512 classify methods Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 01/14] acl: fix x86 build when compiler doesn't support AVX2 Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 02/14] doc: fix missing classify methods in ACL guide Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 03/14] acl: remove of unused enum value Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 04/14] acl: remove library constructor Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 05/14] app/acl: few small improvements Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 06/14] test/acl: expand classify test coverage Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 07/14] acl: add infrastructure to support AVX512 classify Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 08/14] acl: introduce 256-bit width AVX512 classify implementation Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 09/14] acl: update default classify algorithm selection Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 10/14] acl: introduce 512-bit width AVX512 classify implementation Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 11/14] acl: for AVX512 classify use 4B load whenever possible Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 12/14] acl: deduplicate AVX512 code paths Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 13/14] test/acl: add AVX512 classify support Konstantin Ananyev
2020-10-05 18:45     ` [dpdk-dev] [PATCH v3 14/14] app/acl: " Konstantin Ananyev
2020-10-06 15:03     ` [dpdk-dev] [PATCH v4 00/14] acl: introduce AVX512 classify methods Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 01/14] acl: fix x86 build when compiler doesn't support AVX2 Konstantin Ananyev
2020-10-08 13:42         ` [dpdk-dev] [dpdk-stable] " David Marchand
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 02/14] doc: fix missing classify methods in ACL guide Konstantin Ananyev
2020-10-08 13:42         ` David Marchand
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 03/14] acl: remove of unused enum value Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 04/14] acl: remove library constructor Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 05/14] app/acl: few small improvements Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 06/14] test/acl: expand classify test coverage Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 07/14] acl: add infrastructure to support AVX512 classify Konstantin Ananyev
2020-10-13 19:17         ` David Marchand
2020-10-13 22:26           ` Ananyev, Konstantin
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 08/14] acl: introduce 256-bit width AVX512 classify implementation Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 09/14] acl: update default classify algorithm selection Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 10/14] acl: introduce 512-bit width AVX512 classify implementation Konstantin Ananyev
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 11/14] acl: for AVX512 classify use 4B load whenever possible Konstantin Ananyev
2020-10-06 15:03       ` Konstantin Ananyev [this message]
2020-10-16 15:56         ` [dpdk-dev] [PATCH v4 12/14] acl: deduplicate AVX512 code paths Ferruh Yigit
2020-10-16 16:20           ` Thomas Monjalon
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 13/14] test/acl: add AVX512 classify support Konstantin Ananyev
2020-10-14 10:26         ` David Marchand
2020-10-14 10:32           ` Ananyev, Konstantin
2020-10-14 10:35             ` David Marchand
2020-10-06 15:03       ` [dpdk-dev] [PATCH v4 14/14] app/acl: " Konstantin Ananyev
2020-10-14 12:40       ` [dpdk-dev] [PATCH v4 00/14] acl: introduce AVX512 classify methods David Marchand
2020-10-06 15:05     ` [dpdk-dev] [PATCH v3 " David Marchand
2020-10-06 16:07       ` Ananyev, Konstantin
2020-10-08 10:49         ` David Marchand
2020-10-14  9:23         ` Kinsella, Ray

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201006150316.5776-13-konstantin.ananyev@intel.com \
    --to=konstantin.ananyev@intel.com \
    --cc=dev@dpdk.org \
    --cc=jerinj@marvell.com \
    --cc=ruifeng.wang@arm.com \
    --cc=vladimir.medvedkin@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.