[dpdk-dev] [PATCH v4 12/14] acl: deduplicate AVX512 code paths

From: Konstantin Ananyev <konstantin.ananyev@intel.com>
To: dev@dpdk.org
Cc: jerinj@marvell.com, ruifeng.wang@arm.com,
	vladimir.medvedkin@intel.com,
	Konstantin Ananyev <konstantin.ananyev@intel.com>
Subject: [dpdk-dev] [PATCH v4 12/14] acl: deduplicate AVX512 code paths
Date: Tue,  6 Oct 2020 16:03:14 +0100	[thread overview]
Message-ID: <20201006150316.5776-13-konstantin.ananyev@intel.com> (raw)
In-Reply-To: <20201006150316.5776-1-konstantin.ananyev@intel.com>

Current rte_acl_classify_avx512x32() and rte_acl_classify_avx512x16()
code paths are very similar. The only differences are due to
256/512 register/instrincts naming conventions.
So to deduplicate the code:
  - Move common code into “acl_run_avx512_common.h”
  - Use macros to hide difference in naming conventions

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
---
 lib/librte_acl/acl_run_avx512_common.h | 477 +++++++++++++++++++++
 lib/librte_acl/acl_run_avx512x16.h     | 569 ++++---------------------
 lib/librte_acl/acl_run_avx512x8.h      | 565 ++++--------------------
 3 files changed, 654 insertions(+), 957 deletions(-)
 create mode 100644 lib/librte_acl/acl_run_avx512_common.h

diff --git a/lib/librte_acl/acl_run_avx512_common.h b/lib/librte_acl/acl_run_avx512_common.h
new file mode 100644
index 0000000000..1baf79b7ae
--- /dev/null
+++ b/lib/librte_acl/acl_run_avx512_common.h
@@ -0,0 +1,477 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+/*
+ * WARNING: It is not recommended to include this file directly.
+ * Please include "acl_run_avx512x*.h" instead.
+ * To make this file to generate proper code an includer has to
+ * define several macros, refer to "acl_run_avx512x*.h" for more details.
+ */
+
+/*
+ * Calculate the address of the next transition for
+ * all types of nodes. Note that only DFA nodes and range
+ * nodes actually transition to another node. Match
+ * nodes not supposed to be encountered here.
+ * For quad range nodes:
+ * Calculate number of range boundaries that are less than the
+ * input value. Range boundaries for each node are in signed 8 bit,
+ * ordered from -128 to 127.
+ * This is effectively a popcnt of bytes that are greater than the
+ * input byte.
+ * Single nodes are processed in the same ways as quad range nodes.
+ */
+static __rte_always_inline _T_simd
+_F_(calc_addr)(_T_simd index_mask, _T_simd next_input, _T_simd shuffle_input,
+	_T_simd four_32, _T_simd range_base, _T_simd tr_lo, _T_simd tr_hi)
+{
+	__mmask64 qm;
+	_T_mask dfa_msk;
+	_T_simd addr, in, node_type, r, t;
+	_T_simd dfa_ofs, quad_ofs;
+
+	t = _M_SI_(xor)(index_mask, index_mask);
+	in = _M_I_(shuffle_epi8)(next_input, shuffle_input);
+
+	/* Calc node type and node addr */
+	node_type = _M_SI_(andnot)(index_mask, tr_lo);
+	addr = _M_SI_(and)(index_mask, tr_lo);
+
+	/* mask for DFA type(0) nodes */
+	dfa_msk = _M_I_(cmpeq_epi32_mask)(node_type, t);
+
+	/* DFA calculations. */
+	r = _M_I_(srli_epi32)(in, 30);
+	r = _M_I_(add_epi8)(r, range_base);
+	t = _M_I_(srli_epi32)(in, 24);
+	r = _M_I_(shuffle_epi8)(tr_hi, r);
+
+	dfa_ofs = _M_I_(sub_epi32)(t, r);
+
+	/* QUAD/SINGLE calculations. */
+	qm = _M_I_(cmpgt_epi8_mask)(in, tr_hi);
+	t = _M_I_(maskz_set1_epi8)(qm, (uint8_t)UINT8_MAX);
+	t = _M_I_(lzcnt_epi32)(t);
+	t = _M_I_(srli_epi32)(t, 3);
+	quad_ofs = _M_I_(sub_epi32)(four_32, t);
+
+	/* blend DFA and QUAD/SINGLE. */
+	t = _M_I_(mask_mov_epi32)(quad_ofs, dfa_msk, dfa_ofs);
+
+	/* calculate address for next transitions. */
+	addr = _M_I_(add_epi32)(addr, t);
+	return addr;
+}
+
+/*
+ * Process _N_ transitions in parallel.
+ * tr_lo contains low 32 bits for _N_ transition.
+ * tr_hi contains high 32 bits for _N_ transition.
+ * next_input contains up to 4 input bytes for _N_ flows.
+ */
+static __rte_always_inline _T_simd
+_F_(trans)(_T_simd next_input, const uint64_t *trans, _T_simd *tr_lo,
+	_T_simd *tr_hi)
+{
+	const int32_t *tr;
+	_T_simd addr;
+
+	tr = (const int32_t *)(uintptr_t)trans;
+
+	/* Calculate the address (array index) for all _N_ transitions. */
+	addr = _F_(calc_addr)(_SV_(index_mask), next_input, _SV_(shuffle_input),
+		_SV_(four_32), _SV_(range_base), *tr_lo, *tr_hi);
+
+	/* load lower 32 bits of _N_ transactions at once. */
+	*tr_lo = _M_GI_(i32gather_epi32, addr, tr, sizeof(trans[0]));
+
+	next_input = _M_I_(srli_epi32)(next_input, CHAR_BIT);
+
+	/* load high 32 bits of _N_ transactions at once. */
+	*tr_hi = _M_GI_(i32gather_epi32, addr, (tr + 1), sizeof(trans[0]));
+
+	return next_input;
+}
+
+/*
+ * Execute first transition for up to _N_ flows in parallel.
+ * next_input should contain one input byte for up to _N_ flows.
+ * msk - mask of active flows.
+ * tr_lo contains low 32 bits for up to _N_ transitions.
+ * tr_hi contains high 32 bits for up to _N_ transitions.
+ */
+static __rte_always_inline void
+_F_(first_trans)(const struct acl_flow_avx512 *flow, _T_simd next_input,
+	_T_mask msk, _T_simd *tr_lo, _T_simd *tr_hi)
+{
+	const int32_t *tr;
+	_T_simd addr, root;
+
+	tr = (const int32_t *)(uintptr_t)flow->trans;
+
+	addr = _M_I_(set1_epi32)(UINT8_MAX);
+	root = _M_I_(set1_epi32)(flow->root_index);
+
+	addr = _M_SI_(and)(next_input, addr);
+	addr = _M_I_(add_epi32)(root, addr);
+
+	/* load lower 32 bits of _N_ transactions at once. */
+	*tr_lo = _M_MGI_(mask_i32gather_epi32)(*tr_lo, msk, addr, tr,
+		sizeof(flow->trans[0]));
+
+	/* load high 32 bits of _N_ transactions at once. */
+	*tr_hi = _M_MGI_(mask_i32gather_epi32)(*tr_hi, msk, addr, (tr + 1),
+		sizeof(flow->trans[0]));
+}
+
+/*
+ * Load and return next 4 input bytes for up to _N_ flows in parallel.
+ * pdata - 8x2 pointers to flow input data
+ * mask - mask of active flows.
+ * di - data indexes for these _N_ flows.
+ */
+static inline _T_simd
+_F_(get_next_bytes)(const struct acl_flow_avx512 *flow, _T_simd pdata[2],
+	uint32_t msk, _T_simd *di, uint32_t bnum)
+{
+	const int32_t *div;
+	uint32_t m[2];
+	_T_simd one, zero, t, p[2];
+
+	div = (const int32_t *)flow->data_index;
+
+	one = _M_I_(set1_epi32)(1);
+	zero = _M_SI_(xor)(one, one);
+
+	/* load data offsets for given indexes */
+	t = _M_MGI_(mask_i32gather_epi32)(zero, msk, *di, div, sizeof(div[0]));
+
+	/* increment data indexes */
+	*di = _M_I_(mask_add_epi32)(*di, msk, *di, one);
+
+	/*
+	 * unsigned expand 32-bit indexes to 64-bit
+	 * (for later pointer arithmetic), i.e:
+	 * for (i = 0; i != _N_; i++)
+	 *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
+	 */
+	p[0] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[0]),
+			t);
+	p[1] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[1]),
+			t);
+
+	p[0] = _M_I_(add_epi64)(p[0], pdata[0]);
+	p[1] = _M_I_(add_epi64)(p[1], pdata[1]);
+
+	/* load input byte(s), either one or four */
+
+	m[0] = msk & _SIMD_PTR_MSK_;
+	m[1] = msk >> _SIMD_PTR_NUM_;
+
+	return _F_(gather_bytes)(zero, p, m, bnum);
+}
+
+/*
+ * Start up to _N_ new flows.
+ * num - number of flows to start
+ * msk - mask of new flows.
+ * pdata - pointers to flow input data
+ * idx - match indexed for given flows
+ * di - data indexes for these flows.
+ */
+static inline void
+_F_(start_flow)(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
+	_T_simd pdata[2], _T_simd *idx, _T_simd *di)
+{
+	uint32_t n, m[2], nm[2];
+	_T_simd ni, nd[2];
+
+	/* split mask into two - one for each pdata[] */
+	m[0] = msk & _SIMD_PTR_MSK_;
+	m[1] = msk >> _SIMD_PTR_NUM_;
+
+	/* calculate masks for new flows */
+	n = __builtin_popcount(m[0]);
+	nm[0] = (1 << n) - 1;
+	nm[1] = (1 << (num - n)) - 1;
+
+	/* load input data pointers for new flows */
+	nd[0] = _M_I_(maskz_loadu_epi64)(nm[0],
+			flow->idata + flow->num_packets);
+	nd[1] = _M_I_(maskz_loadu_epi64)(nm[1],
+			flow->idata + flow->num_packets + n);
+
+	/* calculate match indexes of new flows */
+	ni = _M_I_(set1_epi32)(flow->num_packets);
+	ni = _M_I_(add_epi32)(ni, _SV_(idx_add));
+
+	/* merge new and existing flows data */
+	pdata[0] = _M_I_(mask_expand_epi64)(pdata[0], m[0], nd[0]);
+	pdata[1] = _M_I_(mask_expand_epi64)(pdata[1], m[1], nd[1]);
+
+	/* update match and data indexes */
+	*idx = _M_I_(mask_expand_epi32)(*idx, msk, ni);
+	*di = _M_I_(maskz_mov_epi32)(msk ^ _SIMD_MASK_MAX_, *di);
+
+	flow->num_packets += num;
+}
+
+/*
+ * Process found matches for up to _N_ flows.
+ * fmsk - mask of active flows
+ * rmsk - mask of found matches
+ * pdata - pointers to flow input data
+ * di - data indexes for these flows
+ * idx - match indexed for given flows
+ * tr_lo contains low 32 bits for up to _N_ transitions.
+ * tr_hi contains high 32 bits for up to _N_ transitions.
+ */
+static inline uint32_t
+_F_(match_process)(struct acl_flow_avx512 *flow, uint32_t *fmsk,
+	uint32_t *rmsk, _T_simd pdata[2], _T_simd *di, _T_simd *idx,
+	_T_simd *tr_lo, _T_simd *tr_hi)
+{
+	uint32_t n;
+	_T_simd res;
+
+	if (rmsk[0] == 0)
+		return 0;
+
+	/* extract match indexes */
+	res = _M_SI_(and)(tr_lo[0], _SV_(index_mask));
+
+	/* mask  matched transitions to nop */
+	tr_lo[0] = _M_I_(mask_mov_epi32)(tr_lo[0], rmsk[0], _SV_(trlo_idle));
+	tr_hi[0] = _M_I_(mask_mov_epi32)(tr_hi[0], rmsk[0], _SV_(trhi_idle));
+
+	/* save found match indexes */
+	_M_I_(mask_i32scatter_epi32)(flow->matches, rmsk[0], idx[0], res,
+			sizeof(flow->matches[0]));
+
+	/* update masks and start new flows for matches */
+	n = update_flow_mask(flow, fmsk, rmsk);
+	_F_(start_flow)(flow, n, rmsk[0], pdata, idx, di);
+
+	return n;
+}
+
+/*
+ * Test for matches ut to (2 * _N_) flows at once,
+ * if matches exist - process them and start new flows.
+ */
+static inline void
+_F_(match_check_process)(struct acl_flow_avx512 *flow, uint32_t fm[2],
+	_T_simd pdata[4], _T_simd di[2], _T_simd idx[2], _T_simd inp[2],
+	_T_simd tr_lo[2], _T_simd tr_hi[2])
+{
+	uint32_t n[2];
+	uint32_t rm[2];
+
+	/* check for matches */
+	rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask));
+	rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask));
+
+	/* till unprocessed matches exist */
+	while ((rm[0] | rm[1]) != 0) {
+
+		/* process matches and start new flows */
+		n[0] = _F_(match_process)(flow, &fm[0], &rm[0], &pdata[0],
+			&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
+		n[1] = _F_(match_process)(flow, &fm[1], &rm[1], &pdata[2],
+			&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
+
+		/* execute first transition for new flows, if any */
+
+		if (n[0] != 0) {
+			inp[0] = _F_(get_next_bytes)(flow, &pdata[0],
+					rm[0], &di[0], flow->first_load_sz);
+			_F_(first_trans)(flow, inp[0], rm[0], &tr_lo[0],
+					&tr_hi[0]);
+			rm[0] = _M_I_(test_epi32_mask)(tr_lo[0],
+					_SV_(match_mask));
+		}
+
+		if (n[1] != 0) {
+			inp[1] = _F_(get_next_bytes)(flow, &pdata[2],
+					rm[1], &di[1], flow->first_load_sz);
+			_F_(first_trans)(flow, inp[1], rm[1], &tr_lo[1],
+					&tr_hi[1]);
+			rm[1] = _M_I_(test_epi32_mask)(tr_lo[1],
+					_SV_(match_mask));
+		}
+	}
+}
+
+/*
+ * Perform search for up to (2 * _N_) flows in parallel.
+ * Use two sets of metadata, each serves _N_ flows max.
+ */
+static inline void
+_F_(search_trie)(struct acl_flow_avx512 *flow)
+{
+	uint32_t fm[2];
+	_T_simd di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
+
+	/* first 1B load */
+	_F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
+			&pdata[0], &idx[0], &di[0]);
+	_F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
+			&pdata[2], &idx[1], &di[1]);
+
+	in[0] = _F_(get_next_bytes)(flow, &pdata[0], _SIMD_MASK_MAX_, &di[0],
+			flow->first_load_sz);
+	in[1] = _F_(get_next_bytes)(flow, &pdata[2], _SIMD_MASK_MAX_, &di[1],
+			flow->first_load_sz);
+
+	_F_(first_trans)(flow, in[0], _SIMD_MASK_MAX_, &tr_lo[0], &tr_hi[0]);
+	_F_(first_trans)(flow, in[1], _SIMD_MASK_MAX_, &tr_lo[1], &tr_hi[1]);
+
+	fm[0] = _SIMD_MASK_MAX_;
+	fm[1] = _SIMD_MASK_MAX_;
+
+	/* match check */
+	_F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi);
+
+	while ((fm[0] | fm[1]) != 0) {
+
+		/* load next 4B */
+
+		in[0] = _F_(get_next_bytes)(flow, &pdata[0], fm[0],
+				&di[0], sizeof(uint32_t));
+		in[1] = _F_(get_next_bytes)(flow, &pdata[2], fm[1],
+				&di[1], sizeof(uint32_t));
+
+		/* main 4B loop */
+
+		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
+		in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
+
+		/* check for matches */
+		_F_(match_check_process)(flow, fm, pdata, di, idx, in,
+			tr_lo, tr_hi);
+	}
+}
+
+/*
+ * resolve match index to actual result/priority offset.
+ */
+static inline _T_simd
+_F_(resolve_match_idx)(_T_simd mi)
+{
+	RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
+		1 << (match_log + 2));
+	return _M_I_(slli_epi32)(mi, match_log);
+}
+
+/*
+ * Resolve multiple matches for the same flow based on priority.
+ */
+static inline _T_simd
+_F_(resolve_pri)(const int32_t res[], const int32_t pri[],
+	const uint32_t match[], _T_mask msk, uint32_t nb_trie,
+	uint32_t nb_skip)
+{
+	uint32_t i;
+	const uint32_t *pm;
+	_T_mask m;
+	_T_simd cp, cr, np, nr, mch;
+
+	const _T_simd zero = _M_I_(set1_epi32)(0);
+
+	/* get match indexes */
+	mch = _M_I_(maskz_loadu_epi32)(msk, match);
+	mch = _F_(resolve_match_idx)(mch);
+
+	/* read result and priority values for first trie */
+	cr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res, sizeof(res[0]));
+	cp = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri, sizeof(pri[0]));
+
+	/*
+	 * read result and priority values for next tries and select one
+	 * with highest priority.
+	 */
+	for (i = 1, pm = match + nb_skip; i != nb_trie;
+			i++, pm += nb_skip) {
+
+		mch = _M_I_(maskz_loadu_epi32)(msk, pm);
+		mch = _F_(resolve_match_idx)(mch);
+
+		nr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res,
+				sizeof(res[0]));
+		np = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri,
+				sizeof(pri[0]));
+
+		m = _M_I_(cmpgt_epi32_mask)(cp, np);
+		cr = _M_I_(mask_mov_epi32)(nr, m, cr);
+		cp = _M_I_(mask_mov_epi32)(np, m, cp);
+	}
+
+	return cr;
+}
+
+/*
+ * Resolve num (<= _N_) matches for single category
+ */
+static inline void
+_F_(resolve_sc)(uint32_t result[], const int32_t res[],
+	const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
+	uint32_t nb_trie, uint32_t nb_skip)
+{
+	_T_mask msk;
+	_T_simd cr;
+
+	msk = (1 << nb_pkt) - 1;
+	cr = _F_(resolve_pri)(res, pri, match, msk, nb_trie, nb_skip);
+	_M_I_(mask_storeu_epi32)(result, msk, cr);
+}
+
+/*
+ * Resolve matches for single category
+ */
+static inline void
+_F_(resolve_single_cat)(uint32_t result[],
+	const struct rte_acl_match_results pr[], const uint32_t match[],
+	uint32_t nb_pkt, uint32_t nb_trie)
+{
+	uint32_t j, k, n;
+	const int32_t *res, *pri;
+	_T_simd cr[2];
+
+	res = (const int32_t *)pr->results;
+	pri = pr->priority;
+
+	for (k = 0; k != (nb_pkt & ~_SIMD_FLOW_MSK_); k += _SIMD_FLOW_NUM_) {
+
+		j = k + _SIMD_MASK_BIT_;
+
+		cr[0] = _F_(resolve_pri)(res, pri, match + k, _SIMD_MASK_MAX_,
+				nb_trie, nb_pkt);
+		cr[1] = _F_(resolve_pri)(res, pri, match + j, _SIMD_MASK_MAX_,
+				nb_trie, nb_pkt);
+
+		_M_SI_(storeu)((void *)(result + k), cr[0]);
+		_M_SI_(storeu)((void *)(result + j), cr[1]);
+	}
+
+	n = nb_pkt - k;
+	if (n != 0) {
+		if (n > _SIMD_MASK_BIT_) {
+			_F_(resolve_sc)(result + k, res, pri, match + k,
+				_SIMD_MASK_BIT_, nb_trie, nb_pkt);
+			k += _SIMD_MASK_BIT_;
+			n -= _SIMD_MASK_BIT_;
+		}
+		_F_(resolve_sc)(result + k, res, pri, match + k, n,
+				nb_trie, nb_pkt);
+	}
+}
diff --git a/lib/librte_acl/acl_run_avx512x16.h b/lib/librte_acl/acl_run_avx512x16.h
index a39df8f3c0..da244bc257 100644
--- a/lib/librte_acl/acl_run_avx512x16.h
+++ b/lib/librte_acl/acl_run_avx512x16.h
@@ -2,16 +2,57 @@
  * Copyright(c) 2020 Intel Corporation
  */
 
-#define	MASK16_BIT	(sizeof(__mmask16) * CHAR_BIT)
+/*
+ * Defines required by "acl_run_avx512_common.h".
+ * Note that all of them has to be undefined by the end
+ * of this file, as "acl_run_avx512_common.h" can be included several
+ * times from different *.h files for the same *.c.
+ */
+
+/*
+ * This implementation uses 512-bit registers(zmm) and instrincts.
+ * So our main SIMD type is 512-bit width and each such variable can
+ * process sizeof(__m512i) / sizeof(uint32_t) == 16 entries in parallel.
+ */
+#define _T_simd		__m512i
+#define _T_mask		__mmask16
+
+/* Naming convention for static const variables. */
+#define _SC_(x)		zmm_##x
+#define _SV_(x)		(zmm_##x.z)
+
+/* Naming convention for internal functions. */
+#define _F_(x)		x##_avx512x16
+
+/*
+ * Same instrincts have different syntaxis (depending on the bit-width),
+ * so to overcome that few macros need to be defined.
+ */
+
+/* Naming convention for generic epi(packed integers) type instrincts. */
+#define _M_I_(x)	_mm512_##x
+
+/* Naming convention for si(whole simd integer) type instrincts. */
+#define _M_SI_(x)	_mm512_##x##_si512
+
+/* Naming convention for masked gather type instrincts. */
+#define _M_MGI_(x)	_mm512_##x
+
+/* Naming convention for gather type instrincts. */
+#define _M_GI_(name, idx, base, scale)	_mm512_##name(idx, base, scale)
 
-#define NUM_AVX512X16X2	(2 * MASK16_BIT)
-#define MSK_AVX512X16X2	(NUM_AVX512X16X2 - 1)
+/* num/mask of transitions per SIMD regs */
+#define _SIMD_MASK_BIT_	(sizeof(_T_simd) / sizeof(uint32_t))
+#define _SIMD_MASK_MAX_	RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
+
+#define _SIMD_FLOW_NUM_	(2 * _SIMD_MASK_BIT_)
+#define _SIMD_FLOW_MSK_	(_SIMD_FLOW_NUM_ - 1)
 
 /* num/mask of pointers per SIMD regs */
-#define ZMM_PTR_NUM	(sizeof(__m512i) / sizeof(uintptr_t))
-#define ZMM_PTR_MSK	RTE_LEN2MASK(ZMM_PTR_NUM, uint32_t)
+#define _SIMD_PTR_NUM_	(sizeof(_T_simd) / sizeof(uintptr_t))
+#define _SIMD_PTR_MSK_	RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
 
-static const __rte_x86_zmm_t zmm_match_mask = {
+static const __rte_x86_zmm_t _SC_(match_mask) = {
 	.u32 = {
 		RTE_ACL_NODE_MATCH,
 		RTE_ACL_NODE_MATCH,
@@ -32,7 +73,7 @@ static const __rte_x86_zmm_t zmm_match_mask = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_index_mask = {
+static const __rte_x86_zmm_t _SC_(index_mask) = {
 	.u32 = {
 		RTE_ACL_NODE_INDEX,
 		RTE_ACL_NODE_INDEX,
@@ -53,7 +94,7 @@ static const __rte_x86_zmm_t zmm_index_mask = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_trlo_idle = {
+static const __rte_x86_zmm_t _SC_(trlo_idle) = {
 	.u32 = {
 		RTE_ACL_IDLE_NODE,
 		RTE_ACL_IDLE_NODE,
@@ -74,7 +115,7 @@ static const __rte_x86_zmm_t zmm_trlo_idle = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_trhi_idle = {
+static const __rte_x86_zmm_t _SC_(trhi_idle) = {
 	.u32 = {
 		0, 0, 0, 0,
 		0, 0, 0, 0,
@@ -83,7 +124,7 @@ static const __rte_x86_zmm_t zmm_trhi_idle = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_shuffle_input = {
+static const __rte_x86_zmm_t _SC_(shuffle_input) = {
 	.u32 = {
 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
@@ -92,7 +133,7 @@ static const __rte_x86_zmm_t zmm_shuffle_input = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_four_32 = {
+static const __rte_x86_zmm_t _SC_(four_32) = {
 	.u32 = {
 		4, 4, 4, 4,
 		4, 4, 4, 4,
@@ -101,7 +142,7 @@ static const __rte_x86_zmm_t zmm_four_32 = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_idx_add = {
+static const __rte_x86_zmm_t _SC_(idx_add) = {
 	.u32 = {
 		0, 1, 2, 3,
 		4, 5, 6, 7,
@@ -110,7 +151,7 @@ static const __rte_x86_zmm_t zmm_idx_add = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_range_base = {
+static const __rte_x86_zmm_t _SC_(range_base) = {
 	.u32 = {
 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
@@ -119,16 +160,16 @@ static const __rte_x86_zmm_t zmm_range_base = {
 	},
 };
 
-static const __rte_x86_zmm_t zmm_pminp = {
+static const __rte_x86_zmm_t _SC_(pminp) = {
 	.u32 = {
 		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 	},
 };
 
-static const __mmask16 zmm_pmidx_msk = 0x5555;
+static const _T_mask _SC_(pmidx_msk) = 0x5555;
 
-static const __rte_x86_zmm_t zmm_pmidx[2] = {
+static const __rte_x86_zmm_t _SC_(pmidx[2]) = {
 	[0] = {
 		.u32 = {
 			0, 0, 1, 0, 2, 0, 3, 0,
@@ -148,7 +189,7 @@ static const __rte_x86_zmm_t zmm_pmidx[2] = {
  * gather load on a byte quantity. So we have to mimic it in SW,
  * by doing 8x1B scalar loads.
  */
-static inline ymm_t
+static inline __m256i
 _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
 {
 	rte_ymm_t v;
@@ -156,7 +197,7 @@ _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
 
 	static const uint32_t zero;
 
-	p.z = _mm512_mask_set1_epi64(pdata, mask ^ ZMM_PTR_MSK,
+	p.z = _mm512_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
 		(uintptr_t)&zero);
 
 	v.u32[0] = *(uint8_t *)p.u64[0];
@@ -172,369 +213,29 @@ _m512_mask_gather_epi8x8(__m512i pdata, __mmask8 mask)
 }
 
 /*
- * Calculate the address of the next transition for
- * all types of nodes. Note that only DFA nodes and range
- * nodes actually transition to another node. Match
- * nodes not supposed to be encountered here.
- * For quad range nodes:
- * Calculate number of range boundaries that are less than the
- * input value. Range boundaries for each node are in signed 8 bit,
- * ordered from -128 to 127.
- * This is effectively a popcnt of bytes that are greater than the
- * input byte.
- * Single nodes are processed in the same ways as quad range nodes.
- */
-static __rte_always_inline __m512i
-calc_addr16(__m512i index_mask, __m512i next_input, __m512i shuffle_input,
-	__m512i four_32, __m512i range_base, __m512i tr_lo, __m512i tr_hi)
-{
-	__mmask64 qm;
-	__mmask16 dfa_msk;
-	__m512i addr, in, node_type, r, t;
-	__m512i dfa_ofs, quad_ofs;
-
-	t = _mm512_xor_si512(index_mask, index_mask);
-	in = _mm512_shuffle_epi8(next_input, shuffle_input);
-
-	/* Calc node type and node addr */
-	node_type = _mm512_andnot_si512(index_mask, tr_lo);
-	addr = _mm512_and_si512(index_mask, tr_lo);
-
-	/* mask for DFA type(0) nodes */
-	dfa_msk = _mm512_cmpeq_epi32_mask(node_type, t);
-
-	/* DFA calculations. */
-	r = _mm512_srli_epi32(in, 30);
-	r = _mm512_add_epi8(r, range_base);
-	t = _mm512_srli_epi32(in, 24);
-	r = _mm512_shuffle_epi8(tr_hi, r);
-
-	dfa_ofs = _mm512_sub_epi32(t, r);
-
-	/* QUAD/SINGLE calculations. */
-	qm = _mm512_cmpgt_epi8_mask(in, tr_hi);
-	t = _mm512_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX);
-	t = _mm512_lzcnt_epi32(t);
-	t = _mm512_srli_epi32(t, 3);
-	quad_ofs = _mm512_sub_epi32(four_32, t);
-
-	/* blend DFA and QUAD/SINGLE. */
-	t = _mm512_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs);
-
-	/* calculate address for next transitions. */
-	addr = _mm512_add_epi32(addr, t);
-	return addr;
-}
-
-/*
- * Process 16 transitions in parallel.
- * tr_lo contains low 32 bits for 16 transition.
- * tr_hi contains high 32 bits for 16 transition.
- * next_input contains up to 4 input bytes for 16 flows.
+ * Gather 4/1 input bytes for up to 16 (2*8) locations in parallel.
  */
 static __rte_always_inline __m512i
-transition16(__m512i next_input, const uint64_t *trans, __m512i *tr_lo,
-	__m512i *tr_hi)
-{
-	const int32_t *tr;
-	__m512i addr;
-
-	tr = (const int32_t *)(uintptr_t)trans;
-
-	/* Calculate the address (array index) for all 16 transitions. */
-	addr = calc_addr16(zmm_index_mask.z, next_input, zmm_shuffle_input.z,
-		zmm_four_32.z, zmm_range_base.z, *tr_lo, *tr_hi);
-
-	/* load lower 32 bits of 16 transactions at once. */
-	*tr_lo = _mm512_i32gather_epi32(addr, tr, sizeof(trans[0]));
-
-	next_input = _mm512_srli_epi32(next_input, CHAR_BIT);
-
-	/* load high 32 bits of 16 transactions at once. */
-	*tr_hi = _mm512_i32gather_epi32(addr, (tr + 1), sizeof(trans[0]));
-
-	return next_input;
-}
-
-/*
- * Execute first transition for up to 16 flows in parallel.
- * next_input should contain one input byte for up to 16 flows.
- * msk - mask of active flows.
- * tr_lo contains low 32 bits for up to 16 transitions.
- * tr_hi contains high 32 bits for up to 16 transitions.
- */
-static __rte_always_inline void
-first_trans16(const struct acl_flow_avx512 *flow, __m512i next_input,
-	__mmask16 msk, __m512i *tr_lo, __m512i *tr_hi)
+_F_(gather_bytes)(__m512i zero, const __m512i p[2], const uint32_t m[2],
+	uint32_t bnum)
 {
-	const int32_t *tr;
-	__m512i addr, root;
-
-	tr = (const int32_t *)(uintptr_t)flow->trans;
-
-	addr = _mm512_set1_epi32(UINT8_MAX);
-	root = _mm512_set1_epi32(flow->root_index);
-
-	addr = _mm512_and_si512(next_input, addr);
-	addr = _mm512_add_epi32(root, addr);
-
-	/* load lower 32 bits of 16 transactions at once. */
-	*tr_lo = _mm512_mask_i32gather_epi32(*tr_lo, msk, addr, tr,
-		sizeof(flow->trans[0]));
-
-	/* load high 32 bits of 16 transactions at once. */
-	*tr_hi = _mm512_mask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1),
-		sizeof(flow->trans[0]));
-}
-
-/*
- * Load and return next 4 input bytes for up to 16 flows in parallel.
- * pdata - 8x2 pointers to flow input data
- * mask - mask of active flows.
- * di - data indexes for these 16 flows.
- */
-static inline __m512i
-get_next_bytes_avx512x16(const struct acl_flow_avx512 *flow, __m512i pdata[2],
-	uint32_t msk, __m512i *di, uint32_t bnum)
-{
-	const int32_t *div;
-	uint32_t m[2];
-	__m512i one, zero, t, p[2];
-	ymm_t inp[2];
-
-	div = (const int32_t *)flow->data_index;
-
-	one = _mm512_set1_epi32(1);
-	zero = _mm512_xor_si512(one, one);
-
-	/* load data offsets for given indexes */
-	t = _mm512_mask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0]));
-
-	/* increment data indexes */
-	*di = _mm512_mask_add_epi32(*di, msk, *di, one);
-
-	/*
-	 * unsigned expand 32-bit indexes to 64-bit
-	 * (for later pointer arithmetic), i.e:
-	 * for (i = 0; i != 16; i++)
-	 *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
-	 */
-	p[0] = _mm512_maskz_permutexvar_epi32(zmm_pmidx_msk, zmm_pmidx[0].z, t);
-	p[1] = _mm512_maskz_permutexvar_epi32(zmm_pmidx_msk, zmm_pmidx[1].z, t);
-
-	p[0] = _mm512_add_epi64(p[0], pdata[0]);
-	p[1] = _mm512_add_epi64(p[1], pdata[1]);
-
-	/* load input byte(s), either one or four */
-
-	m[0] = msk & ZMM_PTR_MSK;
-	m[1] = msk >> ZMM_PTR_NUM;
+	__m256i inp[2];
 
 	if (bnum == sizeof(uint8_t)) {
 		inp[0] = _m512_mask_gather_epi8x8(p[0], m[0]);
 		inp[1] = _m512_mask_gather_epi8x8(p[1], m[1]);
 	} else {
 		inp[0] = _mm512_mask_i64gather_epi32(
-				_mm512_castsi512_si256(zero), m[0], p[0],
-				NULL, sizeof(uint8_t));
+				_mm512_castsi512_si256(zero),
+				m[0], p[0], NULL, sizeof(uint8_t));
 		inp[1] = _mm512_mask_i64gather_epi32(
-				_mm512_castsi512_si256(zero), m[1], p[1],
-				NULL, sizeof(uint8_t));
+				_mm512_castsi512_si256(zero),
+				m[1], p[1], NULL, sizeof(uint8_t));
 	}
 
 	/* squeeze input into one 512-bit register */
 	return _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]),
-			zmm_pminp.z, _mm512_castsi256_si512(inp[1]));
-}
-
-/*
- * Start up to 16 new flows.
- * num - number of flows to start
- * msk - mask of new flows.
- * pdata - pointers to flow input data
- * idx - match indexed for given flows
- * di - data indexes for these flows.
- */
-static inline void
-start_flow16(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
-	__m512i pdata[2], __m512i *idx, __m512i *di)
-{
-	uint32_t n, m[2], nm[2];
-	__m512i ni, nd[2];
-
-	/* split mask into two - one for each pdata[] */
-	m[0] = msk & ZMM_PTR_MSK;
-	m[1] = msk >> ZMM_PTR_NUM;
-
-	/* calculate masks for new flows */
-	n = __builtin_popcount(m[0]);
-	nm[0] = (1 << n) - 1;
-	nm[1] = (1 << (num - n)) - 1;
-
-	/* load input data pointers for new flows */
-	nd[0] = _mm512_maskz_loadu_epi64(nm[0],
-		flow->idata + flow->num_packets);
-	nd[1] = _mm512_maskz_loadu_epi64(nm[1],
-		flow->idata + flow->num_packets + n);
-
-	/* calculate match indexes of new flows */
-	ni = _mm512_set1_epi32(flow->num_packets);
-	ni = _mm512_add_epi32(ni, zmm_idx_add.z);
-
-	/* merge new and existing flows data */
-	pdata[0] = _mm512_mask_expand_epi64(pdata[0], m[0], nd[0]);
-	pdata[1] = _mm512_mask_expand_epi64(pdata[1], m[1], nd[1]);
-
-	/* update match and data indexes */
-	*idx = _mm512_mask_expand_epi32(*idx, msk, ni);
-	*di = _mm512_maskz_mov_epi32(msk ^ UINT16_MAX, *di);
-
-	flow->num_packets += num;
-}
-
-/*
- * Process found matches for up to 16 flows.
- * fmsk - mask of active flows
- * rmsk - mask of found matches
- * pdata - pointers to flow input data
- * di - data indexes for these flows
- * idx - match indexed for given flows
- * tr_lo contains low 32 bits for up to 8 transitions.
- * tr_hi contains high 32 bits for up to 8 transitions.
- */
-static inline uint32_t
-match_process_avx512x16(struct acl_flow_avx512 *flow, uint32_t *fmsk,
-	uint32_t *rmsk, __m512i pdata[2], __m512i *di, __m512i *idx,
-	__m512i *tr_lo, __m512i *tr_hi)
-{
-	uint32_t n;
-	__m512i res;
-
-	if (rmsk[0] == 0)
-		return 0;
-
-	/* extract match indexes */
-	res = _mm512_and_si512(tr_lo[0], zmm_index_mask.z);
-
-	/* mask  matched transitions to nop */
-	tr_lo[0] = _mm512_mask_mov_epi32(tr_lo[0], rmsk[0], zmm_trlo_idle.z);
-	tr_hi[0] = _mm512_mask_mov_epi32(tr_hi[0], rmsk[0], zmm_trhi_idle.z);
-
-	/* save found match indexes */
-	_mm512_mask_i32scatter_epi32(flow->matches, rmsk[0],
-		idx[0], res, sizeof(flow->matches[0]));
-
-	/* update masks and start new flows for matches */
-	n = update_flow_mask(flow, fmsk, rmsk);
-	start_flow16(flow, n, rmsk[0], pdata, idx, di);
-
-	return n;
-}
-
-/*
- * Test for matches ut to 32 (2x16) flows at once,
- * if matches exist - process them and start new flows.
- */
-static inline void
-match_check_process_avx512x16x2(struct acl_flow_avx512 *flow, uint32_t fm[2],
-	__m512i pdata[4], __m512i di[2], __m512i idx[2], __m512i inp[2],
-	__m512i tr_lo[2], __m512i tr_hi[2])
-{
-	uint32_t n[2];
-	uint32_t rm[2];
-
-	/* check for matches */
-	rm[0] = _mm512_test_epi32_mask(tr_lo[0], zmm_match_mask.z);
-	rm[1] = _mm512_test_epi32_mask(tr_lo[1], zmm_match_mask.z);
-
-	/* till unprocessed matches exist */
-	while ((rm[0] | rm[1]) != 0) {
-
-		/* process matches and start new flows */
-		n[0] = match_process_avx512x16(flow, &fm[0], &rm[0], &pdata[0],
-			&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
-		n[1] = match_process_avx512x16(flow, &fm[1], &rm[1], &pdata[2],
-			&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
-
-		/* execute first transition for new flows, if any */
-
-		if (n[0] != 0) {
-			inp[0] = get_next_bytes_avx512x16(flow, &pdata[0],
-				rm[0], &di[0], flow->first_load_sz);
-			first_trans16(flow, inp[0], rm[0], &tr_lo[0],
-				&tr_hi[0]);
-			rm[0] = _mm512_test_epi32_mask(tr_lo[0],
-				zmm_match_mask.z);
-		}
-
-		if (n[1] != 0) {
-			inp[1] = get_next_bytes_avx512x16(flow, &pdata[2],
-				rm[1], &di[1], flow->first_load_sz);
-			first_trans16(flow, inp[1], rm[1], &tr_lo[1],
-				&tr_hi[1]);
-			rm[1] = _mm512_test_epi32_mask(tr_lo[1],
-				zmm_match_mask.z);
-		}
-	}
-}
-
-/*
- * Perform search for up to 32 flows in parallel.
- * Use two sets of metadata, each serves 16 flows max.
- * So in fact we perform search for 2x16 flows.
- */
-static inline void
-search_trie_avx512x16x2(struct acl_flow_avx512 *flow)
-{
-	uint32_t fm[2];
-	__m512i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
-
-	/* first 1B load */
-	start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[0], &idx[0], &di[0]);
-	start_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[2], &idx[1], &di[1]);
-
-	in[0] = get_next_bytes_avx512x16(flow, &pdata[0], UINT16_MAX, &di[0],
-			flow->first_load_sz);
-	in[1] = get_next_bytes_avx512x16(flow, &pdata[2], UINT16_MAX, &di[1],
-			flow->first_load_sz);
-
-	first_trans16(flow, in[0], UINT16_MAX, &tr_lo[0], &tr_hi[0]);
-	first_trans16(flow, in[1], UINT16_MAX, &tr_lo[1], &tr_hi[1]);
-
-	fm[0] = UINT16_MAX;
-	fm[1] = UINT16_MAX;
-
-	/* match check */
-	match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,
-		tr_lo, tr_hi);
-
-	while ((fm[0] | fm[1]) != 0) {
-
-		/* load next 4B */
-
-		in[0] = get_next_bytes_avx512x16(flow, &pdata[0], fm[0],
-			&di[0], sizeof(uint32_t));
-		in[1] = get_next_bytes_avx512x16(flow, &pdata[2], fm[1],
-			&di[1], sizeof(uint32_t));
-
-		/* main 4B loop */
-
-		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		/* check for matches */
-		match_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,
-			tr_lo, tr_hi);
-	}
+			_SV_(pminp), _mm512_castsi256_si512(inp[1]));
 }
 
 /*
@@ -582,120 +283,12 @@ resolve_mcgt8_avx512x1(uint32_t result[],
 	}
 }
 
-/*
- * resolve match index to actual result/priority offset.
- */
-static inline __m512i
-resolve_match_idx_avx512x16(__m512i mi)
-{
-	RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
-		1 << (match_log + 2));
-	return _mm512_slli_epi32(mi, match_log);
-}
-
-/*
- * Resolve multiple matches for the same flow based on priority.
- */
-static inline __m512i
-resolve_pri_avx512x16(const int32_t res[], const int32_t pri[],
-	const uint32_t match[], __mmask16 msk, uint32_t nb_trie,
-	uint32_t nb_skip)
-{
-	uint32_t i;
-	const uint32_t *pm;
-	__mmask16 m;
-	__m512i cp, cr, np, nr, mch;
-
-	const __m512i zero = _mm512_set1_epi32(0);
-
-	/* get match indexes */
-	mch = _mm512_maskz_loadu_epi32(msk, match);
-	mch = resolve_match_idx_avx512x16(mch);
-
-	/* read result and priority values for first trie */
-	cr = _mm512_mask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0]));
-	cp = _mm512_mask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0]));
-
-	/*
-	 * read result and priority values for next tries and select one
-	 * with highest priority.
-	 */
-	for (i = 1, pm = match + nb_skip; i != nb_trie;
-			i++, pm += nb_skip) {
-
-		mch = _mm512_maskz_loadu_epi32(msk, pm);
-		mch = resolve_match_idx_avx512x16(mch);
-
-		nr = _mm512_mask_i32gather_epi32(zero, msk, mch, res,
-			sizeof(res[0]));
-		np = _mm512_mask_i32gather_epi32(zero, msk, mch, pri,
-			sizeof(pri[0]));
-
-		m = _mm512_cmpgt_epi32_mask(cp, np);
-		cr = _mm512_mask_mov_epi32(nr, m, cr);
-		cp = _mm512_mask_mov_epi32(np, m, cp);
-	}
-
-	return cr;
-}
-
-/*
- * Resolve num (<= 16) matches for single category
- */
-static inline void
-resolve_sc_avx512x16(uint32_t result[], const int32_t res[],
-	const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
-	uint32_t nb_trie, uint32_t nb_skip)
-{
-	__mmask16 msk;
-	__m512i cr;
-
-	msk = (1 << nb_pkt) - 1;
-	cr = resolve_pri_avx512x16(res, pri, match, msk, nb_trie, nb_skip);
-	_mm512_mask_storeu_epi32(result, msk, cr);
-}
+#include "acl_run_avx512_common.h"
 
 /*
- * Resolve matches for single category
+ * Perform search for up to (2 * 16) flows in parallel.
+ * Use two sets of metadata, each serves 16 flows max.
  */
-static inline void
-resolve_sc_avx512x16x2(uint32_t result[],
-	const struct rte_acl_match_results pr[], const uint32_t match[],
-	uint32_t nb_pkt, uint32_t nb_trie)
-{
-	uint32_t j, k, n;
-	const int32_t *res, *pri;
-	__m512i cr[2];
-
-	res = (const int32_t *)pr->results;
-	pri = pr->priority;
-
-	for (k = 0; k != (nb_pkt & ~MSK_AVX512X16X2); k += NUM_AVX512X16X2) {
-
-		j = k + MASK16_BIT;
-
-		cr[0] = resolve_pri_avx512x16(res, pri, match + k, UINT16_MAX,
-				nb_trie, nb_pkt);
-		cr[1] = resolve_pri_avx512x16(res, pri, match + j, UINT16_MAX,
-				nb_trie, nb_pkt);
-
-		_mm512_storeu_si512(result + k, cr[0]);
-		_mm512_storeu_si512(result + j, cr[1]);
-	}
-
-	n = nb_pkt - k;
-	if (n != 0) {
-		if (n > MASK16_BIT) {
-			resolve_sc_avx512x16(result + k, res, pri, match + k,
-				MASK16_BIT, nb_trie, nb_pkt);
-			k += MASK16_BIT;
-			n -= MASK16_BIT;
-		}
-		resolve_sc_avx512x16(result + k, res, pri, match + k, n,
-				nb_trie, nb_pkt);
-	}
-}
-
 static inline int
 search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint32_t *results, uint32_t total_packets, uint32_t categories)
@@ -711,7 +304,7 @@ search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
 
 		/* process the trie */
-		search_trie_avx512x16x2(&flow);
+		_F_(search_trie)(&flow);
 	}
 
 	/* resolve matches */
@@ -719,7 +312,7 @@ search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		(ctx->trans_table + ctx->match_index);
 
 	if (categories == 1)
-		resolve_sc_avx512x16x2(results, pr, match, total_packets,
+		_F_(resolve_single_cat)(results, pr, match, total_packets,
 			ctx->num_tries);
 	else if (categories <= RTE_ACL_MAX_CATEGORIES / 2)
 		resolve_mcle8_avx512x1(results, pr, match, total_packets,
@@ -730,3 +323,19 @@ search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 	return 0;
 }
+
+#undef _SIMD_PTR_MSK_
+#undef _SIMD_PTR_NUM_
+#undef _SIMD_FLOW_MSK_
+#undef _SIMD_FLOW_NUM_
+#undef _SIMD_MASK_MAX_
+#undef _SIMD_MASK_BIT_
+#undef _M_GI_
+#undef _M_MGI_
+#undef _M_SI_
+#undef _M_I_
+#undef _F_
+#undef _SV_
+#undef _SC_
+#undef _T_mask
+#undef _T_simd
diff --git a/lib/librte_acl/acl_run_avx512x8.h b/lib/librte_acl/acl_run_avx512x8.h
index fedd79b9ae..61ac9d1b47 100644
--- a/lib/librte_acl/acl_run_avx512x8.h
+++ b/lib/librte_acl/acl_run_avx512x8.h
@@ -2,16 +2,57 @@
  * Copyright(c) 2020 Intel Corporation
  */
 
-#define MASK8_BIT	(sizeof(__mmask8) * CHAR_BIT)
+/*
+ * Defines required by "acl_run_avx512_common.h".
+ * Note that all of them has to be undefined by the end
+ * of this file, as "acl_run_avx512_common.h" can be included several
+ * times from different *.h files for the same *.c.
+ */
+
+/*
+ * This implementation uses 256-bit registers(ymm) and instrincts.
+ * So our main SIMD type is 256-bit width and each such variable can
+ * process sizeof(__m256i) / sizeof(uint32_t) == 8 entries in parallel.
+ */
+#define _T_simd		__m256i
+#define _T_mask		__mmask8
+
+/* Naming convention for static const variables. */
+#define _SC_(x)		ymm_##x
+#define _SV_(x)		(ymm_##x.y)
+
+/* Naming convention for internal functions. */
+#define _F_(x)		x##_avx512x8
+
+/*
+ * Same instrincts have different syntaxis (depending on the bit-width),
+ * so to overcome that few macros need to be defined.
+ */
+
+/* Naming convention for generic epi(packed integers) type instrincts. */
+#define _M_I_(x)	_mm256_##x
+
+/* Naming convention for si(whole simd integer) type instrincts. */
+#define _M_SI_(x)	_mm256_##x##_si256
 
-#define NUM_AVX512X8X2	(2 * MASK8_BIT)
-#define MSK_AVX512X8X2	(NUM_AVX512X8X2 - 1)
+/* Naming convention for masked gather type instrincts. */
+#define _M_MGI_(x)	_mm256_m##x
+
+/* Naming convention for gather type instrincts. */
+#define _M_GI_(name, idx, base, scale)	_mm256_##name(base, idx, scale)
+
+/* num/mask of transitions per SIMD regs */
+#define _SIMD_MASK_BIT_	(sizeof(_T_simd) / sizeof(uint32_t))
+#define _SIMD_MASK_MAX_	RTE_LEN2MASK(_SIMD_MASK_BIT_, uint32_t)
+
+#define _SIMD_FLOW_NUM_	(2 * _SIMD_MASK_BIT_)
+#define _SIMD_FLOW_MSK_	(_SIMD_FLOW_NUM_ - 1)
 
 /* num/mask of pointers per SIMD regs */
-#define YMM_PTR_NUM	(sizeof(__m256i) / sizeof(uintptr_t))
-#define YMM_PTR_MSK	RTE_LEN2MASK(YMM_PTR_NUM, uint32_t)
+#define _SIMD_PTR_NUM_	(sizeof(_T_simd) / sizeof(uintptr_t))
+#define _SIMD_PTR_MSK_	RTE_LEN2MASK(_SIMD_PTR_NUM_, uint32_t)
 
-static const rte_ymm_t ymm_match_mask = {
+static const rte_ymm_t _SC_(match_mask) = {
 	.u32 = {
 		RTE_ACL_NODE_MATCH,
 		RTE_ACL_NODE_MATCH,
@@ -24,7 +65,7 @@ static const rte_ymm_t ymm_match_mask = {
 	},
 };
 
-static const rte_ymm_t ymm_index_mask = {
+static const rte_ymm_t _SC_(index_mask) = {
 	.u32 = {
 		RTE_ACL_NODE_INDEX,
 		RTE_ACL_NODE_INDEX,
@@ -37,7 +78,7 @@ static const rte_ymm_t ymm_index_mask = {
 	},
 };
 
-static const rte_ymm_t ymm_trlo_idle = {
+static const rte_ymm_t _SC_(trlo_idle) = {
 	.u32 = {
 		RTE_ACL_IDLE_NODE,
 		RTE_ACL_IDLE_NODE,
@@ -50,51 +91,51 @@ static const rte_ymm_t ymm_trlo_idle = {
 	},
 };
 
-static const rte_ymm_t ymm_trhi_idle = {
+static const rte_ymm_t _SC_(trhi_idle) = {
 	.u32 = {
 		0, 0, 0, 0,
 		0, 0, 0, 0,
 	},
 };
 
-static const rte_ymm_t ymm_shuffle_input = {
+static const rte_ymm_t _SC_(shuffle_input) = {
 	.u32 = {
 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 		0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
 	},
 };
 
-static const rte_ymm_t ymm_four_32 = {
+static const rte_ymm_t _SC_(four_32) = {
 	.u32 = {
 		4, 4, 4, 4,
 		4, 4, 4, 4,
 	},
 };
 
-static const rte_ymm_t ymm_idx_add = {
+static const rte_ymm_t _SC_(idx_add) = {
 	.u32 = {
 		0, 1, 2, 3,
 		4, 5, 6, 7,
 	},
 };
 
-static const rte_ymm_t ymm_range_base = {
+static const rte_ymm_t _SC_(range_base) = {
 	.u32 = {
 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 		0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
 	},
 };
 
-static const rte_ymm_t ymm_pminp = {
+static const rte_ymm_t _SC_(pminp) = {
 	.u32 = {
 		0x00, 0x01, 0x02, 0x03,
 		0x08, 0x09, 0x0a, 0x0b,
 	},
 };
 
-static const __mmask16 ymm_pmidx_msk = 0x55;
+static const __mmask16 _SC_(pmidx_msk) = 0x55;
 
-static const rte_ymm_t ymm_pmidx[2] = {
+static const rte_ymm_t _SC_(pmidx[2]) = {
 	[0] = {
 		.u32 = {
 			0, 0, 1, 0, 2, 0, 3, 0,
@@ -120,7 +161,7 @@ _m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask)
 
 	static const uint32_t zero;
 
-	p.y = _mm256_mask_set1_epi64(pdata, mask ^ YMM_PTR_MSK,
+	p.y = _mm256_mask_set1_epi64(pdata, mask ^ _SIMD_PTR_MSK_,
 		(uintptr_t)&zero);
 
 	v.u32[0] = *(uint8_t *)p.u64[0];
@@ -132,483 +173,37 @@ _m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask)
 }
 
 /*
- * Calculate the address of the next transition for
- * all types of nodes. Note that only DFA nodes and range
- * nodes actually transition to another node. Match
- * nodes not supposed to be encountered here.
- * For quad range nodes:
- * Calculate number of range boundaries that are less than the
- * input value. Range boundaries for each node are in signed 8 bit,
- * ordered from -128 to 127.
- * This is effectively a popcnt of bytes that are greater than the
- * input byte.
- * Single nodes are processed in the same ways as quad range nodes.
- */
-static __rte_always_inline __m256i
-calc_addr8(__m256i index_mask, __m256i next_input, __m256i shuffle_input,
-	__m256i four_32, __m256i range_base, __m256i tr_lo, __m256i tr_hi)
-{
-	__mmask32 qm;
-	__mmask8 dfa_msk;
-	__m256i addr, in, node_type, r, t;
-	__m256i dfa_ofs, quad_ofs;
-
-	t = _mm256_xor_si256(index_mask, index_mask);
-	in = _mm256_shuffle_epi8(next_input, shuffle_input);
-
-	/* Calc node type and node addr */
-	node_type = _mm256_andnot_si256(index_mask, tr_lo);
-	addr = _mm256_and_si256(index_mask, tr_lo);
-
-	/* mask for DFA type(0) nodes */
-	dfa_msk = _mm256_cmpeq_epi32_mask(node_type, t);
-
-	/* DFA calculations. */
-	r = _mm256_srli_epi32(in, 30);
-	r = _mm256_add_epi8(r, range_base);
-	t = _mm256_srli_epi32(in, 24);
-	r = _mm256_shuffle_epi8(tr_hi, r);
-
-	dfa_ofs = _mm256_sub_epi32(t, r);
-
-	/* QUAD/SINGLE calculations. */
-	qm = _mm256_cmpgt_epi8_mask(in, tr_hi);
-	t = _mm256_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX);
-	t = _mm256_lzcnt_epi32(t);
-	t = _mm256_srli_epi32(t, 3);
-	quad_ofs = _mm256_sub_epi32(four_32, t);
-
-	/* blend DFA and QUAD/SINGLE. */
-	t = _mm256_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs);
-
-	/* calculate address for next transitions. */
-	addr = _mm256_add_epi32(addr, t);
-	return addr;
-}
-
-/*
- * Process 16 transitions in parallel.
- * tr_lo contains low 32 bits for 16 transition.
- * tr_hi contains high 32 bits for 16 transition.
- * next_input contains up to 4 input bytes for 16 flows.
+ * Gather 4/1 input bytes for up to 8 (2*8) locations in parallel.
  */
 static __rte_always_inline __m256i
-transition8(__m256i next_input, const uint64_t *trans, __m256i *tr_lo,
-	__m256i *tr_hi)
-{
-	const int32_t *tr;
-	__m256i addr;
-
-	tr = (const int32_t *)(uintptr_t)trans;
-
-	/* Calculate the address (array index) for all 8 transitions. */
-	addr = calc_addr8(ymm_index_mask.y, next_input, ymm_shuffle_input.y,
-		ymm_four_32.y, ymm_range_base.y, *tr_lo, *tr_hi);
-
-	/* load lower 32 bits of 16 transactions at once. */
-	*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
-
-	next_input = _mm256_srli_epi32(next_input, CHAR_BIT);
-
-	/* load high 32 bits of 16 transactions at once. */
-	*tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));
-
-	return next_input;
-}
-
-/*
- * Execute first transition for up to 16 flows in parallel.
- * next_input should contain one input byte for up to 16 flows.
- * msk - mask of active flows.
- * tr_lo contains low 32 bits for up to 16 transitions.
- * tr_hi contains high 32 bits for up to 16 transitions.
- */
-static __rte_always_inline void
-first_trans8(const struct acl_flow_avx512 *flow, __m256i next_input,
-	__mmask8 msk, __m256i *tr_lo, __m256i *tr_hi)
+_F_(gather_bytes)(__m256i zero, const __m256i p[2], const uint32_t m[2],
+	uint32_t bnum)
 {
-	const int32_t *tr;
-	__m256i addr, root;
-
-	tr = (const int32_t *)(uintptr_t)flow->trans;
-
-	addr = _mm256_set1_epi32(UINT8_MAX);
-	root = _mm256_set1_epi32(flow->root_index);
-
-	addr = _mm256_and_si256(next_input, addr);
-	addr = _mm256_add_epi32(root, addr);
-
-	/* load lower 32 bits of 16 transactions at once. */
-	*tr_lo = _mm256_mmask_i32gather_epi32(*tr_lo, msk, addr, tr,
-		sizeof(flow->trans[0]));
-
-	/* load high 32 bits of 16 transactions at once. */
-	*tr_hi = _mm256_mmask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1),
-		sizeof(flow->trans[0]));
-}
-
-/*
- * Load and return next 4 input bytes for up to 16 flows in parallel.
- * pdata - 8x2 pointers to flow input data
- * mask - mask of active flows.
- * di - data indexes for these 16 flows.
- */
-static inline __m256i
-get_next_bytes_avx512x8(const struct acl_flow_avx512 *flow, __m256i pdata[2],
-	uint32_t msk, __m256i *di, uint32_t bnum)
-{
-	const int32_t *div;
-	uint32_t m[2];
-	__m256i one, zero, t, p[2];
 	__m128i inp[2];
 
-	div = (const int32_t *)flow->data_index;
-
-	one = _mm256_set1_epi32(1);
-	zero = _mm256_xor_si256(one, one);
-
-	/* load data offsets for given indexes */
-	t = _mm256_mmask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0]));
-
-	/* increment data indexes */
-	*di = _mm256_mask_add_epi32(*di, msk, *di, one);
-
-	/*
-	 * unsigned expand 32-bit indexes to 64-bit
-	 * (for later pointer arithmetic), i.e:
-	 * for (i = 0; i != 16; i++)
-	 *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
-	 */
-	p[0] = _mm256_maskz_permutexvar_epi32(ymm_pmidx_msk, ymm_pmidx[0].y, t);
-	p[1] = _mm256_maskz_permutexvar_epi32(ymm_pmidx_msk, ymm_pmidx[1].y, t);
-
-	p[0] = _mm256_add_epi64(p[0], pdata[0]);
-	p[1] = _mm256_add_epi64(p[1], pdata[1]);
-
-	/* load input byte(s), either one or four */
-
-	m[0] = msk & YMM_PTR_MSK;
-	m[1] = msk >> YMM_PTR_NUM;
-
 	if (bnum == sizeof(uint8_t)) {
 		inp[0] = _m256_mask_gather_epi8x4(p[0], m[0]);
 		inp[1] = _m256_mask_gather_epi8x4(p[1], m[1]);
 	} else {
 		inp[0] = _mm256_mmask_i64gather_epi32(
-				_mm256_castsi256_si128(zero), m[0], p[0],
-				NULL, sizeof(uint8_t));
+				_mm256_castsi256_si128(zero),
+				m[0], p[0], NULL, sizeof(uint8_t));
 		inp[1] = _mm256_mmask_i64gather_epi32(
-				_mm256_castsi256_si128(zero), m[1], p[1],
-				NULL, sizeof(uint8_t));
+				_mm256_castsi256_si128(zero),
+				m[1], p[1], NULL, sizeof(uint8_t));
 	}
 
-	/* squeeze input into one 512-bit register */
+	/* squeeze input into one 256-bit register */
 	return _mm256_permutex2var_epi32(_mm256_castsi128_si256(inp[0]),
-			ymm_pminp.y,  _mm256_castsi128_si256(inp[1]));
-}
-
-/*
- * Start up to 16 new flows.
- * num - number of flows to start
- * msk - mask of new flows.
- * pdata - pointers to flow input data
- * idx - match indexed for given flows
- * di - data indexes for these flows.
- */
-static inline void
-start_flow8(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
-	__m256i pdata[2], __m256i *idx, __m256i *di)
-{
-	uint32_t n, m[2], nm[2];
-	__m256i ni, nd[2];
-
-	m[0] = msk & YMM_PTR_MSK;
-	m[1] = msk >> YMM_PTR_NUM;
-
-	n = __builtin_popcount(m[0]);
-	nm[0] = (1 << n) - 1;
-	nm[1] = (1 << (num - n)) - 1;
-
-	/* load input data pointers for new flows */
-	nd[0] = _mm256_maskz_loadu_epi64(nm[0],
-		flow->idata + flow->num_packets);
-	nd[1] = _mm256_maskz_loadu_epi64(nm[1],
-		flow->idata + flow->num_packets + n);
-
-	/* calculate match indexes of new flows */
-	ni = _mm256_set1_epi32(flow->num_packets);
-	ni = _mm256_add_epi32(ni, ymm_idx_add.y);
-
-	/* merge new and existing flows data */
-	pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]);
-	pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]);
-
-	/* update match and data indexes */
-	*idx = _mm256_mask_expand_epi32(*idx, msk, ni);
-	*di = _mm256_maskz_mov_epi32(msk ^ UINT8_MAX, *di);
-
-	flow->num_packets += num;
-}
-
-/*
- * Process found matches for up to 16 flows.
- * fmsk - mask of active flows
- * rmsk - mask of found matches
- * pdata - pointers to flow input data
- * di - data indexes for these flows
- * idx - match indexed for given flows
- * tr_lo contains low 32 bits for up to 8 transitions.
- * tr_hi contains high 32 bits for up to 8 transitions.
- */
-static inline uint32_t
-match_process_avx512x8(struct acl_flow_avx512 *flow, uint32_t *fmsk,
-	uint32_t *rmsk, __m256i pdata[2], __m256i *di, __m256i *idx,
-	__m256i *tr_lo, __m256i *tr_hi)
-{
-	uint32_t n;
-	__m256i res;
-
-	if (rmsk[0] == 0)
-		return 0;
-
-	/* extract match indexes */
-	res = _mm256_and_si256(tr_lo[0], ymm_index_mask.y);
-
-	/* mask  matched transitions to nop */
-	tr_lo[0] = _mm256_mask_mov_epi32(tr_lo[0], rmsk[0], ymm_trlo_idle.y);
-	tr_hi[0] = _mm256_mask_mov_epi32(tr_hi[0], rmsk[0], ymm_trhi_idle.y);
-
-	/* save found match indexes */
-	_mm256_mask_i32scatter_epi32(flow->matches, rmsk[0],
-		idx[0], res, sizeof(flow->matches[0]));
-
-	/* update masks and start new flows for matches */
-	n = update_flow_mask(flow, fmsk, rmsk);
-	start_flow8(flow, n, rmsk[0], pdata, idx, di);
-
-	return n;
-}
-
-/*
- * Test for matches ut to 32 (2x16) flows at once,
- * if matches exist - process them and start new flows.
- */
-static inline void
-match_check_process_avx512x8x2(struct acl_flow_avx512 *flow, uint32_t fm[2],
-	__m256i pdata[4], __m256i di[2], __m256i idx[2], __m256i inp[2],
-	__m256i tr_lo[2], __m256i tr_hi[2])
-{
-	uint32_t n[2];
-	uint32_t rm[2];
-
-	/* check for matches */
-	rm[0] = _mm256_test_epi32_mask(tr_lo[0], ymm_match_mask.y);
-	rm[1] = _mm256_test_epi32_mask(tr_lo[1], ymm_match_mask.y);
-
-	/* till unprocessed matches exist */
-	while ((rm[0] | rm[1]) != 0) {
-
-		/* process matches and start new flows */
-		n[0] = match_process_avx512x8(flow, &fm[0], &rm[0], &pdata[0],
-			&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
-		n[1] = match_process_avx512x8(flow, &fm[1], &rm[1], &pdata[2],
-			&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
-
-		/* execute first transition for new flows, if any */
-
-		if (n[0] != 0) {
-			inp[0] = get_next_bytes_avx512x8(flow, &pdata[0],
-				rm[0], &di[0], flow->first_load_sz);
-			first_trans8(flow, inp[0], rm[0], &tr_lo[0],
-				&tr_hi[0]);
-			rm[0] = _mm256_test_epi32_mask(tr_lo[0],
-				ymm_match_mask.y);
-		}
-
-		if (n[1] != 0) {
-			inp[1] = get_next_bytes_avx512x8(flow, &pdata[2],
-				rm[1], &di[1], flow->first_load_sz);
-			first_trans8(flow, inp[1], rm[1], &tr_lo[1],
-				&tr_hi[1]);
-			rm[1] = _mm256_test_epi32_mask(tr_lo[1],
-				ymm_match_mask.y);
-		}
-	}
-}
-
-/*
- * Perform search for up to 32 flows in parallel.
- * Use two sets of metadata, each serves 16 flows max.
- * So in fact we perform search for 2x16 flows.
- */
-static inline void
-search_trie_avx512x8x2(struct acl_flow_avx512 *flow)
-{
-	uint32_t fm[2];
-	__m256i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
-
-	/* first 1B load */
-	start_flow8(flow, MASK8_BIT, UINT8_MAX, &pdata[0], &idx[0], &di[0]);
-	start_flow8(flow, MASK8_BIT, UINT8_MAX, &pdata[2], &idx[1], &di[1]);
-
-	in[0] = get_next_bytes_avx512x8(flow, &pdata[0], UINT8_MAX, &di[0],
-			flow->first_load_sz);
-	in[1] = get_next_bytes_avx512x8(flow, &pdata[2], UINT8_MAX, &di[1],
-			flow->first_load_sz);
-
-	first_trans8(flow, in[0], UINT8_MAX, &tr_lo[0], &tr_hi[0]);
-	first_trans8(flow, in[1], UINT8_MAX, &tr_lo[1], &tr_hi[1]);
-
-	fm[0] = UINT8_MAX;
-	fm[1] = UINT8_MAX;
-
-	/* match check */
-	match_check_process_avx512x8x2(flow, fm, pdata, di, idx, in,
-		tr_lo, tr_hi);
-
-	while ((fm[0] | fm[1]) != 0) {
-
-		/* load next 4B */
-
-		in[0] = get_next_bytes_avx512x8(flow, &pdata[0], fm[0],
-			&di[0], sizeof(uint32_t));
-		in[1] = get_next_bytes_avx512x8(flow, &pdata[2], fm[1],
-			&di[1], sizeof(uint32_t));
-
-		/* main 4B loop */
-
-		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		in[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
-		in[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
-
-		/* check for matches */
-		match_check_process_avx512x8x2(flow, fm, pdata, di, idx, in,
-			tr_lo, tr_hi);
-	}
-}
-
-/*
- * resolve match index to actual result/priority offset.
- */
-static inline __m256i
-resolve_match_idx_avx512x8(__m256i mi)
-{
-	RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
-		1 << (match_log + 2));
-	return _mm256_slli_epi32(mi, match_log);
+			_SV_(pminp), _mm256_castsi128_si256(inp[1]));
 }
 
-/*
- * Resolve multiple matches for the same flow based on priority.
- */
-static inline __m256i
-resolve_pri_avx512x8(const int32_t res[], const int32_t pri[],
-	const uint32_t match[], __mmask8 msk, uint32_t nb_trie,
-	uint32_t nb_skip)
-{
-	uint32_t i;
-	const uint32_t *pm;
-	__mmask16 m;
-	__m256i cp, cr, np, nr, mch;
-
-	const __m256i zero = _mm256_set1_epi32(0);
-
-	/* get match indexes */
-	mch = _mm256_maskz_loadu_epi32(msk, match);
-	mch = resolve_match_idx_avx512x8(mch);
-
-	/* read result and priority values for first trie */
-	cr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0]));
-	cp = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0]));
-
-	/*
-	 * read result and priority values for next tries and select one
-	 * with highest priority.
-	 */
-	for (i = 1, pm = match + nb_skip; i != nb_trie;
-			i++, pm += nb_skip) {
-
-		mch = _mm256_maskz_loadu_epi32(msk, pm);
-		mch = resolve_match_idx_avx512x8(mch);
-
-		nr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res,
-			sizeof(res[0]));
-		np = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri,
-			sizeof(pri[0]));
-
-		m = _mm256_cmpgt_epi32_mask(cp, np);
-		cr = _mm256_mask_mov_epi32(nr, m, cr);
-		cp = _mm256_mask_mov_epi32(np, m, cp);
-	}
-
-	return cr;
-}
-
-/*
- * Resolve num (<= 8) matches for single category
- */
-static inline void
-resolve_sc_avx512x8(uint32_t result[], const int32_t res[],
-	const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
-	uint32_t nb_trie, uint32_t nb_skip)
-{
-	__mmask8 msk;
-	__m256i cr;
-
-	msk = (1 << nb_pkt) - 1;
-	cr = resolve_pri_avx512x8(res, pri, match, msk, nb_trie, nb_skip);
-	_mm256_mask_storeu_epi32(result, msk, cr);
-}
+#include "acl_run_avx512_common.h"
 
 /*
- * Resolve matches for single category
+ * Perform search for up to (2 * 8) flows in parallel.
+ * Use two sets of metadata, each serves 8 flows max.
  */
-static inline void
-resolve_sc_avx512x8x2(uint32_t result[],
-	const struct rte_acl_match_results pr[], const uint32_t match[],
-	uint32_t nb_pkt, uint32_t nb_trie)
-{
-	uint32_t j, k, n;
-	const int32_t *res, *pri;
-	__m256i cr[2];
-
-	res = (const int32_t *)pr->results;
-	pri = pr->priority;
-
-	for (k = 0; k != (nb_pkt & ~MSK_AVX512X8X2); k += NUM_AVX512X8X2) {
-
-		j = k + MASK8_BIT;
-
-		cr[0] = resolve_pri_avx512x8(res, pri, match + k, UINT8_MAX,
-				nb_trie, nb_pkt);
-		cr[1] = resolve_pri_avx512x8(res, pri, match + j, UINT8_MAX,
-				nb_trie, nb_pkt);
-
-		_mm256_storeu_si256((void *)(result + k), cr[0]);
-		_mm256_storeu_si256((void *)(result + j), cr[1]);
-	}
-
-	n = nb_pkt - k;
-	if (n != 0) {
-		if (n > MASK8_BIT) {
-			resolve_sc_avx512x8(result + k, res, pri, match + k,
-				MASK8_BIT, nb_trie, nb_pkt);
-			k += MASK8_BIT;
-			n -= MASK8_BIT;
-		}
-		resolve_sc_avx512x8(result + k, res, pri, match + k, n,
-				nb_trie, nb_pkt);
-	}
-}
-
 static inline int
 search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint32_t *results, uint32_t total_packets, uint32_t categories)
@@ -624,7 +219,7 @@ search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		acl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);
 
 		/* process the trie */
-		search_trie_avx512x8x2(&flow);
+		_F_(search_trie)(&flow);
 	}
 
 	/* resolve matches */
@@ -632,7 +227,7 @@ search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 		(ctx->trans_table + ctx->match_index);
 
 	if (categories == 1)
-		resolve_sc_avx512x8x2(results, pr, match, total_packets,
+		_F_(resolve_single_cat)(results, pr, match, total_packets,
 			ctx->num_tries);
 	else
 		resolve_mcle8_avx512x1(results, pr, match, total_packets,
@@ -640,3 +235,19 @@ search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 	return 0;
 }
+
+#undef _SIMD_PTR_MSK_
+#undef _SIMD_PTR_NUM_
+#undef _SIMD_FLOW_MSK_
+#undef _SIMD_FLOW_NUM_
+#undef _SIMD_MASK_MAX_
+#undef _SIMD_MASK_BIT_
+#undef _M_GI_
+#undef _M_MGI_
+#undef _M_SI_
+#undef _M_I_
+#undef _F_
+#undef _SV_
+#undef _SC_
+#undef _T_mask
+#undef _T_simd
-- 
2.17.1