[PATCH 0/3] ARM64: NEON ACL implementation

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/3] ARM64: NEON ACL implementation
@ 2015-11-02 14:18 Jerin Jacob
  2015-11-02 14:18 ` [PATCH 1/3] arm: ret_vector.h improvements Jerin Jacob
  0 siblings, 1 reply; 12+ messages in thread
From: Jerin Jacob @ 2015-11-02 14:18 UTC (permalink / raw)
  To: dev

ARM64 Neon based ACL implementation

This patch is based on David's armv8 v3 patch set


Jerin Jacob (3):
  arm: ret_vector.h improvements
  arm64: acl: add neon based acl implementation
  arm64: defconfig: enabled CONFIG_RTE_LIBRTE_ACL

 app/test-acl/main.c                               |   4 +
 config/defconfig_arm64-armv8a-linuxapp-gcc        |   1 -
 lib/librte_acl/Makefile                           |   5 +
 lib/librte_acl/acl.h                              |   4 +
 lib/librte_acl/acl_run_neon.c                     |  46 ++++
 lib/librte_acl/acl_run_neon.h                     | 290 ++++++++++++++++++++++
 lib/librte_acl/rte_acl.c                          |  25 ++
 lib/librte_acl/rte_acl.h                          |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  58 ++---
 9 files changed, 392 insertions(+), 42 deletions(-)
 create mode 100644 lib/librte_acl/acl_run_neon.c
 create mode 100644 lib/librte_acl/acl_run_neon.h

-- 
2.1.0

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/3] arm: ret_vector.h improvements
  2015-11-02 14:18 [PATCH 0/3] ARM64: NEON ACL implementation Jerin Jacob
@ 2015-11-02 14:18 ` Jerin Jacob
  2015-11-02 14:18   ` [PATCH 2/3] arm64: acl: add neon based acl implementation Jerin Jacob
  0 siblings, 1 reply; 12+ messages in thread
From: Jerin Jacob @ 2015-11-02 14:18 UTC (permalink / raw)
  To: dev

added the definition of rte_xmm and xmm_t for acl noen implementation
removed the emulated _mm_* functions

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 lib/librte_eal/common/include/arch/arm/rte_vect.h | 58 +++++++----------------
 1 file changed, 17 insertions(+), 41 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 7d5de97..21cdb4d 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -1,7 +1,7 @@
 /*-
  *   BSD LICENSE
  *
- *   Copyright(c) 2015 RehiveTech. All rights reserved.
+ *   Copyright(c) 2015 Cavium Networks. All rights reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -13,7 +13,7 @@
  *       notice, this list of conditions and the following disclaimer in
  *       the documentation and/or other materials provided with the
  *       distribution.
- *     * Neither the name of RehiveTech nor the names of its
+ *     * Neither the name of Cavium Networks nor the names of its
  *       contributors may be used to endorse or promote products derived
  *       from this software without specific prior written permission.
  *
@@ -33,49 +33,25 @@
 #ifndef _RTE_VECT_ARM_H_
 #define _RTE_VECT_ARM_H_
 
+#include "arm_neon.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define XMM_SIZE 16
-#define XMM_MASK (XMM_MASK - 1)
-
-typedef struct {
-	union uint128 {
-		uint8_t uint8[16];
-		uint32_t uint32[4];
-	} val;
-} __m128i;
-
-static inline __m128i
-_mm_set_epi32(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3)
-{
-	__m128i res;
-
-	res.val.uint32[0] = v0;
-	res.val.uint32[1] = v1;
-	res.val.uint32[2] = v2;
-	res.val.uint32[3] = v3;
-	return res;
-}
-
-static inline __m128i
-_mm_loadu_si128(__m128i *v)
-{
-	__m128i res;
-
-	res = *v;
-	return res;
-}
-
-static inline __m128i
-_mm_load_si128(__m128i *v)
-{
-	__m128i res;
-
-	res = *v;
-	return res;
-}
+typedef int32x4_t xmm_t;
+
+#define	XMM_SIZE	(sizeof(xmm_t))
+#define	XMM_MASK	(XMM_SIZE - 1)
+
+typedef union rte_xmm {
+	xmm_t    x;
+	uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
+	uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
+	uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
+	uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
+	double   pd[XMM_SIZE / sizeof(double)];
+} __attribute__((aligned(16))) rte_xmm_t;
 
 #ifdef __cplusplus
 }
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] arm64: acl: add neon based acl implementation
  2015-11-02 14:18 ` [PATCH 1/3] arm: ret_vector.h improvements Jerin Jacob
@ 2015-11-02 14:18   ` Jerin Jacob
  2015-11-02 14:18     ` [PATCH 3/3] arm64: defconfig: enabled CONFIG_RTE_LIBRTE_ACL Jerin Jacob
                       ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Jerin Jacob @ 2015-11-02 14:18 UTC (permalink / raw)
  To: dev

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 app/test-acl/main.c           |   4 +
 lib/librte_acl/Makefile       |   5 +
 lib/librte_acl/acl.h          |   4 +
 lib/librte_acl/acl_run_neon.c |  46 +++++++
 lib/librte_acl/acl_run_neon.h | 290 ++++++++++++++++++++++++++++++++++++++++++
 lib/librte_acl/rte_acl.c      |  25 ++++
 lib/librte_acl/rte_acl.h      |   1 +
 7 files changed, 375 insertions(+)
 create mode 100644 lib/librte_acl/acl_run_neon.c
 create mode 100644 lib/librte_acl/acl_run_neon.h

diff --git a/app/test-acl/main.c b/app/test-acl/main.c
index 72ce83c..0b0c093 100644
--- a/app/test-acl/main.c
+++ b/app/test-acl/main.c
@@ -101,6 +101,10 @@ static const struct acl_alg acl_alg[] = {
 		.name = "avx2",
 		.alg = RTE_ACL_CLASSIFY_AVX2,
 	},
+	{
+		.name = "neon",
+		.alg = RTE_ACL_CLASSIFY_NEON,
+	},
 };
 
 static struct {
diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile
index 7a1cf8a..27f91d5 100644
--- a/lib/librte_acl/Makefile
+++ b/lib/librte_acl/Makefile
@@ -48,9 +48,14 @@ SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c
 SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c
 SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c
 SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_scalar.c
+ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
+SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_neon.c
+else
 SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c
+endif
 
 CFLAGS_acl_run_sse.o += -msse4.1
+CFLAGS_acl_run_neon.o += -flax-vector-conversions -Wno-maybe-uninitialized
 
 #
 # If the compiler supports AVX2 instructions,
diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
index eb4930c..09d6784 100644
--- a/lib/librte_acl/acl.h
+++ b/lib/librte_acl/acl.h
@@ -230,6 +230,10 @@ int
 rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint32_t *results, uint32_t num, uint32_t categories);
 
+int
+rte_acl_classify_neon(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	uint32_t *results, uint32_t num, uint32_t categories);
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
diff --git a/lib/librte_acl/acl_run_neon.c b/lib/librte_acl/acl_run_neon.c
new file mode 100644
index 0000000..b014451
--- /dev/null
+++ b/lib/librte_acl/acl_run_neon.c
@@ -0,0 +1,46 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2015.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "acl_run_neon.h"
+
+int
+rte_acl_classify_neon(const struct rte_acl_ctx *ctx, const uint8_t **data,
+		      uint32_t *results, uint32_t num, uint32_t categories)
+{
+	if (likely(num >= 8))
+		return search_neon_8(ctx, data, results, num, categories);
+	else if (num >= 4)
+		return search_neon_4(ctx, data, results, num, categories);
+	else
+		return rte_acl_classify_scalar(ctx, data, results, num,
+			categories);
+}
diff --git a/lib/librte_acl/acl_run_neon.h b/lib/librte_acl/acl_run_neon.h
new file mode 100644
index 0000000..4579476
--- /dev/null
+++ b/lib/librte_acl/acl_run_neon.h
@@ -0,0 +1,290 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2015.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "acl_run.h"
+#include "acl_vect.h"
+
+struct _neon_acl_const {
+	rte_xmm_t xmm_shuffle_input;
+	rte_xmm_t xmm_index_mask;
+	rte_xmm_t range_base;
+} neon_acl_const  __attribute__((aligned(RTE_CACHE_LINE_SIZE))) = {
+	{
+		.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c}
+	},
+	{
+		.u32 = {RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX,
+		RTE_ACL_NODE_INDEX, RTE_ACL_NODE_INDEX}
+	},
+	{
+		.u32 = {0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c}
+	},
+};
+
+/*
+ * Resolve priority for multiple results (neon version).
+ * This consists comparing the priority of the current traversal with the
+ * running set of results for the packet.
+ * For each result, keep a running array of the result (rule number) and
+ * its priority for each category.
+ */
+static inline void
+resolve_priority_neon(uint64_t transition, int n, const struct rte_acl_ctx *ctx,
+		      struct parms *parms,
+		      const struct rte_acl_match_results *p,
+		      uint32_t categories)
+{
+	uint32_t x;
+	int32x4_t results, priority, results1, priority1;
+	uint32x4_t selector;
+	int32_t *saved_results, *saved_priority;
+
+	for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {
+		saved_results = (int32_t *)(&parms[n].cmplt->results[x]);
+		saved_priority = (int32_t *)(&parms[n].cmplt->priority[x]);
+
+		/* get results and priorities for completed trie */
+		results = vld1q_s32(
+			(const int32_t *)&p[transition].results[x]);
+		priority = vld1q_s32(
+			(const int32_t *)&p[transition].priority[x]);
+
+		/* if this is not the first completed trie */
+		if (parms[n].cmplt->count != ctx->num_tries) {
+			/* get running best results and their priorities */
+			results1 = vld1q_s32(saved_results);
+			priority1 = vld1q_s32(saved_priority);
+
+			/* select results that are highest priority */
+			selector = vcgtq_s32(priority1, priority);
+			results = vbslq_s32(selector, results1, results);
+			priority = vbslq_s32(selector, priority1, priority);
+		}
+
+		/* save running best results and their priorities */
+		vst1q_s32(saved_results, results);
+		vst1q_s32(saved_priority, priority);
+	}
+}
+
+/*
+ * Check for any match in 4 transitions
+ */
+static inline __attribute__((always_inline)) uint32_t
+check_any_match_x4(uint64_t val[])
+{
+	return ((val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH);
+}
+
+static inline __attribute__((always_inline)) void
+acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
+		   struct acl_flow_data *flows, uint64_t transitions[])
+{
+	while (check_any_match_x4(transitions)) {
+		transitions[0] = acl_match_check(transitions[0], slot, ctx,
+			parms, flows, resolve_priority_neon);
+		transitions[1] = acl_match_check(transitions[1], slot + 1, ctx,
+			parms, flows, resolve_priority_neon);
+		transitions[2] = acl_match_check(transitions[2], slot + 2, ctx,
+			parms, flows, resolve_priority_neon);
+		transitions[3] = acl_match_check(transitions[3], slot + 3, ctx,
+			parms, flows, resolve_priority_neon);
+	}
+}
+
+/*
+ * Process 4 transitions (in 2 NEON Q registers) in parallel
+ */
+static inline __attribute__((always_inline)) int32x4_t
+transition4(int32x4_t next_input, const uint64_t *trans, uint64_t transitions[])
+{
+	int32x4x2_t tr_hi_lo;
+	int32x4_t t, in, r;
+	uint32x4_t index_msk, node_type, addr;
+	uint32x4_t dfa_msk, mask, quad_ofs, dfa_ofs;
+
+	/* Move low 32 into tr_hi_lo.val[0] and high 32 into tr_hi_lo.val[1] */
+	tr_hi_lo = vld2q_s32((const int32_t *)transitions);
+
+	/* Calculate the address (array index) for all 4 transitions. */
+
+	index_msk = vld1q_u32((const uint32_t *)&neon_acl_const.xmm_index_mask);
+
+	/* Calc node type and node addr */
+	node_type = vbicq_s32(tr_hi_lo.val[0], index_msk);
+	addr = vandq_s32(tr_hi_lo.val[0], index_msk);
+
+	/* t = 0 */
+	t = veorq_s32(node_type, node_type);
+
+	/* mask for DFA type(0) nodes */
+	dfa_msk = vceqq_u32(node_type, t);
+
+	mask = vld1q_s32((const int32_t *)&neon_acl_const.xmm_shuffle_input);
+	in = vqtbl1q_u8((uint8x16_t)next_input, (uint8x16_t)mask);
+
+	/* DFA calculations. */
+	r = vshrq_n_u32(in, 30); /* div by 64 */
+	mask = vld1q_s32((const int32_t *)&neon_acl_const.range_base);
+	r = vaddq_u8(r, mask);
+	t = vshrq_n_u32(in, 24);
+	r = vqtbl1q_u8((uint8x16_t)tr_hi_lo.val[1], (uint8x16_t)r);
+	dfa_ofs = vsubq_s32(t, r);
+
+	/* QUAD/SINGLE calculations. */
+	t = vcgtq_s8(in, tr_hi_lo.val[1]);
+	t = vabsq_s8(t);
+	t = vpaddlq_u8(t);
+	quad_ofs = vpaddlq_u16(t);
+
+	/* blend DFA and QUAD/SINGLE. */
+	t = vbslq_u8(dfa_msk, dfa_ofs, quad_ofs);
+
+	/* calculate address for next transitions */
+	addr = vaddq_u32(addr, t);
+
+	/* Fill next transitions */
+	transitions[0] = trans[vgetq_lane_u32(addr, 0)];
+	transitions[1] = trans[vgetq_lane_u32(addr, 1)];
+	transitions[2] = trans[vgetq_lane_u32(addr, 2)];
+	transitions[3] = trans[vgetq_lane_u32(addr, 3)];
+
+	return vshrq_n_u32(next_input, CHAR_BIT);
+}
+
+/*
+ * Execute trie traversal with 8 traversals in parallel
+ */
+static inline int
+search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	      uint32_t *results, uint32_t total_packets, uint32_t categories)
+{
+	int n;
+	struct acl_flow_data flows;
+	uint64_t index_array[8];
+	struct completion cmplt[8];
+	struct parms parms[8];
+	int32x4_t input0, input1;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		     total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < 8; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	 /* Check for any matches. */
+	acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]);
+	acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]);
+
+	while (flows.started > 0) {
+		/* Gather 4 bytes of input data for each stream. */
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input0, 0);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4), input1, 0);
+
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1);
+
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input0, 2);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6), input1, 2);
+
+		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input0, 3);
+		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7), input1, 3);
+
+		/* Process the 4 bytes of input on each stream. */
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		input0 = transition4(input0, flows.trans, &index_array[0]);
+		input1 = transition4(input1, flows.trans, &index_array[4]);
+
+		 /* Check for any matches. */
+		acl_match_check_x4(0, ctx, parms, &flows, &index_array[0]);
+		acl_match_check_x4(4, ctx, parms, &flows, &index_array[4]);
+	}
+
+	return 0;
+}
+
+/*
+ * Execute trie traversal with 4 traversals in parallel
+ */
+static inline int
+search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
+	      uint32_t *results, int total_packets, uint32_t categories)
+{
+	int n;
+	struct acl_flow_data flows;
+	uint64_t index_array[4];
+	struct completion cmplt[4];
+	struct parms parms[4];
+	int32x4_t input;
+
+	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
+		     total_packets, categories, ctx->trans_table);
+
+	for (n = 0; n < 4; n++) {
+		cmplt[n].count = 0;
+		index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
+	}
+
+	/* Check for any matches. */
+	acl_match_check_x4(0, ctx, parms, &flows, index_array);
+
+	while (flows.started > 0) {
+		/* Gather 4 bytes of input data for each stream. */
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input, 0);
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1);
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2);
+		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3);
+
+		/* Process the 4 bytes of input on each stream. */
+		input = transition4(input, flows.trans, index_array);
+		input = transition4(input, flows.trans, index_array);
+		input = transition4(input, flows.trans, index_array);
+		input = transition4(input, flows.trans, index_array);
+
+		/* Check for any matches. */
+		acl_match_check_x4(0, ctx, parms, &flows, index_array);
+	}
+
+	return 0;
+}
+
diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c
index d60219f..e2fdebd 100644
--- a/lib/librte_acl/rte_acl.c
+++ b/lib/librte_acl/rte_acl.c
@@ -55,11 +55,32 @@ rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,
 	return -ENOTSUP;
 }
 
+int __attribute__ ((weak))
+rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx,
+	__rte_unused const uint8_t **data,
+	__rte_unused uint32_t *results,
+	__rte_unused uint32_t num,
+	__rte_unused uint32_t categories)
+{
+	return -ENOTSUP;
+}
+
+int __attribute__ ((weak))
+rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx,
+	__rte_unused const uint8_t **data,
+	__rte_unused uint32_t *results,
+	__rte_unused uint32_t num,
+	__rte_unused uint32_t categories)
+{
+	return -ENOTSUP;
+}
+
 static const rte_acl_classify_t classify_fns[] = {
 	[RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,
 	[RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,
 	[RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse,
 	[RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
+	[RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
 };
 
 /* by default, use always available scalar code path. */
@@ -93,6 +114,9 @@ rte_acl_init(void)
 {
 	enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT;
 
+#ifdef RTE_ARCH_ARM64
+	alg =  RTE_ACL_CLASSIFY_NEON;
+#else
 #ifdef CC_AVX2_SUPPORT
 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))
 		alg = RTE_ACL_CLASSIFY_AVX2;
@@ -102,6 +126,7 @@ rte_acl_init(void)
 #endif
 		alg = RTE_ACL_CLASSIFY_SSE;
 
+#endif
 	rte_acl_set_default_classify(alg);
 }
 
diff --git a/lib/librte_acl/rte_acl.h b/lib/librte_acl/rte_acl.h
index 98ef2fc..0979a09 100644
--- a/lib/librte_acl/rte_acl.h
+++ b/lib/librte_acl/rte_acl.h
@@ -270,6 +270,7 @@ enum rte_acl_classify_alg {
 	RTE_ACL_CLASSIFY_SCALAR = 1,  /**< generic implementation. */
 	RTE_ACL_CLASSIFY_SSE = 2,     /**< requires SSE4.1 support. */
 	RTE_ACL_CLASSIFY_AVX2 = 3,    /**< requires AVX2 support. */
+	RTE_ACL_CLASSIFY_NEON = 4,    /**< requires NEON support. */
 	RTE_ACL_CLASSIFY_NUM          /* should always be the last one. */
 };
 
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] arm64: defconfig: enabled CONFIG_RTE_LIBRTE_ACL
  2015-11-02 14:18   ` [PATCH 2/3] arm64: acl: add neon based acl implementation Jerin Jacob
@ 2015-11-02 14:18     ` Jerin Jacob
  2015-11-02 15:39     ` [PATCH 2/3] arm64: acl: add neon based acl implementation Jan Viktorin
  2015-11-02 16:54     ` Ananyev, Konstantin
  2 siblings, 0 replies; 12+ messages in thread
From: Jerin Jacob @ 2015-11-02 14:18 UTC (permalink / raw)
  To: dev

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 config/defconfig_arm64-armv8a-linuxapp-gcc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc b/config/defconfig_arm64-armv8a-linuxapp-gcc
index 79a9533..3e3bf12 100644
--- a/config/defconfig_arm64-armv8a-linuxapp-gcc
+++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
@@ -48,7 +48,6 @@ CONFIG_RTE_LIBRTE_IVSHMEM=n
 CONFIG_RTE_LIBRTE_EAL_HOTPLUG=n
 
 CONFIG_RTE_LIBRTE_LPM=n
-CONFIG_RTE_LIBRTE_ACL=n
 CONFIG_RTE_LIBRTE_TABLE=n
 CONFIG_RTE_LIBRTE_PIPELINE=n
 
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
  2015-11-02 14:18   ` [PATCH 2/3] arm64: acl: add neon based acl implementation Jerin Jacob
  2015-11-02 14:18     ` [PATCH 3/3] arm64: defconfig: enabled CONFIG_RTE_LIBRTE_ACL Jerin Jacob
@ 2015-11-02 15:39     ` Jan Viktorin
  2015-11-02 16:19       ` Jerin Jacob
  2015-11-02 16:54     ` Ananyev, Konstantin
  2 siblings, 1 reply; 12+ messages in thread
From: Jan Viktorin @ 2015-11-02 15:39 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev

On Mon, 2 Nov 2015 19:48:40 +0530
Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:

> Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> ---
>  app/test-acl/main.c           |   4 +
>  lib/librte_acl/Makefile       |   5 +
>  lib/librte_acl/acl.h          |   4 +
>  lib/librte_acl/acl_run_neon.c |  46 +++++++
>  lib/librte_acl/acl_run_neon.h | 290 ++++++++++++++++++++++++++++++++++++++++++
>  lib/librte_acl/rte_acl.c      |  25 ++++
>  lib/librte_acl/rte_acl.h      |   1 +
>  7 files changed, 375 insertions(+)
>  create mode 100644 lib/librte_acl/acl_run_neon.c
>  create mode 100644 lib/librte_acl/acl_run_neon.h
> 
> diff --git a/app/test-acl/main.c b/app/test-acl/main.c
> index 72ce83c..0b0c093 100644
> --- a/app/test-acl/main.c
> +++ b/app/test-acl/main.c
> @@ -101,6 +101,10 @@ static const struct acl_alg acl_alg[] = {
>  		.name = "avx2",
>  		.alg = RTE_ACL_CLASSIFY_AVX2,
>  	},
> +	{
> +		.name = "neon",
> +		.alg = RTE_ACL_CLASSIFY_NEON,
> +	},
>  };
>  
>  static struct {
> diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile
> index 7a1cf8a..27f91d5 100644
> --- a/lib/librte_acl/Makefile
> +++ b/lib/librte_acl/Makefile
> @@ -48,9 +48,14 @@ SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c
>  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c
>  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c
>  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_scalar.c
> +ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
> +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_neon.c

Are the used NEON instrinsics for ACL ARMv8-specific? If so, the file should be named
something like acl_run_neonv8.c...

> +else
>  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c
> +endif
>  
>  CFLAGS_acl_run_sse.o += -msse4.1
> +CFLAGS_acl_run_neon.o += -flax-vector-conversions -Wno-maybe-uninitialized

From man gcc:

-flax-vector-conversions
 Allow implicit conversions between vectors with differing numbers of elements and/or
 incompatible element types.  This option should not be used for new code.

I've already pointed to this in the Dave's ARMv8 patchset. They dropped it silently.
What is the purpose? Is it necessary?

Jan

>  
>  #
>  # If the compiler supports AVX2 instructions,
> diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
> index eb4930c..09d6784 100644
> --- a/lib/librte_acl/acl.h
> +++ b/lib/librte_acl/acl.h
> @@ -230,6 +230,10 @@ int
>  rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
>  	uint32_t *results, uint32_t num, uint32_t categories);
>  
--snip--

-- 
   Jan Viktorin                  E-mail: Viktorin@RehiveTech.com
   System Architect              Web:    www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
  2015-11-02 15:39     ` [PATCH 2/3] arm64: acl: add neon based acl implementation Jan Viktorin
@ 2015-11-02 16:19       ` Jerin Jacob
  2015-11-02 17:31         ` Jan Viktorin
  0 siblings, 1 reply; 12+ messages in thread
From: Jerin Jacob @ 2015-11-02 16:19 UTC (permalink / raw)
  To: Jan Viktorin; +Cc: dev

On Mon, Nov 02, 2015 at 04:39:37PM +0100, Jan Viktorin wrote:
> On Mon, 2 Nov 2015 19:48:40 +0530
> Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:
> 
> > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > ---
> >  app/test-acl/main.c           |   4 +
> >  lib/librte_acl/Makefile       |   5 +
> >  lib/librte_acl/acl.h          |   4 +
> >  lib/librte_acl/acl_run_neon.c |  46 +++++++
> >  lib/librte_acl/acl_run_neon.h | 290 ++++++++++++++++++++++++++++++++++++++++++
> >  lib/librte_acl/rte_acl.c      |  25 ++++
> >  lib/librte_acl/rte_acl.h      |   1 +
> >  7 files changed, 375 insertions(+)
> >  create mode 100644 lib/librte_acl/acl_run_neon.c
> >  create mode 100644 lib/librte_acl/acl_run_neon.h
> > 
> > diff --git a/app/test-acl/main.c b/app/test-acl/main.c
> > index 72ce83c..0b0c093 100644
> > --- a/app/test-acl/main.c
> > +++ b/app/test-acl/main.c
> > @@ -101,6 +101,10 @@ static const struct acl_alg acl_alg[] = {
> >  		.name = "avx2",
> >  		.alg = RTE_ACL_CLASSIFY_AVX2,
> >  	},
> > +	{
> > +		.name = "neon",
> > +		.alg = RTE_ACL_CLASSIFY_NEON,
> > +	},
> >  };
> >  
> >  static struct {
> > diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile
> > index 7a1cf8a..27f91d5 100644
> > --- a/lib/librte_acl/Makefile
> > +++ b/lib/librte_acl/Makefile
> > @@ -48,9 +48,14 @@ SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c
> >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c
> >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c
> >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_scalar.c
> > +ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
> > +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_neon.c
> 
> Are the used NEON instrinsics for ACL ARMv8-specific? If so, the file should be named
> something like acl_run_neonv8.c...
> 

Yes, bit of armv8 specific, looks like vqtbl1q_u8 NEON instrinsics
defined only in armv8. I could rename to acl_run_neonv8.c but keeping
as acl_run_neon.c, may in future it can be extend to armv7 also.
I am open to any decision, let me know your views.

> > +else
> >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c
> > +endif
> >  
> >  CFLAGS_acl_run_sse.o += -msse4.1
> > +CFLAGS_acl_run_neon.o += -flax-vector-conversions -Wno-maybe-uninitialized
> 
> From man gcc:
> 
> -flax-vector-conversions
>  Allow implicit conversions between vectors with differing numbers of elements and/or
>  incompatible element types.  This option should not be used for new code.
> 
> I've already pointed to this in the Dave's ARMv8 patchset. They dropped it silently.
> What is the purpose? Is it necessary?

Yes, the same tr hi value we can representing as unsigned and signed
based on it DFA or QRANGE .


> 
> Jan
> 
> >  
> >  #
> >  # If the compiler supports AVX2 instructions,
> > diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
> > index eb4930c..09d6784 100644
> > --- a/lib/librte_acl/acl.h
> > +++ b/lib/librte_acl/acl.h
> > @@ -230,6 +230,10 @@ int
> >  rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
> >  	uint32_t *results, uint32_t num, uint32_t categories);
> >  
> --snip--
> 
> -- 
>    Jan Viktorin                  E-mail: Viktorin@RehiveTech.com
>    System Architect              Web:    www.RehiveTech.com
>    RehiveTech
>    Brno, Czech Republic

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
  2015-11-02 14:18   ` [PATCH 2/3] arm64: acl: add neon based acl implementation Jerin Jacob
  2015-11-02 14:18     ` [PATCH 3/3] arm64: defconfig: enabled CONFIG_RTE_LIBRTE_ACL Jerin Jacob
  2015-11-02 15:39     ` [PATCH 2/3] arm64: acl: add neon based acl implementation Jan Viktorin
@ 2015-11-02 16:54     ` Ananyev, Konstantin
  2015-11-03  4:30       ` Jerin Jacob
  2 siblings, 1 reply; 12+ messages in thread
From: Ananyev, Konstantin @ 2015-11-02 16:54 UTC (permalink / raw)
  To: Jerin Jacob, dev

Hi Jacob,

> diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c
> index d60219f..e2fdebd 100644
> --- a/lib/librte_acl/rte_acl.c
> +++ b/lib/librte_acl/rte_acl.c
> @@ -55,11 +55,32 @@ rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,
>  	return -ENOTSUP;
>  }
> 
> +int __attribute__ ((weak))
> +rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx,
> +	__rte_unused const uint8_t **data,
> +	__rte_unused uint32_t *results,
> +	__rte_unused uint32_t num,
> +	__rte_unused uint32_t categories)
> +{
> +	return -ENOTSUP;
> +}
> +
> +int __attribute__ ((weak))
> +rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx,
> +	__rte_unused const uint8_t **data,
> +	__rte_unused uint32_t *results,
> +	__rte_unused uint32_t num,
> +	__rte_unused uint32_t categories)
> +{
> +	return -ENOTSUP;
> +}
> +
>  static const rte_acl_classify_t classify_fns[] = {
>  	[RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,
>  	[RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,
>  	[RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse,
>  	[RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
> +	[RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
>  };
> 
>  /* by default, use always available scalar code path. */
> @@ -93,6 +114,9 @@ rte_acl_init(void)
>  {
>  	enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT;
> 
> +#ifdef RTE_ARCH_ARM64
> +	alg =  RTE_ACL_CLASSIFY_NEON;
> +#else

On ARM, is there any specific cpu flag that you can use to determine is NEON
isa is supported or not?
It would be good to avoid extra conditional compilation here if possible.
Another question - did I get it right that NEON isa is supported on all
possible RTE_ARCH_ARM64 cpu models you plan to support?
Konstantin

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
  2015-11-02 16:19       ` Jerin Jacob
@ 2015-11-02 17:31         ` Jan Viktorin
  0 siblings, 0 replies; 12+ messages in thread
From: Jan Viktorin @ 2015-11-02 17:31 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev

On Mon, 2 Nov 2015 21:49:54 +0530
Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:

> On Mon, Nov 02, 2015 at 04:39:37PM +0100, Jan Viktorin wrote:
> > On Mon, 2 Nov 2015 19:48:40 +0530
> > Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:
> >   
> > > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > ---
> > >  app/test-acl/main.c           |   4 +
> > >  lib/librte_acl/Makefile       |   5 +
> > >  lib/librte_acl/acl.h          |   4 +
> > >  lib/librte_acl/acl_run_neon.c |  46 +++++++
> > >  lib/librte_acl/acl_run_neon.h | 290 ++++++++++++++++++++++++++++++++++++++++++
> > >  lib/librte_acl/rte_acl.c      |  25 ++++
> > >  lib/librte_acl/rte_acl.h      |   1 +
> > >  7 files changed, 375 insertions(+)
> > >  create mode 100644 lib/librte_acl/acl_run_neon.c
> > >  create mode 100644 lib/librte_acl/acl_run_neon.h
> > > 
> > > diff --git a/app/test-acl/main.c b/app/test-acl/main.c
> > > index 72ce83c..0b0c093 100644
> > > --- a/app/test-acl/main.c
> > > +++ b/app/test-acl/main.c
> > > @@ -101,6 +101,10 @@ static const struct acl_alg acl_alg[] = {
> > >  		.name = "avx2",
> > >  		.alg = RTE_ACL_CLASSIFY_AVX2,
> > >  	},
> > > +	{
> > > +		.name = "neon",
> > > +		.alg = RTE_ACL_CLASSIFY_NEON,
> > > +	},
> > >  };
> > >  
> > >  static struct {
> > > diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile
> > > index 7a1cf8a..27f91d5 100644
> > > --- a/lib/librte_acl/Makefile
> > > +++ b/lib/librte_acl/Makefile
> > > @@ -48,9 +48,14 @@ SRCS-$(CONFIG_RTE_LIBRTE_ACL) += rte_acl.c
> > >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_bld.c
> > >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_gen.c
> > >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_scalar.c
> > > +ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
> > > +SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_neon.c  
> > 
> > Are the used NEON instrinsics for ACL ARMv8-specific? If so, the file should be named
> > something like acl_run_neonv8.c...
> >   
> 
> Yes, bit of armv8 specific, looks like vqtbl1q_u8 NEON instrinsics
> defined only in armv8. I could rename to acl_run_neonv8.c but keeping
> as acl_run_neon.c, may in future it can be extend to armv7 also.
> I am open to any decision, let me know your views.

OK, this sounds reasonable. Leave it as it is.

> 
> > > +else
> > >  SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c
> > > +endif
> > >  
> > >  CFLAGS_acl_run_sse.o += -msse4.1
> > > +CFLAGS_acl_run_neon.o += -flax-vector-conversions -Wno-maybe-uninitialized  
> > 
> > From man gcc:
> > 
> > -flax-vector-conversions
> >  Allow implicit conversions between vectors with differing numbers of elements and/or
> >  incompatible element types.  This option should not be used for new code.
> > 
> > I've already pointed to this in the Dave's ARMv8 patchset. They dropped it silently.
> > What is the purpose? Is it necessary?  
> 
> Yes, the same tr hi value we can representing as unsigned and signed
> based on it DFA or QRANGE .

I don't understand your answer. What is "tr hi"? What means DFA and
QRANGE here?

I just wanted to point to the note: "This option should not be used for
new code."

Jan

> 
> 
> > 
> > Jan
> >   
> > >  
> > >  #
> > >  # If the compiler supports AVX2 instructions,
> > > diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h
> > > index eb4930c..09d6784 100644
> > > --- a/lib/librte_acl/acl.h
> > > +++ b/lib/librte_acl/acl.h
> > > @@ -230,6 +230,10 @@ int
> > >  rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,
> > >  	uint32_t *results, uint32_t num, uint32_t categories);
> > >    
> > --snip--
> > 
> > -- 
> >    Jan Viktorin                  E-mail: Viktorin@RehiveTech.com
> >    System Architect              Web:    www.RehiveTech.com
> >    RehiveTech
> >    Brno, Czech Republic  



-- 
   Jan Viktorin                  E-mail: Viktorin@RehiveTech.com
   System Architect              Web:    www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
  2015-11-02 16:54     ` Ananyev, Konstantin
@ 2015-11-03  4:30       ` Jerin Jacob
  2015-11-03 10:23         ` Ananyev, Konstantin
  0 siblings, 1 reply; 12+ messages in thread
From: Jerin Jacob @ 2015-11-03  4:30 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: dev

On Mon, Nov 02, 2015 at 04:54:24PM +0000, Ananyev, Konstantin wrote:
> Hi Jacob,
> 
> > diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c
> > index d60219f..e2fdebd 100644
> > --- a/lib/librte_acl/rte_acl.c
> > +++ b/lib/librte_acl/rte_acl.c
> > @@ -55,11 +55,32 @@ rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,
> >  	return -ENOTSUP;
> >  }
> > 
> > +int __attribute__ ((weak))
> > +rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx,
> > +	__rte_unused const uint8_t **data,
> > +	__rte_unused uint32_t *results,
> > +	__rte_unused uint32_t num,
> > +	__rte_unused uint32_t categories)
> > +{
> > +	return -ENOTSUP;
> > +}
> > +
> > +int __attribute__ ((weak))
> > +rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx,
> > +	__rte_unused const uint8_t **data,
> > +	__rte_unused uint32_t *results,
> > +	__rte_unused uint32_t num,
> > +	__rte_unused uint32_t categories)
> > +{
> > +	return -ENOTSUP;
> > +}
> > +
> >  static const rte_acl_classify_t classify_fns[] = {
> >  	[RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,
> >  	[RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,
> >  	[RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse,
> >  	[RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
> > +	[RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
> >  };
> > 
> >  /* by default, use always available scalar code path. */
> > @@ -93,6 +114,9 @@ rte_acl_init(void)
> >  {
> >  	enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT;
> > 
> > +#ifdef RTE_ARCH_ARM64
> > +	alg =  RTE_ACL_CLASSIFY_NEON;
> > +#else

Hi Konstantin,
> 
> On ARM, is there any specific cpu flag that you can use to determine is NEON
> isa is supported or not?

Yes, on armv7(RTE_CPUFLAG_NEON). On armv8-a NEON is mandatory.

> It would be good to avoid extra conditional compilation here if possible.
neon acl is verified/ported only on armv8. While adding the armv7 support the
check can be extended for cpuflag based on RTE_CPUFLAG_NEON on armv7

> Another question - did I get it right that NEON isa is supported on all
> possible RTE_ARCH_ARM64 cpu models you plan to support?

Yes

> Konstantin
> 
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
  2015-11-03  4:30       ` Jerin Jacob
@ 2015-11-03 10:23         ` Ananyev, Konstantin
  2015-11-03 10:35           ` Jan Viktorin
  0 siblings, 1 reply; 12+ messages in thread
From: Ananyev, Konstantin @ 2015-11-03 10:23 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev


Hi Jacob,

> -----Original Message-----
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Tuesday, November 03, 2015 4:31 AM
> To: Ananyev, Konstantin
> Cc: dev@dpdk.org; thomas.monjalon@6wind.com; Hunt, David; viktorin@rehivetech.com
> Subject: Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
> 
> On Mon, Nov 02, 2015 at 04:54:24PM +0000, Ananyev, Konstantin wrote:
> > Hi Jacob,
> >
> > > diff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c
> > > index d60219f..e2fdebd 100644
> > > --- a/lib/librte_acl/rte_acl.c
> > > +++ b/lib/librte_acl/rte_acl.c
> > > @@ -55,11 +55,32 @@ rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,
> > >  	return -ENOTSUP;
> > >  }
> > >
> > > +int __attribute__ ((weak))
> > > +rte_acl_classify_sse(__rte_unused const struct rte_acl_ctx *ctx,
> > > +	__rte_unused const uint8_t **data,
> > > +	__rte_unused uint32_t *results,
> > > +	__rte_unused uint32_t num,
> > > +	__rte_unused uint32_t categories)
> > > +{
> > > +	return -ENOTSUP;
> > > +}
> > > +
> > > +int __attribute__ ((weak))
> > > +rte_acl_classify_neon(__rte_unused const struct rte_acl_ctx *ctx,
> > > +	__rte_unused const uint8_t **data,
> > > +	__rte_unused uint32_t *results,
> > > +	__rte_unused uint32_t num,
> > > +	__rte_unused uint32_t categories)
> > > +{
> > > +	return -ENOTSUP;
> > > +}
> > > +
> > >  static const rte_acl_classify_t classify_fns[] = {
> > >  	[RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,
> > >  	[RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,
> > >  	[RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse,
> > >  	[RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,
> > > +	[RTE_ACL_CLASSIFY_NEON] = rte_acl_classify_neon,
> > >  };
> > >
> > >  /* by default, use always available scalar code path. */
> > > @@ -93,6 +114,9 @@ rte_acl_init(void)
> > >  {
> > >  	enum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT;
> > >
> > > +#ifdef RTE_ARCH_ARM64
> > > +	alg =  RTE_ACL_CLASSIFY_NEON;
> > > +#else
> 
> Hi Konstantin,
> >
> > On ARM, is there any specific cpu flag that you can use to determine is NEON
> > isa is supported or not?
> 
> Yes, on armv7(RTE_CPUFLAG_NEON). On armv8-a NEON is mandatory.
> 
> > It would be good to avoid extra conditional compilation here if possible.
> neon acl is verified/ported only on armv8. While adding the armv7 support the
> check can be extended for cpuflag based on RTE_CPUFLAG_NEON on armv7

Ok, and is there a flag that allows to distinguish between armv7 and arm8 then at runtime?
It is probably ok like that, but with all that conditional compilations it gets too messy.
Another thing - if you can distinguish between armv7 and armv8 at runtime, then you probably
can set alg = RTE_ACL_CLASSIFY_DEFAULT for armv7 and alg = RTE_ACL_CLASSIFY_NEON for arm8?
Konstantin

> 
> > Another question - did I get it right that NEON isa is supported on all
> > possible RTE_ARCH_ARM64 cpu models you plan to support?
> 
> Yes
> 
> > Konstantin
> >
> >

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
  2015-11-03 10:23         ` Ananyev, Konstantin
@ 2015-11-03 10:35           ` Jan Viktorin
  2015-11-03 13:20             ` Ananyev, Konstantin
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Viktorin @ 2015-11-03 10:35 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: dev

On Tue, 3 Nov 2015 10:23:55 +0000
"Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:

> > Hi Konstantin,  
> > >
> > > On ARM, is there any specific cpu flag that you can use to determine is NEON
> > > isa is supported or not?  
> > 
> > Yes, on armv7(RTE_CPUFLAG_NEON). On armv8-a NEON is mandatory.
> >   
> > > It would be good to avoid extra conditional compilation here if possible.  
> > neon acl is verified/ported only on armv8. While adding the armv7 support the
> > check can be extended for cpuflag based on RTE_CPUFLAG_NEON on armv7  
> 
> Ok, and is there a flag that allows to distinguish between armv7 and arm8 then at runtime?
> It is probably ok like that, but with all that conditional compilations it gets too messy.
> Another thing - if you can distinguish between armv7 and armv8 at runtime, then you probably
> can set alg = RTE_ACL_CLASSIFY_DEFAULT for armv7 and alg = RTE_ACL_CLASSIFY_NEON for arm8?
> Konstantin

The ARMv7 provides "v7l" in the AT_PLATFORM and the ARMv8 gives
"aarach64". However, I am afraid, as these are two distinct platforms
(with distinct cpuflags), we cannot use the cpuflags easily for this.

Is there an API in DPDK which allows to ask globally: "what platform am
I running on?"?

Jan

> 
> >   
> > > Another question - did I get it right that NEON isa is supported on all
> > > possible RTE_ARCH_ARM64 cpu models you plan to support?  
> > 
> > Yes


-- 
  Jan Viktorin                E-mail: Viktorin@RehiveTech.com
  System Architect            Web:    www.RehiveTech.com
  RehiveTech
  Brno, Czech Republic

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
  2015-11-03 10:35           ` Jan Viktorin
@ 2015-11-03 13:20             ` Ananyev, Konstantin
  0 siblings, 0 replies; 12+ messages in thread
From: Ananyev, Konstantin @ 2015-11-03 13:20 UTC (permalink / raw)
  To: Jan Viktorin; +Cc: dev



> -----Original Message-----
> From: Jan Viktorin [mailto:viktorin@rehivetech.com]
> Sent: Tuesday, November 03, 2015 10:36 AM
> To: Ananyev, Konstantin
> Cc: Jerin Jacob; dev@dpdk.org; thomas.monjalon@6wind.com; Hunt, David
> Subject: Re: [PATCH 2/3] arm64: acl: add neon based acl implementation
> 
> On Tue, 3 Nov 2015 10:23:55 +0000
> "Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:
> 
> > > Hi Konstantin,
> > > >
> > > > On ARM, is there any specific cpu flag that you can use to determine is NEON
> > > > isa is supported or not?
> > >
> > > Yes, on armv7(RTE_CPUFLAG_NEON). On armv8-a NEON is mandatory.
> > >
> > > > It would be good to avoid extra conditional compilation here if possible.
> > > neon acl is verified/ported only on armv8. While adding the armv7 support the
> > > check can be extended for cpuflag based on RTE_CPUFLAG_NEON on armv7
> >
> > Ok, and is there a flag that allows to distinguish between armv7 and arm8 then at runtime?
> > It is probably ok like that, but with all that conditional compilations it gets too messy.
> > Another thing - if you can distinguish between armv7 and armv8 at runtime, then you probably
> > can set alg = RTE_ACL_CLASSIFY_DEFAULT for armv7 and alg = RTE_ACL_CLASSIFY_NEON for arm8?
> > Konstantin
> 
> The ARMv7 provides "v7l" in the AT_PLATFORM and the ARMv8 gives
> "aarach64". However, I am afraid, as these are two distinct platforms
> (with distinct cpuflags), we cannot use the cpuflags easily for this.
> 

Ok, if you think it is unavoidable - let's keep it like that for now.

> Is there an API in DPDK which allows to ask globally: "what platform am
> I running on?"?

Not that I am aware about.
Konstantin

> 
> Jan
> 
> >
> > >
> > > > Another question - did I get it right that NEON isa is supported on all
> > > > possible RTE_ARCH_ARM64 cpu models you plan to support?
> > >
> > > Yes
> 
> 
> --
>   Jan Viktorin                E-mail: Viktorin@RehiveTech.com
>   System Architect            Web:    www.RehiveTech.com
>   RehiveTech
>   Brno, Czech Republic

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2015-11-03 13:21 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-02 14:18 [PATCH 0/3] ARM64: NEON ACL implementation Jerin Jacob
2015-11-02 14:18 ` [PATCH 1/3] arm: ret_vector.h improvements Jerin Jacob
2015-11-02 14:18   ` [PATCH 2/3] arm64: acl: add neon based acl implementation Jerin Jacob
2015-11-02 14:18     ` [PATCH 3/3] arm64: defconfig: enabled CONFIG_RTE_LIBRTE_ACL Jerin Jacob
2015-11-02 15:39     ` [PATCH 2/3] arm64: acl: add neon based acl implementation Jan Viktorin
2015-11-02 16:19       ` Jerin Jacob
2015-11-02 17:31         ` Jan Viktorin
2015-11-02 16:54     ` Ananyev, Konstantin
2015-11-03  4:30       ` Jerin Jacob
2015-11-03 10:23         ` Ananyev, Konstantin
2015-11-03 10:35           ` Jan Viktorin
2015-11-03 13:20             ` Ananyev, Konstantin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.