All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] Virtio NEON support for ARM
@ 2016-06-27 11:54 Jerin Jacob
  2016-06-27 11:54 ` [PATCH 1/4] virtio: Fix compile time dependency of use_simple_rxtx usage Jerin Jacob
                   ` (5 more replies)
  0 siblings, 6 replies; 44+ messages in thread
From: Jerin Jacob @ 2016-06-27 11:54 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

This patchset includes,

1) General cleanup on compile time dependency of use_simple_rxtx with RTE_MACHINE_CPUFLAG_SSSE3
2) Added NEON support for optimized Rx handling

This patchset is based on dpdk-next-virtio/master at a1d8bd4911b28e32c35f16ab2ff3e22180d1f1d7

Jerin Jacob (4):
  virtio: Fix compile time dependency of use_simple_rxtx usage
  virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
  virtio: move SSE based Rx implementation to separate file
  virtio: add neon support

 MAINTAINERS                                  |   1 +
 config/common_base                           |   1 +
 config/defconfig_arm-armv7a-linuxapp-gcc     |   1 +
 config/defconfig_ppc_64-power8-linuxapp-gcc  |   1 +
 config/defconfig_tile-tilegx-linuxapp-gcc    |   1 +
 doc/guides/rel_notes/release_16_07.rst       |   2 +
 drivers/net/virtio/Makefile                  |   3 -
 drivers/net/virtio/virtio_pci.h              |   1 +
 drivers/net/virtio/virtio_rxtx.c             |  32 ++--
 drivers/net/virtio/virtio_rxtx.h             |   3 +-
 drivers/net/virtio/virtio_rxtx_simple.c      | 168 +------------------
 drivers/net/virtio/virtio_rxtx_simple_neon.h | 238 +++++++++++++++++++++++++++
 drivers/net/virtio/virtio_rxtx_simple_sse.h  | 225 +++++++++++++++++++++++++
 drivers/net/virtio/virtio_user_ethdev.c      |   1 +
 14 files changed, 490 insertions(+), 188 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.h
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.h

-- 
2.5.5

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH 1/4] virtio: Fix compile time dependency of use_simple_rxtx usage
  2016-06-27 11:54 [PATCH 0/4] Virtio NEON support for ARM Jerin Jacob
@ 2016-06-27 11:54 ` Jerin Jacob
  2016-06-27 11:54 ` [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR Jerin Jacob
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 44+ messages in thread
From: Jerin Jacob @ 2016-06-27 11:54 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

Removed unnecessary compile time dependency on "use_simple_rxtx".

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/virtio/Makefile             |  3 ---
 drivers/net/virtio/virtio_pci.h         |  1 +
 drivers/net/virtio/virtio_rxtx.c        | 28 +++++++++-------------------
 drivers/net/virtio/virtio_rxtx.h        |  3 +--
 drivers/net/virtio/virtio_rxtx_simple.c |  8 ++++++--
 drivers/net/virtio/virtio_user_ethdev.c |  1 +
 6 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index 43de46c..114d40e 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -50,10 +50,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtqueue.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_pci.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c
-
-ifeq ($(findstring RTE_MACHINE_CPUFLAG_SSSE3,$(CFLAGS)),RTE_MACHINE_CPUFLAG_SSSE3)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c
-endif
 
 ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
index dd7693f..b8295a7 100644
--- a/drivers/net/virtio/virtio_pci.h
+++ b/drivers/net/virtio/virtio_pci.h
@@ -253,6 +253,7 @@ struct virtio_hw {
 	uint8_t	    use_msix;
 	uint8_t     started;
 	uint8_t     modern;
+	uint8_t     use_simple_rxtx;
 	uint8_t     mac_addr[ETHER_ADDR_LEN];
 	uint32_t    notify_off_multiplier;
 	uint8_t     *isr;
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index a27208e..63b53f7 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -67,10 +67,6 @@
 #define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \
 	ETH_TXQ_FLAGS_NOOFFLOADS)
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-static int use_simple_rxtx;
-#endif
-
 static void
 vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
 {
@@ -333,6 +329,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 	 */
 	uint16_t i;
 	uint16_t desc_idx;
+	struct virtio_hw *hw = dev->data->dev_private;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -353,8 +350,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 		nbufs = 0;
 		error = ENOSPC;
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-		if (use_simple_rxtx) {
+		if (hw->use_simple_rxtx) {
 			for (desc_idx = 0; desc_idx < vq->vq_nentries;
 			     desc_idx++) {
 				vq->vq_ring.avail->ring[desc_idx] = desc_idx;
@@ -362,7 +358,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 					VRING_DESC_F_WRITE;
 			}
 		}
-#endif
+
 		memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf));
 		for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST;
 		     desc_idx++) {
@@ -378,12 +374,11 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 			/******************************************
 			*         Enqueue allocated buffers        *
 			*******************************************/
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-			if (use_simple_rxtx)
+			if (hw->use_simple_rxtx)
 				error = virtqueue_enqueue_recv_refill_simple(vq, m);
 			else
-#endif
 				error = virtqueue_enqueue_recv_refill(vq, m);
+
 			if (error) {
 				rte_pktmbuf_free(m);
 				break;
@@ -404,8 +399,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 		struct virtqueue *vq = txvq->vq;
 
 		virtio_dev_vring_start(vq);
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-		if (use_simple_rxtx) {
+		if (hw->use_simple_rxtx) {
 			uint16_t mid_idx  = vq->vq_nentries >> 1;
 
 			for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) {
@@ -426,7 +420,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 			     desc_idx++)
 				vq->vq_ring.avail->ring[desc_idx] = desc_idx;
 		}
-#endif
+
 		VIRTQUEUE_DUMP(vq);
 	}
 }
@@ -456,9 +450,7 @@ virtio_dev_rx_queue_setup(struct rte_eth_dev *dev,
 
 	dev->data->rx_queues[queue_idx] = rxvq;
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
 	virtio_rxq_vec_setup(rxvq);
-#endif
 
 	return 0;
 }
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 {
 	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-	struct virtio_hw *hw = dev->data->dev_private;
-#endif
 	struct virtnet_tx *txvq;
 	struct virtqueue *vq;
 	uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	}
 
 #ifdef RTE_MACHINE_CPUFLAG_SSSE3
+	struct virtio_hw *hw = dev->data->dev_private;
 	/* Use simple rx/tx func if single segment and no offloads */
 	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
 	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
 		PMD_INIT_LOG(INFO, "Using simple rx/tx path");
 		dev->tx_pkt_burst = virtio_xmit_pkts_simple;
 		dev->rx_pkt_burst = virtio_recv_pkts_vec;
-		use_simple_rxtx = 1;
+		hw->use_simple_rxtx = 1;
 	}
 #endif
 
diff --git a/drivers/net/virtio/virtio_rxtx.h b/drivers/net/virtio/virtio_rxtx.h
index 058b56a..28f82d6 100644
--- a/drivers/net/virtio/virtio_rxtx.h
+++ b/drivers/net/virtio/virtio_rxtx.h
@@ -86,10 +86,9 @@ struct virtnet_ctl {
 	const struct rte_memzone *mz;   /**< mem zone to populate RX ring. */
 };
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
 int virtio_rxq_vec_setup(struct virtnet_rx *rxvq);
 
 int virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq,
 	struct rte_mbuf *m);
-#endif
+
 #endif /* _VIRTIO_RXTX_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 242ad90..67430da 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -37,8 +37,6 @@
 #include <string.h>
 #include <errno.h>
 
-#include <tmmintrin.h>
-
 #include <rte_cycles.h>
 #include <rte_memory.h>
 #include <rte_memzone.h>
@@ -131,6 +129,10 @@ virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
 	vq_update_avail_idx(vq);
 }
 
+#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+
+#include <tmmintrin.h>
+
 /* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
  *
  * This routine is for non-mergeable RX, one desc for each guest buffer.
@@ -293,6 +295,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return nb_pkts_received;
 }
 
+#endif
+
 #define VIRTIO_TX_FREE_THRESH 32
 #define VIRTIO_TX_MAX_FREE_BUF_SZ 32
 #define VIRTIO_TX_FREE_NR 32
diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
index 9216182..ce6ce91 100644
--- a/drivers/net/virtio/virtio_user_ethdev.c
+++ b/drivers/net/virtio/virtio_user_ethdev.c
@@ -303,6 +303,7 @@ virtio_user_eth_dev_alloc(const char *name)
 	hw->vtpci_ops = &virtio_user_ops;
 	hw->use_msix = 0;
 	hw->modern   = 0;
+	hw->use_simple_rxtx = 0;
 	hw->virtio_user_dev = dev;
 	data->dev_private = hw;
 	data->numa_node = SOCKET_ID_ANY;
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
  2016-06-27 11:54 [PATCH 0/4] Virtio NEON support for ARM Jerin Jacob
  2016-06-27 11:54 ` [PATCH 1/4] virtio: Fix compile time dependency of use_simple_rxtx usage Jerin Jacob
@ 2016-06-27 11:54 ` Jerin Jacob
  2016-06-27 14:19   ` Thomas Monjalon
  2016-06-27 11:54 ` [PATCH 3/4] virtio: move SSE based Rx implementation to separate file Jerin Jacob
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-06-27 11:54 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

like other PMD drivers, introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
for vector based handler selection in virtio

Enabled by default in common config and disabled for non X86
platforms

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 config/common_base                          | 1 +
 config/defconfig_arm-armv7a-linuxapp-gcc    | 1 +
 config/defconfig_arm64-armv8a-linuxapp-gcc  | 1 +
 config/defconfig_ppc_64-power8-linuxapp-gcc | 1 +
 config/defconfig_tile-tilegx-linuxapp-gcc   | 1 +
 drivers/net/virtio/virtio_rxtx.c            | 2 ++
 6 files changed, 7 insertions(+)

diff --git a/config/common_base b/config/common_base
index 3a04fba..f6ce168 100644
--- a/config/common_base
+++ b/config/common_base
@@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
 CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
 CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
 CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
 
 #
 # Compile burst-oriented VMXNET3 PMD driver
diff --git a/config/defconfig_arm-armv7a-linuxapp-gcc b/config/defconfig_arm-armv7a-linuxapp-gcc
index bde6acd..a249ad5 100644
--- a/config/defconfig_arm-armv7a-linuxapp-gcc
+++ b/config/defconfig_arm-armv7a-linuxapp-gcc
@@ -75,3 +75,4 @@ CONFIG_RTE_LIBRTE_VMXNET3_PMD=n
 CONFIG_RTE_LIBRTE_PMD_XENVIRT=n
 CONFIG_RTE_LIBRTE_PMD_BNX2X=n
 CONFIG_RTE_LIBRTE_QEDE_PMD=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=n
diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc b/config/defconfig_arm64-armv8a-linuxapp-gcc
index a786562..95ed30e 100644
--- a/config/defconfig_arm64-armv8a-linuxapp-gcc
+++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
@@ -48,5 +48,6 @@ CONFIG_RTE_IXGBE_INC_VECTOR=n
 CONFIG_RTE_LIBRTE_IVSHMEM=n
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_I40E_PMD=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=n
 
 CONFIG_RTE_SCHED_VECTOR=n
diff --git a/config/defconfig_ppc_64-power8-linuxapp-gcc b/config/defconfig_ppc_64-power8-linuxapp-gcc
index bef8f49..1eca73a 100644
--- a/config/defconfig_ppc_64-power8-linuxapp-gcc
+++ b/config/defconfig_ppc_64-power8-linuxapp-gcc
@@ -51,6 +51,7 @@ CONFIG_RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT=n
 CONFIG_RTE_LIBRTE_IXGBE_PMD=n
 CONFIG_RTE_LIBRTE_I40E_PMD=n
 CONFIG_RTE_LIBRTE_VIRTIO_PMD=y
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=n
 CONFIG_RTE_LIBRTE_VMXNET3_PMD=n
 CONFIG_RTE_LIBRTE_PMD_BOND=n
 CONFIG_RTE_LIBRTE_ENIC_PMD=n
diff --git a/config/defconfig_tile-tilegx-linuxapp-gcc b/config/defconfig_tile-tilegx-linuxapp-gcc
index 5a50793..0d6fe1e 100644
--- a/config/defconfig_tile-tilegx-linuxapp-gcc
+++ b/config/defconfig_tile-tilegx-linuxapp-gcc
@@ -59,6 +59,7 @@ CONFIG_RTE_LIBRTE_IXGBE_PMD=n
 CONFIG_RTE_LIBRTE_I40E_PMD=n
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_VIRTIO_PMD=n
+CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=n
 CONFIG_RTE_LIBRTE_VMXNET3_PMD=n
 CONFIG_RTE_LIBRTE_ENIC_PMD=n
 
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 63b53f7..e9b42f3 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -499,6 +499,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
+#ifdef RTE_LIBRTE_VIRTIO_INC_VECTOR
 #ifdef RTE_MACHINE_CPUFLAG_SSSE3
 	struct virtio_hw *hw = dev->data->dev_private;
 	/* Use simple rx/tx func if single segment and no offloads */
@@ -510,6 +511,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		hw->use_simple_rxtx = 1;
 	}
 #endif
+#endif
 
 	ret = virtio_dev_queue_setup(dev, VTNET_TQ, queue_idx, vtpci_queue_idx,
 			nb_desc, socket_id, (void **)&txvq);
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 3/4] virtio: move SSE based Rx implementation to separate file
  2016-06-27 11:54 [PATCH 0/4] Virtio NEON support for ARM Jerin Jacob
  2016-06-27 11:54 ` [PATCH 1/4] virtio: Fix compile time dependency of use_simple_rxtx usage Jerin Jacob
  2016-06-27 11:54 ` [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR Jerin Jacob
@ 2016-06-27 11:54 ` Jerin Jacob
  2016-06-28  6:17   ` Jianbo Liu
  2016-06-27 11:54 ` [PATCH 4/4] virtio: add neon support Jerin Jacob
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-06-27 11:54 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

split out SSE instruction based virtio simple rx
implementation to a separate file

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/virtio/virtio_rxtx_simple.c     | 166 +-------------------
 drivers/net/virtio/virtio_rxtx_simple_sse.h | 225 ++++++++++++++++++++++++++++
 2 files changed, 226 insertions(+), 165 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.h

diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 67430da..ca87605 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -130,171 +130,7 @@ virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
 }
 
 #ifdef RTE_MACHINE_CPUFLAG_SSSE3
-
-#include <tmmintrin.h>
-
-/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
- *
- * This routine is for non-mergeable RX, one desc for each guest buffer.
- * This routine is based on the RX ring layout optimization. Each entry in the
- * avail ring points to the desc with the same index in the desc ring and this
- * will never be changed in the driver.
- *
- * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
- */
-uint16_t
-virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-	uint16_t nb_pkts)
-{
-	struct virtnet_rx *rxvq = rx_queue;
-	struct virtqueue *vq = rxvq->vq;
-	uint16_t nb_used;
-	uint16_t desc_idx;
-	struct vring_used_elem *rused;
-	struct rte_mbuf **sw_ring;
-	struct rte_mbuf **sw_ring_end;
-	uint16_t nb_pkts_received;
-	__m128i shuf_msk1, shuf_msk2, len_adjust;
-
-	shuf_msk1 = _mm_set_epi8(
-		0xFF, 0xFF, 0xFF, 0xFF,
-		0xFF, 0xFF,		/* vlan tci */
-		5, 4,			/* dat len */
-		0xFF, 0xFF, 5, 4,	/* pkt len */
-		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
-
-	);
-
-	shuf_msk2 = _mm_set_epi8(
-		0xFF, 0xFF, 0xFF, 0xFF,
-		0xFF, 0xFF,		/* vlan tci */
-		13, 12,			/* dat len */
-		0xFF, 0xFF, 13, 12,	/* pkt len */
-		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
-	);
-
-	/* Subtract the header length.
-	*  In which case do we need the header length in used->len ?
-	*/
-	len_adjust = _mm_set_epi16(
-		0, 0,
-		0,
-		(uint16_t)-vq->hw->vtnet_hdr_size,
-		0, (uint16_t)-vq->hw->vtnet_hdr_size,
-		0, 0);
-
-	if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
-		return 0;
-
-	nb_used = VIRTQUEUE_NUSED(vq);
-
-	rte_compiler_barrier();
-
-	if (unlikely(nb_used == 0))
-		return 0;
-
-	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
-	nb_used = RTE_MIN(nb_used, nb_pkts);
-
-	desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
-	rused = &vq->vq_ring.used->ring[desc_idx];
-	sw_ring  = &vq->sw_ring[desc_idx];
-	sw_ring_end = &vq->sw_ring[vq->vq_nentries];
-
-	_mm_prefetch((const void *)rused, _MM_HINT_T0);
-
-	if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
-		virtio_rxq_rearm_vec(rxvq);
-		if (unlikely(virtqueue_kick_prepare(vq)))
-			virtqueue_notify(vq);
-	}
-
-	for (nb_pkts_received = 0;
-		nb_pkts_received < nb_used;) {
-		__m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
-		__m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
-		__m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
-
-		mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
-		desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
-		_mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
-
-		mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
-		desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
-		_mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
-
-		mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
-		desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
-		_mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
-
-		mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
-		desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
-		_mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
-
-		pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
-		pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
-		pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
-		pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
-			pkt_mb[1]);
-		_mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
-			pkt_mb[0]);
-
-		pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
-		pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
-		pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
-		pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
-			pkt_mb[3]);
-		_mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
-			pkt_mb[2]);
-
-		pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
-		pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
-		pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
-		pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
-			pkt_mb[5]);
-		_mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
-			pkt_mb[4]);
-
-		pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
-		pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
-		pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
-		pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
-			pkt_mb[7]);
-		_mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
-			pkt_mb[6]);
-
-		if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
-			if (sw_ring + nb_used <= sw_ring_end)
-				nb_pkts_received += nb_used;
-			else
-				nb_pkts_received += sw_ring_end - sw_ring;
-			break;
-		} else {
-			if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
-				sw_ring_end)) {
-				nb_pkts_received += sw_ring_end - sw_ring;
-				break;
-			} else {
-				nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
-
-				rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
-				sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
-				rused   += RTE_VIRTIO_DESC_PER_LOOP;
-				nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
-			}
-		}
-	}
-
-	vq->vq_used_cons_idx += nb_pkts_received;
-	vq->vq_free_cnt += nb_pkts_received;
-	rxvq->stats.packets += nb_pkts_received;
-	return nb_pkts_received;
-}
-
+#include "virtio_rxtx_simple_sse.h"
 #endif
 
 #define VIRTIO_TX_FREE_THRESH 32
diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.h b/drivers/net/virtio/virtio_rxtx_simple_sse.h
new file mode 100644
index 0000000..4e8728f
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.h
@@ -0,0 +1,225 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <tmmintrin.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtqueue.h"
+#include "virtio_rxtx.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts)
+{
+	struct virtnet_rx *rxvq = rx_queue;
+	struct virtqueue *vq = rxvq->vq;
+	uint16_t nb_used;
+	uint16_t desc_idx;
+	struct vring_used_elem *rused;
+	struct rte_mbuf **sw_ring;
+	struct rte_mbuf **sw_ring_end;
+	uint16_t nb_pkts_received;
+	__m128i shuf_msk1, shuf_msk2, len_adjust;
+
+	shuf_msk1 = _mm_set_epi8(
+		0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF,		/* vlan tci */
+		5, 4,			/* dat len */
+		0xFF, 0xFF, 5, 4,	/* pkt len */
+		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
+
+	);
+
+	shuf_msk2 = _mm_set_epi8(
+		0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF,		/* vlan tci */
+		13, 12,			/* dat len */
+		0xFF, 0xFF, 13, 12,	/* pkt len */
+		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
+	);
+
+	/* Subtract the header length.
+	*  In which case do we need the header length in used->len ?
+	*/
+	len_adjust = _mm_set_epi16(
+		0, 0,
+		0,
+		(uint16_t)-vq->hw->vtnet_hdr_size,
+		0, (uint16_t)-vq->hw->vtnet_hdr_size,
+		0, 0);
+
+	if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+		return 0;
+
+	nb_used = VIRTQUEUE_NUSED(vq);
+
+	rte_compiler_barrier();
+
+	if (unlikely(nb_used == 0))
+		return 0;
+
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+	nb_used = RTE_MIN(nb_used, nb_pkts);
+
+	desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+	rused = &vq->vq_ring.used->ring[desc_idx];
+	sw_ring  = &vq->sw_ring[desc_idx];
+	sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+	_mm_prefetch((const void *)rused, _MM_HINT_T0);
+
+	if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+		virtio_rxq_rearm_vec(rxvq);
+		if (unlikely(virtqueue_kick_prepare(vq)))
+			virtqueue_notify(vq);
+	}
+
+	for (nb_pkts_received = 0;
+		nb_pkts_received < nb_used;) {
+		__m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		__m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		__m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+		mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
+		desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
+		_mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
+
+		mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
+		desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
+		_mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
+
+		mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
+		desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
+		_mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
+
+		mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
+		desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
+		_mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
+
+		pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
+		pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
+		pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
+		pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
+			pkt_mb[1]);
+		_mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
+			pkt_mb[0]);
+
+		pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
+		pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
+		pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
+		pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
+			pkt_mb[3]);
+		_mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
+			pkt_mb[2]);
+
+		pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
+		pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
+		pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
+		pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
+			pkt_mb[5]);
+		_mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
+			pkt_mb[4]);
+
+		pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
+		pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
+		pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
+		pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
+			pkt_mb[7]);
+		_mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
+			pkt_mb[6]);
+
+		if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+			if (sw_ring + nb_used <= sw_ring_end)
+				nb_pkts_received += nb_used;
+			else
+				nb_pkts_received += sw_ring_end - sw_ring;
+			break;
+		} else {
+			if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+				sw_ring_end)) {
+				nb_pkts_received += sw_ring_end - sw_ring;
+				break;
+			} else {
+				nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+				rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+				sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+				rused   += RTE_VIRTIO_DESC_PER_LOOP;
+				nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+			}
+		}
+	}
+
+	vq->vq_used_cons_idx += nb_pkts_received;
+	vq->vq_free_cnt += nb_pkts_received;
+	rxvq->stats.packets += nb_pkts_received;
+	return nb_pkts_received;
+}
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 4/4] virtio: add neon support
  2016-06-27 11:54 [PATCH 0/4] Virtio NEON support for ARM Jerin Jacob
                   ` (2 preceding siblings ...)
  2016-06-27 11:54 ` [PATCH 3/4] virtio: move SSE based Rx implementation to separate file Jerin Jacob
@ 2016-06-27 11:54 ` Jerin Jacob
  2016-07-01 11:16 ` From: Jerin Jacob <jerin.jacob@caviumnetworks.com> Jerin Jacob
  2016-07-01 11:19 ` [PATCH v2 0/3] " Jerin Jacob
  5 siblings, 0 replies; 44+ messages in thread
From: Jerin Jacob @ 2016-06-27 11:54 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

Added neon based Rx vector implementation for virtio.
Selected neon based virtio implementation for ARM64 as
default and updated the MAINTAINERS file.

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 MAINTAINERS                                  |   1 +
 config/defconfig_arm64-armv8a-linuxapp-gcc   |   1 -
 doc/guides/rel_notes/release_16_07.rst       |   2 +
 drivers/net/virtio/virtio_rxtx.c             |   2 +-
 drivers/net/virtio/virtio_rxtx_simple.c      |   2 +
 drivers/net/virtio/virtio_rxtx_simple_neon.h | 238 +++++++++++++++++++++++++++
 6 files changed, 244 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index f6c0d3d..2bb12aa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -142,6 +142,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
 F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
+F: drivers/net/virtio/virtio_rxtx_simple_neon.h
 
 EZchip TILE-Gx
 M: Zhigang Lu <zlu@ezchip.com>
diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc b/config/defconfig_arm64-armv8a-linuxapp-gcc
index 95ed30e..a786562 100644
--- a/config/defconfig_arm64-armv8a-linuxapp-gcc
+++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
@@ -48,6 +48,5 @@ CONFIG_RTE_IXGBE_INC_VECTOR=n
 CONFIG_RTE_LIBRTE_IVSHMEM=n
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_I40E_PMD=n
-CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=n
 
 CONFIG_RTE_SCHED_VECTOR=n
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 2694f50..3187a33 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -118,6 +118,8 @@ New Features
   * Root privilege is a must for sorting hugepages by physical address.
   * Can only be used with vhost user backend.
 
+* **Virtio NEON support for ARM.**
+
 Resolved Issues
 ---------------
 
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index e9b42f3..ca25db3 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -500,7 +500,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	}
 
 #ifdef RTE_LIBRTE_VIRTIO_INC_VECTOR
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+#if defined RTE_MACHINE_CPUFLAG_SSSE3 || defined RTE_MACHINE_CPUFLAG_NEON
 	struct virtio_hw *hw = dev->data->dev_private;
 	/* Use simple rx/tx func if single segment and no offloads */
 	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index ca87605..e5dc010 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -131,6 +131,8 @@ virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
 
 #ifdef RTE_MACHINE_CPUFLAG_SSSE3
 #include "virtio_rxtx_simple_sse.h"
+#elif RTE_MACHINE_CPUFLAG_NEON
+#include "virtio_rxtx_simple_neon.h"
 #endif
 
 #define VIRTIO_TX_FREE_THRESH 32
diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.h b/drivers/net/virtio/virtio_rxtx_simple_neon.h
new file mode 100644
index 0000000..41f347d
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_neon.h
@@ -0,0 +1,238 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2016
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+#include <rte_vect.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtqueue.h"
+#include "virtio_rxtx.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts)
+{
+	struct virtnet_rx *rxvq = rx_queue;
+	struct virtqueue *vq = rxvq->vq;
+	uint16_t nb_used;
+	uint16_t desc_idx;
+	struct vring_used_elem *rused;
+	struct rte_mbuf **sw_ring;
+	struct rte_mbuf **sw_ring_end;
+	uint16_t nb_pkts_received;
+
+	uint8x16_t shuf_msk1 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+		4, 5, 0xFF, 0xFF,       /* pkt len */
+		4, 5,                   /* dat len */
+		0xFF, 0xFF,             /* vlan tci */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	uint8x16_t shuf_msk2 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+		12, 13, 0xFF, 0xFF,     /* pkt len */
+		12, 13,                 /* dat len */
+		0xFF, 0xFF,             /* vlan tci */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	/* Subtract the header length.
+	 *  In which case do we need the header length in used->len ?
+	 */
+	uint16x8_t len_adjust = {
+		0, 0,
+		(uint16_t)vq->hw->vtnet_hdr_size, 0,
+		(uint16_t)vq->hw->vtnet_hdr_size,
+		0,
+		0, 0
+	};
+
+	if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+		return 0;
+
+	nb_used = VIRTQUEUE_NUSED(vq);
+
+	rte_rmb();
+
+	if (unlikely(nb_used == 0))
+		return 0;
+
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+	nb_used = RTE_MIN(nb_used, nb_pkts);
+
+	desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+	rused = &vq->vq_ring.used->ring[desc_idx];
+	sw_ring  = &vq->sw_ring[desc_idx];
+	sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+	rte_prefetch_non_temporal(rused);
+
+	if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+		virtio_rxq_rearm_vec(rxvq);
+		if (unlikely(virtqueue_kick_prepare(vq)))
+			virtqueue_notify(vq);
+	}
+
+	for (nb_pkts_received = 0;
+		nb_pkts_received < nb_used;) {
+		uint64x2_t desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		uint64x2_t mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		uint64x2_t pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+		mbp[0] = vld1q_u64((uint64_t *)(sw_ring + 0));
+		desc[0] = vld1q_u64((uint64_t *)(rused + 0));
+		vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0]);
+
+		mbp[1] = vld1q_u64((uint64_t *)(sw_ring + 2));
+		desc[1] = vld1q_u64((uint64_t *)(rused + 2));
+		vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1]);
+
+		mbp[2] = vld1q_u64((uint64_t *)(sw_ring + 4));
+		desc[2] = vld1q_u64((uint64_t *)(rused + 4));
+		vst1q_u64((uint64_t *)&rx_pkts[4], mbp[2]);
+
+		mbp[3] = vld1q_u64((uint64_t *)(sw_ring + 6));
+		desc[3] = vld1q_u64((uint64_t *)(rused + 6));
+		vst1q_u64((uint64_t *)&rx_pkts[6], mbp[3]);
+
+		pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[0]), shuf_msk2));
+		pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[0]), shuf_msk1));
+		pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
+		pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1,
+			pkt_mb[1]);
+		vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1,
+			pkt_mb[0]);
+
+		pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[1]), shuf_msk2));
+		pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[1]), shuf_msk1));
+		pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
+		pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1,
+			pkt_mb[3]);
+		vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1,
+			pkt_mb[2]);
+
+		pkt_mb[5] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[2]), shuf_msk2));
+		pkt_mb[4] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[2]), shuf_msk1));
+		pkt_mb[5] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[5]), len_adjust));
+		pkt_mb[4] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[4]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[5]->rx_descriptor_fields1,
+			pkt_mb[5]);
+		vst1q_u64((void *)&rx_pkts[4]->rx_descriptor_fields1,
+			pkt_mb[4]);
+
+		pkt_mb[7] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[3]), shuf_msk2));
+		pkt_mb[6] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[3]), shuf_msk1));
+		pkt_mb[7] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[7]), len_adjust));
+		pkt_mb[6] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[6]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[7]->rx_descriptor_fields1,
+			pkt_mb[7]);
+		vst1q_u64((void *)&rx_pkts[6]->rx_descriptor_fields1,
+			pkt_mb[6]);
+
+		if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+			if (sw_ring + nb_used <= sw_ring_end)
+				nb_pkts_received += nb_used;
+			else
+				nb_pkts_received += sw_ring_end - sw_ring;
+			break;
+		} else {
+			if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+				sw_ring_end)) {
+				nb_pkts_received += sw_ring_end - sw_ring;
+				break;
+			} else {
+				nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+				rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+				sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+				rused   += RTE_VIRTIO_DESC_PER_LOOP;
+				nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+			}
+		}
+	}
+
+	vq->vq_used_cons_idx += nb_pkts_received;
+	vq->vq_free_cnt += nb_pkts_received;
+	rxvq->stats.packets += nb_pkts_received;
+	return nb_pkts_received;
+}
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
  2016-06-27 11:54 ` [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR Jerin Jacob
@ 2016-06-27 14:19   ` Thomas Monjalon
  2016-06-27 14:48     ` Jerin Jacob
  0 siblings, 1 reply; 44+ messages in thread
From: Thomas Monjalon @ 2016-06-27 14:19 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev, bruce.richardson, jianbo.liu, huawei.xie, yuanhan.liu

2016-06-27 17:24, Jerin Jacob:
> --- a/config/common_base
> +++ b/config/common_base
> @@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
>  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
>  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
>  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
> +CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y

I don't remember what means INC_VECTOR?
Why a config option is needed for vector implementations?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
  2016-06-27 14:19   ` Thomas Monjalon
@ 2016-06-27 14:48     ` Jerin Jacob
  2016-06-27 14:59       ` Thomas Monjalon
  0 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-06-27 14:48 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, bruce.richardson, jianbo.liu, huawei.xie, yuanhan.liu

On Mon, Jun 27, 2016 at 04:19:57PM +0200, Thomas Monjalon wrote:
> 2016-06-27 17:24, Jerin Jacob:
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
> >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
> >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
> >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
> > +CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
> 
> I don't remember what means INC_VECTOR?
> Why a config option is needed for vector implementations?

I thought of adding additional configuration option(INC_VECTOR) _apart_ from
cpu flag based scheme in the patch because even though if a given platform
has cpu instruction support, in some platforms scalar version may
perform well wrt vector version(based on instruction latency, emulation required or not
etc). So a top level flag INC_VECTOR, can override the vector selection
for a given platform if required.

Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
followed the existing flags)
$ grep "INC_VECTOR" config/common_base
CONFIG_RTE_IXGBE_INC_VECTOR=y
CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y

Jerin

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
  2016-06-27 14:48     ` Jerin Jacob
@ 2016-06-27 14:59       ` Thomas Monjalon
  2016-06-29 11:18         ` Jerin Jacob
  0 siblings, 1 reply; 44+ messages in thread
From: Thomas Monjalon @ 2016-06-27 14:59 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev, bruce.richardson, jianbo.liu, huawei.xie, yuanhan.liu

2016-06-27 20:18, Jerin Jacob:
> On Mon, Jun 27, 2016 at 04:19:57PM +0200, Thomas Monjalon wrote:
> > 2016-06-27 17:24, Jerin Jacob:
> > > --- a/config/common_base
> > > +++ b/config/common_base
> > > @@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
> > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
> > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
> > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
> > > +CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
> > 
> > I don't remember what means INC_VECTOR?
> > Why a config option is needed for vector implementations?
> 
> I thought of adding additional configuration option(INC_VECTOR) _apart_ from
> cpu flag based scheme in the patch because even though if a given platform
> has cpu instruction support, in some platforms scalar version may
> perform well wrt vector version(based on instruction latency, emulation required or not
> etc). So a top level flag INC_VECTOR, can override the vector selection
> for a given platform if required.

Isn't it a runtime driver option needed to disable vector virtio?

> Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
> followed the existing flags)
> $ grep "INC_VECTOR" config/common_base
> CONFIG_RTE_IXGBE_INC_VECTOR=y
> CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
> CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y

If the flag is really needed I would suggest VIRTIO_VECTOR.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 3/4] virtio: move SSE based Rx implementation to separate file
  2016-06-27 11:54 ` [PATCH 3/4] virtio: move SSE based Rx implementation to separate file Jerin Jacob
@ 2016-06-28  6:17   ` Jianbo Liu
  2016-06-29 11:27     ` Jerin Jacob
  0 siblings, 1 reply; 44+ messages in thread
From: Jianbo Liu @ 2016-06-28  6:17 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, Thomas Monjalon, Bruce Richardson, huawei.xie, yuanhan.liu

On 27 June 2016 at 19:54, Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:
> split out SSE instruction based virtio simple rx
> implementation to a separate file
>
> Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> ---
>  drivers/net/virtio/virtio_rxtx_simple.c     | 166 +-------------------
>  drivers/net/virtio/virtio_rxtx_simple_sse.h | 225 ++++++++++++++++++++++++++++
>  2 files changed, 226 insertions(+), 165 deletions(-)
>  create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.h
>
I think it's better to move sse implementation to a C file,
as Bruce pointed out at
http://www.dpdk.org/ml/archives/dev/2016-April/037937.html

Jianbo

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
  2016-06-27 14:59       ` Thomas Monjalon
@ 2016-06-29 11:18         ` Jerin Jacob
  2016-06-29 11:25           ` Thomas Monjalon
  0 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-06-29 11:18 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, bruce.richardson, jianbo.liu, huawei.xie, yuanhan.liu

On Mon, Jun 27, 2016 at 04:59:42PM +0200, Thomas Monjalon wrote:
> 2016-06-27 20:18, Jerin Jacob:
> > On Mon, Jun 27, 2016 at 04:19:57PM +0200, Thomas Monjalon wrote:
> > > 2016-06-27 17:24, Jerin Jacob:
> > > > --- a/config/common_base
> > > > +++ b/config/common_base
> > > > @@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
> > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
> > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
> > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
> > > > +CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
> > > 
> > > I don't remember what means INC_VECTOR?
> > > Why a config option is needed for vector implementations?
> > 
> > I thought of adding additional configuration option(INC_VECTOR) _apart_ from
> > cpu flag based scheme in the patch because even though if a given platform
> > has cpu instruction support, in some platforms scalar version may
> > perform well wrt vector version(based on instruction latency, emulation required or not
> > etc). So a top level flag INC_VECTOR, can override the vector selection
> > for a given platform if required.
> 
> Isn't it a runtime driver option needed to disable vector virtio?
> 
> > Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
> > followed the existing flags)
> > $ grep "INC_VECTOR" config/common_base
> > CONFIG_RTE_IXGBE_INC_VECTOR=y
> > CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
> > CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y
> 
> If the flag is really needed I would suggest VIRTIO_VECTOR.

OK I will change to VIRTIO_VECTOR

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
  2016-06-29 11:18         ` Jerin Jacob
@ 2016-06-29 11:25           ` Thomas Monjalon
  2016-06-29 11:40             ` Jerin Jacob
  0 siblings, 1 reply; 44+ messages in thread
From: Thomas Monjalon @ 2016-06-29 11:25 UTC (permalink / raw)
  To: Jerin Jacob; +Cc: dev, bruce.richardson, jianbo.liu, huawei.xie, yuanhan.liu

2016-06-29 16:48, Jerin Jacob:
> On Mon, Jun 27, 2016 at 04:59:42PM +0200, Thomas Monjalon wrote:
> > 2016-06-27 20:18, Jerin Jacob:
> > > On Mon, Jun 27, 2016 at 04:19:57PM +0200, Thomas Monjalon wrote:
> > > > 2016-06-27 17:24, Jerin Jacob:
> > > > > --- a/config/common_base
> > > > > +++ b/config/common_base
> > > > > @@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
> > > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
> > > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
> > > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
> > > > > +CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
> > > > 
> > > > I don't remember what means INC_VECTOR?
> > > > Why a config option is needed for vector implementations?
> > > 
> > > I thought of adding additional configuration option(INC_VECTOR) _apart_ from
> > > cpu flag based scheme in the patch because even though if a given platform
> > > has cpu instruction support, in some platforms scalar version may
> > > perform well wrt vector version(based on instruction latency, emulation required or not
> > > etc). So a top level flag INC_VECTOR, can override the vector selection
> > > for a given platform if required.
> > 
> > Isn't it a runtime driver option needed to disable vector virtio?
> > 
> > > Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
> > > followed the existing flags)
> > > $ grep "INC_VECTOR" config/common_base
> > > CONFIG_RTE_IXGBE_INC_VECTOR=y
> > > CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
> > > CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y
> > 
> > If the flag is really needed I would suggest VIRTIO_VECTOR.
> 
> OK I will change to VIRTIO_VECTOR

I would prefer a runtime option.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 3/4] virtio: move SSE based Rx implementation to separate file
  2016-06-28  6:17   ` Jianbo Liu
@ 2016-06-29 11:27     ` Jerin Jacob
  2016-06-30  5:43       ` Yuanhan Liu
  0 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-06-29 11:27 UTC (permalink / raw)
  To: Jianbo Liu
  Cc: dev, Thomas Monjalon, Bruce Richardson, huawei.xie, yuanhan.liu

On Tue, Jun 28, 2016 at 02:17:41PM +0800, Jianbo Liu wrote:
> On 27 June 2016 at 19:54, Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:
> > split out SSE instruction based virtio simple rx
> > implementation to a separate file
> >
> > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > ---
> >  drivers/net/virtio/virtio_rxtx_simple.c     | 166 +-------------------
> >  drivers/net/virtio/virtio_rxtx_simple_sse.h | 225 ++++++++++++++++++++++++++++
> >  2 files changed, 226 insertions(+), 165 deletions(-)
> >  create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.h
> >
> I think it's better to move sse implementation to a C file,
> as Bruce pointed out at
> http://www.dpdk.org/ml/archives/dev/2016-April/037937.html

I can move to C file, That would call for further restructuring of the code
by Introducing a new file drivers/net/virtio/virtio_rxtx_simple.h and
moving all static inline functions of virtio_rxtx_simple.c so that
virtio_rxtx_simple_sse.c and virtio_rxtx_simple_neon.c can include it.

Huawei,Yuanhan,All,

Are you OK with above restructuring?

Jerin

> 
> Jianbo

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
  2016-06-29 11:25           ` Thomas Monjalon
@ 2016-06-29 11:40             ` Jerin Jacob
  2016-06-30  5:44               ` Yuanhan Liu
  0 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-06-29 11:40 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, bruce.richardson, jianbo.liu, huawei.xie, yuanhan.liu

On Wed, Jun 29, 2016 at 01:25:35PM +0200, Thomas Monjalon wrote:
> 2016-06-29 16:48, Jerin Jacob:
> > On Mon, Jun 27, 2016 at 04:59:42PM +0200, Thomas Monjalon wrote:
> > > 2016-06-27 20:18, Jerin Jacob:
> > > > On Mon, Jun 27, 2016 at 04:19:57PM +0200, Thomas Monjalon wrote:
> > > > > 2016-06-27 17:24, Jerin Jacob:
> > > > > > --- a/config/common_base
> > > > > > +++ b/config/common_base
> > > > > > @@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
> > > > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
> > > > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
> > > > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
> > > > > > +CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
> > > > > 
> > > > > I don't remember what means INC_VECTOR?
> > > > > Why a config option is needed for vector implementations?
> > > > 
> > > > I thought of adding additional configuration option(INC_VECTOR) _apart_ from
> > > > cpu flag based scheme in the patch because even though if a given platform
> > > > has cpu instruction support, in some platforms scalar version may
> > > > perform well wrt vector version(based on instruction latency, emulation required or not
> > > > etc). So a top level flag INC_VECTOR, can override the vector selection
> > > > for a given platform if required.
> > > 
> > > Isn't it a runtime driver option needed to disable vector virtio?
> > > 
> > > > Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
> > > > followed the existing flags)
> > > > $ grep "INC_VECTOR" config/common_base
> > > > CONFIG_RTE_IXGBE_INC_VECTOR=y
> > > > CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
> > > > CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y
> > > 
> > > If the flag is really needed I would suggest VIRTIO_VECTOR.
> > 
> > OK I will change to VIRTIO_VECTOR
> 
> I would prefer a runtime option.

OK

The platform I test their was NO need for additional VIRTIO_VECTOR
configuration as NEON versions outperforms than scalar version.

I thought of adding this option to override for any platform if it need
to accommodate such platform differences NEON vs scalar versions.

I will change completely to run-time detection based on cpuflags for IA and ARM.
Any objections?

Jerin

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 3/4] virtio: move SSE based Rx implementation to separate file
  2016-06-29 11:27     ` Jerin Jacob
@ 2016-06-30  5:43       ` Yuanhan Liu
  0 siblings, 0 replies; 44+ messages in thread
From: Yuanhan Liu @ 2016-06-30  5:43 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Jianbo Liu, dev, Thomas Monjalon, Bruce Richardson, huawei.xie

On Wed, Jun 29, 2016 at 04:57:46PM +0530, Jerin Jacob wrote:
> On Tue, Jun 28, 2016 at 02:17:41PM +0800, Jianbo Liu wrote:
> > On 27 June 2016 at 19:54, Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:
> > > split out SSE instruction based virtio simple rx
> > > implementation to a separate file
> > >
> > > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > ---
> > >  drivers/net/virtio/virtio_rxtx_simple.c     | 166 +-------------------
> > >  drivers/net/virtio/virtio_rxtx_simple_sse.h | 225 ++++++++++++++++++++++++++++
> > >  2 files changed, 226 insertions(+), 165 deletions(-)
> > >  create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.h
> > >
> > I think it's better to move sse implementation to a C file,
> > as Bruce pointed out at
> > http://www.dpdk.org/ml/archives/dev/2016-April/037937.html
> 
> I can move to C file, That would call for further restructuring of the code
> by Introducing a new file drivers/net/virtio/virtio_rxtx_simple.h and
> moving all static inline functions of virtio_rxtx_simple.c so that
> virtio_rxtx_simple_sse.c and virtio_rxtx_simple_neon.c can include it.
> 
> Huawei,Yuanhan,All,
> 
> Are you OK with above restructuring?

Yes, I think that's better.

	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR
  2016-06-29 11:40             ` Jerin Jacob
@ 2016-06-30  5:44               ` Yuanhan Liu
  0 siblings, 0 replies; 44+ messages in thread
From: Yuanhan Liu @ 2016-06-30  5:44 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Thomas Monjalon, dev, bruce.richardson, jianbo.liu, huawei.xie

On Wed, Jun 29, 2016 at 05:10:31PM +0530, Jerin Jacob wrote:
> On Wed, Jun 29, 2016 at 01:25:35PM +0200, Thomas Monjalon wrote:
> > 2016-06-29 16:48, Jerin Jacob:
> > > On Mon, Jun 27, 2016 at 04:59:42PM +0200, Thomas Monjalon wrote:
> > > > 2016-06-27 20:18, Jerin Jacob:
> > > > > On Mon, Jun 27, 2016 at 04:19:57PM +0200, Thomas Monjalon wrote:
> > > > > > 2016-06-27 17:24, Jerin Jacob:
> > > > > > > --- a/config/common_base
> > > > > > > +++ b/config/common_base
> > > > > > > @@ -267,6 +267,7 @@ CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_RX=n
> > > > > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_TX=n
> > > > > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DRIVER=n
> > > > > > >  CONFIG_RTE_LIBRTE_VIRTIO_DEBUG_DUMP=n
> > > > > > > +CONFIG_RTE_LIBRTE_VIRTIO_INC_VECTOR=y
> > > > > > 
> > > > > > I don't remember what means INC_VECTOR?
> > > > > > Why a config option is needed for vector implementations?
> > > > > 
> > > > > I thought of adding additional configuration option(INC_VECTOR) _apart_ from
> > > > > cpu flag based scheme in the patch because even though if a given platform
> > > > > has cpu instruction support, in some platforms scalar version may
> > > > > perform well wrt vector version(based on instruction latency, emulation required or not
> > > > > etc). So a top level flag INC_VECTOR, can override the vector selection
> > > > > for a given platform if required.
> > > > 
> > > > Isn't it a runtime driver option needed to disable vector virtio?
> > > > 
> > > > > Regarding INC_VECTOR(INC in vector configuration name, I have no idea, I
> > > > > followed the existing flags)
> > > > > $ grep "INC_VECTOR" config/common_base
> > > > > CONFIG_RTE_IXGBE_INC_VECTOR=y
> > > > > CONFIG_RTE_LIBRTE_I40E_INC_VECTOR=y
> > > > > CONFIG_RTE_LIBRTE_FM10K_INC_VECTOR=y
> > > > 
> > > > If the flag is really needed I would suggest VIRTIO_VECTOR.
> > > 
> > > OK I will change to VIRTIO_VECTOR
> > 
> > I would prefer a runtime option.

+1

> 
> OK
> 
> The platform I test their was NO need for additional VIRTIO_VECTOR
> configuration as NEON versions outperforms than scalar version.
> 
> I thought of adding this option to override for any platform if it need
> to accommodate such platform differences NEON vs scalar versions.
> 
> I will change completely to run-time detection based on cpuflags for IA and ARM.
> Any objections?

Nope from me.

	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* From: Jerin Jacob <jerin.jacob@caviumnetworks.com>
  2016-06-27 11:54 [PATCH 0/4] Virtio NEON support for ARM Jerin Jacob
                   ` (3 preceding siblings ...)
  2016-06-27 11:54 ` [PATCH 4/4] virtio: add neon support Jerin Jacob
@ 2016-07-01 11:16 ` Jerin Jacob
  2016-07-01 11:16   ` [PATCH v2 1/3] virtio: conditional compilation cleanup Jerin Jacob
                     ` (3 more replies)
  2016-07-01 11:19 ` [PATCH v2 0/3] " Jerin Jacob
  5 siblings, 4 replies; 44+ messages in thread
From: Jerin Jacob @ 2016-07-01 11:16 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie, yuanhan.liu

This patch-set includes,

1) General cleanup of compile time dependency.
2) made vector handler section based on run-time cpuflags
2) Added NEON support for optimized Rx handling

This patch-set is based on dpdk-next-virtio/master at aaaf0c005

v2:
- made vector handler selection based on run-time cpuflags (Suggested by Thomas)
- moved vector implementations to .c file instead of .h file(Suggested by Jianbo)

Jerin Jacob (3):
  virtio: conditional compilation cleanup
  virtio: move SSE based Rx implementation to separate file
  virtio: add neon support

 MAINTAINERS                                  |   1 +
 doc/guides/rel_notes/release_16_07.rst       |   2 +
 drivers/net/vision/Wakefully                  |   7 +-
 drivers/net/virtio/virtio_pci.h              |   1 +
 drivers/net/virtio/virtio_rxtx.c             |  62 +++---
 drivers/net/virtio/virtio_rxtx.h             |   3 +-
 drivers/net/virtio/virtio_rxtx_simple.c      | 269 ++-------------------------
 drivers/net/virtio/virtio_rxtx_simple.h      | 133 +++++++++++++
 drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++
 drivers/net/virtio/virtio_rxtx_simple_sse.c  | 222 ++++++++++++++++++++++
 drivers/net/virtio/virtio_user_ethdev.c      |   1 +
 11 files changed, 646 insertions(+), 290 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c

-- 
2.5.5

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH v2 1/3] virtio: conditional compilation cleanup
  2016-07-01 11:16 ` From: Jerin Jacob <jerin.jacob@caviumnetworks.com> Jerin Jacob
@ 2016-07-01 11:16   ` Jerin Jacob
  2016-07-04  7:36     ` Yuanhan Liu
  2016-07-01 11:16   ` [PATCH v2 2/3] virtio: move SSE based Rx implementation to separate file Jerin Jacob
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-07-01 11:16 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

Removed unnecessary compile time dependency on "use_simple_rxtx".

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/virtio/Makefile             |  3 ---
 drivers/net/virtio/virtio_pci.h         |  1 +
 drivers/net/virtio/virtio_rxtx.c        | 28 +++++++++-------------------
 drivers/net/virtio/virtio_rxtx.h        |  3 +--
 drivers/net/virtio/virtio_rxtx_simple.c |  8 ++++++--
 drivers/net/virtio/virtio_user_ethdev.c |  1 +
 6 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index 3020b68..b9b0d8d 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -50,10 +50,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtqueue.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_pci.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c
-
-ifeq ($(findstring RTE_MACHINE_CPUFLAG_SSSE3,$(CFLAGS)),RTE_MACHINE_CPUFLAG_SSSE3)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c
-endif
 
 ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
index dd7693f..b8295a7 100644
--- a/drivers/net/virtio/virtio_pci.h
+++ b/drivers/net/virtio/virtio_pci.h
@@ -253,6 +253,7 @@ struct virtio_hw {
 	uint8_t	    use_msix;
 	uint8_t     started;
 	uint8_t     modern;
+	uint8_t     use_simple_rxtx;
 	uint8_t     mac_addr[ETHER_ADDR_LEN];
 	uint32_t    notify_off_multiplier;
 	uint8_t     *isr;
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index a27208e..63b53f7 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -67,10 +67,6 @@
 #define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \
 	ETH_TXQ_FLAGS_NOOFFLOADS)
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-static int use_simple_rxtx;
-#endif
-
 static void
 vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
 {
@@ -333,6 +329,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 	 */
 	uint16_t i;
 	uint16_t desc_idx;
+	struct virtio_hw *hw = dev->data->dev_private;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -353,8 +350,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 		nbufs = 0;
 		error = ENOSPC;
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-		if (use_simple_rxtx) {
+		if (hw->use_simple_rxtx) {
 			for (desc_idx = 0; desc_idx < vq->vq_nentries;
 			     desc_idx++) {
 				vq->vq_ring.avail->ring[desc_idx] = desc_idx;
@@ -362,7 +358,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 					VRING_DESC_F_WRITE;
 			}
 		}
-#endif
+
 		memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf));
 		for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST;
 		     desc_idx++) {
@@ -378,12 +374,11 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 			/******************************************
 			*         Enqueue allocated buffers        *
 			*******************************************/
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-			if (use_simple_rxtx)
+			if (hw->use_simple_rxtx)
 				error = virtqueue_enqueue_recv_refill_simple(vq, m);
 			else
-#endif
 				error = virtqueue_enqueue_recv_refill(vq, m);
+
 			if (error) {
 				rte_pktmbuf_free(m);
 				break;
@@ -404,8 +399,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 		struct virtqueue *vq = txvq->vq;
 
 		virtio_dev_vring_start(vq);
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-		if (use_simple_rxtx) {
+		if (hw->use_simple_rxtx) {
 			uint16_t mid_idx  = vq->vq_nentries >> 1;
 
 			for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) {
@@ -426,7 +420,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 			     desc_idx++)
 				vq->vq_ring.avail->ring[desc_idx] = desc_idx;
 		}
-#endif
+
 		VIRTQUEUE_DUMP(vq);
 	}
 }
@@ -456,9 +450,7 @@ virtio_dev_rx_queue_setup(struct rte_eth_dev *dev,
 
 	dev->data->rx_queues[queue_idx] = rxvq;
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
 	virtio_rxq_vec_setup(rxvq);
-#endif
 
 	return 0;
 }
@@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 {
 	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-	struct virtio_hw *hw = dev->data->dev_private;
-#endif
 	struct virtnet_tx *txvq;
 	struct virtqueue *vq;
 	uint16_t tx_free_thresh;
@@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 	}
 
 #ifdef RTE_MACHINE_CPUFLAG_SSSE3
+	struct virtio_hw *hw = dev->data->dev_private;
 	/* Use simple rx/tx func if single segment and no offloads */
 	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
 	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
 		PMD_INIT_LOG(INFO, "Using simple rx/tx path");
 		dev->tx_pkt_burst = virtio_xmit_pkts_simple;
 		dev->rx_pkt_burst = virtio_recv_pkts_vec;
-		use_simple_rxtx = 1;
+		hw->use_simple_rxtx = 1;
 	}
 #endif
 
diff --git a/drivers/net/virtio/virtio_rxtx.h b/drivers/net/virtio/virtio_rxtx.h
index 058b56a..28f82d6 100644
--- a/drivers/net/virtio/virtio_rxtx.h
+++ b/drivers/net/virtio/virtio_rxtx.h
@@ -86,10 +86,9 @@ struct virtnet_ctl {
 	const struct rte_memzone *mz;   /**< mem zone to populate RX ring. */
 };
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
 int virtio_rxq_vec_setup(struct virtnet_rx *rxvq);
 
 int virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq,
 	struct rte_mbuf *m);
-#endif
+
 #endif /* _VIRTIO_RXTX_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 242ad90..67430da 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -37,8 +37,6 @@
 #include <string.h>
 #include <errno.h>
 
-#include <tmmintrin.h>
-
 #include <rte_cycles.h>
 #include <rte_memory.h>
 #include <rte_memzone.h>
@@ -131,6 +129,10 @@ virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
 	vq_update_avail_idx(vq);
 }
 
+#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+
+#include <tmmintrin.h>
+
 /* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
  *
  * This routine is for non-mergeable RX, one desc for each guest buffer.
@@ -293,6 +295,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return nb_pkts_received;
 }
 
+#endif
+
 #define VIRTIO_TX_FREE_THRESH 32
 #define VIRTIO_TX_MAX_FREE_BUF_SZ 32
 #define VIRTIO_TX_FREE_NR 32
diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
index 5ab2471..bef8130 100644
--- a/drivers/net/virtio/virtio_user_ethdev.c
+++ b/drivers/net/virtio/virtio_user_ethdev.c
@@ -303,6 +303,7 @@ virtio_user_eth_dev_alloc(const char *name)
 	hw->vtpci_ops = &virtio_user_ops;
 	hw->use_msix = 0;
 	hw->modern   = 0;
+	hw->use_simple_rxtx = 0;
 	hw->virtio_user_dev = dev;
 	data->dev_private = hw;
 	data->numa_node = SOCKET_ID_ANY;
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v2 2/3] virtio: move SSE based Rx implementation to separate file
  2016-07-01 11:16 ` From: Jerin Jacob <jerin.jacob@caviumnetworks.com> Jerin Jacob
  2016-07-01 11:16   ` [PATCH v2 1/3] virtio: conditional compilation cleanup Jerin Jacob
@ 2016-07-01 11:16   ` Jerin Jacob
  2016-07-04  7:42     ` Yuanhan Liu
  2016-07-01 11:16   ` [PATCH v2 3/3] virtio: add neon support Jerin Jacob
  2016-07-05 12:49   ` [PATCH v3 0/4] Virtio NEON support for ARM Jerin Jacob
  3 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-07-01 11:16 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

* Introduced cpuflag based run-time detection to
select the SSE based simple Rx handler
* Split out SSE instruction based virtio simple Rx
implementation to a separate file

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/virtio/Makefile                 |   4 +
 drivers/net/virtio/virtio_rxtx.c            |  35 ++--
 drivers/net/virtio/virtio_rxtx_simple.c     | 273 ++--------------------------
 drivers/net/virtio/virtio_rxtx_simple.h     | 133 ++++++++++++++
 drivers/net/virtio/virtio_rxtx_simple_sse.c | 222 ++++++++++++++++++++++
 5 files changed, 394 insertions(+), 273 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index b9b0d8d..c4103b7 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -52,6 +52,10 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c
 
+ifeq ($(CONFIG_RTE_ARCH_X86),y)
+SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_sse.c
+endif
+
 ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/virtio_user_dev.c
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index 63b53f7..a4d4a57 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -50,6 +50,7 @@
 #include <rte_string_fns.h>
 #include <rte_errno.h>
 #include <rte_byteorder.h>
+#include <rte_cpuflags.h>
 
 #include "virtio_logs.h"
 #include "virtio_ethdev.h"
@@ -470,6 +471,28 @@ virtio_dev_rx_queue_release(void *rxq)
 	rte_memzone_free(mz);
 }
 
+static void
+virtio_update_rxtx_handler(struct rte_eth_dev *dev,
+			   const struct rte_eth_txconf *tx_conf)
+{
+	uint8_t use_simple_rxtx = 0;
+	struct virtio_hw *hw = dev->data->dev_private;
+
+#if defined RTE_ARCH_X86
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
+		use_simple_rxtx = 1;
+#endif
+	/* Use simple rx/tx func if single segment and no offloads */
+	if (use_simple_rxtx &&
+	   (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
+		!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
+		PMD_INIT_LOG(INFO, "Using simple rx/tx path");
+		dev->tx_pkt_burst = virtio_xmit_pkts_simple;
+		dev->rx_pkt_burst = virtio_recv_pkts_vec;
+		hw->use_simple_rxtx = use_simple_rxtx;
+	}
+}
+
 /*
  * struct rte_eth_dev *dev: Used to update dev
  * uint16_t nb_desc: Defaults to values read from config space
@@ -499,17 +522,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-	struct virtio_hw *hw = dev->data->dev_private;
-	/* Use simple rx/tx func if single segment and no offloads */
-	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
-	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
-		PMD_INIT_LOG(INFO, "Using simple rx/tx path");
-		dev->tx_pkt_burst = virtio_xmit_pkts_simple;
-		dev->rx_pkt_burst = virtio_recv_pkts_vec;
-		hw->use_simple_rxtx = 1;
-	}
-#endif
+	virtio_update_rxtx_handler(dev, tx_conf);
 
 	ret = virtio_dev_queue_setup(dev, VTNET_TQ, queue_idx, vtpci_queue_idx,
 			nb_desc, socket_id, (void **)&txvq);
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 67430da..485ddce 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -51,14 +51,7 @@
 #include <rte_errno.h>
 #include <rte_byteorder.h>
 
-#include "virtio_logs.h"
-#include "virtio_ethdev.h"
-#include "virtqueue.h"
-#include "virtio_rxtx.h"
-
-#define RTE_VIRTIO_VPMD_RX_BURST 32
-#define RTE_VIRTIO_DESC_PER_LOOP 8
-#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+#include "virtio_rxtx_simple.h"
 
 #ifndef __INTEL_COMPILER
 #pragma GCC diagnostic ignored "-Wcast-qual"
@@ -89,260 +82,6 @@ virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq,
 	return 0;
 }
 
-static inline void
-virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
-{
-	int i;
-	uint16_t desc_idx;
-	struct rte_mbuf **sw_ring;
-	struct vring_desc *start_dp;
-	int ret;
-	struct virtqueue *vq = rxvq->vq;
-
-	desc_idx = vq->vq_avail_idx & (vq->vq_nentries - 1);
-	sw_ring = &vq->sw_ring[desc_idx];
-	start_dp = &vq->vq_ring.desc[desc_idx];
-
-	ret = rte_mempool_get_bulk(rxvq->mpool, (void **)sw_ring,
-		RTE_VIRTIO_VPMD_RX_REARM_THRESH);
-	if (unlikely(ret)) {
-		rte_eth_devices[rxvq->port_id].data->rx_mbuf_alloc_failed +=
-			RTE_VIRTIO_VPMD_RX_REARM_THRESH;
-		return;
-	}
-
-	for (i = 0; i < RTE_VIRTIO_VPMD_RX_REARM_THRESH; i++) {
-		uintptr_t p;
-
-		p = (uintptr_t)&sw_ring[i]->rearm_data;
-		*(uint64_t *)p = rxvq->mbuf_initializer;
-
-		start_dp[i].addr =
-			MBUF_DATA_DMA_ADDR(sw_ring[i], vq->offset) -
-			vq->hw->vtnet_hdr_size;
-		start_dp[i].len = sw_ring[i]->buf_len -
-			RTE_PKTMBUF_HEADROOM + vq->hw->vtnet_hdr_size;
-	}
-
-	vq->vq_avail_idx += RTE_VIRTIO_VPMD_RX_REARM_THRESH;
-	vq->vq_free_cnt -= RTE_VIRTIO_VPMD_RX_REARM_THRESH;
-	vq_update_avail_idx(vq);
-}
-
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-
-#include <tmmintrin.h>
-
-/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
- *
- * This routine is for non-mergeable RX, one desc for each guest buffer.
- * This routine is based on the RX ring layout optimization. Each entry in the
- * avail ring points to the desc with the same index in the desc ring and this
- * will never be changed in the driver.
- *
- * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
- */
-uint16_t
-virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-	uint16_t nb_pkts)
-{
-	struct virtnet_rx *rxvq = rx_queue;
-	struct virtqueue *vq = rxvq->vq;
-	uint16_t nb_used;
-	uint16_t desc_idx;
-	struct vring_used_elem *rused;
-	struct rte_mbuf **sw_ring;
-	struct rte_mbuf **sw_ring_end;
-	uint16_t nb_pkts_received;
-	__m128i shuf_msk1, shuf_msk2, len_adjust;
-
-	shuf_msk1 = _mm_set_epi8(
-		0xFF, 0xFF, 0xFF, 0xFF,
-		0xFF, 0xFF,		/* vlan tci */
-		5, 4,			/* dat len */
-		0xFF, 0xFF, 5, 4,	/* pkt len */
-		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
-
-	);
-
-	shuf_msk2 = _mm_set_epi8(
-		0xFF, 0xFF, 0xFF, 0xFF,
-		0xFF, 0xFF,		/* vlan tci */
-		13, 12,			/* dat len */
-		0xFF, 0xFF, 13, 12,	/* pkt len */
-		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
-	);
-
-	/* Subtract the header length.
-	*  In which case do we need the header length in used->len ?
-	*/
-	len_adjust = _mm_set_epi16(
-		0, 0,
-		0,
-		(uint16_t)-vq->hw->vtnet_hdr_size,
-		0, (uint16_t)-vq->hw->vtnet_hdr_size,
-		0, 0);
-
-	if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
-		return 0;
-
-	nb_used = VIRTQUEUE_NUSED(vq);
-
-	rte_compiler_barrier();
-
-	if (unlikely(nb_used == 0))
-		return 0;
-
-	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
-	nb_used = RTE_MIN(nb_used, nb_pkts);
-
-	desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
-	rused = &vq->vq_ring.used->ring[desc_idx];
-	sw_ring  = &vq->sw_ring[desc_idx];
-	sw_ring_end = &vq->sw_ring[vq->vq_nentries];
-
-	_mm_prefetch((const void *)rused, _MM_HINT_T0);
-
-	if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
-		virtio_rxq_rearm_vec(rxvq);
-		if (unlikely(virtqueue_kick_prepare(vq)))
-			virtqueue_notify(vq);
-	}
-
-	for (nb_pkts_received = 0;
-		nb_pkts_received < nb_used;) {
-		__m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
-		__m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
-		__m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
-
-		mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
-		desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
-		_mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
-
-		mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
-		desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
-		_mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
-
-		mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
-		desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
-		_mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
-
-		mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
-		desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
-		_mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
-
-		pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
-		pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
-		pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
-		pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
-			pkt_mb[1]);
-		_mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
-			pkt_mb[0]);
-
-		pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
-		pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
-		pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
-		pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
-			pkt_mb[3]);
-		_mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
-			pkt_mb[2]);
-
-		pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
-		pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
-		pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
-		pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
-			pkt_mb[5]);
-		_mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
-			pkt_mb[4]);
-
-		pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
-		pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
-		pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
-		pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
-			pkt_mb[7]);
-		_mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
-			pkt_mb[6]);
-
-		if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
-			if (sw_ring + nb_used <= sw_ring_end)
-				nb_pkts_received += nb_used;
-			else
-				nb_pkts_received += sw_ring_end - sw_ring;
-			break;
-		} else {
-			if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
-				sw_ring_end)) {
-				nb_pkts_received += sw_ring_end - sw_ring;
-				break;
-			} else {
-				nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
-
-				rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
-				sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
-				rused   += RTE_VIRTIO_DESC_PER_LOOP;
-				nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
-			}
-		}
-	}
-
-	vq->vq_used_cons_idx += nb_pkts_received;
-	vq->vq_free_cnt += nb_pkts_received;
-	rxvq->stats.packets += nb_pkts_received;
-	return nb_pkts_received;
-}
-
-#endif
-
-#define VIRTIO_TX_FREE_THRESH 32
-#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
-#define VIRTIO_TX_FREE_NR 32
-/* TODO: vq->tx_free_cnt could mean num of free slots so we could avoid shift */
-static inline void
-virtio_xmit_cleanup(struct virtqueue *vq)
-{
-	uint16_t i, desc_idx;
-	int nb_free = 0;
-	struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ];
-
-	desc_idx = (uint16_t)(vq->vq_used_cons_idx &
-		   ((vq->vq_nentries >> 1) - 1));
-	m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
-	m = __rte_pktmbuf_prefree_seg(m);
-	if (likely(m != NULL)) {
-		free[0] = m;
-		nb_free = 1;
-		for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
-			m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
-			m = __rte_pktmbuf_prefree_seg(m);
-			if (likely(m != NULL)) {
-				if (likely(m->pool == free[0]->pool))
-					free[nb_free++] = m;
-				else {
-					rte_mempool_put_bulk(free[0]->pool,
-						(void **)free, nb_free);
-					free[0] = m;
-					nb_free = 1;
-				}
-			}
-		}
-		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
-	} else {
-		for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
-			m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
-			m = __rte_pktmbuf_prefree_seg(m);
-			if (m != NULL)
-				rte_mempool_put(m->pool, m);
-		}
-	}
-
-	vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR;
-	vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1);
-}
-
 uint16_t
 virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_pkts)
@@ -423,3 +162,13 @@ virtio_rxq_vec_setup(struct virtnet_rx *rxq)
 
 	return 0;
 }
+
+/* Stub for linkage when arch specific implementation is not available */
+uint16_t __attribute__((weak))
+virtio_recv_pkts_vec(void *rx_queue __rte_unused,
+		     struct rte_mbuf **rx_pkts __rte_unused,
+		     uint16_t nb_pkts __rte_unused)
+{
+	rte_panic("Wrong weak function linked by linker\n");
+	return 0;
+}
diff --git a/drivers/net/virtio/virtio_rxtx_simple.h b/drivers/net/virtio/virtio_rxtx_simple.h
new file mode 100644
index 0000000..8cb43c0
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple.h
@@ -0,0 +1,133 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTIO_RXTX_SIMPLE_H_
+#define _VIRTIO_RXTX_SIMPLE_H_
+
+#include <stdint.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtqueue.h"
+#include "virtio_rxtx.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+static inline void
+virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
+{
+	int i;
+	uint16_t desc_idx;
+	struct rte_mbuf **sw_ring;
+	struct vring_desc *start_dp;
+	int ret;
+	struct virtqueue *vq = rxvq->vq;
+
+	desc_idx = vq->vq_avail_idx & (vq->vq_nentries - 1);
+	sw_ring = &vq->sw_ring[desc_idx];
+	start_dp = &vq->vq_ring.desc[desc_idx];
+
+	ret = rte_mempool_get_bulk(rxvq->mpool, (void **)sw_ring,
+		RTE_VIRTIO_VPMD_RX_REARM_THRESH);
+	if (unlikely(ret)) {
+		rte_eth_devices[rxvq->port_id].data->rx_mbuf_alloc_failed +=
+			RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+		return;
+	}
+
+	for (i = 0; i < RTE_VIRTIO_VPMD_RX_REARM_THRESH; i++) {
+		uintptr_t p;
+
+		p = (uintptr_t)&sw_ring[i]->rearm_data;
+		*(uint64_t *)p = rxvq->mbuf_initializer;
+
+		start_dp[i].addr =
+			MBUF_DATA_DMA_ADDR(sw_ring[i], vq->offset) -
+			vq->hw->vtnet_hdr_size;
+		start_dp[i].len = sw_ring[i]->buf_len -
+			RTE_PKTMBUF_HEADROOM + vq->hw->vtnet_hdr_size;
+	}
+
+	vq->vq_avail_idx += RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+	vq->vq_free_cnt -= RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+	vq_update_avail_idx(vq);
+}
+
+#define VIRTIO_TX_FREE_THRESH 32
+#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
+#define VIRTIO_TX_FREE_NR 32
+/* TODO: vq->tx_free_cnt could mean num of free slots so we could avoid shift */
+static inline void
+virtio_xmit_cleanup(struct virtqueue *vq)
+{
+	uint16_t i, desc_idx;
+	int nb_free = 0;
+	struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ];
+
+	desc_idx = (uint16_t)(vq->vq_used_cons_idx &
+		   ((vq->vq_nentries >> 1) - 1));
+	m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+	m = __rte_pktmbuf_prefree_seg(m);
+	if (likely(m != NULL)) {
+		free[0] = m;
+		nb_free = 1;
+		for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
+			m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+			m = __rte_pktmbuf_prefree_seg(m);
+			if (likely(m != NULL)) {
+				if (likely(m->pool == free[0]->pool))
+					free[nb_free++] = m;
+				else {
+					rte_mempool_put_bulk(free[0]->pool,
+						(void **)free, nb_free);
+					free[0] = m;
+					nb_free = 1;
+				}
+			}
+		}
+		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+	} else {
+		for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
+			m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+			m = __rte_pktmbuf_prefree_seg(m);
+			if (m != NULL)
+				rte_mempool_put(m->pool, m);
+		}
+	}
+
+	vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR;
+	vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1);
+}
+
+#endif /* _VIRTIO_RXTX_SIMPLE_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c
new file mode 100644
index 0000000..39000e8
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c
@@ -0,0 +1,222 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <tmmintrin.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+
+#include "virtio_rxtx_simple.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts)
+{
+	struct virtnet_rx *rxvq = rx_queue;
+	struct virtqueue *vq = rxvq->vq;
+	uint16_t nb_used;
+	uint16_t desc_idx;
+	struct vring_used_elem *rused;
+	struct rte_mbuf **sw_ring;
+	struct rte_mbuf **sw_ring_end;
+	uint16_t nb_pkts_received;
+	__m128i shuf_msk1, shuf_msk2, len_adjust;
+
+	shuf_msk1 = _mm_set_epi8(
+		0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF,		/* vlan tci */
+		5, 4,			/* dat len */
+		0xFF, 0xFF, 5, 4,	/* pkt len */
+		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
+
+	);
+
+	shuf_msk2 = _mm_set_epi8(
+		0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF,		/* vlan tci */
+		13, 12,			/* dat len */
+		0xFF, 0xFF, 13, 12,	/* pkt len */
+		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
+	);
+
+	/* Subtract the header length.
+	*  In which case do we need the header length in used->len ?
+	*/
+	len_adjust = _mm_set_epi16(
+		0, 0,
+		0,
+		(uint16_t)-vq->hw->vtnet_hdr_size,
+		0, (uint16_t)-vq->hw->vtnet_hdr_size,
+		0, 0);
+
+	if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+		return 0;
+
+	nb_used = VIRTQUEUE_NUSED(vq);
+
+	rte_compiler_barrier();
+
+	if (unlikely(nb_used == 0))
+		return 0;
+
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+	nb_used = RTE_MIN(nb_used, nb_pkts);
+
+	desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+	rused = &vq->vq_ring.used->ring[desc_idx];
+	sw_ring  = &vq->sw_ring[desc_idx];
+	sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+	_mm_prefetch((const void *)rused, _MM_HINT_T0);
+
+	if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+		virtio_rxq_rearm_vec(rxvq);
+		if (unlikely(virtqueue_kick_prepare(vq)))
+			virtqueue_notify(vq);
+	}
+
+	for (nb_pkts_received = 0;
+		nb_pkts_received < nb_used;) {
+		__m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		__m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		__m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+		mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
+		desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
+		_mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
+
+		mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
+		desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
+		_mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
+
+		mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
+		desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
+		_mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
+
+		mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
+		desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
+		_mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
+
+		pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
+		pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
+		pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
+		pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
+			pkt_mb[1]);
+		_mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
+			pkt_mb[0]);
+
+		pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
+		pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
+		pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
+		pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
+			pkt_mb[3]);
+		_mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
+			pkt_mb[2]);
+
+		pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
+		pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
+		pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
+		pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
+			pkt_mb[5]);
+		_mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
+			pkt_mb[4]);
+
+		pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
+		pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
+		pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
+		pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
+			pkt_mb[7]);
+		_mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
+			pkt_mb[6]);
+
+		if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+			if (sw_ring + nb_used <= sw_ring_end)
+				nb_pkts_received += nb_used;
+			else
+				nb_pkts_received += sw_ring_end - sw_ring;
+			break;
+		} else {
+			if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+				sw_ring_end)) {
+				nb_pkts_received += sw_ring_end - sw_ring;
+				break;
+			} else {
+				nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+				rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+				sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+				rused   += RTE_VIRTIO_DESC_PER_LOOP;
+				nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+			}
+		}
+	}
+
+	vq->vq_used_cons_idx += nb_pkts_received;
+	vq->vq_free_cnt += nb_pkts_received;
+	rxvq->stats.packets += nb_pkts_received;
+	return nb_pkts_received;
+}
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v2 3/3] virtio: add neon support
  2016-07-01 11:16 ` From: Jerin Jacob <jerin.jacob@caviumnetworks.com> Jerin Jacob
  2016-07-01 11:16   ` [PATCH v2 1/3] virtio: conditional compilation cleanup Jerin Jacob
  2016-07-01 11:16   ` [PATCH v2 2/3] virtio: move SSE based Rx implementation to separate file Jerin Jacob
@ 2016-07-01 11:16   ` Jerin Jacob
  2016-07-04  7:53     ` Yuanhan Liu
  2016-07-05 12:49   ` [PATCH v3 0/4] Virtio NEON support for ARM Jerin Jacob
  3 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-07-01 11:16 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

Added neon based Rx vector implementation.
Selection of the new handler based neon availability at runtime.
Updated the release notes and MAINTAINERS file.

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 MAINTAINERS                                  |   1 +
 doc/guides/rel_notes/release_16_07.rst       |   2 +
 drivers/net/virtio/Makefile                  |   2 +
 drivers/net/virtio/virtio_rxtx.c             |   3 +
 drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
 5 files changed, 243 insertions(+)
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c

diff --git a/MAINTAINERS b/MAINTAINERS
index a59191e..ab04cee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -143,6 +143,7 @@ F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+F: drivers/net/virtio/virtio_rxtx_simple_neon.c
 
 EZchip TILE-Gx
 M: Zhigang Lu <zlu@ezchip.com>
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 9e2a817..3a5add5 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -174,6 +174,8 @@ New Features
   section of the "Network Interface Controller Drivers" document.
 
 
+* **Virtio NEON support for ARM.**
+
 Resolved Issues
 ---------------
 
diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index c4103b7..97972a6 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -54,6 +54,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c
 
 ifeq ($(CONFIG_RTE_ARCH_X86),y)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_sse.c
+else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),)
+SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_neon.c
 endif
 
 ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index a4d4a57..19d1742 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -481,6 +481,9 @@ virtio_update_rxtx_handler(struct rte_eth_dev *dev,
 #if defined RTE_ARCH_X86
 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
 		use_simple_rxtx = 1;
+#elif defined RTE_ARCH_ARM64 || defined CONFIG_RTE_ARCH_ARM
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+		use_simple_rxtx = 1;
 #endif
 	/* Use simple rx/tx func if single segment and no offloads */
 	if (use_simple_rxtx &&
diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c
new file mode 100644
index 0000000..793eefb
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c
@@ -0,0 +1,235 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2016
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+#include <rte_vect.h>
+
+#include "virtio_rxtx_simple.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts)
+{
+	struct virtnet_rx *rxvq = rx_queue;
+	struct virtqueue *vq = rxvq->vq;
+	uint16_t nb_used;
+	uint16_t desc_idx;
+	struct vring_used_elem *rused;
+	struct rte_mbuf **sw_ring;
+	struct rte_mbuf **sw_ring_end;
+	uint16_t nb_pkts_received;
+
+	uint8x16_t shuf_msk1 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+		4, 5, 0xFF, 0xFF,       /* pkt len */
+		4, 5,                   /* dat len */
+		0xFF, 0xFF,             /* vlan tci */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	uint8x16_t shuf_msk2 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+		12, 13, 0xFF, 0xFF,     /* pkt len */
+		12, 13,                 /* dat len */
+		0xFF, 0xFF,             /* vlan tci */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	/* Subtract the header length.
+	 *  In which case do we need the header length in used->len ?
+	 */
+	uint16x8_t len_adjust = {
+		0, 0,
+		(uint16_t)vq->hw->vtnet_hdr_size, 0,
+		(uint16_t)vq->hw->vtnet_hdr_size,
+		0,
+		0, 0
+	};
+
+	if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+		return 0;
+
+	nb_used = VIRTQUEUE_NUSED(vq);
+
+	rte_rmb();
+
+	if (unlikely(nb_used == 0))
+		return 0;
+
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+	nb_used = RTE_MIN(nb_used, nb_pkts);
+
+	desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+	rused = &vq->vq_ring.used->ring[desc_idx];
+	sw_ring  = &vq->sw_ring[desc_idx];
+	sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+	rte_prefetch_non_temporal(rused);
+
+	if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+		virtio_rxq_rearm_vec(rxvq);
+		if (unlikely(virtqueue_kick_prepare(vq)))
+			virtqueue_notify(vq);
+	}
+
+	for (nb_pkts_received = 0;
+		nb_pkts_received < nb_used;) {
+		uint64x2_t desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		uint64x2_t mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		uint64x2_t pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+		mbp[0] = vld1q_u64((uint64_t *)(sw_ring + 0));
+		desc[0] = vld1q_u64((uint64_t *)(rused + 0));
+		vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0]);
+
+		mbp[1] = vld1q_u64((uint64_t *)(sw_ring + 2));
+		desc[1] = vld1q_u64((uint64_t *)(rused + 2));
+		vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1]);
+
+		mbp[2] = vld1q_u64((uint64_t *)(sw_ring + 4));
+		desc[2] = vld1q_u64((uint64_t *)(rused + 4));
+		vst1q_u64((uint64_t *)&rx_pkts[4], mbp[2]);
+
+		mbp[3] = vld1q_u64((uint64_t *)(sw_ring + 6));
+		desc[3] = vld1q_u64((uint64_t *)(rused + 6));
+		vst1q_u64((uint64_t *)&rx_pkts[6], mbp[3]);
+
+		pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[0]), shuf_msk2));
+		pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[0]), shuf_msk1));
+		pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
+		pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1,
+			pkt_mb[1]);
+		vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1,
+			pkt_mb[0]);
+
+		pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[1]), shuf_msk2));
+		pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[1]), shuf_msk1));
+		pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
+		pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1,
+			pkt_mb[3]);
+		vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1,
+			pkt_mb[2]);
+
+		pkt_mb[5] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[2]), shuf_msk2));
+		pkt_mb[4] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[2]), shuf_msk1));
+		pkt_mb[5] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[5]), len_adjust));
+		pkt_mb[4] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[4]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[5]->rx_descriptor_fields1,
+			pkt_mb[5]);
+		vst1q_u64((void *)&rx_pkts[4]->rx_descriptor_fields1,
+			pkt_mb[4]);
+
+		pkt_mb[7] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[3]), shuf_msk2));
+		pkt_mb[6] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[3]), shuf_msk1));
+		pkt_mb[7] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[7]), len_adjust));
+		pkt_mb[6] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[6]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[7]->rx_descriptor_fields1,
+			pkt_mb[7]);
+		vst1q_u64((void *)&rx_pkts[6]->rx_descriptor_fields1,
+			pkt_mb[6]);
+
+		if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+			if (sw_ring + nb_used <= sw_ring_end)
+				nb_pkts_received += nb_used;
+			else
+				nb_pkts_received += sw_ring_end - sw_ring;
+			break;
+		} else {
+			if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+				sw_ring_end)) {
+				nb_pkts_received += sw_ring_end - sw_ring;
+				break;
+			} else {
+				nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+				rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+				sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+				rused   += RTE_VIRTIO_DESC_PER_LOOP;
+				nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+			}
+		}
+	}
+
+	vq->vq_used_cons_idx += nb_pkts_received;
+	vq->vq_free_cnt += nb_pkts_received;
+	rxvq->stats.packets += nb_pkts_received;
+	return nb_pkts_received;
+}
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v2 0/3] Virtio NEON support for ARM
  2016-06-27 11:54 [PATCH 0/4] Virtio NEON support for ARM Jerin Jacob
                   ` (4 preceding siblings ...)
  2016-07-01 11:16 ` From: Jerin Jacob <jerin.jacob@caviumnetworks.com> Jerin Jacob
@ 2016-07-01 11:19 ` Jerin Jacob
  5 siblings, 0 replies; 44+ messages in thread
From: Jerin Jacob @ 2016-07-01 11:19 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

This patch-set includes,

1) General cleanup of compile time dependency.
2) made vector handler section based on run-time cpuflags
2) Added NEON support for optimized Rx handling

This patch-set is based on dpdk-next-virtio/master

v2:
- made vector handler selection based on run-time cpuflags (Suggested by Thomas)
- moved vector implementations to .c file instead of .h file(Suggested by Jianbo)

Jerin Jacob (3):
  virtio: conditional compilation cleanup
  virtio: move SSE based Rx implementation to separate file
  virtio: add neon support

 MAINTAINERS                                  |   1 +
 doc/guides/rel_notes/release_16_07.rst       |   2 +
 drivers/net/vision/Wakefully                  |   7 +-
 drivers/net/virtio/virtio_pci.h              |   1 +
 drivers/net/virtio/virtio_rxtx.c             |  62 +++---
 drivers/net/virtio/virtio_rxtx.h             |   3 +-
 drivers/net/virtio/virtio_rxtx_simple.c      | 269 ++-------------------------
 drivers/net/virtio/virtio_rxtx_simple.h      | 133 +++++++++++++
 drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++
 drivers/net/virtio/virtio_rxtx_simple_sse.c  | 222 ++++++++++++++++++++++
 drivers/net/virtio/virtio_user_ethdev.c      |   1 +
 11 files changed, 646 insertions(+), 290 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c

-- 
2.5.5

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 1/3] virtio: conditional compilation cleanup
  2016-07-01 11:16   ` [PATCH v2 1/3] virtio: conditional compilation cleanup Jerin Jacob
@ 2016-07-04  7:36     ` Yuanhan Liu
  2016-07-04  8:36       ` Jerin Jacob
  0 siblings, 1 reply; 44+ messages in thread
From: Yuanhan Liu @ 2016-07-04  7:36 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Fri, Jul 01, 2016 at 04:46:36PM +0530, Jerin Jacob wrote:
> @@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
>  {
>  	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
>  
> -#ifdef RTE_MACHINE_CPUFLAG_SSSE3
> -	struct virtio_hw *hw = dev->data->dev_private;
> -#endif
>  	struct virtnet_tx *txvq;
>  	struct virtqueue *vq;
>  	uint16_t tx_free_thresh;
> @@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
>  	}
>  
>  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> +	struct virtio_hw *hw = dev->data->dev_private;

I'd suggest to move above declaration to ...

>  	/* Use simple rx/tx func if single segment and no offloads */
>  	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
>  	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {

here: we should try to avoid declaring vars in the middle of a code block.

	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 2/3] virtio: move SSE based Rx implementation to separate file
  2016-07-01 11:16   ` [PATCH v2 2/3] virtio: move SSE based Rx implementation to separate file Jerin Jacob
@ 2016-07-04  7:42     ` Yuanhan Liu
  2016-07-04  8:38       ` Jerin Jacob
  0 siblings, 1 reply; 44+ messages in thread
From: Yuanhan Liu @ 2016-07-04  7:42 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Fri, Jul 01, 2016 at 04:46:37PM +0530, Jerin Jacob wrote:
> * Introduced cpuflag based run-time detection to
> select the SSE based simple Rx handler
> * Split out SSE instruction based virtio simple Rx
> implementation to a separate file

As your commit log says, it does two things, therefore, I'd suggest you
to do it in two patches, with each just does one thing as you mentioned.

> +static void
> +virtio_update_rxtx_handler(struct rte_eth_dev *dev,
> +			   const struct rte_eth_txconf *tx_conf)
> +{
> +	uint8_t use_simple_rxtx = 0;
> +	struct virtio_hw *hw = dev->data->dev_private;
> +
> +#if defined RTE_ARCH_X86
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
> +		use_simple_rxtx = 1;
> +#endif
> +	/* Use simple rx/tx func if single segment and no offloads */
> +	if (use_simple_rxtx &&
> +	   (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
> +		!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {

The alignment here is not consistent, something like following is what
I'd suggest:

	if (use_simple_rxtx &&
	    (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
	    !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {

	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 3/3] virtio: add neon support
  2016-07-01 11:16   ` [PATCH v2 3/3] virtio: add neon support Jerin Jacob
@ 2016-07-04  7:53     ` Yuanhan Liu
  2016-07-04  8:55       ` Jerin Jacob
  0 siblings, 1 reply; 44+ messages in thread
From: Yuanhan Liu @ 2016-07-04  7:53 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Fri, Jul 01, 2016 at 04:46:38PM +0530, Jerin Jacob wrote:
> Added neon based Rx vector implementation.
> Selection of the new handler based neon availability at runtime.
> Updated the release notes and MAINTAINERS file.
> 
> Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> ---
>  MAINTAINERS                                  |   1 +
>  doc/guides/rel_notes/release_16_07.rst       |   2 +
>  drivers/net/virtio/Makefile                  |   2 +
>  drivers/net/virtio/virtio_rxtx.c             |   3 +
>  drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
>  5 files changed, 243 insertions(+)
>  create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index a59191e..ab04cee 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -143,6 +143,7 @@ F: lib/librte_acl/acl_run_neon.*
>  F: lib/librte_lpm/rte_lpm_neon.h
>  F: lib/librte_hash/rte*_arm64.h
>  F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> +F: drivers/net/virtio/virtio_rxtx_simple_neon.c
>  
>  EZchip TILE-Gx
>  M: Zhigang Lu <zlu@ezchip.com>
> diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
> index 9e2a817..3a5add5 100644
> --- a/doc/guides/rel_notes/release_16_07.rst
> +++ b/doc/guides/rel_notes/release_16_07.rst

This series basically looks good to me, but I don't think we can make it
for v16.07: you missed v1 deadline; it's also too late: rc1 was already out.

	--yliu

> @@ -174,6 +174,8 @@ New Features
>    section of the "Network Interface Controller Drivers" document.
>  
>  
> +* **Virtio NEON support for ARM.**
> +
>  Resolved Issues
>  ---------------
>  

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 1/3] virtio: conditional compilation cleanup
  2016-07-04  7:36     ` Yuanhan Liu
@ 2016-07-04  8:36       ` Jerin Jacob
  2016-07-04  8:42         ` Yuanhan Liu
  0 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-07-04  8:36 UTC (permalink / raw)
  To: Yuanhan Liu
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Mon, Jul 04, 2016 at 03:36:48PM +0800, Yuanhan Liu wrote:
> On Fri, Jul 01, 2016 at 04:46:36PM +0530, Jerin Jacob wrote:
> > @@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> >  {
> >  	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
> >  
> > -#ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > -	struct virtio_hw *hw = dev->data->dev_private;
> > -#endif
> >  	struct virtnet_tx *txvq;
> >  	struct virtqueue *vq;
> >  	uint16_t tx_free_thresh;
> > @@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> >  	}
> >  
> >  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > +	struct virtio_hw *hw = dev->data->dev_private;
> 
> I'd suggest to move above declaration to ...
> 
> >  	/* Use simple rx/tx func if single segment and no offloads */
> >  	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
> >  	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> 
> here: we should try to avoid declaring vars in the middle of a code block.

Next patch in this series, moving all rxtx handler selection code to
separate function(virtio_update_rxtx_handler()) where declaration comes
as first line in the function.i.e the comment is taken care of in the
series.

> 
> 	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 2/3] virtio: move SSE based Rx implementation to separate file
  2016-07-04  7:42     ` Yuanhan Liu
@ 2016-07-04  8:38       ` Jerin Jacob
  0 siblings, 0 replies; 44+ messages in thread
From: Jerin Jacob @ 2016-07-04  8:38 UTC (permalink / raw)
  To: Yuanhan Liu
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Mon, Jul 04, 2016 at 03:42:47PM +0800, Yuanhan Liu wrote:
> On Fri, Jul 01, 2016 at 04:46:37PM +0530, Jerin Jacob wrote:
> > * Introduced cpuflag based run-time detection to
> > select the SSE based simple Rx handler
> > * Split out SSE instruction based virtio simple Rx
> > implementation to a separate file
> 
> As your commit log says, it does two things, therefore, I'd suggest you
> to do it in two patches, with each just does one thing as you mentioned.

OK. Will fix it in next revision.

> 
> > +static void
> > +virtio_update_rxtx_handler(struct rte_eth_dev *dev,
> > +			   const struct rte_eth_txconf *tx_conf)
> > +{
> > +	uint8_t use_simple_rxtx = 0;
> > +	struct virtio_hw *hw = dev->data->dev_private;
> > +
> > +#if defined RTE_ARCH_X86
> > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
> > +		use_simple_rxtx = 1;
> > +#endif
> > +	/* Use simple rx/tx func if single segment and no offloads */
> > +	if (use_simple_rxtx &&
> > +	   (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
> > +		!vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> 
> The alignment here is not consistent, something like following is what
> I'd suggest:
> 
> 	if (use_simple_rxtx &&
> 	    (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
> 	    !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {

OK. Will fix it in next revision.

> 
> 	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 1/3] virtio: conditional compilation cleanup
  2016-07-04  8:36       ` Jerin Jacob
@ 2016-07-04  8:42         ` Yuanhan Liu
  2016-07-04  9:07           ` Jerin Jacob
  0 siblings, 1 reply; 44+ messages in thread
From: Yuanhan Liu @ 2016-07-04  8:42 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Mon, Jul 04, 2016 at 02:06:27PM +0530, Jerin Jacob wrote:
> On Mon, Jul 04, 2016 at 03:36:48PM +0800, Yuanhan Liu wrote:
> > On Fri, Jul 01, 2016 at 04:46:36PM +0530, Jerin Jacob wrote:
> > > @@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > >  {
> > >  	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
> > >  
> > > -#ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > -	struct virtio_hw *hw = dev->data->dev_private;
> > > -#endif
> > >  	struct virtnet_tx *txvq;
> > >  	struct virtqueue *vq;
> > >  	uint16_t tx_free_thresh;
> > > @@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > >  	}
> > >  
> > >  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > +	struct virtio_hw *hw = dev->data->dev_private;
> > 
> > I'd suggest to move above declaration to ...
> > 
> > >  	/* Use simple rx/tx func if single segment and no offloads */
> > >  	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
> > >  	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> > 
> > here: we should try to avoid declaring vars in the middle of a code block.
> 
> Next patch in this series, moving all rxtx handler selection code to
> separate function(virtio_update_rxtx_handler()) where declaration comes
> as first line in the function.i.e the comment is taken care of in the
> series.

Yes, I saw that. But in principle, each patch is atomic: it's not a
good idea/practice to introduce issues in path A and then fix it in
path B.

	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 3/3] virtio: add neon support
  2016-07-04  7:53     ` Yuanhan Liu
@ 2016-07-04  8:55       ` Jerin Jacob
  2016-07-04  9:02         ` Yuanhan Liu
  0 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-07-04  8:55 UTC (permalink / raw)
  To: Yuanhan Liu
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Mon, Jul 04, 2016 at 03:53:22PM +0800, Yuanhan Liu wrote:
> On Fri, Jul 01, 2016 at 04:46:38PM +0530, Jerin Jacob wrote:
> > Added neon based Rx vector implementation.
> > Selection of the new handler based neon availability at runtime.
> > Updated the release notes and MAINTAINERS file.
> > 
> > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > ---
> >  MAINTAINERS                                  |   1 +
> >  doc/guides/rel_notes/release_16_07.rst       |   2 +
> >  drivers/net/virtio/Makefile                  |   2 +
> >  drivers/net/virtio/virtio_rxtx.c             |   3 +
> >  drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
> >  5 files changed, 243 insertions(+)
> >  create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
> > 
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index a59191e..ab04cee 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -143,6 +143,7 @@ F: lib/librte_acl/acl_run_neon.*
> >  F: lib/librte_lpm/rte_lpm_neon.h
> >  F: lib/librte_hash/rte*_arm64.h
> >  F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> > +F: drivers/net/virtio/virtio_rxtx_simple_neon.c
> >  
> >  EZchip TILE-Gx
> >  M: Zhigang Lu <zlu@ezchip.com>
> > diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
> > index 9e2a817..3a5add5 100644
> > --- a/doc/guides/rel_notes/release_16_07.rst
> > +++ b/doc/guides/rel_notes/release_16_07.rst
> 
> This series basically looks good to me, but I don't think we can make it
> for v16.07: you missed v1 deadline; it's also too late: rc1 was already out.

OK. But I thought, Thomas hasn't pulled the changes from dpdk-next-virtio.

Even if didn't make it for v16.07, I would suggest you to consider taking
the changes to dpdk-next-virtio as this change involves file restructuring
(Will have issue with re-basing in future) without having any functional impact.


> 
> 	--yliu
> 
> > @@ -174,6 +174,8 @@ New Features
> >    section of the "Network Interface Controller Drivers" document.
> >  
> >  
> > +* **Virtio NEON support for ARM.**
> > +
> >  Resolved Issues
> >  ---------------
> >  

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 3/3] virtio: add neon support
  2016-07-04  8:55       ` Jerin Jacob
@ 2016-07-04  9:02         ` Yuanhan Liu
  0 siblings, 0 replies; 44+ messages in thread
From: Yuanhan Liu @ 2016-07-04  9:02 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Mon, Jul 04, 2016 at 02:25:55PM +0530, Jerin Jacob wrote:
> On Mon, Jul 04, 2016 at 03:53:22PM +0800, Yuanhan Liu wrote:
> > On Fri, Jul 01, 2016 at 04:46:38PM +0530, Jerin Jacob wrote:
> > > Added neon based Rx vector implementation.
> > > Selection of the new handler based neon availability at runtime.
> > > Updated the release notes and MAINTAINERS file.
> > > 
> > > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > > ---
> > >  MAINTAINERS                                  |   1 +
> > >  doc/guides/rel_notes/release_16_07.rst       |   2 +
> > >  drivers/net/virtio/Makefile                  |   2 +
> > >  drivers/net/virtio/virtio_rxtx.c             |   3 +
> > >  drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
> > >  5 files changed, 243 insertions(+)
> > >  create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
> > > 
> > > diff --git a/MAINTAINERS b/MAINTAINERS
> > > index a59191e..ab04cee 100644
> > > --- a/MAINTAINERS
> > > +++ b/MAINTAINERS
> > > @@ -143,6 +143,7 @@ F: lib/librte_acl/acl_run_neon.*
> > >  F: lib/librte_lpm/rte_lpm_neon.h
> > >  F: lib/librte_hash/rte*_arm64.h
> > >  F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
> > > +F: drivers/net/virtio/virtio_rxtx_simple_neon.c
> > >  
> > >  EZchip TILE-Gx
> > >  M: Zhigang Lu <zlu@ezchip.com>
> > > diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
> > > index 9e2a817..3a5add5 100644
> > > --- a/doc/guides/rel_notes/release_16_07.rst
> > > +++ b/doc/guides/rel_notes/release_16_07.rst
> > 
> > This series basically looks good to me, but I don't think we can make it
> > for v16.07: you missed v1 deadline; it's also too late: rc1 was already out.
> 
> OK. But I thought, Thomas hasn't pulled the changes from dpdk-next-virtio.
> 
> Even if didn't make it for v16.07, I would suggest you to consider taking
> the changes to dpdk-next-virtio as this change involves file restructuring
> (Will have issue with re-basing in future) without having any functional impact.

Yes, that's my plan. I will do the merge ASAP, when

- v16.07 is out.

- your patches are ready.


	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 1/3] virtio: conditional compilation cleanup
  2016-07-04  8:42         ` Yuanhan Liu
@ 2016-07-04  9:07           ` Jerin Jacob
  2016-07-04 11:02             ` Yuanhan Liu
  0 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-07-04  9:07 UTC (permalink / raw)
  To: Yuanhan Liu
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Mon, Jul 04, 2016 at 04:42:32PM +0800, Yuanhan Liu wrote:
> On Mon, Jul 04, 2016 at 02:06:27PM +0530, Jerin Jacob wrote:
> > On Mon, Jul 04, 2016 at 03:36:48PM +0800, Yuanhan Liu wrote:
> > > On Fri, Jul 01, 2016 at 04:46:36PM +0530, Jerin Jacob wrote:
> > > > @@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > > >  {
> > > >  	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
> > > >  
> > > > -#ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > > -	struct virtio_hw *hw = dev->data->dev_private;
> > > > -#endif
> > > >  	struct virtnet_tx *txvq;
> > > >  	struct virtqueue *vq;
> > > >  	uint16_t tx_free_thresh;
> > > > @@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > > >  	}
> > > >  
> > > >  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > > +	struct virtio_hw *hw = dev->data->dev_private;
> > > 
> > > I'd suggest to move above declaration to ...
> > > 
> > > >  	/* Use simple rx/tx func if single segment and no offloads */
> > > >  	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
> > > >  	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> > > 
> > > here: we should try to avoid declaring vars in the middle of a code block.
> > 
> > Next patch in this series, moving all rxtx handler selection code to
> > separate function(virtio_update_rxtx_handler()) where declaration comes
> > as first line in the function.i.e the comment is taken care of in the
> > series.
> 
> Yes, I saw that. But in principle, each patch is atomic: it's not a
> good idea/practice to introduce issues in path A and then fix it in
> path B.

In my view it was not an issue as I was removing all possible
conditional compilation flag. If I were to move the declaration to top
then another conditional compilation RTE_MACHINE_CPUFLAG_SSSE3
flag I need to add around declaring the variable.

Hope this justifies the reason. If you are not convinced then let me know,
if will add the change in next revision.

Jerin

> 
> 	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 1/3] virtio: conditional compilation cleanup
  2016-07-04  9:07           ` Jerin Jacob
@ 2016-07-04 11:02             ` Yuanhan Liu
  2016-07-04 12:15               ` Jerin Jacob
  0 siblings, 1 reply; 44+ messages in thread
From: Yuanhan Liu @ 2016-07-04 11:02 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Mon, Jul 04, 2016 at 02:37:55PM +0530, Jerin Jacob wrote:
> On Mon, Jul 04, 2016 at 04:42:32PM +0800, Yuanhan Liu wrote:
> > On Mon, Jul 04, 2016 at 02:06:27PM +0530, Jerin Jacob wrote:
> > > On Mon, Jul 04, 2016 at 03:36:48PM +0800, Yuanhan Liu wrote:
> > > > On Fri, Jul 01, 2016 at 04:46:36PM +0530, Jerin Jacob wrote:
> > > > > @@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > > > >  {
> > > > >  	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
> > > > >  
> > > > > -#ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > > > -	struct virtio_hw *hw = dev->data->dev_private;
> > > > > -#endif
> > > > >  	struct virtnet_tx *txvq;
> > > > >  	struct virtqueue *vq;
> > > > >  	uint16_t tx_free_thresh;
> > > > > @@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > > > >  	}
> > > > >  
> > > > >  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > > > +	struct virtio_hw *hw = dev->data->dev_private;
> > > > 
> > > > I'd suggest to move above declaration to ...
> > > > 
> > > > >  	/* Use simple rx/tx func if single segment and no offloads */
> > > > >  	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
> > > > >  	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> > > > 
> > > > here: we should try to avoid declaring vars in the middle of a code block.
> > > 
> > > Next patch in this series, moving all rxtx handler selection code to
> > > separate function(virtio_update_rxtx_handler()) where declaration comes
> > > as first line in the function.i.e the comment is taken care of in the
> > > series.
> > 
> > Yes, I saw that. But in principle, each patch is atomic: it's not a
> > good idea/practice to introduce issues in path A and then fix it in
> > path B.
> 
> In my view it was not an issue as I was removing all possible
> conditional compilation flag. If I were to move the declaration to top
> then another conditional compilation RTE_MACHINE_CPUFLAG_SSSE3
> flag I need to add around declaring the variable.

Nope, I was suggesting to move it inside the "if" block. So, this
is actually consistent with what you are trying to do. Besides, it
removes an declaration in the middle.

	--yliu

> Hope this justifies the reason. If you are not convinced then let me know,
> if will add the change in next revision.
> 
> Jerin
> 
> > 
> > 	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 1/3] virtio: conditional compilation cleanup
  2016-07-04 11:02             ` Yuanhan Liu
@ 2016-07-04 12:15               ` Jerin Jacob
  2016-07-04 12:26                 ` Yuanhan Liu
  0 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-07-04 12:15 UTC (permalink / raw)
  To: Yuanhan Liu
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Mon, Jul 04, 2016 at 07:02:25PM +0800, Yuanhan Liu wrote:
> On Mon, Jul 04, 2016 at 02:37:55PM +0530, Jerin Jacob wrote:
> > On Mon, Jul 04, 2016 at 04:42:32PM +0800, Yuanhan Liu wrote:
> > > On Mon, Jul 04, 2016 at 02:06:27PM +0530, Jerin Jacob wrote:
> > > > On Mon, Jul 04, 2016 at 03:36:48PM +0800, Yuanhan Liu wrote:
> > > > > On Fri, Jul 01, 2016 at 04:46:36PM +0530, Jerin Jacob wrote:
> > > > > > @@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > > > > >  {
> > > > > >  	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
> > > > > >  
> > > > > > -#ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > > > > -	struct virtio_hw *hw = dev->data->dev_private;
> > > > > > -#endif
> > > > > >  	struct virtnet_tx *txvq;
> > > > > >  	struct virtqueue *vq;
> > > > > >  	uint16_t tx_free_thresh;
> > > > > > @@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > > > > >  	}
> > > > > >  
> > > > > >  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > > > > +	struct virtio_hw *hw = dev->data->dev_private;
> > > > > 
> > > > > I'd suggest to move above declaration to ...
> > > > > 
> > > > > >  	/* Use simple rx/tx func if single segment and no offloads */
> > > > > >  	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
> > > > > >  	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> > > > > 
> > > > > here: we should try to avoid declaring vars in the middle of a code block.
> > > > 
> > > > Next patch in this series, moving all rxtx handler selection code to
> > > > separate function(virtio_update_rxtx_handler()) where declaration comes
> > > > as first line in the function.i.e the comment is taken care of in the
> > > > series.
> > > 
> > > Yes, I saw that. But in principle, each patch is atomic: it's not a
> > > good idea/practice to introduce issues in path A and then fix it in
> > > path B.
> > 
> > In my view it was not an issue as I was removing all possible
> > conditional compilation flag. If I were to move the declaration to top
> > then another conditional compilation RTE_MACHINE_CPUFLAG_SSSE3
> > flag I need to add around declaring the variable.
> 
> Nope, I was suggesting to move it inside the "if" block. So, this
> is actually consistent with what you are trying to do. Besides, it
> removes an declaration in the middle.

Just to get the clarity on "moving inside the 'if' block"

Are you suggesting to have like below?

 #ifdef RTE_MACHINE_CPUFLAG_SSSE3
+       struct virtio_hw *hw;
        /* Use simple rx/tx func if single segment and no offloads */
        if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
VIRTIO_SIMPLE_FLAGS &&
             !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
                PMD_INIT_LOG(INFO, "Using simple rx/tx path");
                dev->tx_pkt_burst = virtio_xmit_pkts_simple;
                dev->rx_pkt_burst = virtio_recv_pkts_vec;
-               use_simple_rxtx = 1;
+		hw = dev->data->dev_private;
+               hw->use_simple_rxtx = 1;
        }
 #endif


Instead of following scheme in existing patch,

 #ifdef RTE_MACHINE_CPUFLAG_SSSE3
+       struct virtio_hw *hw = dev->data->dev_private;
        /* Use simple rx/tx func if single segment and no offloads */
        if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
VIRTIO_SIMPLE_FLAGS &&
             !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
                PMD_INIT_LOG(INFO, "Using simple rx/tx path");
                dev->tx_pkt_burst = virtio_xmit_pkts_simple;
                dev->rx_pkt_burst = virtio_recv_pkts_vec;
-               use_simple_rxtx = 1;
+               hw->use_simple_rxtx = 1;
        }
 #endif


The former case will have issue as "hw" been used in "if" with vtpci_with_feature.

OR

if you meant just floating "struct virtio_hw *hw" without RTE_MACHINE_CPUFLAG_SSSE3
then it comes error on non x86 as unused "hw" variable.

If you meant something else then let me know?

> 
> 	--yliu
> 
> > Hope this justifies the reason. If you are not convinced then let me know,
> > if will add the change in next revision.
> > 
> > Jerin
> > 
> > > 
> > > 	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 1/3] virtio: conditional compilation cleanup
  2016-07-04 12:15               ` Jerin Jacob
@ 2016-07-04 12:26                 ` Yuanhan Liu
  2016-07-04 12:50                   ` Jerin Jacob
  0 siblings, 1 reply; 44+ messages in thread
From: Yuanhan Liu @ 2016-07-04 12:26 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Mon, Jul 04, 2016 at 05:45:57PM +0530, Jerin Jacob wrote:
> On Mon, Jul 04, 2016 at 07:02:25PM +0800, Yuanhan Liu wrote:
> > On Mon, Jul 04, 2016 at 02:37:55PM +0530, Jerin Jacob wrote:
> > > On Mon, Jul 04, 2016 at 04:42:32PM +0800, Yuanhan Liu wrote:
> > > > On Mon, Jul 04, 2016 at 02:06:27PM +0530, Jerin Jacob wrote:
> > > > > On Mon, Jul 04, 2016 at 03:36:48PM +0800, Yuanhan Liu wrote:
> > > > > > On Fri, Jul 01, 2016 at 04:46:36PM +0530, Jerin Jacob wrote:
> > > > > > > @@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > > > > > >  {
> > > > > > >  	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
> > > > > > >  
> > > > > > > -#ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > > > > > -	struct virtio_hw *hw = dev->data->dev_private;
> > > > > > > -#endif
> > > > > > >  	struct virtnet_tx *txvq;
> > > > > > >  	struct virtqueue *vq;
> > > > > > >  	uint16_t tx_free_thresh;
> > > > > > > @@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > > > > > >  	}
> > > > > > >  
> > > > > > >  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > > > > > +	struct virtio_hw *hw = dev->data->dev_private;
> > > > > > 
> > > > > > I'd suggest to move above declaration to ...
> > > > > > 
> > > > > > >  	/* Use simple rx/tx func if single segment and no offloads */
> > > > > > >  	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
> > > > > > >  	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> > > > > > 
> > > > > > here: we should try to avoid declaring vars in the middle of a code block.
> > > > > 
> > > > > Next patch in this series, moving all rxtx handler selection code to
> > > > > separate function(virtio_update_rxtx_handler()) where declaration comes
> > > > > as first line in the function.i.e the comment is taken care of in the
> > > > > series.
> > > > 
> > > > Yes, I saw that. But in principle, each patch is atomic: it's not a
> > > > good idea/practice to introduce issues in path A and then fix it in
> > > > path B.
> > > 
> > > In my view it was not an issue as I was removing all possible
> > > conditional compilation flag. If I were to move the declaration to top
> > > then another conditional compilation RTE_MACHINE_CPUFLAG_SSSE3
> > > flag I need to add around declaring the variable.
> > 
> > Nope, I was suggesting to move it inside the "if" block. So, this
> > is actually consistent with what you are trying to do. Besides, it
> > removes an declaration in the middle.
> 
> Just to get the clarity on "moving inside the 'if' block"
> 
> Are you suggesting to have like below?
> 
>  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> +       struct virtio_hw *hw;
>         /* Use simple rx/tx func if single segment and no offloads */
>         if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
> VIRTIO_SIMPLE_FLAGS &&
>              !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
>                 PMD_INIT_LOG(INFO, "Using simple rx/tx path");
>                 dev->tx_pkt_burst = virtio_xmit_pkts_simple;
>                 dev->rx_pkt_burst = virtio_recv_pkts_vec;
> -               use_simple_rxtx = 1;
> +		hw = dev->data->dev_private;
> +               hw->use_simple_rxtx = 1;
>         }
>  #endif
> 
> 
> Instead of following scheme in existing patch,
> 
>  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> +       struct virtio_hw *hw = dev->data->dev_private;
>         /* Use simple rx/tx func if single segment and no offloads */
>         if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
> VIRTIO_SIMPLE_FLAGS &&
>              !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
>                 PMD_INIT_LOG(INFO, "Using simple rx/tx path");
>                 dev->tx_pkt_burst = virtio_xmit_pkts_simple;
>                 dev->rx_pkt_burst = virtio_recv_pkts_vec;
> -               use_simple_rxtx = 1;
> +               hw->use_simple_rxtx = 1;
>         }
>  #endif
> 
> 
> The former case will have issue as "hw" been used in "if" with vtpci_with_feature.

Oh, my bad. I overlooked it. Sorry for that!

> OR
> 
> if you meant just floating "struct virtio_hw *hw" without RTE_MACHINE_CPUFLAG_SSSE3
> then it comes error on non x86 as unused "hw" variable.
> 
> If you meant something else then let me know?

I then prefer to keep the "#ifdef .. #endif" on top then. It will stop
us from offending a minor rule, while you can remove the ugly "#ifdef"
block in the next patch.

Works to you?

	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 1/3] virtio: conditional compilation cleanup
  2016-07-04 12:26                 ` Yuanhan Liu
@ 2016-07-04 12:50                   ` Jerin Jacob
  2016-07-04 12:57                     ` Yuanhan Liu
  0 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-07-04 12:50 UTC (permalink / raw)
  To: Yuanhan Liu
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Mon, Jul 04, 2016 at 08:26:30PM +0800, Yuanhan Liu wrote:
> On Mon, Jul 04, 2016 at 05:45:57PM +0530, Jerin Jacob wrote:
> > On Mon, Jul 04, 2016 at 07:02:25PM +0800, Yuanhan Liu wrote:
> > > On Mon, Jul 04, 2016 at 02:37:55PM +0530, Jerin Jacob wrote:
> > > > On Mon, Jul 04, 2016 at 04:42:32PM +0800, Yuanhan Liu wrote:
> > > > > On Mon, Jul 04, 2016 at 02:06:27PM +0530, Jerin Jacob wrote:
> > > > > > On Mon, Jul 04, 2016 at 03:36:48PM +0800, Yuanhan Liu wrote:
> > > > > > > On Fri, Jul 01, 2016 at 04:46:36PM +0530, Jerin Jacob wrote:
> > > > > > > > @@ -494,9 +486,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > > > > > > >  {
> > > > > > > >  	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
> > > > > > > >  
> > > > > > > > -#ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > > > > > > -	struct virtio_hw *hw = dev->data->dev_private;
> > > > > > > > -#endif
> > > > > > > >  	struct virtnet_tx *txvq;
> > > > > > > >  	struct virtqueue *vq;
> > > > > > > >  	uint16_t tx_free_thresh;
> > > > > > > > @@ -511,13 +500,14 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
> > > > > > > >  	}
> > > > > > > >  
> > > > > > > >  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > > > > > > > +	struct virtio_hw *hw = dev->data->dev_private;
> > > > > > > 
> > > > > > > I'd suggest to move above declaration to ...
> > > > > > > 
> > > > > > > >  	/* Use simple rx/tx func if single segment and no offloads */
> > > > > > > >  	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
> > > > > > > >  	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> > > > > > > 
> > > > > > > here: we should try to avoid declaring vars in the middle of a code block.
> > > > > > 
> > > > > > Next patch in this series, moving all rxtx handler selection code to
> > > > > > separate function(virtio_update_rxtx_handler()) where declaration comes
> > > > > > as first line in the function.i.e the comment is taken care of in the
> > > > > > series.
> > > > > 
> > > > > Yes, I saw that. But in principle, each patch is atomic: it's not a
> > > > > good idea/practice to introduce issues in path A and then fix it in
> > > > > path B.
> > > > 
> > > > In my view it was not an issue as I was removing all possible
> > > > conditional compilation flag. If I were to move the declaration to top
> > > > then another conditional compilation RTE_MACHINE_CPUFLAG_SSSE3
> > > > flag I need to add around declaring the variable.
> > > 
> > > Nope, I was suggesting to move it inside the "if" block. So, this
> > > is actually consistent with what you are trying to do. Besides, it
> > > removes an declaration in the middle.
> > 
> > Just to get the clarity on "moving inside the 'if' block"
> > 
> > Are you suggesting to have like below?
> > 
> >  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > +       struct virtio_hw *hw;
> >         /* Use simple rx/tx func if single segment and no offloads */
> >         if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
> > VIRTIO_SIMPLE_FLAGS &&
> >              !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> >                 PMD_INIT_LOG(INFO, "Using simple rx/tx path");
> >                 dev->tx_pkt_burst = virtio_xmit_pkts_simple;
> >                 dev->rx_pkt_burst = virtio_recv_pkts_vec;
> > -               use_simple_rxtx = 1;
> > +		hw = dev->data->dev_private;
> > +               hw->use_simple_rxtx = 1;
> >         }
> >  #endif
> > 
> > 
> > Instead of following scheme in existing patch,
> > 
> >  #ifdef RTE_MACHINE_CPUFLAG_SSSE3
> > +       struct virtio_hw *hw = dev->data->dev_private;
> >         /* Use simple rx/tx func if single segment and no offloads */
> >         if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) ==
> > VIRTIO_SIMPLE_FLAGS &&
> >              !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
> >                 PMD_INIT_LOG(INFO, "Using simple rx/tx path");
> >                 dev->tx_pkt_burst = virtio_xmit_pkts_simple;
> >                 dev->rx_pkt_burst = virtio_recv_pkts_vec;
> > -               use_simple_rxtx = 1;
> > +               hw->use_simple_rxtx = 1;
> >         }
> >  #endif
> > 
> > 
> > The former case will have issue as "hw" been used in "if" with vtpci_with_feature.
> 
> Oh, my bad. I overlooked it. Sorry for that!
> 
> > OR
> > 
> > if you meant just floating "struct virtio_hw *hw" without RTE_MACHINE_CPUFLAG_SSSE3
> > then it comes error on non x86 as unused "hw" variable.
> > 
> > If you meant something else then let me know?
> 
> I then prefer to keep the "#ifdef .. #endif" on top then. It will stop
> us from offending a minor rule, while you can remove the ugly "#ifdef"
> block in the next patch.
> 
> Works to you?

OK. As you wish :-)

> 
> 	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v2 1/3] virtio: conditional compilation cleanup
  2016-07-04 12:50                   ` Jerin Jacob
@ 2016-07-04 12:57                     ` Yuanhan Liu
  0 siblings, 0 replies; 44+ messages in thread
From: Yuanhan Liu @ 2016-07-04 12:57 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Mon, Jul 04, 2016 at 06:20:42PM +0530, Jerin Jacob wrote:
> > > The former case will have issue as "hw" been used in "if" with vtpci_with_feature.
> > 
> > Oh, my bad. I overlooked it. Sorry for that!
> > 
> > > OR
> > > 
> > > if you meant just floating "struct virtio_hw *hw" without RTE_MACHINE_CPUFLAG_SSSE3
> > > then it comes error on non x86 as unused "hw" variable.
> > > 
> > > If you meant something else then let me know?
> > 
> > I then prefer to keep the "#ifdef .. #endif" on top then. It will stop
> > us from offending a minor rule, while you can remove the ugly "#ifdef"
> > block in the next patch.
> > 
> > Works to you?
> 
> OK. As you wish :-)

Thank you!

	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH v3 0/4] Virtio NEON support for ARM
  2016-07-01 11:16 ` From: Jerin Jacob <jerin.jacob@caviumnetworks.com> Jerin Jacob
                     ` (2 preceding siblings ...)
  2016-07-01 11:16   ` [PATCH v2 3/3] virtio: add neon support Jerin Jacob
@ 2016-07-05 12:49   ` Jerin Jacob
  2016-07-05 12:49     ` [PATCH v3 1/4] virtio: conditional compilation cleanup Jerin Jacob
                       ` (4 more replies)
  3 siblings, 5 replies; 44+ messages in thread
From: Jerin Jacob @ 2016-07-05 12:49 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

This patch-set includes,

1) General cleanup of compile time dependency.
2) made vector handler section based on run-time cpuflags
2) Added NEON support for optimized Rx handling

This patch-set is based on dpdk-next-virtio/master

v3:
Address Yuanhan's review comments
http://dpdk.org/dev/patchwork/patch/14495/
http://dpdk.org/dev/patchwork/patch/14496/

v2:
- made vector handler selection based on run-time cpuflags (Suggested by Thomas)
- moved vector implementations to .c file instead of .h file(Suggested by Jianbo)

Jerin Jacob (4):
  virtio: conditional compilation cleanup
  virtio: move SSE based Rx implementation to separate file
  virtio: add cpuflag based vector handler selection
  virtio: add neon support

 MAINTAINERS                                  |   1 +
 doc/guides/rel_notes/release_16_07.rst       |   2 +
 drivers/net/virtio/Makefile                  |   7 +-
 drivers/net/virtio/virtio_pci.h              |   1 +
 drivers/net/virtio/virtio_rxtx.c             |  63 ++++---
 drivers/net/virtio/virtio_rxtx.h             |   3 +-
 drivers/net/virtio/virtio_rxtx_simple.c      | 269 ++-------------------------
 drivers/net/virtio/virtio_rxtx_simple.h      | 133 +++++++++++++
 drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++
 drivers/net/virtio/virtio_rxtx_simple_sse.c  | 222 ++++++++++++++++++++++
 drivers/net/virtio/virtio_user_ethdev.c      |   1 +
 11 files changed, 646 insertions(+), 291 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c

-- 
2.5.5

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH v3 1/4] virtio: conditional compilation cleanup
  2016-07-05 12:49   ` [PATCH v3 0/4] Virtio NEON support for ARM Jerin Jacob
@ 2016-07-05 12:49     ` Jerin Jacob
  2016-07-05 12:49     ` [PATCH v3 2/4] virtio: move SSE based Rx implementation to separate file Jerin Jacob
                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 44+ messages in thread
From: Jerin Jacob @ 2016-07-05 12:49 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

Removed unnecessary compile time dependency on "use_simple_rxtx".

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/virtio/Makefile             |  3 ---
 drivers/net/virtio/virtio_pci.h         |  1 +
 drivers/net/virtio/virtio_rxtx.c        | 24 ++++++++----------------
 drivers/net/virtio/virtio_rxtx.h        |  3 +--
 drivers/net/virtio/virtio_rxtx_simple.c |  8 ++++++--
 drivers/net/virtio/virtio_user_ethdev.c |  1 +
 6 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index 3020b68..b9b0d8d 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -50,10 +50,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtqueue.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_pci.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c
-
-ifeq ($(findstring RTE_MACHINE_CPUFLAG_SSSE3,$(CFLAGS)),RTE_MACHINE_CPUFLAG_SSSE3)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c
-endif
 
 ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
index dd7693f..b8295a7 100644
--- a/drivers/net/virtio/virtio_pci.h
+++ b/drivers/net/virtio/virtio_pci.h
@@ -253,6 +253,7 @@ struct virtio_hw {
 	uint8_t	    use_msix;
 	uint8_t     started;
 	uint8_t     modern;
+	uint8_t     use_simple_rxtx;
 	uint8_t     mac_addr[ETHER_ADDR_LEN];
 	uint32_t    notify_off_multiplier;
 	uint8_t     *isr;
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index a27208e..e707954 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -67,10 +67,6 @@
 #define VIRTIO_SIMPLE_FLAGS ((uint32_t)ETH_TXQ_FLAGS_NOMULTSEGS | \
 	ETH_TXQ_FLAGS_NOOFFLOADS)
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-static int use_simple_rxtx;
-#endif
-
 static void
 vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
 {
@@ -333,6 +329,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 	 */
 	uint16_t i;
 	uint16_t desc_idx;
+	struct virtio_hw *hw = dev->data->dev_private;
 
 	PMD_INIT_FUNC_TRACE();
 
@@ -353,8 +350,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 		nbufs = 0;
 		error = ENOSPC;
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-		if (use_simple_rxtx) {
+		if (hw->use_simple_rxtx) {
 			for (desc_idx = 0; desc_idx < vq->vq_nentries;
 			     desc_idx++) {
 				vq->vq_ring.avail->ring[desc_idx] = desc_idx;
@@ -362,7 +358,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 					VRING_DESC_F_WRITE;
 			}
 		}
-#endif
+
 		memset(&rxvq->fake_mbuf, 0, sizeof(rxvq->fake_mbuf));
 		for (desc_idx = 0; desc_idx < RTE_PMD_VIRTIO_RX_MAX_BURST;
 		     desc_idx++) {
@@ -378,12 +374,11 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 			/******************************************
 			*         Enqueue allocated buffers        *
 			*******************************************/
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-			if (use_simple_rxtx)
+			if (hw->use_simple_rxtx)
 				error = virtqueue_enqueue_recv_refill_simple(vq, m);
 			else
-#endif
 				error = virtqueue_enqueue_recv_refill(vq, m);
+
 			if (error) {
 				rte_pktmbuf_free(m);
 				break;
@@ -404,8 +399,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 		struct virtqueue *vq = txvq->vq;
 
 		virtio_dev_vring_start(vq);
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-		if (use_simple_rxtx) {
+		if (hw->use_simple_rxtx) {
 			uint16_t mid_idx  = vq->vq_nentries >> 1;
 
 			for (desc_idx = 0; desc_idx < mid_idx; desc_idx++) {
@@ -426,7 +420,7 @@ virtio_dev_rxtx_start(struct rte_eth_dev *dev)
 			     desc_idx++)
 				vq->vq_ring.avail->ring[desc_idx] = desc_idx;
 		}
-#endif
+
 		VIRTQUEUE_DUMP(vq);
 	}
 }
@@ -456,9 +450,7 @@ virtio_dev_rx_queue_setup(struct rte_eth_dev *dev,
 
 	dev->data->rx_queues[queue_idx] = rxvq;
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
 	virtio_rxq_vec_setup(rxvq);
-#endif
 
 	return 0;
 }
@@ -517,7 +509,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		PMD_INIT_LOG(INFO, "Using simple rx/tx path");
 		dev->tx_pkt_burst = virtio_xmit_pkts_simple;
 		dev->rx_pkt_burst = virtio_recv_pkts_vec;
-		use_simple_rxtx = 1;
+		hw->use_simple_rxtx = 1;
 	}
 #endif
 
diff --git a/drivers/net/virtio/virtio_rxtx.h b/drivers/net/virtio/virtio_rxtx.h
index 058b56a..28f82d6 100644
--- a/drivers/net/virtio/virtio_rxtx.h
+++ b/drivers/net/virtio/virtio_rxtx.h
@@ -86,10 +86,9 @@ struct virtnet_ctl {
 	const struct rte_memzone *mz;   /**< mem zone to populate RX ring. */
 };
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
 int virtio_rxq_vec_setup(struct virtnet_rx *rxvq);
 
 int virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq,
 	struct rte_mbuf *m);
-#endif
+
 #endif /* _VIRTIO_RXTX_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 242ad90..67430da 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -37,8 +37,6 @@
 #include <string.h>
 #include <errno.h>
 
-#include <tmmintrin.h>
-
 #include <rte_cycles.h>
 #include <rte_memory.h>
 #include <rte_memzone.h>
@@ -131,6 +129,10 @@ virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
 	vq_update_avail_idx(vq);
 }
 
+#ifdef RTE_MACHINE_CPUFLAG_SSSE3
+
+#include <tmmintrin.h>
+
 /* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
  *
  * This routine is for non-mergeable RX, one desc for each guest buffer.
@@ -293,6 +295,8 @@ virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return nb_pkts_received;
 }
 
+#endif
+
 #define VIRTIO_TX_FREE_THRESH 32
 #define VIRTIO_TX_MAX_FREE_BUF_SZ 32
 #define VIRTIO_TX_FREE_NR 32
diff --git a/drivers/net/virtio/virtio_user_ethdev.c b/drivers/net/virtio/virtio_user_ethdev.c
index 3d3c9da..3185a4c 100644
--- a/drivers/net/virtio/virtio_user_ethdev.c
+++ b/drivers/net/virtio/virtio_user_ethdev.c
@@ -303,6 +303,7 @@ virtio_user_eth_dev_alloc(const char *name)
 	hw->vtpci_ops = &virtio_user_ops;
 	hw->use_msix = 0;
 	hw->modern   = 0;
+	hw->use_simple_rxtx = 0;
 	hw->virtio_user_dev = dev;
 	data->dev_private = hw;
 	data->numa_node = SOCKET_ID_ANY;
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v3 2/4] virtio: move SSE based Rx implementation to separate file
  2016-07-05 12:49   ` [PATCH v3 0/4] Virtio NEON support for ARM Jerin Jacob
  2016-07-05 12:49     ` [PATCH v3 1/4] virtio: conditional compilation cleanup Jerin Jacob
@ 2016-07-05 12:49     ` Jerin Jacob
  2016-08-18  6:52       ` Yuanhan Liu
  2016-07-05 12:49     ` [PATCH v3 3/4] virtio: add cpuflag based vector handler selection Jerin Jacob
                       ` (2 subsequent siblings)
  4 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-07-05 12:49 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

Split out SSE instruction based virtio simple Rx
implementation to a separate file

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/virtio/Makefile                 |   4 +
 drivers/net/virtio/virtio_rxtx_simple.c     | 273 ++--------------------------
 drivers/net/virtio/virtio_rxtx_simple.h     | 133 ++++++++++++++
 drivers/net/virtio/virtio_rxtx_simple_sse.c | 222 ++++++++++++++++++++++
 4 files changed, 370 insertions(+), 262 deletions(-)
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c

diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index b9b0d8d..c4103b7 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -52,6 +52,10 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_ethdev.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c
 
+ifeq ($(CONFIG_RTE_ARCH_X86),y)
+SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_sse.c
+endif
+
 ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/virtio_user_dev.c
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 67430da..485ddce 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -51,14 +51,7 @@
 #include <rte_errno.h>
 #include <rte_byteorder.h>
 
-#include "virtio_logs.h"
-#include "virtio_ethdev.h"
-#include "virtqueue.h"
-#include "virtio_rxtx.h"
-
-#define RTE_VIRTIO_VPMD_RX_BURST 32
-#define RTE_VIRTIO_DESC_PER_LOOP 8
-#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+#include "virtio_rxtx_simple.h"
 
 #ifndef __INTEL_COMPILER
 #pragma GCC diagnostic ignored "-Wcast-qual"
@@ -89,260 +82,6 @@ virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq,
 	return 0;
 }
 
-static inline void
-virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
-{
-	int i;
-	uint16_t desc_idx;
-	struct rte_mbuf **sw_ring;
-	struct vring_desc *start_dp;
-	int ret;
-	struct virtqueue *vq = rxvq->vq;
-
-	desc_idx = vq->vq_avail_idx & (vq->vq_nentries - 1);
-	sw_ring = &vq->sw_ring[desc_idx];
-	start_dp = &vq->vq_ring.desc[desc_idx];
-
-	ret = rte_mempool_get_bulk(rxvq->mpool, (void **)sw_ring,
-		RTE_VIRTIO_VPMD_RX_REARM_THRESH);
-	if (unlikely(ret)) {
-		rte_eth_devices[rxvq->port_id].data->rx_mbuf_alloc_failed +=
-			RTE_VIRTIO_VPMD_RX_REARM_THRESH;
-		return;
-	}
-
-	for (i = 0; i < RTE_VIRTIO_VPMD_RX_REARM_THRESH; i++) {
-		uintptr_t p;
-
-		p = (uintptr_t)&sw_ring[i]->rearm_data;
-		*(uint64_t *)p = rxvq->mbuf_initializer;
-
-		start_dp[i].addr =
-			MBUF_DATA_DMA_ADDR(sw_ring[i], vq->offset) -
-			vq->hw->vtnet_hdr_size;
-		start_dp[i].len = sw_ring[i]->buf_len -
-			RTE_PKTMBUF_HEADROOM + vq->hw->vtnet_hdr_size;
-	}
-
-	vq->vq_avail_idx += RTE_VIRTIO_VPMD_RX_REARM_THRESH;
-	vq->vq_free_cnt -= RTE_VIRTIO_VPMD_RX_REARM_THRESH;
-	vq_update_avail_idx(vq);
-}
-
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-
-#include <tmmintrin.h>
-
-/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
- *
- * This routine is for non-mergeable RX, one desc for each guest buffer.
- * This routine is based on the RX ring layout optimization. Each entry in the
- * avail ring points to the desc with the same index in the desc ring and this
- * will never be changed in the driver.
- *
- * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
- */
-uint16_t
-virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-	uint16_t nb_pkts)
-{
-	struct virtnet_rx *rxvq = rx_queue;
-	struct virtqueue *vq = rxvq->vq;
-	uint16_t nb_used;
-	uint16_t desc_idx;
-	struct vring_used_elem *rused;
-	struct rte_mbuf **sw_ring;
-	struct rte_mbuf **sw_ring_end;
-	uint16_t nb_pkts_received;
-	__m128i shuf_msk1, shuf_msk2, len_adjust;
-
-	shuf_msk1 = _mm_set_epi8(
-		0xFF, 0xFF, 0xFF, 0xFF,
-		0xFF, 0xFF,		/* vlan tci */
-		5, 4,			/* dat len */
-		0xFF, 0xFF, 5, 4,	/* pkt len */
-		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
-
-	);
-
-	shuf_msk2 = _mm_set_epi8(
-		0xFF, 0xFF, 0xFF, 0xFF,
-		0xFF, 0xFF,		/* vlan tci */
-		13, 12,			/* dat len */
-		0xFF, 0xFF, 13, 12,	/* pkt len */
-		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
-	);
-
-	/* Subtract the header length.
-	*  In which case do we need the header length in used->len ?
-	*/
-	len_adjust = _mm_set_epi16(
-		0, 0,
-		0,
-		(uint16_t)-vq->hw->vtnet_hdr_size,
-		0, (uint16_t)-vq->hw->vtnet_hdr_size,
-		0, 0);
-
-	if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
-		return 0;
-
-	nb_used = VIRTQUEUE_NUSED(vq);
-
-	rte_compiler_barrier();
-
-	if (unlikely(nb_used == 0))
-		return 0;
-
-	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
-	nb_used = RTE_MIN(nb_used, nb_pkts);
-
-	desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
-	rused = &vq->vq_ring.used->ring[desc_idx];
-	sw_ring  = &vq->sw_ring[desc_idx];
-	sw_ring_end = &vq->sw_ring[vq->vq_nentries];
-
-	_mm_prefetch((const void *)rused, _MM_HINT_T0);
-
-	if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
-		virtio_rxq_rearm_vec(rxvq);
-		if (unlikely(virtqueue_kick_prepare(vq)))
-			virtqueue_notify(vq);
-	}
-
-	for (nb_pkts_received = 0;
-		nb_pkts_received < nb_used;) {
-		__m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
-		__m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
-		__m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
-
-		mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
-		desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
-		_mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
-
-		mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
-		desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
-		_mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
-
-		mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
-		desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
-		_mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
-
-		mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
-		desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
-		_mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
-
-		pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
-		pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
-		pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
-		pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
-			pkt_mb[1]);
-		_mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
-			pkt_mb[0]);
-
-		pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
-		pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
-		pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
-		pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
-			pkt_mb[3]);
-		_mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
-			pkt_mb[2]);
-
-		pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
-		pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
-		pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
-		pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
-			pkt_mb[5]);
-		_mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
-			pkt_mb[4]);
-
-		pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
-		pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
-		pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
-		pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
-		_mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
-			pkt_mb[7]);
-		_mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
-			pkt_mb[6]);
-
-		if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
-			if (sw_ring + nb_used <= sw_ring_end)
-				nb_pkts_received += nb_used;
-			else
-				nb_pkts_received += sw_ring_end - sw_ring;
-			break;
-		} else {
-			if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
-				sw_ring_end)) {
-				nb_pkts_received += sw_ring_end - sw_ring;
-				break;
-			} else {
-				nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
-
-				rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
-				sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
-				rused   += RTE_VIRTIO_DESC_PER_LOOP;
-				nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
-			}
-		}
-	}
-
-	vq->vq_used_cons_idx += nb_pkts_received;
-	vq->vq_free_cnt += nb_pkts_received;
-	rxvq->stats.packets += nb_pkts_received;
-	return nb_pkts_received;
-}
-
-#endif
-
-#define VIRTIO_TX_FREE_THRESH 32
-#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
-#define VIRTIO_TX_FREE_NR 32
-/* TODO: vq->tx_free_cnt could mean num of free slots so we could avoid shift */
-static inline void
-virtio_xmit_cleanup(struct virtqueue *vq)
-{
-	uint16_t i, desc_idx;
-	int nb_free = 0;
-	struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ];
-
-	desc_idx = (uint16_t)(vq->vq_used_cons_idx &
-		   ((vq->vq_nentries >> 1) - 1));
-	m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
-	m = __rte_pktmbuf_prefree_seg(m);
-	if (likely(m != NULL)) {
-		free[0] = m;
-		nb_free = 1;
-		for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
-			m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
-			m = __rte_pktmbuf_prefree_seg(m);
-			if (likely(m != NULL)) {
-				if (likely(m->pool == free[0]->pool))
-					free[nb_free++] = m;
-				else {
-					rte_mempool_put_bulk(free[0]->pool,
-						(void **)free, nb_free);
-					free[0] = m;
-					nb_free = 1;
-				}
-			}
-		}
-		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
-	} else {
-		for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
-			m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
-			m = __rte_pktmbuf_prefree_seg(m);
-			if (m != NULL)
-				rte_mempool_put(m->pool, m);
-		}
-	}
-
-	vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR;
-	vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1);
-}
-
 uint16_t
 virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint16_t nb_pkts)
@@ -423,3 +162,13 @@ virtio_rxq_vec_setup(struct virtnet_rx *rxq)
 
 	return 0;
 }
+
+/* Stub for linkage when arch specific implementation is not available */
+uint16_t __attribute__((weak))
+virtio_recv_pkts_vec(void *rx_queue __rte_unused,
+		     struct rte_mbuf **rx_pkts __rte_unused,
+		     uint16_t nb_pkts __rte_unused)
+{
+	rte_panic("Wrong weak function linked by linker\n");
+	return 0;
+}
diff --git a/drivers/net/virtio/virtio_rxtx_simple.h b/drivers/net/virtio/virtio_rxtx_simple.h
new file mode 100644
index 0000000..8cb43c0
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple.h
@@ -0,0 +1,133 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VIRTIO_RXTX_SIMPLE_H_
+#define _VIRTIO_RXTX_SIMPLE_H_
+
+#include <stdint.h>
+
+#include "virtio_logs.h"
+#include "virtio_ethdev.h"
+#include "virtqueue.h"
+#include "virtio_rxtx.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+static inline void
+virtio_rxq_rearm_vec(struct virtnet_rx *rxvq)
+{
+	int i;
+	uint16_t desc_idx;
+	struct rte_mbuf **sw_ring;
+	struct vring_desc *start_dp;
+	int ret;
+	struct virtqueue *vq = rxvq->vq;
+
+	desc_idx = vq->vq_avail_idx & (vq->vq_nentries - 1);
+	sw_ring = &vq->sw_ring[desc_idx];
+	start_dp = &vq->vq_ring.desc[desc_idx];
+
+	ret = rte_mempool_get_bulk(rxvq->mpool, (void **)sw_ring,
+		RTE_VIRTIO_VPMD_RX_REARM_THRESH);
+	if (unlikely(ret)) {
+		rte_eth_devices[rxvq->port_id].data->rx_mbuf_alloc_failed +=
+			RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+		return;
+	}
+
+	for (i = 0; i < RTE_VIRTIO_VPMD_RX_REARM_THRESH; i++) {
+		uintptr_t p;
+
+		p = (uintptr_t)&sw_ring[i]->rearm_data;
+		*(uint64_t *)p = rxvq->mbuf_initializer;
+
+		start_dp[i].addr =
+			MBUF_DATA_DMA_ADDR(sw_ring[i], vq->offset) -
+			vq->hw->vtnet_hdr_size;
+		start_dp[i].len = sw_ring[i]->buf_len -
+			RTE_PKTMBUF_HEADROOM + vq->hw->vtnet_hdr_size;
+	}
+
+	vq->vq_avail_idx += RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+	vq->vq_free_cnt -= RTE_VIRTIO_VPMD_RX_REARM_THRESH;
+	vq_update_avail_idx(vq);
+}
+
+#define VIRTIO_TX_FREE_THRESH 32
+#define VIRTIO_TX_MAX_FREE_BUF_SZ 32
+#define VIRTIO_TX_FREE_NR 32
+/* TODO: vq->tx_free_cnt could mean num of free slots so we could avoid shift */
+static inline void
+virtio_xmit_cleanup(struct virtqueue *vq)
+{
+	uint16_t i, desc_idx;
+	int nb_free = 0;
+	struct rte_mbuf *m, *free[VIRTIO_TX_MAX_FREE_BUF_SZ];
+
+	desc_idx = (uint16_t)(vq->vq_used_cons_idx &
+		   ((vq->vq_nentries >> 1) - 1));
+	m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+	m = __rte_pktmbuf_prefree_seg(m);
+	if (likely(m != NULL)) {
+		free[0] = m;
+		nb_free = 1;
+		for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
+			m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+			m = __rte_pktmbuf_prefree_seg(m);
+			if (likely(m != NULL)) {
+				if (likely(m->pool == free[0]->pool))
+					free[nb_free++] = m;
+				else {
+					rte_mempool_put_bulk(free[0]->pool,
+						(void **)free, nb_free);
+					free[0] = m;
+					nb_free = 1;
+				}
+			}
+		}
+		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+	} else {
+		for (i = 1; i < VIRTIO_TX_FREE_NR; i++) {
+			m = (struct rte_mbuf *)vq->vq_descx[desc_idx++].cookie;
+			m = __rte_pktmbuf_prefree_seg(m);
+			if (m != NULL)
+				rte_mempool_put(m->pool, m);
+		}
+	}
+
+	vq->vq_used_cons_idx += VIRTIO_TX_FREE_NR;
+	vq->vq_free_cnt += (VIRTIO_TX_FREE_NR << 1);
+}
+
+#endif /* _VIRTIO_RXTX_SIMPLE_H_ */
diff --git a/drivers/net/virtio/virtio_rxtx_simple_sse.c b/drivers/net/virtio/virtio_rxtx_simple_sse.c
new file mode 100644
index 0000000..39000e8
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_sse.c
@@ -0,0 +1,222 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <tmmintrin.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+
+#include "virtio_rxtx_simple.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts)
+{
+	struct virtnet_rx *rxvq = rx_queue;
+	struct virtqueue *vq = rxvq->vq;
+	uint16_t nb_used;
+	uint16_t desc_idx;
+	struct vring_used_elem *rused;
+	struct rte_mbuf **sw_ring;
+	struct rte_mbuf **sw_ring_end;
+	uint16_t nb_pkts_received;
+	__m128i shuf_msk1, shuf_msk2, len_adjust;
+
+	shuf_msk1 = _mm_set_epi8(
+		0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF,		/* vlan tci */
+		5, 4,			/* dat len */
+		0xFF, 0xFF, 5, 4,	/* pkt len */
+		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
+
+	);
+
+	shuf_msk2 = _mm_set_epi8(
+		0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF,		/* vlan tci */
+		13, 12,			/* dat len */
+		0xFF, 0xFF, 13, 12,	/* pkt len */
+		0xFF, 0xFF, 0xFF, 0xFF	/* packet type */
+	);
+
+	/* Subtract the header length.
+	*  In which case do we need the header length in used->len ?
+	*/
+	len_adjust = _mm_set_epi16(
+		0, 0,
+		0,
+		(uint16_t)-vq->hw->vtnet_hdr_size,
+		0, (uint16_t)-vq->hw->vtnet_hdr_size,
+		0, 0);
+
+	if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+		return 0;
+
+	nb_used = VIRTQUEUE_NUSED(vq);
+
+	rte_compiler_barrier();
+
+	if (unlikely(nb_used == 0))
+		return 0;
+
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+	nb_used = RTE_MIN(nb_used, nb_pkts);
+
+	desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+	rused = &vq->vq_ring.used->ring[desc_idx];
+	sw_ring  = &vq->sw_ring[desc_idx];
+	sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+	_mm_prefetch((const void *)rused, _MM_HINT_T0);
+
+	if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+		virtio_rxq_rearm_vec(rxvq);
+		if (unlikely(virtqueue_kick_prepare(vq)))
+			virtqueue_notify(vq);
+	}
+
+	for (nb_pkts_received = 0;
+		nb_pkts_received < nb_used;) {
+		__m128i desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		__m128i mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		__m128i pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+		mbp[0] = _mm_loadu_si128((__m128i *)(sw_ring + 0));
+		desc[0] = _mm_loadu_si128((__m128i *)(rused + 0));
+		_mm_storeu_si128((__m128i *)&rx_pkts[0], mbp[0]);
+
+		mbp[1] = _mm_loadu_si128((__m128i *)(sw_ring + 2));
+		desc[1] = _mm_loadu_si128((__m128i *)(rused + 2));
+		_mm_storeu_si128((__m128i *)&rx_pkts[2], mbp[1]);
+
+		mbp[2] = _mm_loadu_si128((__m128i *)(sw_ring + 4));
+		desc[2] = _mm_loadu_si128((__m128i *)(rused + 4));
+		_mm_storeu_si128((__m128i *)&rx_pkts[4], mbp[2]);
+
+		mbp[3] = _mm_loadu_si128((__m128i *)(sw_ring + 6));
+		desc[3] = _mm_loadu_si128((__m128i *)(rused + 6));
+		_mm_storeu_si128((__m128i *)&rx_pkts[6], mbp[3]);
+
+		pkt_mb[1] = _mm_shuffle_epi8(desc[0], shuf_msk2);
+		pkt_mb[0] = _mm_shuffle_epi8(desc[0], shuf_msk1);
+		pkt_mb[1] = _mm_add_epi16(pkt_mb[1], len_adjust);
+		pkt_mb[0] = _mm_add_epi16(pkt_mb[0], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[1]->rx_descriptor_fields1,
+			pkt_mb[1]);
+		_mm_storeu_si128((void *)&rx_pkts[0]->rx_descriptor_fields1,
+			pkt_mb[0]);
+
+		pkt_mb[3] = _mm_shuffle_epi8(desc[1], shuf_msk2);
+		pkt_mb[2] = _mm_shuffle_epi8(desc[1], shuf_msk1);
+		pkt_mb[3] = _mm_add_epi16(pkt_mb[3], len_adjust);
+		pkt_mb[2] = _mm_add_epi16(pkt_mb[2], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[3]->rx_descriptor_fields1,
+			pkt_mb[3]);
+		_mm_storeu_si128((void *)&rx_pkts[2]->rx_descriptor_fields1,
+			pkt_mb[2]);
+
+		pkt_mb[5] = _mm_shuffle_epi8(desc[2], shuf_msk2);
+		pkt_mb[4] = _mm_shuffle_epi8(desc[2], shuf_msk1);
+		pkt_mb[5] = _mm_add_epi16(pkt_mb[5], len_adjust);
+		pkt_mb[4] = _mm_add_epi16(pkt_mb[4], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[5]->rx_descriptor_fields1,
+			pkt_mb[5]);
+		_mm_storeu_si128((void *)&rx_pkts[4]->rx_descriptor_fields1,
+			pkt_mb[4]);
+
+		pkt_mb[7] = _mm_shuffle_epi8(desc[3], shuf_msk2);
+		pkt_mb[6] = _mm_shuffle_epi8(desc[3], shuf_msk1);
+		pkt_mb[7] = _mm_add_epi16(pkt_mb[7], len_adjust);
+		pkt_mb[6] = _mm_add_epi16(pkt_mb[6], len_adjust);
+		_mm_storeu_si128((void *)&rx_pkts[7]->rx_descriptor_fields1,
+			pkt_mb[7]);
+		_mm_storeu_si128((void *)&rx_pkts[6]->rx_descriptor_fields1,
+			pkt_mb[6]);
+
+		if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+			if (sw_ring + nb_used <= sw_ring_end)
+				nb_pkts_received += nb_used;
+			else
+				nb_pkts_received += sw_ring_end - sw_ring;
+			break;
+		} else {
+			if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+				sw_ring_end)) {
+				nb_pkts_received += sw_ring_end - sw_ring;
+				break;
+			} else {
+				nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+				rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+				sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+				rused   += RTE_VIRTIO_DESC_PER_LOOP;
+				nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+			}
+		}
+	}
+
+	vq->vq_used_cons_idx += nb_pkts_received;
+	vq->vq_free_cnt += nb_pkts_received;
+	rxvq->stats.packets += nb_pkts_received;
+	return nb_pkts_received;
+}
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v3 3/4] virtio: add cpuflag based vector handler selection
  2016-07-05 12:49   ` [PATCH v3 0/4] Virtio NEON support for ARM Jerin Jacob
  2016-07-05 12:49     ` [PATCH v3 1/4] virtio: conditional compilation cleanup Jerin Jacob
  2016-07-05 12:49     ` [PATCH v3 2/4] virtio: move SSE based Rx implementation to separate file Jerin Jacob
@ 2016-07-05 12:49     ` Jerin Jacob
  2016-07-05 12:49     ` [PATCH v3 4/4] virtio: add neon support Jerin Jacob
  2016-09-28  0:37     ` [PATCH v3 0/4] Virtio NEON support for ARM Yuanhan Liu
  4 siblings, 0 replies; 44+ messages in thread
From: Jerin Jacob @ 2016-07-05 12:49 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

Introduced cpuflag based run-time detection to select the
SSE based simple Rx handler

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 drivers/net/virtio/virtio_rxtx.c | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index e707954..adc3457 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -50,6 +50,7 @@
 #include <rte_string_fns.h>
 #include <rte_errno.h>
 #include <rte_byteorder.h>
+#include <rte_cpuflags.h>
 
 #include "virtio_logs.h"
 #include "virtio_ethdev.h"
@@ -470,6 +471,28 @@ virtio_dev_rx_queue_release(void *rxq)
 	rte_memzone_free(mz);
 }
 
+static void
+virtio_update_rxtx_handler(struct rte_eth_dev *dev,
+			   const struct rte_eth_txconf *tx_conf)
+{
+	uint8_t use_simple_rxtx = 0;
+	struct virtio_hw *hw = dev->data->dev_private;
+
+#if defined RTE_ARCH_X86
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
+		use_simple_rxtx = 1;
+#endif
+	/* Use simple rx/tx func if single segment and no offloads */
+	if (use_simple_rxtx &&
+	    (tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
+	    !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
+		PMD_INIT_LOG(INFO, "Using simple rx/tx path");
+		dev->tx_pkt_burst = virtio_xmit_pkts_simple;
+		dev->rx_pkt_burst = virtio_recv_pkts_vec;
+		hw->use_simple_rxtx = use_simple_rxtx;
+	}
+}
+
 /*
  * struct rte_eth_dev *dev: Used to update dev
  * uint16_t nb_desc: Defaults to values read from config space
@@ -485,10 +508,6 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 			const struct rte_eth_txconf *tx_conf)
 {
 	uint8_t vtpci_queue_idx = 2 * queue_idx + VTNET_SQ_TQ_QUEUE_IDX;
-
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-	struct virtio_hw *hw = dev->data->dev_private;
-#endif
 	struct virtnet_tx *txvq;
 	struct virtqueue *vq;
 	uint16_t tx_free_thresh;
@@ -502,16 +521,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		return -EINVAL;
 	}
 
-#ifdef RTE_MACHINE_CPUFLAG_SSSE3
-	/* Use simple rx/tx func if single segment and no offloads */
-	if ((tx_conf->txq_flags & VIRTIO_SIMPLE_FLAGS) == VIRTIO_SIMPLE_FLAGS &&
-	     !vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {
-		PMD_INIT_LOG(INFO, "Using simple rx/tx path");
-		dev->tx_pkt_burst = virtio_xmit_pkts_simple;
-		dev->rx_pkt_burst = virtio_recv_pkts_vec;
-		hw->use_simple_rxtx = 1;
-	}
-#endif
+	virtio_update_rxtx_handler(dev, tx_conf);
 
 	ret = virtio_dev_queue_setup(dev, VTNET_TQ, queue_idx, vtpci_queue_idx,
 			nb_desc, socket_id, (void **)&txvq);
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH v3 4/4] virtio: add neon support
  2016-07-05 12:49   ` [PATCH v3 0/4] Virtio NEON support for ARM Jerin Jacob
                       ` (2 preceding siblings ...)
  2016-07-05 12:49     ` [PATCH v3 3/4] virtio: add cpuflag based vector handler selection Jerin Jacob
@ 2016-07-05 12:49     ` Jerin Jacob
  2016-07-06  3:11       ` Jianbo Liu
  2016-09-28  0:37     ` [PATCH v3 0/4] Virtio NEON support for ARM Yuanhan Liu
  4 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-07-05 12:49 UTC (permalink / raw)
  To: dev
  Cc: thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie,
	yuanhan.liu, Jerin Jacob

Added neon based Rx vector implementation.
Selection of the new handler based neon availability at runtime.
Updated the release notes and MAINTAINERS file.

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 MAINTAINERS                                  |   1 +
 doc/guides/rel_notes/release_16_07.rst       |   2 +
 drivers/net/virtio/Makefile                  |   2 +
 drivers/net/virtio/virtio_rxtx.c             |   3 +
 drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
 5 files changed, 243 insertions(+)
 create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c

diff --git a/MAINTAINERS b/MAINTAINERS
index a59191e..ab04cee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -143,6 +143,7 @@ F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
+F: drivers/net/virtio/virtio_rxtx_simple_neon.c
 
 EZchip TILE-Gx
 M: Zhigang Lu <zlu@ezchip.com>
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 569f562..57f3d28 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -180,6 +180,8 @@ New Features
   section of the "Network Interface Controller Drivers" document.
 
 
+* **Virtio NEON support for ARM.**
+
 Resolved Issues
 ---------------
 
diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
index c4103b7..97972a6 100644
--- a/drivers/net/virtio/Makefile
+++ b/drivers/net/virtio/Makefile
@@ -54,6 +54,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c
 
 ifeq ($(CONFIG_RTE_ARCH_X86),y)
 SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_sse.c
+else ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),)
+SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple_neon.c
 endif
 
 ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index adc3457..8f6cad8 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -481,6 +481,9 @@ virtio_update_rxtx_handler(struct rte_eth_dev *dev,
 #if defined RTE_ARCH_X86
 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE3))
 		use_simple_rxtx = 1;
+#elif defined RTE_ARCH_ARM64 || defined CONFIG_RTE_ARCH_ARM
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON))
+		use_simple_rxtx = 1;
 #endif
 	/* Use simple rx/tx func if single segment and no offloads */
 	if (use_simple_rxtx &&
diff --git a/drivers/net/virtio/virtio_rxtx_simple_neon.c b/drivers/net/virtio/virtio_rxtx_simple_neon.c
new file mode 100644
index 0000000..793eefb
--- /dev/null
+++ b/drivers/net/virtio/virtio_rxtx_simple_neon.c
@@ -0,0 +1,235 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2016
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <rte_byteorder.h>
+#include <rte_branch_prediction.h>
+#include <rte_cycles.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_memzone.h>
+#include <rte_mempool.h>
+#include <rte_malloc.h>
+#include <rte_mbuf.h>
+#include <rte_prefetch.h>
+#include <rte_string_fns.h>
+#include <rte_vect.h>
+
+#include "virtio_rxtx_simple.h"
+
+#define RTE_VIRTIO_VPMD_RX_BURST 32
+#define RTE_VIRTIO_DESC_PER_LOOP 8
+#define RTE_VIRTIO_VPMD_RX_REARM_THRESH RTE_VIRTIO_VPMD_RX_BURST
+
+/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP)
+ *
+ * This routine is for non-mergeable RX, one desc for each guest buffer.
+ * This routine is based on the RX ring layout optimization. Each entry in the
+ * avail ring points to the desc with the same index in the desc ring and this
+ * will never be changed in the driver.
+ *
+ * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet
+ */
+uint16_t
+virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+	uint16_t nb_pkts)
+{
+	struct virtnet_rx *rxvq = rx_queue;
+	struct virtqueue *vq = rxvq->vq;
+	uint16_t nb_used;
+	uint16_t desc_idx;
+	struct vring_used_elem *rused;
+	struct rte_mbuf **sw_ring;
+	struct rte_mbuf **sw_ring_end;
+	uint16_t nb_pkts_received;
+
+	uint8x16_t shuf_msk1 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+		4, 5, 0xFF, 0xFF,       /* pkt len */
+		4, 5,                   /* dat len */
+		0xFF, 0xFF,             /* vlan tci */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	uint8x16_t shuf_msk2 = {
+		0xFF, 0xFF, 0xFF, 0xFF, /* packet type */
+		12, 13, 0xFF, 0xFF,     /* pkt len */
+		12, 13,                 /* dat len */
+		0xFF, 0xFF,             /* vlan tci */
+		0xFF, 0xFF, 0xFF, 0xFF
+	};
+
+	/* Subtract the header length.
+	 *  In which case do we need the header length in used->len ?
+	 */
+	uint16x8_t len_adjust = {
+		0, 0,
+		(uint16_t)vq->hw->vtnet_hdr_size, 0,
+		(uint16_t)vq->hw->vtnet_hdr_size,
+		0,
+		0, 0
+	};
+
+	if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP))
+		return 0;
+
+	nb_used = VIRTQUEUE_NUSED(vq);
+
+	rte_rmb();
+
+	if (unlikely(nb_used == 0))
+		return 0;
+
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP);
+	nb_used = RTE_MIN(nb_used, nb_pkts);
+
+	desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+	rused = &vq->vq_ring.used->ring[desc_idx];
+	sw_ring  = &vq->sw_ring[desc_idx];
+	sw_ring_end = &vq->sw_ring[vq->vq_nentries];
+
+	rte_prefetch_non_temporal(rused);
+
+	if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) {
+		virtio_rxq_rearm_vec(rxvq);
+		if (unlikely(virtqueue_kick_prepare(vq)))
+			virtqueue_notify(vq);
+	}
+
+	for (nb_pkts_received = 0;
+		nb_pkts_received < nb_used;) {
+		uint64x2_t desc[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		uint64x2_t mbp[RTE_VIRTIO_DESC_PER_LOOP / 2];
+		uint64x2_t pkt_mb[RTE_VIRTIO_DESC_PER_LOOP];
+
+		mbp[0] = vld1q_u64((uint64_t *)(sw_ring + 0));
+		desc[0] = vld1q_u64((uint64_t *)(rused + 0));
+		vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0]);
+
+		mbp[1] = vld1q_u64((uint64_t *)(sw_ring + 2));
+		desc[1] = vld1q_u64((uint64_t *)(rused + 2));
+		vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1]);
+
+		mbp[2] = vld1q_u64((uint64_t *)(sw_ring + 4));
+		desc[2] = vld1q_u64((uint64_t *)(rused + 4));
+		vst1q_u64((uint64_t *)&rx_pkts[4], mbp[2]);
+
+		mbp[3] = vld1q_u64((uint64_t *)(sw_ring + 6));
+		desc[3] = vld1q_u64((uint64_t *)(rused + 6));
+		vst1q_u64((uint64_t *)&rx_pkts[6], mbp[3]);
+
+		pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[0]), shuf_msk2));
+		pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[0]), shuf_msk1));
+		pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[1]), len_adjust));
+		pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[0]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1,
+			pkt_mb[1]);
+		vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1,
+			pkt_mb[0]);
+
+		pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[1]), shuf_msk2));
+		pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[1]), shuf_msk1));
+		pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[3]), len_adjust));
+		pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[2]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1,
+			pkt_mb[3]);
+		vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1,
+			pkt_mb[2]);
+
+		pkt_mb[5] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[2]), shuf_msk2));
+		pkt_mb[4] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[2]), shuf_msk1));
+		pkt_mb[5] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[5]), len_adjust));
+		pkt_mb[4] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[4]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[5]->rx_descriptor_fields1,
+			pkt_mb[5]);
+		vst1q_u64((void *)&rx_pkts[4]->rx_descriptor_fields1,
+			pkt_mb[4]);
+
+		pkt_mb[7] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[3]), shuf_msk2));
+		pkt_mb[6] = vreinterpretq_u64_u8(vqtbl1q_u8(
+				vreinterpretq_u8_u64(desc[3]), shuf_msk1));
+		pkt_mb[7] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[7]), len_adjust));
+		pkt_mb[6] = vreinterpretq_u64_u16(vsubq_u16(
+				vreinterpretq_u16_u64(pkt_mb[6]), len_adjust));
+		vst1q_u64((void *)&rx_pkts[7]->rx_descriptor_fields1,
+			pkt_mb[7]);
+		vst1q_u64((void *)&rx_pkts[6]->rx_descriptor_fields1,
+			pkt_mb[6]);
+
+		if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) {
+			if (sw_ring + nb_used <= sw_ring_end)
+				nb_pkts_received += nb_used;
+			else
+				nb_pkts_received += sw_ring_end - sw_ring;
+			break;
+		} else {
+			if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >=
+				sw_ring_end)) {
+				nb_pkts_received += sw_ring_end - sw_ring;
+				break;
+			} else {
+				nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP;
+
+				rx_pkts += RTE_VIRTIO_DESC_PER_LOOP;
+				sw_ring += RTE_VIRTIO_DESC_PER_LOOP;
+				rused   += RTE_VIRTIO_DESC_PER_LOOP;
+				nb_used -= RTE_VIRTIO_DESC_PER_LOOP;
+			}
+		}
+	}
+
+	vq->vq_used_cons_idx += nb_pkts_received;
+	vq->vq_free_cnt += nb_pkts_received;
+	rxvq->stats.packets += nb_pkts_received;
+	return nb_pkts_received;
+}
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH v3 4/4] virtio: add neon support
  2016-07-05 12:49     ` [PATCH v3 4/4] virtio: add neon support Jerin Jacob
@ 2016-07-06  3:11       ` Jianbo Liu
  0 siblings, 0 replies; 44+ messages in thread
From: Jianbo Liu @ 2016-07-06  3:11 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, Thomas Monjalon, Bruce Richardson, huawei.xie, yuanhan.liu

On 5 July 2016 at 20:49, Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:
> Added neon based Rx vector implementation.
> Selection of the new handler based neon availability at runtime.
> Updated the release notes and MAINTAINERS file.
>
> Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> ---
>  MAINTAINERS                                  |   1 +
>  doc/guides/rel_notes/release_16_07.rst       |   2 +
>  drivers/net/virtio/Makefile                  |   2 +
>  drivers/net/virtio/virtio_rxtx.c             |   3 +
>  drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++++++
>  5 files changed, 243 insertions(+)
>  create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
>

Acked-by: Jianbo Liu <jianbo.liu@linaro.org>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v3 2/4] virtio: move SSE based Rx implementation to separate file
  2016-07-05 12:49     ` [PATCH v3 2/4] virtio: move SSE based Rx implementation to separate file Jerin Jacob
@ 2016-08-18  6:52       ` Yuanhan Liu
  2016-08-19  3:24         ` Jerin Jacob
  0 siblings, 1 reply; 44+ messages in thread
From: Yuanhan Liu @ 2016-08-18  6:52 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Tue, Jul 05, 2016 at 06:19:24PM +0530, Jerin Jacob wrote:
> Split out SSE instruction based virtio simple Rx
> implementation to a separate file
> 
> Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>

Hi,

I was about to apply this set. I then did some build test and found a
weird issue: it breaks the build with clang (ubuntu 16.04).

    drivers/net/virtio/virtio_rxtx_simple_sse.c:130:2: error: cast from 'const void *' to 'void *' drops const qualifier [-Werror,-Wcast-qual]
            _mm_prefetch((const void *)rused, _MM_HINT_T0);
            ^
    /usr/lib/llvm-3.8/bin/../lib/clang/3.8.0/include/xmmintrin.h:684:58: note: expanded from macro '_mm_prefetch'
    #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
                                                         ^
    1 error generated.

Weird enough I don't see this issue before this commit: the error
line is exactly the same before and after this commit.

Another note is that _mm_prefetch() is actually with different prototype
for gcc and clang. For gcc, we have:

    _mm_prefetch (const void *__P, enum _mm_hint __I)

Any thoughts?

	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v3 2/4] virtio: move SSE based Rx implementation to separate file
  2016-08-18  6:52       ` Yuanhan Liu
@ 2016-08-19  3:24         ` Jerin Jacob
  2016-08-23  7:43           ` Yuanhan Liu
  0 siblings, 1 reply; 44+ messages in thread
From: Jerin Jacob @ 2016-08-19  3:24 UTC (permalink / raw)
  To: Yuanhan Liu
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Thu, Aug 18, 2016 at 02:52:31PM +0800, Yuanhan Liu wrote:
> On Tue, Jul 05, 2016 at 06:19:24PM +0530, Jerin Jacob wrote:
> > Split out SSE instruction based virtio simple Rx
> > implementation to a separate file
> > 
> > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> 
> Hi,
> 
> I was about to apply this set. I then did some build test and found a
> weird issue: it breaks the build with clang (ubuntu 16.04).
> 
>     drivers/net/virtio/virtio_rxtx_simple_sse.c:130:2: error: cast from 'const void *' to 'void *' drops const qualifier [-Werror,-Wcast-qual]
>             _mm_prefetch((const void *)rused, _MM_HINT_T0);
>             ^
>     /usr/lib/llvm-3.8/bin/../lib/clang/3.8.0/include/xmmintrin.h:684:58: note: expanded from macro '_mm_prefetch'
>     #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
>                                                          ^
>     1 error generated.
> 
> Weird enough I don't see this issue before this commit: the error
> line is exactly the same before and after this commit.

Yes, I looked at the pre processed output as well, it comes as same before and
after this commit.

> 
> Another note is that _mm_prefetch() is actually with different prototype
> for gcc and clang. For gcc, we have:
> 
>     _mm_prefetch (const void *__P, enum _mm_hint __I)
> 
> Any thoughts?

How about replacing "_mm_prefetch((const void *)rused, _MM_HINT_T0)"
with "rte_prefetch0(rused)" to have same prototype and fix the issue
with clang?

> 
> 	--yliu
> 
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v3 2/4] virtio: move SSE based Rx implementation to separate file
  2016-08-19  3:24         ` Jerin Jacob
@ 2016-08-23  7:43           ` Yuanhan Liu
  0 siblings, 0 replies; 44+ messages in thread
From: Yuanhan Liu @ 2016-08-23  7:43 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Fri, Aug 19, 2016 at 08:54:00AM +0530, Jerin Jacob wrote:
> On Thu, Aug 18, 2016 at 02:52:31PM +0800, Yuanhan Liu wrote:
> > On Tue, Jul 05, 2016 at 06:19:24PM +0530, Jerin Jacob wrote:
> > > Split out SSE instruction based virtio simple Rx
> > > implementation to a separate file
> > > 
> > > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > 
> > Hi,
> > 
> > I was about to apply this set. I then did some build test and found a
> > weird issue: it breaks the build with clang (ubuntu 16.04).
> > 
> >     drivers/net/virtio/virtio_rxtx_simple_sse.c:130:2: error: cast from 'const void *' to 'void *' drops const qualifier [-Werror,-Wcast-qual]
> >             _mm_prefetch((const void *)rused, _MM_HINT_T0);
> >             ^
> >     /usr/lib/llvm-3.8/bin/../lib/clang/3.8.0/include/xmmintrin.h:684:58: note: expanded from macro '_mm_prefetch'
> >     #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
> >                                                          ^
> >     1 error generated.
> > 
> > Weird enough I don't see this issue before this commit: the error
> > line is exactly the same before and after this commit.
> 
> Yes, I looked at the pre processed output as well, it comes as same before and
> after this commit.
> 
> > 
> > Another note is that _mm_prefetch() is actually with different prototype
> > for gcc and clang. For gcc, we have:
> > 
> >     _mm_prefetch (const void *__P, enum _mm_hint __I)
> > 
> > Any thoughts?
> 
> How about replacing "_mm_prefetch((const void *)rused, _MM_HINT_T0)"
> with "rte_prefetch0(rused)" to have same prototype and fix the issue
> with clang?

Yes, it should work. BTW, I have just sent out a patch for that. I will
apply yours when that patch has been applied.

	--yliu

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH v3 0/4] Virtio NEON support for ARM
  2016-07-05 12:49   ` [PATCH v3 0/4] Virtio NEON support for ARM Jerin Jacob
                       ` (3 preceding siblings ...)
  2016-07-05 12:49     ` [PATCH v3 4/4] virtio: add neon support Jerin Jacob
@ 2016-09-28  0:37     ` Yuanhan Liu
  4 siblings, 0 replies; 44+ messages in thread
From: Yuanhan Liu @ 2016-09-28  0:37 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: dev, thomas.monjalon, bruce.richardson, jianbo.liu, huawei.xie

On Tue, Jul 05, 2016 at 06:19:22PM +0530, Jerin Jacob wrote:
> This patch-set includes,
> 
> 1) General cleanup of compile time dependency.
> 2) made vector handler section based on run-time cpuflags
> 2) Added NEON support for optimized Rx handling
> 
> This patch-set is based on dpdk-next-virtio/master

Now the weird build issue is gone, this series is applied to
dpdk-next-virtio.

I addressed some conflict issues; you might want to have a
simple check to make sure I don't mess something up. Or even
better, if you could do a test.

Thanks.

	--yliu

> 
> v3:
> Address Yuanhan's review comments
> http://dpdk.org/dev/patchwork/patch/14495/
> http://dpdk.org/dev/patchwork/patch/14496/
> 
> v2:
> - made vector handler selection based on run-time cpuflags (Suggested by Thomas)
> - moved vector implementations to .c file instead of .h file(Suggested by Jianbo)
> 
> Jerin Jacob (4):
>   virtio: conditional compilation cleanup
>   virtio: move SSE based Rx implementation to separate file
>   virtio: add cpuflag based vector handler selection
>   virtio: add neon support
> 
>  MAINTAINERS                                  |   1 +
>  doc/guides/rel_notes/release_16_07.rst       |   2 +
>  drivers/net/virtio/Makefile                  |   7 +-
>  drivers/net/virtio/virtio_pci.h              |   1 +
>  drivers/net/virtio/virtio_rxtx.c             |  63 ++++---
>  drivers/net/virtio/virtio_rxtx.h             |   3 +-
>  drivers/net/virtio/virtio_rxtx_simple.c      | 269 ++-------------------------
>  drivers/net/virtio/virtio_rxtx_simple.h      | 133 +++++++++++++
>  drivers/net/virtio/virtio_rxtx_simple_neon.c | 235 +++++++++++++++++++++++
>  drivers/net/virtio/virtio_rxtx_simple_sse.c  | 222 ++++++++++++++++++++++
>  drivers/net/virtio/virtio_user_ethdev.c      |   1 +
>  11 files changed, 646 insertions(+), 291 deletions(-)
>  create mode 100644 drivers/net/virtio/virtio_rxtx_simple.h
>  create mode 100644 drivers/net/virtio/virtio_rxtx_simple_neon.c
>  create mode 100644 drivers/net/virtio/virtio_rxtx_simple_sse.c
> 
> -- 
> 2.5.5

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2016-09-28  0:36 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-06-27 11:54 [PATCH 0/4] Virtio NEON support for ARM Jerin Jacob
2016-06-27 11:54 ` [PATCH 1/4] virtio: Fix compile time dependency of use_simple_rxtx usage Jerin Jacob
2016-06-27 11:54 ` [PATCH 2/4] virtio: introduce RTE_LIBRTE_VIRTIO_INC_VECTOR Jerin Jacob
2016-06-27 14:19   ` Thomas Monjalon
2016-06-27 14:48     ` Jerin Jacob
2016-06-27 14:59       ` Thomas Monjalon
2016-06-29 11:18         ` Jerin Jacob
2016-06-29 11:25           ` Thomas Monjalon
2016-06-29 11:40             ` Jerin Jacob
2016-06-30  5:44               ` Yuanhan Liu
2016-06-27 11:54 ` [PATCH 3/4] virtio: move SSE based Rx implementation to separate file Jerin Jacob
2016-06-28  6:17   ` Jianbo Liu
2016-06-29 11:27     ` Jerin Jacob
2016-06-30  5:43       ` Yuanhan Liu
2016-06-27 11:54 ` [PATCH 4/4] virtio: add neon support Jerin Jacob
2016-07-01 11:16 ` From: Jerin Jacob <jerin.jacob@caviumnetworks.com> Jerin Jacob
2016-07-01 11:16   ` [PATCH v2 1/3] virtio: conditional compilation cleanup Jerin Jacob
2016-07-04  7:36     ` Yuanhan Liu
2016-07-04  8:36       ` Jerin Jacob
2016-07-04  8:42         ` Yuanhan Liu
2016-07-04  9:07           ` Jerin Jacob
2016-07-04 11:02             ` Yuanhan Liu
2016-07-04 12:15               ` Jerin Jacob
2016-07-04 12:26                 ` Yuanhan Liu
2016-07-04 12:50                   ` Jerin Jacob
2016-07-04 12:57                     ` Yuanhan Liu
2016-07-01 11:16   ` [PATCH v2 2/3] virtio: move SSE based Rx implementation to separate file Jerin Jacob
2016-07-04  7:42     ` Yuanhan Liu
2016-07-04  8:38       ` Jerin Jacob
2016-07-01 11:16   ` [PATCH v2 3/3] virtio: add neon support Jerin Jacob
2016-07-04  7:53     ` Yuanhan Liu
2016-07-04  8:55       ` Jerin Jacob
2016-07-04  9:02         ` Yuanhan Liu
2016-07-05 12:49   ` [PATCH v3 0/4] Virtio NEON support for ARM Jerin Jacob
2016-07-05 12:49     ` [PATCH v3 1/4] virtio: conditional compilation cleanup Jerin Jacob
2016-07-05 12:49     ` [PATCH v3 2/4] virtio: move SSE based Rx implementation to separate file Jerin Jacob
2016-08-18  6:52       ` Yuanhan Liu
2016-08-19  3:24         ` Jerin Jacob
2016-08-23  7:43           ` Yuanhan Liu
2016-07-05 12:49     ` [PATCH v3 3/4] virtio: add cpuflag based vector handler selection Jerin Jacob
2016-07-05 12:49     ` [PATCH v3 4/4] virtio: add neon support Jerin Jacob
2016-07-06  3:11       ` Jianbo Liu
2016-09-28  0:37     ` [PATCH v3 0/4] Virtio NEON support for ARM Yuanhan Liu
2016-07-01 11:19 ` [PATCH v2 0/3] " Jerin Jacob

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.