All of lore.kernel.org
 help / color / mirror / Atom feed
From: Qi Zhang <qi.z.zhang@intel.com>
To: dev@dpdk.org
Cc: magnus.karlsson@intel.com, bjorn.topel@intel.com,
	Qi Zhang <qi.z.zhang@intel.com>
Subject: [RFC v2 1/7] net/af_xdp: new PMD driver
Date: Thu,  8 Mar 2018 21:52:43 +0800	[thread overview]
Message-ID: <20180308135249.28187-2-qi.z.zhang@intel.com> (raw)
In-Reply-To: <20180308135249.28187-1-qi.z.zhang@intel.com>

This is the vanilla version.
Packet data will copy between af_xdp memory buffer and mbuf mempool.
indexes of memory buffer is simply managed by a fifo ring.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 config/common_base                            |   5 +
 config/common_linuxapp                        |   1 +
 drivers/net/Makefile                          |   1 +
 drivers/net/af_xdp/Makefile                   |  26 +
 drivers/net/af_xdp/meson.build                |   7 +
 drivers/net/af_xdp/rte_eth_af_xdp.c           | 760 ++++++++++++++++++++++++++
 drivers/net/af_xdp/rte_pmd_af_xdp_version.map |   4 +
 drivers/net/af_xdp/xdpsock_queue.h            |  66 +++
 mk/rte.app.mk                                 |   1 +
 9 files changed, 871 insertions(+)
 create mode 100644 drivers/net/af_xdp/Makefile
 create mode 100644 drivers/net/af_xdp/meson.build
 create mode 100644 drivers/net/af_xdp/rte_eth_af_xdp.c
 create mode 100644 drivers/net/af_xdp/rte_pmd_af_xdp_version.map
 create mode 100644 drivers/net/af_xdp/xdpsock_queue.h

diff --git a/config/common_base b/config/common_base
index ad03cf433..84b7b3b7e 100644
--- a/config/common_base
+++ b/config/common_base
@@ -368,6 +368,11 @@ CONFIG_RTE_LIBRTE_VMXNET3_DEBUG_TX_FREE=n
 CONFIG_RTE_LIBRTE_PMD_AF_PACKET=n
 
 #
+# Compile software PMD backed by AF_XDP sockets (Linux only)
+#
+CONFIG_RTE_LIBRTE_PMD_AF_XDP=n
+
+#
 # Compile link bonding PMD library
 #
 CONFIG_RTE_LIBRTE_PMD_BOND=y
diff --git a/config/common_linuxapp b/config/common_linuxapp
index ff98f2355..3b10695b6 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -16,6 +16,7 @@ CONFIG_RTE_LIBRTE_VHOST=y
 CONFIG_RTE_LIBRTE_VHOST_NUMA=y
 CONFIG_RTE_LIBRTE_PMD_VHOST=y
 CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
+CONFIG_RTE_LIBRTE_PMD_AF_XDP=y
 CONFIG_RTE_LIBRTE_PMD_TAP=y
 CONFIG_RTE_LIBRTE_AVP_PMD=y
 CONFIG_RTE_LIBRTE_VDEV_NETVSC_PMD=y
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index e1127326b..409234ac3 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -9,6 +9,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD),d)
 endif
 
 DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += af_packet
+DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += af_xdp
 DIRS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += ark
 DIRS-$(CONFIG_RTE_LIBRTE_AVF_PMD) += avf
 DIRS-$(CONFIG_RTE_LIBRTE_AVP_PMD) += avp
diff --git a/drivers/net/af_xdp/Makefile b/drivers/net/af_xdp/Makefile
new file mode 100644
index 000000000..990073655
--- /dev/null
+++ b/drivers/net/af_xdp/Makefile
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_af_xdp.a
+
+EXPORT_MAP := rte_pmd_af_xdp_version.map
+
+LIBABIVER := 1
+
+CFLAGS += -O3 -I/opt/af_xdp/linux_headers/include
+CFLAGS += $(WERROR_FLAGS)
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_vdev
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += rte_eth_af_xdp.c
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/af_xdp/meson.build b/drivers/net/af_xdp/meson.build
new file mode 100644
index 000000000..4b6652685
--- /dev/null
+++ b/drivers/net/af_xdp/meson.build
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+if host_machine.system() != 'linux'
+	build = false
+endif
+sources = files('rte_eth_af_xdp.c')
diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c
new file mode 100644
index 000000000..5c7c53aeb
--- /dev/null
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -0,0 +1,760 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_ethdev_driver.h>
+#include <rte_ethdev_vdev.h>
+#include <rte_malloc.h>
+#include <rte_kvargs.h>
+#include <rte_bus_vdev.h>
+
+#include <linux/if_ether.h>
+#include <linux/if_xdp.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <poll.h>
+#include "xdpsock_queue.h"
+
+#ifndef SOL_XDP
+#define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+#define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+#define PF_XDP AF_XDP
+#endif
+
+#define ETH_AF_XDP_IFACE_ARG		"iface"
+#define ETH_AF_XDP_QUEUE_IDX_ARG	"queue"
+#define ETH_AF_XDP_RING_SIZE_ARG	"ringsz"
+
+#define ETH_AF_XDP_FRAME_SIZE		2048
+#define ETH_AF_XDP_NUM_BUFFERS		131072
+#define ETH_AF_XDP_DATA_HEADROOM	0
+#define ETH_AF_XDP_DFLT_RING_SIZE	1024
+#define ETH_AF_XDP_DFLT_QUEUE_IDX	0
+
+#define ETH_AF_XDP_RX_BATCH_SIZE	32
+#define ETH_AF_XDP_TX_BATCH_SIZE	32
+
+struct xdp_umem {
+	char *buffer;
+	size_t size;
+	unsigned int frame_size;
+	unsigned int frame_size_log2;
+	unsigned int nframes;
+	int mr_fd;
+};
+
+struct pmd_internals {
+	int sfd;
+	int if_index;
+	char if_name[IFNAMSIZ];
+	struct ether_addr eth_addr;
+	struct xdp_queue rx;
+	struct xdp_queue tx;
+	struct xdp_umem *umem;
+	struct rte_mempool *mb_pool;
+
+	unsigned long rx_pkts;
+	unsigned long rx_bytes;
+	unsigned long rx_dropped;
+
+	unsigned long tx_pkts;
+	unsigned long err_pkts;
+	unsigned long tx_bytes;
+
+	uint16_t port_id;
+	uint16_t queue_idx;
+	int ring_size;
+	struct rte_ring *buf_ring;
+};
+
+static const char * const valid_arguments[] = {
+	ETH_AF_XDP_IFACE_ARG,
+	ETH_AF_XDP_QUEUE_IDX_ARG,
+	ETH_AF_XDP_RING_SIZE_ARG,
+	NULL
+};
+
+static struct rte_eth_link pmd_link = {
+	.link_speed = ETH_SPEED_NUM_10G,
+	.link_duplex = ETH_LINK_FULL_DUPLEX,
+	.link_status = ETH_LINK_DOWN,
+	.link_autoneg = ETH_LINK_AUTONEG
+};
+
+static void *get_pkt_data(struct pmd_internals *internals,
+			  uint32_t index,
+			  uint32_t offset)
+{
+	return (void *)(internals->umem->buffer +
+			(index << internals->umem->frame_size_log2) + offset);
+}
+
+static uint16_t
+eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct pmd_internals *internals = queue;
+	struct xdp_queue *rxq = &internals->rx;
+	struct rte_mbuf *mbuf;
+	unsigned long dropped = 0;
+	unsigned long rx_bytes = 0;
+	uint16_t count = 0;
+
+	nb_pkts = nb_pkts < ETH_AF_XDP_RX_BATCH_SIZE ?
+		  nb_pkts : ETH_AF_XDP_RX_BATCH_SIZE;
+
+	struct xdp_desc descs[ETH_AF_XDP_RX_BATCH_SIZE];
+	void *indexes[ETH_AF_XDP_RX_BATCH_SIZE];
+	int rcvd, i;
+
+	/* fill rx ring */
+	if (rxq->num_free >= ETH_AF_XDP_RX_BATCH_SIZE) {
+		int n = rte_ring_dequeue_bulk(internals->buf_ring,
+					      indexes,
+					      ETH_AF_XDP_RX_BATCH_SIZE,
+					      NULL);
+		for (i = 0; i < n; i++)
+			descs[i].idx = (uint32_t)((long int)indexes[i]);
+		xq_enq(rxq, descs, n);
+	}
+
+	/* read data */
+	rcvd = xq_deq(rxq, descs, nb_pkts);
+	if (rcvd == 0)
+		return 0;
+
+	for (i = 0; i < rcvd; i++) {
+		char *pkt;
+		uint32_t idx = descs[i].idx;
+
+		mbuf = rte_pktmbuf_alloc(internals->mb_pool);
+		rte_pktmbuf_pkt_len(mbuf) =
+			rte_pktmbuf_data_len(mbuf) =
+			descs[i].len;
+		if (mbuf) {
+			pkt = get_pkt_data(internals, idx, descs[i].offset);
+			memcpy(rte_pktmbuf_mtod(mbuf, void *),
+			       pkt, descs[i].len);
+			rx_bytes += descs[i].len;
+			bufs[count++] = mbuf;
+		} else {
+			dropped++;
+		}
+		indexes[i] = (void *)((long int)idx);
+	}
+
+	rte_ring_enqueue_bulk(internals->buf_ring, indexes, rcvd, NULL);
+
+	internals->rx_pkts += (rcvd - dropped);
+	internals->rx_bytes += rx_bytes;
+	internals->rx_dropped += dropped;
+
+	return count;
+}
+
+static void kick_tx(int fd)
+{
+	int ret;
+
+	for (;;) {
+		ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+		if (ret >= 0 || errno == ENOBUFS)
+			return;
+		if (errno == EAGAIN)
+			continue;
+	}
+}
+
+static uint16_t
+eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct pmd_internals *internals = queue;
+	struct xdp_queue *txq = &internals->tx;
+	struct rte_mbuf *mbuf;
+	struct xdp_desc descs[ETH_AF_XDP_TX_BATCH_SIZE];
+	void *indexes[ETH_AF_XDP_TX_BATCH_SIZE];
+	uint16_t i, valid;
+	unsigned long tx_bytes = 0;
+
+	nb_pkts = nb_pkts < ETH_AF_XDP_TX_BATCH_SIZE ?
+		  nb_pkts : ETH_AF_XDP_TX_BATCH_SIZE;
+
+	if (txq->num_free < ETH_AF_XDP_TX_BATCH_SIZE * 2) {
+		int n = xq_deq(txq, descs, ETH_AF_XDP_TX_BATCH_SIZE);
+
+		for (i = 0; i < n; i++)
+			indexes[i] = (void *)((long int)descs[i].idx);
+		rte_ring_enqueue_bulk(internals->buf_ring, indexes, n, NULL);
+	}
+
+	nb_pkts = nb_pkts > txq->num_free ? txq->num_free : nb_pkts;
+	nb_pkts = rte_ring_dequeue_bulk(internals->buf_ring, indexes,
+					nb_pkts, NULL);
+
+	valid = 0;
+	for (i = 0; i < nb_pkts; i++) {
+		char *pkt;
+		unsigned int buf_len =
+			internals->umem->frame_size - ETH_AF_XDP_DATA_HEADROOM;
+		mbuf = bufs[i];
+		if (mbuf->pkt_len <= buf_len) {
+			descs[valid].idx = (uint32_t)((long int)indexes[valid]);
+			descs[valid].offset = ETH_AF_XDP_DATA_HEADROOM;
+			descs[valid].flags = 0;
+			descs[valid].len = mbuf->pkt_len;
+			pkt = get_pkt_data(internals, descs[i].idx,
+					   descs[i].offset);
+			memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
+			       descs[i].len);
+			valid++;
+			tx_bytes += mbuf->pkt_len;
+		}
+		/* packet will be consumed anyway */
+		rte_pktmbuf_free(mbuf);
+	}
+
+	xq_enq(txq, descs, valid);
+	kick_tx(internals->sfd);
+
+	if (valid < nb_pkts)
+		rte_ring_enqueue_bulk(internals->buf_ring, &indexes[valid],
+				      nb_pkts - valid, NULL);
+
+	internals->err_pkts += (nb_pkts - valid);
+	internals->tx_pkts += valid;
+	internals->tx_bytes += tx_bytes;
+
+	return nb_pkts;
+}
+
+static void
+fill_rx_desc(struct pmd_internals *internals)
+{
+	int num_free = internals->rx.num_free;
+	void *p = NULL;
+	int i;
+
+	for (i = 0; i < num_free; i++) {
+		struct xdp_desc desc = {};
+
+		rte_ring_dequeue(internals->buf_ring, &p);
+		desc.idx = (uint32_t)((long int)p);
+		xq_enq(&internals->rx, &desc, 1);
+	}
+}
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+
+	dev->data->dev_link.link_status = ETH_LINK_UP;
+	fill_rx_desc(internals);
+
+	return 0;
+}
+
+/* This function gets called when the current port gets stopped. */
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+	dev->data->dev_link.link_status = ETH_LINK_DOWN;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
+{
+	return 0;
+}
+
+static void
+eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+
+	dev_info->if_index = internals->if_index;
+	dev_info->max_mac_addrs = 1;
+	dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
+	dev_info->max_rx_queues = 1;
+	dev_info->max_tx_queues = 1;
+	dev_info->min_rx_bufsize = 0;
+}
+
+static int
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+	const struct pmd_internals *internals = dev->data->dev_private;
+
+	stats->ipackets = stats->q_ipackets[0] =
+		internals->rx_pkts;
+	stats->ibytes = stats->q_ibytes[0] =
+		internals->rx_bytes;
+	stats->imissed =
+		internals->rx_dropped;
+
+	stats->opackets = stats->q_opackets[0]
+		= internals->tx_pkts;
+	stats->oerrors = stats->q_errors[0] =
+		internals->err_pkts;
+	stats->obytes = stats->q_obytes[0] =
+		internals->tx_bytes;
+
+	return 0;
+}
+
+static void
+eth_stats_reset(struct rte_eth_dev *dev)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+
+	internals->rx_pkts = 0;
+	internals->rx_bytes = 0;
+	internals->rx_dropped = 0;
+
+	internals->tx_pkts = 0;
+	internals->err_pkts = 0;
+	internals->tx_bytes = 0;
+}
+
+static void
+eth_dev_close(struct rte_eth_dev *dev __rte_unused)
+{
+}
+
+static void
+eth_queue_release(void *q __rte_unused)
+{
+}
+
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+		int wait_to_complete __rte_unused)
+{
+	return 0;
+}
+
+static struct xdp_umem *xsk_alloc_and_mem_reg_buffers(int sfd, size_t nbuffers)
+{
+	struct xdp_mr_req req = { .frame_size = ETH_AF_XDP_FRAME_SIZE,
+				  .data_headroom = ETH_AF_XDP_DATA_HEADROOM };
+	struct xdp_umem *umem;
+	void *bufs;
+	int ret;
+
+	ret = posix_memalign((void **)&bufs, getpagesize(),
+			     nbuffers * req.frame_size);
+	if (ret)
+		return NULL;
+
+	umem = calloc(1, sizeof(*umem));
+	if (!umem) {
+		free(bufs);
+		return NULL;
+	}
+
+	req.addr = (unsigned long)bufs;
+	req.len = nbuffers * req.frame_size;
+	ret = setsockopt(sfd, SOL_XDP, XDP_MEM_REG, &req, sizeof(req));
+	RTE_ASSERT(ret == 0);
+
+	umem->frame_size = ETH_AF_XDP_FRAME_SIZE;
+	umem->frame_size_log2 = 11;
+	umem->buffer = bufs;
+	umem->size = nbuffers * req.frame_size;
+	umem->nframes = nbuffers;
+	umem->mr_fd = sfd;
+
+	return umem;
+}
+
+static int
+xdp_configure(struct pmd_internals *internals)
+{
+	struct sockaddr_xdp sxdp;
+	struct xdp_ring_req req;
+	char ring_name[0x100];
+	int ret = 0;
+	long int i;
+
+	snprintf(ring_name, 0x100, "%s_%s_%d", "af_xdp_ring",
+		 internals->if_name, internals->queue_idx);
+	internals->buf_ring = rte_ring_create(ring_name,
+					      ETH_AF_XDP_NUM_BUFFERS,
+					      SOCKET_ID_ANY,
+					      0x0);
+	if (!internals->buf_ring)
+		return -1;
+
+	for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
+		rte_ring_enqueue(internals->buf_ring, (void *)i);
+
+	internals->umem = xsk_alloc_and_mem_reg_buffers(internals->sfd,
+							ETH_AF_XDP_NUM_BUFFERS);
+	if (!internals->umem)
+		goto error;
+
+	req.mr_fd = internals->umem->mr_fd;
+	req.desc_nr = internals->ring_size;
+
+	ret = setsockopt(internals->sfd, SOL_XDP, XDP_RX_RING,
+			 &req, sizeof(req));
+
+	RTE_ASSERT(ret == 0);
+
+	ret = setsockopt(internals->sfd, SOL_XDP, XDP_TX_RING,
+			 &req, sizeof(req));
+
+	RTE_ASSERT(ret == 0);
+
+	internals->rx.ring = mmap(0, req.desc_nr * sizeof(struct xdp_desc),
+				  PROT_READ | PROT_WRITE,
+				  MAP_SHARED | MAP_LOCKED | MAP_POPULATE,
+				  internals->sfd,
+				  XDP_PGOFF_RX_RING);
+	RTE_ASSERT(internals->rx.ring != MAP_FAILED);
+
+	internals->rx.num_free = req.desc_nr;
+	internals->rx.ring_mask = req.desc_nr - 1;
+
+	internals->tx.ring = mmap(0, req.desc_nr * sizeof(struct xdp_desc),
+				  PROT_READ | PROT_WRITE,
+				  MAP_SHARED | MAP_LOCKED | MAP_POPULATE,
+				  internals->sfd,
+				  XDP_PGOFF_TX_RING);
+	RTE_ASSERT(internals->tx.ring != MAP_FAILED);
+
+	internals->tx.num_free = req.desc_nr;
+	internals->tx.ring_mask = req.desc_nr - 1;
+
+	sxdp.sxdp_family = PF_XDP;
+	sxdp.sxdp_ifindex = internals->if_index;
+	sxdp.sxdp_queue_id = internals->queue_idx;
+
+	ret = bind(internals->sfd, (struct sockaddr *)&sxdp, sizeof(sxdp));
+	RTE_ASSERT(ret == 0);
+
+	return ret;
+error:
+	rte_ring_free(internals->buf_ring);
+	internals->buf_ring = NULL;
+	return -1;
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev,
+		   uint16_t rx_queue_id,
+		   uint16_t nb_rx_desc __rte_unused,
+		   unsigned int socket_id __rte_unused,
+		   const struct rte_eth_rxconf *rx_conf __rte_unused,
+		   struct rte_mempool *mb_pool)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+	unsigned int buf_size, data_size;
+
+	RTE_ASSERT(rx_queue_id == 0);
+	internals->mb_pool = mb_pool;
+	xdp_configure(internals);
+
+	/* Now get the space available for data in the mbuf */
+	buf_size = rte_pktmbuf_data_room_size(internals->mb_pool) -
+		RTE_PKTMBUF_HEADROOM;
+	data_size = internals->umem->frame_size;
+
+	if (data_size > buf_size) {
+		RTE_LOG(ERR, PMD,
+			"%s: %d bytes will not fit in mbuf (%d bytes)\n",
+			dev->device->name, data_size, buf_size);
+		return -ENOMEM;
+	}
+
+	dev->data->rx_queues[rx_queue_id] = internals;
+	return 0;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev,
+		   uint16_t tx_queue_id,
+		   uint16_t nb_tx_desc __rte_unused,
+		   unsigned int socket_id __rte_unused,
+		   const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+
+	RTE_ASSERT(tx_queue_id == 0);
+	dev->data->tx_queues[tx_queue_id] = internals;
+	return 0;
+}
+
+static int
+eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+	struct ifreq ifr = { .ifr_mtu = mtu };
+	int ret;
+	int s;
+
+	s = socket(PF_INET, SOCK_DGRAM, 0);
+	if (s < 0)
+		return -EINVAL;
+
+	snprintf(ifr.ifr_name, IFNAMSIZ, "%s", internals->if_name);
+	ret = ioctl(s, SIOCSIFMTU, &ifr);
+	close(s);
+
+	if (ret < 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void
+eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
+{
+	struct ifreq ifr;
+	int s;
+
+	s = socket(PF_INET, SOCK_DGRAM, 0);
+	if (s < 0)
+		return;
+
+	snprintf(ifr.ifr_name, IFNAMSIZ, "%s", if_name);
+	if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0)
+		goto out;
+	ifr.ifr_flags &= mask;
+	ifr.ifr_flags |= flags;
+	if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0)
+		goto out;
+out:
+	close(s);
+}
+
+static void
+eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+
+	eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
+}
+
+static void
+eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
+{
+	struct pmd_internals *internals = dev->data->dev_private;
+
+	eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
+}
+
+static const struct eth_dev_ops ops = {
+	.dev_start = eth_dev_start,
+	.dev_stop = eth_dev_stop,
+	.dev_close = eth_dev_close,
+	.dev_configure = eth_dev_configure,
+	.dev_infos_get = eth_dev_info,
+	.mtu_set = eth_dev_mtu_set,
+	.promiscuous_enable = eth_dev_promiscuous_enable,
+	.promiscuous_disable = eth_dev_promiscuous_disable,
+	.rx_queue_setup = eth_rx_queue_setup,
+	.tx_queue_setup = eth_tx_queue_setup,
+	.rx_queue_release = eth_queue_release,
+	.tx_queue_release = eth_queue_release,
+	.link_update = eth_link_update,
+	.stats_get = eth_stats_get,
+	.stats_reset = eth_stats_reset,
+};
+
+static struct rte_vdev_driver pmd_af_xdp_drv;
+
+static void
+parse_parameters(struct rte_kvargs *kvlist,
+		 char **if_name,
+		 int *queue_idx,
+		 int *ring_size)
+{
+	struct rte_kvargs_pair *pair = NULL;
+	unsigned int k_idx;
+
+	for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
+		pair = &kvlist->pairs[k_idx];
+		if (strstr(pair->key, ETH_AF_XDP_IFACE_ARG))
+			*if_name = pair->value;
+		else if (strstr(pair->key, ETH_AF_XDP_QUEUE_IDX_ARG))
+			*queue_idx = atoi(pair->value);
+		else if (strstr(pair->key, ETH_AF_XDP_RING_SIZE_ARG))
+			*ring_size = atoi(pair->value);
+	}
+}
+
+static int
+get_iface_info(const char *if_name,
+	       struct ether_addr *eth_addr,
+	       int *if_index)
+{
+	struct ifreq ifr;
+	int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+
+	if (sock < 0)
+		return -1;
+
+	strcpy(ifr.ifr_name, if_name);
+	if (ioctl(sock, SIOCGIFINDEX, &ifr))
+		goto error;
+	*if_index = ifr.ifr_ifindex;
+
+	if (ioctl(sock, SIOCGIFHWADDR, &ifr))
+		goto error;
+
+	memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, 6);
+
+	close(sock);
+	return 0;
+
+error:
+	close(sock);
+	return -1;
+}
+
+static int
+init_internals(struct rte_vdev_device *dev,
+	       const char *if_name,
+	       int queue_idx,
+	       int ring_size)
+{
+	const char *name = rte_vdev_device_name(dev);
+	struct rte_eth_dev *eth_dev = NULL;
+	struct rte_eth_dev_data *data = NULL;
+	const unsigned int numa_node = dev->device.numa_node;
+	struct pmd_internals *internals = NULL;
+	int ret;
+
+	data = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
+	if (!data)
+		return -1;
+
+	internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
+	if (!internals)
+		goto error_1;
+
+	internals->queue_idx = queue_idx;
+	internals->ring_size = ring_size;
+	strcpy(internals->if_name, if_name);
+	internals->sfd = socket(PF_XDP, SOCK_RAW, 0);
+	if (internals->sfd < 0)
+		goto error_2;
+
+	ret = get_iface_info(if_name, &internals->eth_addr,
+			     &internals->if_index);
+	if (ret)
+		goto error_3;
+
+	eth_dev = rte_eth_vdev_allocate(dev, 0);
+	if (!eth_dev)
+		goto error_3;
+
+	rte_memcpy(data, eth_dev->data, sizeof(*data));
+	internals->port_id = eth_dev->data->port_id;
+	data->dev_private = internals;
+	data->nb_rx_queues = 1;
+	data->nb_tx_queues = 1;
+	data->dev_link = pmd_link;
+	data->mac_addrs = &internals->eth_addr;
+
+	eth_dev->data = data;
+	eth_dev->dev_ops = &ops;
+
+	eth_dev->rx_pkt_burst = eth_af_xdp_rx;
+	eth_dev->tx_pkt_burst = eth_af_xdp_tx;
+
+	return 0;
+
+error_3:
+	close(internals->sfd);
+
+error_2:
+	rte_free(internals);
+
+error_1:
+	rte_free(data);
+	return -1;
+}
+
+static int
+rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
+{
+	struct rte_kvargs *kvlist;
+	char *if_name = NULL;
+	int ring_size = ETH_AF_XDP_DFLT_RING_SIZE;
+	int queue_idx = ETH_AF_XDP_DFLT_QUEUE_IDX;
+	int ret;
+
+	RTE_LOG(INFO, PMD, "Initializing pmd_af_packet for %s\n",
+		rte_vdev_device_name(dev));
+
+	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
+	if (!kvlist) {
+		RTE_LOG(ERR, PMD,
+			"Invalid kvargs");
+		return -1;
+	}
+
+	if (dev->device.numa_node == SOCKET_ID_ANY)
+		dev->device.numa_node = rte_socket_id();
+
+	parse_parameters(kvlist, &if_name, &queue_idx, &ring_size);
+
+	ret = init_internals(dev, if_name, queue_idx, ring_size);
+	rte_kvargs_free(kvlist);
+
+	return ret;
+}
+
+static int
+rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
+{
+	struct rte_eth_dev *eth_dev = NULL;
+	struct pmd_internals *internals;
+
+	RTE_LOG(INFO, PMD, "Closing AF_XDP ethdev on numa socket %u\n",
+		rte_socket_id());
+
+	if (!dev)
+		return -1;
+
+	/* find the ethdev entry */
+	eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
+	if (!eth_dev)
+		return -1;
+
+	internals = eth_dev->data->dev_private;
+	rte_ring_free(internals->buf_ring);
+	rte_free(internals->umem);
+	rte_free(eth_dev->data->dev_private);
+	rte_free(eth_dev->data);
+	close(internals->sfd);
+
+	rte_eth_dev_release_port(eth_dev);
+
+	return 0;
+}
+
+static struct rte_vdev_driver pmd_af_xdp_drv = {
+	.probe = rte_pmd_af_xdp_probe,
+	.remove = rte_pmd_af_xdp_remove,
+};
+
+RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
+RTE_PMD_REGISTER_ALIAS(net_af_xdp, eth_af_xdp);
+RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
+			      "iface=<string> "
+			      "queue=<int> "
+			      "ringsz=<int> ");
diff --git a/drivers/net/af_xdp/rte_pmd_af_xdp_version.map b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
new file mode 100644
index 000000000..ef3539840
--- /dev/null
+++ b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
@@ -0,0 +1,4 @@
+DPDK_2.0 {
+
+	local: *;
+};
diff --git a/drivers/net/af_xdp/xdpsock_queue.h b/drivers/net/af_xdp/xdpsock_queue.h
new file mode 100644
index 000000000..c5d0cb56a
--- /dev/null
+++ b/drivers/net/af_xdp/xdpsock_queue.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#ifndef __XDPSOCK_QUEUE_H
+#define __XDPSOCK_QUEUE_H
+
+static inline int xq_enq(struct xdp_queue *q,
+			 const struct xdp_desc *descs,
+			 unsigned int ndescs)
+{
+	unsigned int avail_idx = q->avail_idx;
+	unsigned int i;
+	int j;
+
+	if (q->num_free < ndescs)
+		return -ENOSPC;
+
+	q->num_free -= ndescs;
+
+	for (i = 0; i < ndescs; i++) {
+		unsigned int idx = avail_idx++ & q->ring_mask;
+
+		q->ring[idx].idx	= descs[i].idx;
+		q->ring[idx].len	= descs[i].len;
+		q->ring[idx].offset	= descs[i].offset;
+		q->ring[idx].error	= 0;
+	}
+	rte_smp_wmb();
+
+	for (j = ndescs - 1; j >= 0; j--) {
+		unsigned int idx = (q->avail_idx + j) & q->ring_mask;
+
+		q->ring[idx].flags = descs[j].flags | XDP_DESC_KERNEL;
+	}
+	q->avail_idx += ndescs;
+
+	return 0;
+}
+
+static inline int xq_deq(struct xdp_queue *q,
+			 struct xdp_desc *descs,
+			 int ndescs)
+{
+	unsigned int idx, last_used_idx = q->last_used_idx;
+	int i, entries = 0;
+
+	for (i = 0; i < ndescs; i++) {
+		idx = (last_used_idx++) & q->ring_mask;
+		if (q->ring[idx].flags & XDP_DESC_KERNEL)
+			break;
+		entries++;
+	}
+	q->num_free += entries;
+
+	rte_smp_rmb();
+
+	for (i = 0; i < entries; i++) {
+		idx = q->last_used_idx++ & q->ring_mask;
+		descs[i] = q->ring[idx];
+	}
+
+	return entries;
+}
+
+#endif /* __XDPSOCK_QUEUE_H */
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 3eb41d176..bc26e1457 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -120,6 +120,7 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
 _LDLIBS-$(CONFIG_RTE_DRIVER_MEMPOOL_STACK)  += -lrte_mempool_stack
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -lrte_pmd_af_packet
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     += -lrte_pmd_af_xdp
 _LDLIBS-$(CONFIG_RTE_LIBRTE_ARK_PMD)        += -lrte_pmd_ark
 _LDLIBS-$(CONFIG_RTE_LIBRTE_AVF_PMD)        += -lrte_pmd_avf
 _LDLIBS-$(CONFIG_RTE_LIBRTE_AVP_PMD)        += -lrte_pmd_avp
-- 
2.13.6

  reply	other threads:[~2018-03-08 13:52 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-08 13:52 [RFC v2 0/7] PMD driver for AF_XDP Qi Zhang
2018-03-08 13:52 ` Qi Zhang [this message]
2018-03-08 13:52 ` [RFC v2 2/7] lib/mbuf: enable parse flags when create mempool Qi Zhang
2018-03-08 13:52 ` [RFC v2 3/7] lib/mempool: allow page size aligned mempool Qi Zhang
2018-03-08 13:52 ` [RFC v2 4/7] net/af_xdp: use mbuf mempool for buffer management Qi Zhang
2018-03-08 13:52 ` [RFC v2 5/7] net/af_xdp: enable share mempool Qi Zhang
2018-03-08 13:52 ` [RFC v2 6/7] net/af_xdp: load BPF file Qi Zhang
2018-03-08 14:20   ` Zhang, Qi Z
2018-03-08 23:15   ` Stephen Hemminger
2018-05-09  7:02     ` Björn Töpel
2018-03-08 13:52 ` [RFC v2 7/7] app/testpmd: enable parameter for mempool flags Qi Zhang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180308135249.28187-2-qi.z.zhang@intel.com \
    --to=qi.z.zhang@intel.com \
    --cc=bjorn.topel@intel.com \
    --cc=dev@dpdk.org \
    --cc=magnus.karlsson@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.