All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Wiles, Keith" <keith.wiles@intel.com>
To: Ophir Munk <ophirmu@mellanox.com>
Cc: DPDK <dev@dpdk.org>, Pascal Mazon <pascal.mazon@6wind.com>,
	"Thomas Monjalon" <thomas@monjalon.net>,
	Olga Shern <olgas@mellanox.com>
Subject: Re: [PATCH v4 2/2] net/tap: support TSO (TCP Segment Offload)
Date: Wed, 13 Jun 2018 16:04:22 +0000	[thread overview]
Message-ID: <37D23262-6A5D-4931-A874-1733643C7F95@intel.com> (raw)
In-Reply-To: <1528821108-12405-3-git-send-email-ophirmu@mellanox.com>



> On Jun 12, 2018, at 11:31 AM, Ophir Munk <ophirmu@mellanox.com> wrote:
> 
> This commit implements TCP segmentation offload in TAP.
> librte_gso library is used to segment large TCP payloads (e.g. packets
> of 64K bytes size) into smaller MTU size buffers.
> By supporting TSO offload capability in software a TAP device can be used
> as a failsafe sub device and be paired with another PCI device which
> supports TSO capability in HW.
> 
> For more details on librte_gso implementation please refer to dpdk
> documentation.
> The number of newly generated TCP TSO segments is limited to 64.
> 
> Reviewed-by: Raslan Darawsheh <rasland@mellanox.com>
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> ---
> drivers/net/tap/Makefile      |   2 +-
> drivers/net/tap/rte_eth_tap.c | 159 +++++++++++++++++++++++++++++++++++-------
> drivers/net/tap/rte_eth_tap.h |   3 +
> mk/rte.app.mk                 |   4 +-
> 4 files changed, 138 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
> index ccc5c5f..3243365 100644
> --- a/drivers/net/tap/Makefile
> +++ b/drivers/net/tap/Makefile
> @@ -24,7 +24,7 @@ CFLAGS += -I.
> CFLAGS += $(WERROR_FLAGS)
> LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
> LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
> -LDLIBS += -lrte_bus_vdev
> +LDLIBS += -lrte_bus_vdev -lrte_gso
> 
> CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
> 
> diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
> index c19f053..62b931f 100644
> --- a/drivers/net/tap/rte_eth_tap.c
> +++ b/drivers/net/tap/rte_eth_tap.c
> @@ -17,6 +17,7 @@
> #include <rte_ip.h>
> #include <rte_string_fns.h>
> 
> +#include <assert.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <sys/socket.h>
> @@ -55,6 +56,9 @@
> #define ETH_TAP_CMP_MAC_FMT     "0123456789ABCDEFabcdef"
> #define ETH_TAP_MAC_ARG_FMT     ETH_TAP_MAC_FIXED "|" ETH_TAP_USR_MAC_FMT
> 
> +#define TAP_GSO_MBUFS_NUM	128
> +#define TAP_GSO_MBUF_SEG_SIZE	128
> +
> static struct rte_vdev_driver pmd_tap_drv;
> static struct rte_vdev_driver pmd_tun_drv;
> 
> @@ -412,7 +416,8 @@ tap_tx_offload_get_queue_capa(void)
> 	return DEV_TX_OFFLOAD_MULTI_SEGS |
> 	       DEV_TX_OFFLOAD_IPV4_CKSUM |
> 	       DEV_TX_OFFLOAD_UDP_CKSUM |
> -	       DEV_TX_OFFLOAD_TCP_CKSUM;
> +	       DEV_TX_OFFLOAD_TCP_CKSUM |
> +	       DEV_TX_OFFLOAD_TCP_TSO;
> }
> 
> /* Finalize l4 checksum calculation */
> @@ -479,23 +484,15 @@ tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
> 	}
> }
> 
> -/* Callback to handle sending packets from the tap interface
> - */
> -static uint16_t
> -pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> +static inline void
> +tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
> +			struct rte_mbuf **pmbufs, uint16_t l234_hlen,
> +			uint16_t *num_packets, unsigned long *num_tx_bytes)
> {
> -	struct tx_queue *txq = queue;
> -	uint16_t num_tx = 0;
> -	unsigned long num_tx_bytes = 0;
> -	uint32_t max_size;
> 	int i;
> 
> -	if (unlikely(nb_pkts == 0))
> -		return 0;
> -
> -	max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> -	for (i = 0; i < nb_pkts; i++) {
> -		struct rte_mbuf *mbuf = bufs[num_tx];
> +	for (i = 0; i < num_mbufs; i++) {
> +		struct rte_mbuf *mbuf = pmbufs[i];
> 		struct iovec iovecs[mbuf->nb_segs + 2];
> 		struct tun_pi pi = { .flags = 0, .proto = 0x00 };
> 		struct rte_mbuf *seg = mbuf;
> @@ -503,8 +500,7 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> 		int proto;
> 		int n;
> 		int j;
> -		int k; /* first index in iovecs for copying segments */
> -		uint16_t l234_hlen; /* length of layers 2,3,4 headers */
> +		int k; /* current index in iovecs for copying segments */
> 		uint16_t seg_len; /* length of first segment */
> 		uint16_t nb_segs;
> 		uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
> @@ -512,10 +508,6 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> 		uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
> 		uint16_t is_cksum = 0; /* in case cksum should be offloaded */
> 
> -		/* stats.errs will be incremented */
> -		if (rte_pktmbuf_pkt_len(mbuf) > max_size)
> -			break;
> -
> 		l4_cksum = NULL;
> 		if (txq->type == ETH_TUNTAP_TYPE_TUN) {
> 			/*
> @@ -554,9 +546,8 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> 			l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
> 			if (seg_len < l234_hlen)
> 				break;
> -
> -			/* To change checksums, work on a
> -			 * copy of l2, l3 l4 headers.

Adding back in the blank line above would be nice for readability.
> +			/* To change checksums, work on a * copy of l2, l3
> +			 * headers + l4 pseudo header
> 			 */
> 			rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
> 					l234_hlen);
> @@ -598,13 +589,80 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> 		n = writev(txq->fd, iovecs, j);
> 		if (n <= 0)
> 			break;
> +		(*num_packets)++;
> +		(*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
> +	}
> +}
> +
> +/* Callback to handle sending packets from the tap interface
> + */
> +static uint16_t
> +pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
> +{
> +	struct tx_queue *txq = queue;
> +	uint16_t num_tx = 0;
> +	uint16_t num_packets = 0;
> +	unsigned long num_tx_bytes = 0;
> +	uint16_t tso_segsz = 0;
> +	uint16_t hdrs_len;
> +	uint32_t max_size;
> +	int i;
> +	uint64_t tso;
> +	int ret;
> 
> +	if (unlikely(nb_pkts == 0))
> +		return 0;
> +
> +	struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
> +	max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
> +	for (i = 0; i < nb_pkts; i++) {
> +		struct rte_mbuf *mbuf_in = bufs[num_tx];
> +		struct rte_mbuf **mbuf;
> +		uint16_t num_mbufs;
> +
> +		tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
> +		if (tso) {
> +			struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;

Missing blank line here, this one is not optional.
> +			assert(gso_ctx != NULL);

Blank line would be nice.
> +			/* TCP segmentation implies TCP checksum offload */
> +			mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM;

Blank line would be nice.
> +			/* gso size is calculated without ETHER_CRC_LEN */
> +			hdrs_len = mbuf_in->l2_len + mbuf_in->l3_len +
> +					mbuf_in->l4_len;
> +			tso_segsz = mbuf_in->tso_segsz + hdrs_len;
> +			if (unlikely(tso_segsz == hdrs_len) ||
> +				tso_segsz > *txq->mtu) {
> +				txq->stats.errs++;
> +				break;
> +			}
> +			gso_ctx->gso_size = tso_segsz;
> +			ret = rte_gso_segment(mbuf_in, /* packet to segment */
> +				gso_ctx, /* gso control block */
> +				(struct rte_mbuf **)&gso_mbufs, /* out mbufs */
> +				RTE_DIM(gso_mbufs)); /* max tso mbufs */
> +
> +			/* ret contains the number of new created mbufs */
> +			if (ret < 0)
> +				break;
> +
> +			mbuf = gso_mbufs;
> +			num_mbufs = ret;
> +		} else {
> +			/* stats.errs will be incremented */
> +			if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
> +				break;
> +
> +			mbuf = &mbuf_in;
> +			num_mbufs = 1;
> +		}
> +
> +		tap_write_mbufs(txq, num_mbufs, mbuf, hdrs_len,
> +				&num_packets, &num_tx_bytes);
> 		num_tx++;
> -		num_tx_bytes += mbuf->pkt_len;
> -		rte_pktmbuf_free(mbuf);
> +		rte_pktmbuf_free(mbuf_in);
> 	}
> 
> -	txq->stats.opackets += num_tx;
> +	txq->stats.opackets += num_packets;
> 	txq->stats.errs += nb_pkts - num_tx;
> 	txq->stats.obytes += num_tx_bytes;
> 
> @@ -1066,31 +1124,73 @@ tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
> }
> 
> static int
> +tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
> +{
> +	uint32_t gso_types;
> +	char pool_name[64];
> +
> +	/*
> +	 * Create private mbuf pool with TAP_GSO_MBUF_SEG_SIZE bytes
> +	 * size per mbuf use this pool for both direct and indirect mbufs
> +	 */
> +
> +	struct rte_mempool *mp;      /* Mempool for GSO packets */
> +	/* initialize GSO context */
> +	gso_types = DEV_TX_OFFLOAD_TCP_TSO;
> +	snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device->name);
> +	mp = rte_mempool_lookup((const char *)pool_name);
> +	if (!mp) {
> +		mp = rte_pktmbuf_pool_create(pool_name, TAP_GSO_MBUFS_NUM,
> +			0, 0, RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
> +			SOCKET_ID_ANY);

You have setup the mempool with no cache size, which means you have to take a lock for each allocate. This could changed to have a small cache per lcore say 8, but the total number of mbufs needs to be large enough to not allow starvation for a lcore.  total_mbufs = (max_num_ports * cache_size) + some_extra mbufs;

> +		if (!mp) {
> +			struct pmd_internals *pmd = dev->data->dev_private;
> +			RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for device %s\n",
> +				pmd->name, dev->device->name);
> +			return -1;
> +		}
> +	}
> +
> +	gso_ctx->direct_pool = mp;
> +	gso_ctx->indirect_pool = mp;
> +	gso_ctx->gso_types = gso_types;
> +	gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
> +	gso_ctx->flag = 0;
> +
> +	return 0;
> +}
> +
> +static int
> tap_setup_queue(struct rte_eth_dev *dev,
> 		struct pmd_internals *internals,
> 		uint16_t qid,
> 		int is_rx)
> {
> +	int ret;
> 	int *fd;
> 	int *other_fd;
> 	const char *dir;
> 	struct pmd_internals *pmd = dev->data->dev_private;
> 	struct rx_queue *rx = &internals->rxq[qid];
> 	struct tx_queue *tx = &internals->txq[qid];
> +	struct rte_gso_ctx *gso_ctx;
> 
> 	if (is_rx) {
> 		fd = &rx->fd;
> 		other_fd = &tx->fd;
> 		dir = "rx";
> +		gso_ctx = NULL;
> 	} else {
> 		fd = &tx->fd;
> 		other_fd = &rx->fd;
> 		dir = "tx";
> +		gso_ctx = &tx->gso_ctx;
> 	}
> 	if (*fd != -1) {
> 		/* fd for this queue already exists */
> 		TAP_LOG(DEBUG, "%s: fd %d for %s queue qid %d exists",
> 			pmd->name, *fd, dir, qid);
> +		gso_ctx = NULL;
> 	} else if (*other_fd != -1) {
> 		/* Only other_fd exists. dup it */
> 		*fd = dup(*other_fd);
> @@ -1115,6 +1215,11 @@ tap_setup_queue(struct rte_eth_dev *dev,
> 
> 	tx->mtu = &dev->data->mtu;
> 	rx->rxmode = &dev->data->dev_conf.rxmode;
> +	if (gso_ctx) {
> +		ret = tap_gso_ctx_setup(gso_ctx, dev);
> +		if (ret)
> +			return -1;
> +	}
> 
> 	tx->type = pmd->type;
> 
> diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
> index 7b21d0d..44e2773 100644
> --- a/drivers/net/tap/rte_eth_tap.h
> +++ b/drivers/net/tap/rte_eth_tap.h
> @@ -15,6 +15,7 @@
> 
> #include <rte_ethdev_driver.h>
> #include <rte_ether.h>
> +#include <rte_gso.h>
> #include "tap_log.h"
> 
> #ifdef IFF_MULTI_QUEUE
> @@ -22,6 +23,7 @@
> #else
> #define RTE_PMD_TAP_MAX_QUEUES	1
> #endif
> +#define MAX_GSO_MBUFS 64
> 
> enum rte_tuntap_type {
> 	ETH_TUNTAP_TYPE_UNKNOWN,
> @@ -59,6 +61,7 @@ struct tx_queue {
> 	uint16_t *mtu;                  /* Pointer to MTU from dev_data */
> 	uint16_t csum:1;                /* Enable checksum offloading */
> 	struct pkt_stats stats;         /* Stats for this TX queue */
> +	struct rte_gso_ctx gso_ctx;     /* GSO context */
> };
> 
> struct pmd_internals {
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index 1e32c83..e2ee879 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -38,8 +38,6 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT)           += -lrte_port
> _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP)          += -lrte_pdump
> _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR)    += -lrte_distributor
> _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG)        += -lrte_ip_frag
> -_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
> -_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
> _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lrte_meter
> _LDLIBS-$(CONFIG_RTE_LIBRTE_LPM)            += -lrte_lpm
> # librte_acl needs --whole-archive because of weak functions
> @@ -61,6 +59,8 @@ endif
> _LDLIBS-y += --whole-archive
> 
> _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
> _LDLIBS-$(CONFIG_RTE_LIBRTE_HASH)           += -lrte_hash
> _LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER)         += -lrte_member
> _LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST)          += -lrte_vhost
> -- 
> 2.7.4
> 

Regards,
Keith

  parent reply	other threads:[~2018-06-13 16:04 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-09 21:10 [RFC 0/2] TAP TSO Implementation Ophir Munk
2018-03-09 21:10 ` [RFC 1/2] net/tap: calculate checksum for multi segs packets Ophir Munk
2018-04-09 22:33   ` [PATCH v1 0/2] TAP TSO Ophir Munk
2018-04-09 22:33     ` [PATCH v1 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-04-09 22:33     ` [PATCH v1 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-04-22 11:30       ` [PATCH v2 0/2] TAP TSO Ophir Munk
2018-04-22 11:30         ` [PATCH v2 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-05-07 21:54           ` [PATCH v3 0/2] TAP TSO Ophir Munk
2018-05-07 21:54             ` [PATCH v3 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-05-07 21:54             ` [PATCH v3 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-05-31 13:52           ` [PATCH v2 1/2] net/tap: calculate checksums of multi segs packets Ferruh Yigit
2018-05-31 13:54             ` Ferruh Yigit
2018-04-22 11:30         ` [PATCH v2 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-06-12 16:31   ` [PATCH v4 0/2] TAP TSO Ophir Munk
2018-06-12 16:31     ` [PATCH v4 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-06-12 17:17       ` Wiles, Keith
2018-06-12 16:31     ` [PATCH v4 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-06-12 17:22       ` Wiles, Keith
2018-06-13 16:04       ` Wiles, Keith [this message]
2018-06-14  7:59         ` Ophir Munk
2018-06-14 12:58           ` Wiles, Keith
2018-06-23 23:17       ` [PATCH v5 0/2] TAP TSO Ophir Munk
2018-06-23 23:17         ` [PATCH v5 1/2] net/tap: calculate checksums of multi segs packets Ophir Munk
2018-06-24 13:45           ` Wiles, Keith
2018-06-27 13:11             ` Ferruh Yigit
2018-06-23 23:17         ` [PATCH v5 2/2] net/tap: support TSO (TCP Segment Offload) Ophir Munk
2018-03-09 21:10 ` [RFC 2/2] net/tap: implement TAP TSO Ophir Munk
2018-04-09 16:38 ` [RFC 0/2] TAP TSO Implementation Ferruh Yigit
2018-04-09 22:37   ` Ophir Munk
2018-04-10 14:30     ` Ferruh Yigit
2018-04-10 15:31       ` Ophir Munk

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=37D23262-6A5D-4931-A874-1733643C7F95@intel.com \
    --to=keith.wiles@intel.com \
    --cc=dev@dpdk.org \
    --cc=olgas@mellanox.com \
    --cc=ophirmu@mellanox.com \
    --cc=pascal.mazon@6wind.com \
    --cc=thomas@monjalon.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.