All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next V2 0/3] XDP support for tap
@ 2017-08-11 11:41 Jason Wang
  2017-08-11 11:41 ` [PATCH net-next V2 1/3] tap: use build_skb() for small packet Jason Wang
                   ` (3 more replies)
  0 siblings, 4 replies; 22+ messages in thread
From: Jason Wang @ 2017-08-11 11:41 UTC (permalink / raw)
  To: davem, mst, netdev, linux-kernel; +Cc: kubakici, Jason Wang

Hi all:

This series tries to implement XDP support for tap. Two path were
implemented:

- fast path: small & non-gso packet, For performance reason we do it
  at page level and use build_skb() to create skb if necessary.
- slow path: big or gso packet, we don't want to lose the capability
  compared to generic XDP, so we export some generic xdp helpers and
  do it after skb was created.

xdp1 shows about 41% improvement, xdp_redirect shows about 60%
improvement.

Changes from V1:
- fix the race between xdp set and free
- don't hold extra refcount
- add XDP_REDIRECT support

Please review.

Jason Wang (3):
  tap: use build_skb() for small packet
  net: export some generic xdp helpers
  tap: XDP support

 drivers/net/tun.c         | 247 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/netdevice.h |   2 +
 net/core/dev.c            |  14 +--
 3 files changed, 236 insertions(+), 27 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH net-next V2 1/3] tap: use build_skb() for small packet
  2017-08-11 11:41 [PATCH net-next V2 0/3] XDP support for tap Jason Wang
@ 2017-08-11 11:41 ` Jason Wang
  2017-08-16  3:45   ` Eric Dumazet
  2017-08-11 11:41 ` [PATCH net-next V2 2/3] net: export some generic xdp helpers Jason Wang
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2017-08-11 11:41 UTC (permalink / raw)
  To: davem, mst, netdev, linux-kernel; +Cc: kubakici, Jason Wang

We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
skb in the past. This socket based method is not suitable for high
speed userspace like virtualization which usually:

- ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
  possible
- don't want to be block at sendmsg()

To eliminate the above overheads, this patch tries to use build_skb()
for small packet. We will do this only when the following conditions
are all met:

- TAP instead of TUN
- sk_sndbuf is INT_MAX
- caller don't want to be blocked
- zerocopy is not used
- packet size is smaller enough to use build_skb()

Pktgen from guest to host shows ~11% improvement for rx pps of tap:

Before: ~1.70Mpps
After : ~1.88Mpps

What's more important, this makes it possible to implement XDP for tap
before creating skbs.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 112 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 91 insertions(+), 21 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d21510d..9736df4 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -105,6 +105,8 @@ do {								\
 } while (0)
 #endif
 
+#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+
 /* TUN device flags */
 
 /* IFF_ATTACH_QUEUE is never stored in device flags,
@@ -170,6 +172,7 @@ struct tun_file {
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
+	struct page_frag alloc_frag;
 };
 
 struct tun_flow_entry {
@@ -571,6 +574,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 		}
 		if (tun)
 			skb_array_cleanup(&tfile->tx_array);
+		if (tfile->alloc_frag.page)
+			put_page(tfile->alloc_frag.page);
 		sock_put(&tfile->sk);
 	}
 }
@@ -1190,6 +1195,61 @@ static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
 	}
 }
 
+static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
+			      int len, int noblock, bool zerocopy)
+{
+	if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+		return false;
+
+	if (tfile->socket.sk->sk_sndbuf != INT_MAX)
+		return false;
+
+	if (!noblock)
+		return false;
+
+	if (zerocopy)
+		return false;
+
+	if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
+		return false;
+
+	return true;
+}
+
+static struct sk_buff *tun_build_skb(struct tun_file *tfile,
+				     struct iov_iter *from,
+				     int len)
+{
+	struct page_frag *alloc_frag = &tfile->alloc_frag;
+	struct sk_buff *skb;
+	int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+		     SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	char *buf;
+	size_t copied;
+
+	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+		return ERR_PTR(-ENOMEM);
+
+	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	copied = copy_page_from_iter(alloc_frag->page,
+				     alloc_frag->offset + TUN_RX_PAD,
+				     len, from);
+	if (copied != len)
+		return ERR_PTR(-EFAULT);
+
+	skb = build_skb(buf, buflen);
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	skb_reserve(skb, TUN_RX_PAD);
+	skb_put(skb, len);
+	get_page(alloc_frag->page);
+	alloc_frag->offset += buflen;
+
+	return skb;
+}
+
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			    void *msg_control, struct iov_iter *from,
@@ -1263,30 +1323,38 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			zerocopy = true;
 	}
 
-	if (!zerocopy) {
-		copylen = len;
-		if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
-			linear = good_linear;
-		else
-			linear = tun16_to_cpu(tun, gso.hdr_len);
-	}
-
-	skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
-	if (IS_ERR(skb)) {
-		if (PTR_ERR(skb) != -EAGAIN)
+	if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+		skb = tun_build_skb(tfile, from, len);
+		if (IS_ERR(skb)) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
-		return PTR_ERR(skb);
-	}
+			return PTR_ERR(skb);
+		}
+	} else {
+		if (!zerocopy) {
+			copylen = len;
+			if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
+				linear = good_linear;
+			else
+				linear = tun16_to_cpu(tun, gso.hdr_len);
+		}
 
-	if (zerocopy)
-		err = zerocopy_sg_from_iter(skb, from);
-	else
-		err = skb_copy_datagram_from_iter(skb, 0, from, len);
+		skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+		if (IS_ERR(skb)) {
+			if (PTR_ERR(skb) != -EAGAIN)
+				this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			return PTR_ERR(skb);
+		}
 
-	if (err) {
-		this_cpu_inc(tun->pcpu_stats->rx_dropped);
-		kfree_skb(skb);
-		return -EFAULT;
+		if (zerocopy)
+			err = zerocopy_sg_from_iter(skb, from);
+		else
+			err = skb_copy_datagram_from_iter(skb, 0, from, len);
+
+		if (err) {
+			this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			kfree_skb(skb);
+			return -EFAULT;
+		}
 	}
 
 	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
@@ -2377,6 +2445,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	tfile->sk.sk_write_space = tun_sock_write_space;
 	tfile->sk.sk_sndbuf = INT_MAX;
 
+	tfile->alloc_frag.page = NULL;
+
 	file->private_data = tfile;
 	INIT_LIST_HEAD(&tfile->next);
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH net-next V2 2/3] net: export some generic xdp helpers
  2017-08-11 11:41 [PATCH net-next V2 0/3] XDP support for tap Jason Wang
  2017-08-11 11:41 ` [PATCH net-next V2 1/3] tap: use build_skb() for small packet Jason Wang
@ 2017-08-11 11:41 ` Jason Wang
  2017-08-11 11:41 ` [PATCH net-next V2 3/3] tap: XDP support Jason Wang
  2017-08-14  2:56 ` [PATCH net-next V2 0/3] XDP support for tap David Miller
  3 siblings, 0 replies; 22+ messages in thread
From: Jason Wang @ 2017-08-11 11:41 UTC (permalink / raw)
  To: davem, mst, netdev, linux-kernel; +Cc: kubakici, Jason Wang

This patch tries to export some generic xdp helpers to drivers. This
can let driver to do XDP for a specific skb. This is useful for the
case when the packet is hard to be processed at page level directly
(e.g jumbo/GSO frame).

With this patch, there's no need for driver to forbid the XDP set when
configuration is not suitable. Instead, it can defer the XDP for
packets that is hard to be processed directly after skb is created.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 14 ++++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1d238d5..0f1c4cb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3243,6 +3243,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
 	__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
 }
 
+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 3f69f6e..1a6657a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3920,7 +3920,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 /* When doing generic XDP we have to bypass the qdisc layer and the
  * network taps in order to match in-driver-XDP behavior.
  */
-static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 {
 	struct net_device *dev = skb->dev;
 	struct netdev_queue *txq;
@@ -3941,13 +3941,12 @@ static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 		kfree_skb(skb);
 	}
 }
+EXPORT_SYMBOL_GPL(generic_xdp_tx);
 
 static struct static_key generic_xdp_needed __read_mostly;
 
-static int do_xdp_generic(struct sk_buff *skb)
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 {
-	struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog);
-
 	if (xdp_prog) {
 		u32 act = netif_receive_generic_xdp(skb, xdp_prog);
 		int err;
@@ -3972,6 +3971,7 @@ static int do_xdp_generic(struct sk_buff *skb)
 	kfree_skb(skb);
 	return XDP_DROP;
 }
+EXPORT_SYMBOL_GPL(do_xdp_generic);
 
 static int netif_rx_internal(struct sk_buff *skb)
 {
@@ -3982,7 +3982,8 @@ static int netif_rx_internal(struct sk_buff *skb)
 	trace_netif_rx(skb);
 
 	if (static_key_false(&generic_xdp_needed)) {
-		int ret = do_xdp_generic(skb);
+		int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+					 skb);
 
 		/* Consider XDP consuming the packet a success from
 		 * the netdev point of view we do not want to count
@@ -4503,7 +4504,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 	rcu_read_lock();
 
 	if (static_key_false(&generic_xdp_needed)) {
-		int ret = do_xdp_generic(skb);
+		int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+					 skb);
 
 		if (ret != XDP_PASS) {
 			rcu_read_unlock();
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH net-next V2 3/3] tap: XDP support
  2017-08-11 11:41 [PATCH net-next V2 0/3] XDP support for tap Jason Wang
  2017-08-11 11:41 ` [PATCH net-next V2 1/3] tap: use build_skb() for small packet Jason Wang
  2017-08-11 11:41 ` [PATCH net-next V2 2/3] net: export some generic xdp helpers Jason Wang
@ 2017-08-11 11:41 ` Jason Wang
  2017-08-11 23:12   ` Jakub Kicinski
  2017-08-14  8:43   ` Daniel Borkmann
  2017-08-14  2:56 ` [PATCH net-next V2 0/3] XDP support for tap David Miller
  3 siblings, 2 replies; 22+ messages in thread
From: Jason Wang @ 2017-08-11 11:41 UTC (permalink / raw)
  To: davem, mst, netdev, linux-kernel; +Cc: kubakici, Jason Wang

This patch tries to implement XDP for tun. The implementation was
split into two parts:

- fast path: small and no gso packet. We try to do XDP at page level
  before build_skb(). For XDP_TX, since creating/destroying queues
  were completely under control of userspace, it was implemented
  through generic XDP helper after skb has been built. This could be
  optimized in the future.
- slow path: big or gso packet. We try to do it after skb was created
  through generic XDP helpers.

Test were done through pktgen with small packets.

xdp1 test shows ~41.1% improvement:

Before: ~1.7Mpps
After:  ~2.3Mpps

xdp_redirect to ixgbe shows ~60% improvement:

Before: ~0.8Mpps
After:  ~1.38Mpps

Suggested-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 142 insertions(+), 7 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9736df4..5892284 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -73,6 +73,8 @@
 #include <linux/seq_file.h>
 #include <linux/uio.h>
 #include <linux/skb_array.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 
 #include <linux/uaccess.h>
 
@@ -105,7 +107,8 @@ do {								\
 } while (0)
 #endif
 
-#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+#define TUN_HEADROOM 256
+#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD + TUN_HEADROOM)
 
 /* TUN device flags */
 
@@ -224,6 +227,7 @@ struct tun_struct {
 	u32 flow_count;
 	u32 rx_batched;
 	struct tun_pcpu_stats __percpu *pcpu_stats;
+	struct bpf_prog __rcu *xdp_prog;
 };
 
 #ifdef CONFIG_TUN_VNET_CROSS_LE
@@ -590,6 +594,7 @@ static void tun_detach(struct tun_file *tfile, bool clean)
 static void tun_detach_all(struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
+	struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog);
 	struct tun_file *tfile, *tmp;
 	int i, n = tun->numqueues;
 
@@ -622,6 +627,9 @@ static void tun_detach_all(struct net_device *dev)
 	}
 	BUG_ON(tun->numdisabled != 0);
 
+	if (xdp_prog)
+		bpf_prog_put(xdp_prog);
+
 	if (tun->flags & IFF_PERSIST)
 		module_put(THIS_MODULE);
 }
@@ -1008,6 +1016,46 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->tx_dropped = tx_dropped;
 }
 
+static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+		       struct netlink_ext_ack *extack)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+
+	old_prog = rtnl_dereference(tun->xdp_prog);
+	rcu_assign_pointer(tun->xdp_prog, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	return 0;
+}
+
+static u32 tun_xdp_query(struct net_device *dev)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	const struct bpf_prog *xdp_prog;
+
+	xdp_prog = rtnl_dereference(tun->xdp_prog);
+	if (xdp_prog)
+		return xdp_prog->aux->id;
+
+	return 0;
+}
+
+static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return tun_xdp_set(dev, xdp->prog, xdp->extack);
+	case XDP_QUERY_PROG:
+		xdp->prog_id = tun_xdp_query(dev);
+		xdp->prog_attached = !!xdp->prog_id;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops tun_netdev_ops = {
 	.ndo_uninit		= tun_net_uninit,
 	.ndo_open		= tun_net_open,
@@ -1038,6 +1086,7 @@ static const struct net_device_ops tap_netdev_ops = {
 	.ndo_features_check	= passthru_features_check,
 	.ndo_set_rx_headroom	= tun_set_headroom,
 	.ndo_get_stats64	= tun_net_get_stats64,
+	.ndo_xdp		= tun_xdp,
 };
 
 static void tun_flow_init(struct tun_struct *tun)
@@ -1217,16 +1266,22 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
 	return true;
 }
 
-static struct sk_buff *tun_build_skb(struct tun_file *tfile,
+static struct sk_buff *tun_build_skb(struct tun_struct *tun,
+				     struct tun_file *tfile,
 				     struct iov_iter *from,
-				     int len)
+				     struct virtio_net_hdr *hdr,
+				     int len, int *generic_xdp)
 {
 	struct page_frag *alloc_frag = &tfile->alloc_frag;
 	struct sk_buff *skb;
+	struct bpf_prog *xdp_prog;
 	int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
 		     SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	unsigned int delta = 0;
 	char *buf;
 	size_t copied;
+	bool xdp_xmit = false;
+	int err;
 
 	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
 		return ERR_PTR(-ENOMEM);
@@ -1238,16 +1293,77 @@ static struct sk_buff *tun_build_skb(struct tun_file *tfile,
 	if (copied != len)
 		return ERR_PTR(-EFAULT);
 
+	if (hdr->gso_type)
+		*generic_xdp = 1;
+	else
+		*generic_xdp = 0;
+
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(tun->xdp_prog);
+	if (xdp_prog && !*generic_xdp) {
+		struct xdp_buff xdp;
+		void *orig_data;
+		u32 act;
+
+		xdp.data_hard_start = buf;
+		xdp.data = buf + TUN_RX_PAD;
+		xdp.data_end = xdp.data + len;
+		orig_data = xdp.data;
+		act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+		switch (act) {
+		case XDP_REDIRECT:
+			get_page(alloc_frag->page);
+			alloc_frag->offset += buflen;
+			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
+			if (err)
+				goto err_redirect;
+			return NULL;
+		case XDP_TX:
+			xdp_xmit = true;
+			/* fall through */
+		case XDP_PASS:
+			delta = orig_data - xdp.data;
+			break;
+		default:
+			bpf_warn_invalid_xdp_action(act);
+			/* fall through */
+		case XDP_ABORTED:
+			trace_xdp_exception(tun->dev, xdp_prog, act);
+			/* fall through */
+		case XDP_DROP:
+			goto err_xdp;
+		}
+	}
+
 	skb = build_skb(buf, buflen);
-	if (!skb)
+	if (!skb) {
+		rcu_read_unlock();
 		return ERR_PTR(-ENOMEM);
+	}
 
-	skb_reserve(skb, TUN_RX_PAD);
-	skb_put(skb, len);
+	skb_reserve(skb, TUN_RX_PAD - delta);
+	skb_put(skb, len + delta);
 	get_page(alloc_frag->page);
 	alloc_frag->offset += buflen;
 
+	if (xdp_xmit) {
+		skb->dev = tun->dev;
+		generic_xdp_tx(skb, xdp_prog);
+		rcu_read_lock();
+		return NULL;
+	}
+
+	rcu_read_unlock();
+
 	return skb;
+
+err_redirect:
+	put_page(alloc_frag->page);
+err_xdp:
+	rcu_read_unlock();
+	this_cpu_inc(tun->pcpu_stats->rx_dropped);
+	return NULL;
 }
 
 /* Get packet from user space buffer */
@@ -1266,6 +1382,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	bool zerocopy = false;
 	int err;
 	u32 rxhash;
+	int generic_xdp = 1;
 
 	if (!(tun->dev->flags & IFF_UP))
 		return -EIO;
@@ -1324,11 +1441,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	}
 
 	if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
-		skb = tun_build_skb(tfile, from, len);
+		skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp);
 		if (IS_ERR(skb)) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
 			return PTR_ERR(skb);
 		}
+		if (!skb)
+			return total_len;
 	} else {
 		if (!zerocopy) {
 			copylen = len;
@@ -1402,6 +1521,22 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	skb_reset_network_header(skb);
 	skb_probe_transport_header(skb, 0);
 
+	if (generic_xdp) {
+		struct bpf_prog *xdp_prog;
+		int ret;
+
+		rcu_read_lock();
+		xdp_prog = rcu_dereference(tun->xdp_prog);
+		if (xdp_prog) {
+			ret = do_xdp_generic(xdp_prog, skb);
+			if (ret != XDP_PASS) {
+				rcu_read_unlock();
+				return total_len;
+			}
+		}
+		rcu_read_unlock();
+	}
+
 	rxhash = __skb_get_hash_symmetric(skb);
 #ifndef CONFIG_4KSTACKS
 	tun_rx_batched(tun, tfile, skb, more);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 3/3] tap: XDP support
  2017-08-11 11:41 ` [PATCH net-next V2 3/3] tap: XDP support Jason Wang
@ 2017-08-11 23:12   ` Jakub Kicinski
  2017-08-12  2:48     ` Jason Wang
  2017-08-14  8:43   ` Daniel Borkmann
  1 sibling, 1 reply; 22+ messages in thread
From: Jakub Kicinski @ 2017-08-11 23:12 UTC (permalink / raw)
  To: Jason Wang; +Cc: davem, mst, netdev, linux-kernel, Daniel Borkmann

On Fri, 11 Aug 2017 19:41:18 +0800, Jason Wang wrote:
> This patch tries to implement XDP for tun. The implementation was
> split into two parts:
> 
> - fast path: small and no gso packet. We try to do XDP at page level
>   before build_skb(). For XDP_TX, since creating/destroying queues
>   were completely under control of userspace, it was implemented
>   through generic XDP helper after skb has been built. This could be
>   optimized in the future.
> - slow path: big or gso packet. We try to do it after skb was created
>   through generic XDP helpers.
> 
> Test were done through pktgen with small packets.
> 
> xdp1 test shows ~41.1% improvement:
> 
> Before: ~1.7Mpps
> After:  ~2.3Mpps
> 
> xdp_redirect to ixgbe shows ~60% improvement:
> 
> Before: ~0.8Mpps
> After:  ~1.38Mpps
> 
> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Looks OK to me now :)

Out of curiosity, you say the build_skb() is for "small packets", and it
seems you are always reserving the 256B regardless of XDP being
installed.  Does this have no performance impact on non-XDP case?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 3/3] tap: XDP support
  2017-08-11 23:12   ` Jakub Kicinski
@ 2017-08-12  2:48     ` Jason Wang
  2017-08-14 16:01       ` Michael S. Tsirkin
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2017-08-12  2:48 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: davem, mst, netdev, linux-kernel, Daniel Borkmann



On 2017年08月12日 07:12, Jakub Kicinski wrote:
> On Fri, 11 Aug 2017 19:41:18 +0800, Jason Wang wrote:
>> This patch tries to implement XDP for tun. The implementation was
>> split into two parts:
>>
>> - fast path: small and no gso packet. We try to do XDP at page level
>>    before build_skb(). For XDP_TX, since creating/destroying queues
>>    were completely under control of userspace, it was implemented
>>    through generic XDP helper after skb has been built. This could be
>>    optimized in the future.
>> - slow path: big or gso packet. We try to do it after skb was created
>>    through generic XDP helpers.
>>
>> Test were done through pktgen with small packets.
>>
>> xdp1 test shows ~41.1% improvement:
>>
>> Before: ~1.7Mpps
>> After:  ~2.3Mpps
>>
>> xdp_redirect to ixgbe shows ~60% improvement:
>>
>> Before: ~0.8Mpps
>> After:  ~1.38Mpps
>>
>> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
> Looks OK to me now :)
>
> Out of curiosity, you say the build_skb() is for "small packets", and it
> seems you are always reserving the 256B regardless of XDP being
> installed.  Does this have no performance impact on non-XDP case?

Have a test, only less than 1% were noticed which I think could be ignored.

Thanks

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 0/3] XDP support for tap
  2017-08-11 11:41 [PATCH net-next V2 0/3] XDP support for tap Jason Wang
                   ` (2 preceding siblings ...)
  2017-08-11 11:41 ` [PATCH net-next V2 3/3] tap: XDP support Jason Wang
@ 2017-08-14  2:56 ` David Miller
  3 siblings, 0 replies; 22+ messages in thread
From: David Miller @ 2017-08-14  2:56 UTC (permalink / raw)
  To: jasowang; +Cc: mst, netdev, linux-kernel, kubakici

From: Jason Wang <jasowang@redhat.com>
Date: Fri, 11 Aug 2017 19:41:15 +0800

> Hi all:
> 
> This series tries to implement XDP support for tap. Two path were
> implemented:
> 
> - fast path: small & non-gso packet, For performance reason we do it
>   at page level and use build_skb() to create skb if necessary.
> - slow path: big or gso packet, we don't want to lose the capability
>   compared to generic XDP, so we export some generic xdp helpers and
>   do it after skb was created.
> 
> xdp1 shows about 41% improvement, xdp_redirect shows about 60%
> improvement.
> 
> Changes from V1:
> - fix the race between xdp set and free
> - don't hold extra refcount
> - add XDP_REDIRECT support
> 
> Please review.

Series applied, thanks Jason.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 3/3] tap: XDP support
  2017-08-11 11:41 ` [PATCH net-next V2 3/3] tap: XDP support Jason Wang
  2017-08-11 23:12   ` Jakub Kicinski
@ 2017-08-14  8:43   ` Daniel Borkmann
  2017-08-15  4:55     ` Jason Wang
  1 sibling, 1 reply; 22+ messages in thread
From: Daniel Borkmann @ 2017-08-14  8:43 UTC (permalink / raw)
  To: Jason Wang; +Cc: davem, mst, netdev, linux-kernel, kubakici

On 08/11/2017 01:41 PM, Jason Wang wrote:
> This patch tries to implement XDP for tun. The implementation was
> split into two parts:
[...]
> @@ -1402,6 +1521,22 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>   	skb_reset_network_header(skb);
>   	skb_probe_transport_header(skb, 0);
>
> +	if (generic_xdp) {
> +		struct bpf_prog *xdp_prog;
> +		int ret;
> +
> +		rcu_read_lock();
> +		xdp_prog = rcu_dereference(tun->xdp_prog);

The name generic_xdp is a bit confusing in this context given this
is 'native' XDP, perhaps above if (generic_xdp) should have a comment
explaining semantics for tun and how it relates to actual generic xdp
that sits at dev->xdp_prog, and gets run from netif_rx_ni(). Or just
name the bool xdp_handle_gso with a comment that we let the generic
XDP infrastructure deal with non-linear skbs instead of having to
re-implement the do_xdp_generic() internals, plus a statement that
the actual generic XDP comes a bit later in the path. That would at
least make it more obvious to read, imho.

> +		if (xdp_prog) {
> +			ret = do_xdp_generic(xdp_prog, skb);
> +			if (ret != XDP_PASS) {
> +				rcu_read_unlock();
> +				return total_len;
> +			}
> +		}
> +		rcu_read_unlock();
> +	}
> +
>   	rxhash = __skb_get_hash_symmetric(skb);
>   #ifndef CONFIG_4KSTACKS
>   	tun_rx_batched(tun, tfile, skb, more);
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 3/3] tap: XDP support
  2017-08-12  2:48     ` Jason Wang
@ 2017-08-14 16:01       ` Michael S. Tsirkin
  2017-08-15  5:02         ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Michael S. Tsirkin @ 2017-08-14 16:01 UTC (permalink / raw)
  To: Jason Wang; +Cc: Jakub Kicinski, davem, netdev, linux-kernel, Daniel Borkmann

On Sat, Aug 12, 2017 at 10:48:49AM +0800, Jason Wang wrote:
> 
> 
> On 2017年08月12日 07:12, Jakub Kicinski wrote:
> > On Fri, 11 Aug 2017 19:41:18 +0800, Jason Wang wrote:
> > > This patch tries to implement XDP for tun. The implementation was
> > > split into two parts:
> > > 
> > > - fast path: small and no gso packet. We try to do XDP at page level
> > >    before build_skb(). For XDP_TX, since creating/destroying queues
> > >    were completely under control of userspace, it was implemented
> > >    through generic XDP helper after skb has been built. This could be
> > >    optimized in the future.
> > > - slow path: big or gso packet. We try to do it after skb was created
> > >    through generic XDP helpers.
> > > 
> > > Test were done through pktgen with small packets.
> > > 
> > > xdp1 test shows ~41.1% improvement:
> > > 
> > > Before: ~1.7Mpps
> > > After:  ~2.3Mpps
> > > 
> > > xdp_redirect to ixgbe shows ~60% improvement:
> > > 
> > > Before: ~0.8Mpps
> > > After:  ~1.38Mpps
> > > 
> > > Suggested-by: Michael S. Tsirkin <mst@redhat.com>
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > Looks OK to me now :)
> > 
> > Out of curiosity, you say the build_skb() is for "small packets", and it
> > seems you are always reserving the 256B regardless of XDP being
> > installed.  Does this have no performance impact on non-XDP case?
> 
> Have a test, only less than 1% were noticed which I think could be ignored.
> 
> Thanks

What did you test btw? The biggest issue would be with something like
UDP with short packets.

-- 
MST

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 3/3] tap: XDP support
  2017-08-14  8:43   ` Daniel Borkmann
@ 2017-08-15  4:55     ` Jason Wang
  0 siblings, 0 replies; 22+ messages in thread
From: Jason Wang @ 2017-08-15  4:55 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: davem, mst, netdev, linux-kernel, kubakici



On 2017年08月14日 16:43, Daniel Borkmann wrote:
> On 08/11/2017 01:41 PM, Jason Wang wrote:
>> This patch tries to implement XDP for tun. The implementation was
>> split into two parts:
> [...]
>> @@ -1402,6 +1521,22 @@ static ssize_t tun_get_user(struct tun_struct 
>> *tun, struct tun_file *tfile,
>>       skb_reset_network_header(skb);
>>       skb_probe_transport_header(skb, 0);
>>
>> +    if (generic_xdp) {
>> +        struct bpf_prog *xdp_prog;
>> +        int ret;
>> +
>> +        rcu_read_lock();
>> +        xdp_prog = rcu_dereference(tun->xdp_prog);
>
> The name generic_xdp is a bit confusing in this context given this
> is 'native' XDP, perhaps above if (generic_xdp) should have a comment
> explaining semantics for tun and how it relates to actual generic xdp
> that sits at dev->xdp_prog, and gets run from netif_rx_ni(). Or just
> name the bool xdp_handle_gso with a comment that we let the generic
> XDP infrastructure deal with non-linear skbs instead of having to
> re-implement the do_xdp_generic() internals, plus a statement that
> the actual generic XDP comes a bit later in the path. That would at
> least make it more obvious to read, imho.

Ok, since non gso packet (e.g jumbo packet) may go this way too, 
something like "xdp_handle_skb" is better. Will send a patch.

Thanks

>
>> +        if (xdp_prog) {
>> +            ret = do_xdp_generic(xdp_prog, skb);
>> +            if (ret != XDP_PASS) {
>> +                rcu_read_unlock();
>> +                return total_len;
>> +            }
>> +        }
>> +        rcu_read_unlock();
>> +    }
>> +
>>       rxhash = __skb_get_hash_symmetric(skb);
>>   #ifndef CONFIG_4KSTACKS
>>       tun_rx_batched(tun, tfile, skb, more);
>>
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 3/3] tap: XDP support
  2017-08-14 16:01       ` Michael S. Tsirkin
@ 2017-08-15  5:02         ` Jason Wang
  2017-08-16  3:45           ` Michael S. Tsirkin
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2017-08-15  5:02 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jakub Kicinski, davem, netdev, linux-kernel, Daniel Borkmann



On 2017年08月15日 00:01, Michael S. Tsirkin wrote:
> On Sat, Aug 12, 2017 at 10:48:49AM +0800, Jason Wang wrote:
>>
>> On 2017年08月12日 07:12, Jakub Kicinski wrote:
>>> On Fri, 11 Aug 2017 19:41:18 +0800, Jason Wang wrote:
>>>> This patch tries to implement XDP for tun. The implementation was
>>>> split into two parts:
>>>>
>>>> - fast path: small and no gso packet. We try to do XDP at page level
>>>>     before build_skb(). For XDP_TX, since creating/destroying queues
>>>>     were completely under control of userspace, it was implemented
>>>>     through generic XDP helper after skb has been built. This could be
>>>>     optimized in the future.
>>>> - slow path: big or gso packet. We try to do it after skb was created
>>>>     through generic XDP helpers.
>>>>
>>>> Test were done through pktgen with small packets.
>>>>
>>>> xdp1 test shows ~41.1% improvement:
>>>>
>>>> Before: ~1.7Mpps
>>>> After:  ~2.3Mpps
>>>>
>>>> xdp_redirect to ixgbe shows ~60% improvement:
>>>>
>>>> Before: ~0.8Mpps
>>>> After:  ~1.38Mpps
>>>>
>>>> Suggested-by: Michael S. Tsirkin <mst@redhat.com>
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> Looks OK to me now :)
>>>
>>> Out of curiosity, you say the build_skb() is for "small packets", and it
>>> seems you are always reserving the 256B regardless of XDP being
>>> installed.  Does this have no performance impact on non-XDP case?
>> Have a test, only less than 1% were noticed which I think could be ignored.
>>
>> Thanks
> What did you test btw?

Pktgen

>   The biggest issue would be with something like
> UDP with short packets.
>

Note that we do this only when sndbuf is INT_MAX. So this is probably 
not an issue. The only thing matter is more stress to page allocator, 
but according to the result of pktgen it was very small that could be 
ignored.

Thanks

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet
  2017-08-11 11:41 ` [PATCH net-next V2 1/3] tap: use build_skb() for small packet Jason Wang
@ 2017-08-16  3:45   ` Eric Dumazet
  2017-08-16  3:55     ` Michael S. Tsirkin
  2017-08-16  3:55     ` Jason Wang
  0 siblings, 2 replies; 22+ messages in thread
From: Eric Dumazet @ 2017-08-16  3:45 UTC (permalink / raw)
  To: Jason Wang; +Cc: davem, mst, netdev, linux-kernel, kubakici

On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
> skb in the past. This socket based method is not suitable for high
> speed userspace like virtualization which usually:
> 
> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
>   possible
> - don't want to be block at sendmsg()
> 
> To eliminate the above overheads, this patch tries to use build_skb()
> for small packet. We will do this only when the following conditions
> are all met:
> 
> - TAP instead of TUN
> - sk_sndbuf is INT_MAX
> - caller don't want to be blocked
> - zerocopy is not used
> - packet size is smaller enough to use build_skb()
> 
> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
> 
> Before: ~1.70Mpps
> After : ~1.88Mpps
> 
> What's more important, this makes it possible to implement XDP for tap
> before creating skbs.


Well well well.

You do realize that tun_build_skb() is not thread safe ?

general protection fault: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 3982 Comm: syz-executor0 Not tainted 4.13.0-rc5-next-20170815+ #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
task: ffff880069f265c0 task.stack: ffff880067688000
RIP: 0010:__read_once_size include/linux/compiler.h:276 [inline]
RIP: 0010:compound_head include/linux/page-flags.h:146 [inline]
RIP: 0010:put_page include/linux/mm.h:811 [inline]
RIP: 0010:__skb_frag_unref include/linux/skbuff.h:2743 [inline]
RIP: 0010:skb_release_data+0x26c/0x790 net/core/skbuff.c:568
RSP: 0018:ffff88006768ef20 EFLAGS: 00010206
RAX: 00d70cb5b39acdeb RBX: dffffc0000000000 RCX: 1ffff1000ced1e13
RDX: 0000000000000000 RSI: ffff88003ec28c38 RDI: 06b865ad9cd66f59
RBP: ffff88006768f040 R08: ffffea0000ee74a0 R09: ffffed0007ab4200
R10: 0000000000028c28 R11: 0000000000000010 R12: ffff88003c5581b0
R13: ffffed000ced1dfb R14: 1ffff1000ced1df3 R15: 06b865ad9cd66f39
FS:  00007ffbc9ef7700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000002001aff0 CR3: 000000003d623000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 skb_release_all+0x4a/0x60 net/core/skbuff.c:631
 __kfree_skb net/core/skbuff.c:645 [inline]
 kfree_skb+0x15d/0x4c0 net/core/skbuff.c:663
 __netif_receive_skb_core+0x10f8/0x33d0 net/core/dev.c:4425
 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4456
 netif_receive_skb_internal+0x10b/0x5e0 net/core/dev.c:4527
 netif_receive_skb+0xae/0x390 net/core/dev.c:4551
 tun_rx_batched.isra.43+0x5e7/0x860 drivers/net/tun.c:1221
 tun_get_user+0x11dd/0x2150 drivers/net/tun.c:1542
 tun_chr_write_iter+0xd8/0x190 drivers/net/tun.c:1568
 call_write_iter include/linux/fs.h:1742 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x40bab1
RSP: 002b:00007ffbc9ef6c00 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000000000036 RCX: 000000000040bab1
RDX: 0000000000000036 RSI: 0000000020002000 RDI: 0000000000000003
RBP: 0000000000a5f870 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000
R13: 0000000000000000 R14: 00007ffbc9ef79c0 R15: 00007ffbc9ef7700
Code: c6 e8 c9 78 8d fd 4c 89 e0 48 c1 e8 03 80 3c 18 00 0f 85 93 04 00 00 4d 8b 3c 24 41 c6 45 00 00 49 8d 7f 20 48 89 f8 48 c1 e8 03 <80> 3c 18 00 0f 85 6b 04 00 00 41 80 7d 00 00 49 8b 47 20 0f 85 
RIP: __read_once_size include/linux/compiler.h:276 [inline] RSP: ffff88006768ef20
RIP: compound_head include/linux/page-flags.h:146 [inline] RSP: ffff88006768ef20
RIP: put_page include/linux/mm.h:811 [inline] RSP: ffff88006768ef20
RIP: __skb_frag_unref include/linux/skbuff.h:2743 [inline] RSP: ffff88006768ef20
RIP: skb_release_data+0x26c/0x790 net/core/skbuff.c:568 RSP: ffff88006768ef20
---[ end trace 54050eb1ec52ff83 ]---

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 3/3] tap: XDP support
  2017-08-15  5:02         ` Jason Wang
@ 2017-08-16  3:45           ` Michael S. Tsirkin
  0 siblings, 0 replies; 22+ messages in thread
From: Michael S. Tsirkin @ 2017-08-16  3:45 UTC (permalink / raw)
  To: Jason Wang; +Cc: Jakub Kicinski, davem, netdev, linux-kernel, Daniel Borkmann

On Tue, Aug 15, 2017 at 01:02:05PM +0800, Jason Wang wrote:
> 
> 
> On 2017年08月15日 00:01, Michael S. Tsirkin wrote:
> > On Sat, Aug 12, 2017 at 10:48:49AM +0800, Jason Wang wrote:
> > > 
> > > On 2017年08月12日 07:12, Jakub Kicinski wrote:
> > > > On Fri, 11 Aug 2017 19:41:18 +0800, Jason Wang wrote:
> > > > > This patch tries to implement XDP for tun. The implementation was
> > > > > split into two parts:
> > > > > 
> > > > > - fast path: small and no gso packet. We try to do XDP at page level
> > > > >     before build_skb(). For XDP_TX, since creating/destroying queues
> > > > >     were completely under control of userspace, it was implemented
> > > > >     through generic XDP helper after skb has been built. This could be
> > > > >     optimized in the future.
> > > > > - slow path: big or gso packet. We try to do it after skb was created
> > > > >     through generic XDP helpers.
> > > > > 
> > > > > Test were done through pktgen with small packets.
> > > > > 
> > > > > xdp1 test shows ~41.1% improvement:
> > > > > 
> > > > > Before: ~1.7Mpps
> > > > > After:  ~2.3Mpps
> > > > > 
> > > > > xdp_redirect to ixgbe shows ~60% improvement:
> > > > > 
> > > > > Before: ~0.8Mpps
> > > > > After:  ~1.38Mpps
> > > > > 
> > > > > Suggested-by: Michael S. Tsirkin <mst@redhat.com>
> > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > Looks OK to me now :)
> > > > 
> > > > Out of curiosity, you say the build_skb() is for "small packets", and it
> > > > seems you are always reserving the 256B regardless of XDP being
> > > > installed.  Does this have no performance impact on non-XDP case?
> > > Have a test, only less than 1% were noticed which I think could be ignored.
> > > 
> > > Thanks
> > What did you test btw?
> 
> Pktgen
> 
> >   The biggest issue would be with something like
> > UDP with short packets.
> > 
> 
> Note that we do this only when sndbuf is INT_MAX. So this is probably not an
> issue.

I'd expect to see the issue for guest to host when the packets are
queued at a UDP socket in host.

> The only thing matter is more stress to page allocator, but according
> to the result of pktgen it was very small that could be ignored.
> 
> Thanks

Besides guest to host, for bridging in host bigger truesize might affect
byte queue counts as well.

-- 
MST

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet
  2017-08-16  3:45   ` Eric Dumazet
@ 2017-08-16  3:55     ` Michael S. Tsirkin
  2017-08-16  3:57       ` Jason Wang
  2017-08-16  3:55     ` Jason Wang
  1 sibling, 1 reply; 22+ messages in thread
From: Michael S. Tsirkin @ 2017-08-16  3:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jason Wang, davem, netdev, linux-kernel, kubakici

On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
> > We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
> > skb in the past. This socket based method is not suitable for high
> > speed userspace like virtualization which usually:
> > 
> > - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
> >   possible
> > - don't want to be block at sendmsg()
> > 
> > To eliminate the above overheads, this patch tries to use build_skb()
> > for small packet. We will do this only when the following conditions
> > are all met:
> > 
> > - TAP instead of TUN
> > - sk_sndbuf is INT_MAX
> > - caller don't want to be blocked
> > - zerocopy is not used
> > - packet size is smaller enough to use build_skb()
> > 
> > Pktgen from guest to host shows ~11% improvement for rx pps of tap:
> > 
> > Before: ~1.70Mpps
> > After : ~1.88Mpps
> > 
> > What's more important, this makes it possible to implement XDP for tap
> > before creating skbs.
> 
> 
> Well well well.
> 
> You do realize that tun_build_skb() is not thread safe ?

The issue is alloc frag, isn't it?
I guess for now we can limit this to XDP mode only, and
just allocate full pages in that mode.


> general protection fault: 0000 [#1] SMP KASAN
> Dumping ftrace buffer:
>    (ftrace buffer empty)
> Modules linked in:
> CPU: 0 PID: 3982 Comm: syz-executor0 Not tainted 4.13.0-rc5-next-20170815+ #3
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> task: ffff880069f265c0 task.stack: ffff880067688000
> RIP: 0010:__read_once_size include/linux/compiler.h:276 [inline]
> RIP: 0010:compound_head include/linux/page-flags.h:146 [inline]
> RIP: 0010:put_page include/linux/mm.h:811 [inline]
> RIP: 0010:__skb_frag_unref include/linux/skbuff.h:2743 [inline]
> RIP: 0010:skb_release_data+0x26c/0x790 net/core/skbuff.c:568
> RSP: 0018:ffff88006768ef20 EFLAGS: 00010206
> RAX: 00d70cb5b39acdeb RBX: dffffc0000000000 RCX: 1ffff1000ced1e13
> RDX: 0000000000000000 RSI: ffff88003ec28c38 RDI: 06b865ad9cd66f59
> RBP: ffff88006768f040 R08: ffffea0000ee74a0 R09: ffffed0007ab4200
> R10: 0000000000028c28 R11: 0000000000000010 R12: ffff88003c5581b0
> R13: ffffed000ced1dfb R14: 1ffff1000ced1df3 R15: 06b865ad9cd66f39
> FS:  00007ffbc9ef7700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000000002001aff0 CR3: 000000003d623000 CR4: 00000000000006f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
>  skb_release_all+0x4a/0x60 net/core/skbuff.c:631
>  __kfree_skb net/core/skbuff.c:645 [inline]
>  kfree_skb+0x15d/0x4c0 net/core/skbuff.c:663
>  __netif_receive_skb_core+0x10f8/0x33d0 net/core/dev.c:4425
>  __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4456
>  netif_receive_skb_internal+0x10b/0x5e0 net/core/dev.c:4527
>  netif_receive_skb+0xae/0x390 net/core/dev.c:4551
>  tun_rx_batched.isra.43+0x5e7/0x860 drivers/net/tun.c:1221
>  tun_get_user+0x11dd/0x2150 drivers/net/tun.c:1542
>  tun_chr_write_iter+0xd8/0x190 drivers/net/tun.c:1568
>  call_write_iter include/linux/fs.h:1742 [inline]
>  new_sync_write fs/read_write.c:457 [inline]
>  __vfs_write+0x684/0x970 fs/read_write.c:470
>  vfs_write+0x189/0x510 fs/read_write.c:518
>  SYSC_write fs/read_write.c:565 [inline]
>  SyS_write+0xef/0x220 fs/read_write.c:557
>  entry_SYSCALL_64_fastpath+0x1f/0xbe
> RIP: 0033:0x40bab1
> RSP: 002b:00007ffbc9ef6c00 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
> RAX: ffffffffffffffda RBX: 0000000000000036 RCX: 000000000040bab1
> RDX: 0000000000000036 RSI: 0000000020002000 RDI: 0000000000000003
> RBP: 0000000000a5f870 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000
> R13: 0000000000000000 R14: 00007ffbc9ef79c0 R15: 00007ffbc9ef7700
> Code: c6 e8 c9 78 8d fd 4c 89 e0 48 c1 e8 03 80 3c 18 00 0f 85 93 04 00 00 4d 8b 3c 24 41 c6 45 00 00 49 8d 7f 20 48 89 f8 48 c1 e8 03 <80> 3c 18 00 0f 85 6b 04 00 00 41 80 7d 00 00 49 8b 47 20 0f 85 
> RIP: __read_once_size include/linux/compiler.h:276 [inline] RSP: ffff88006768ef20
> RIP: compound_head include/linux/page-flags.h:146 [inline] RSP: ffff88006768ef20
> RIP: put_page include/linux/mm.h:811 [inline] RSP: ffff88006768ef20
> RIP: __skb_frag_unref include/linux/skbuff.h:2743 [inline] RSP: ffff88006768ef20
> RIP: skb_release_data+0x26c/0x790 net/core/skbuff.c:568 RSP: ffff88006768ef20
> ---[ end trace 54050eb1ec52ff83 ]---

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet
  2017-08-16  3:45   ` Eric Dumazet
  2017-08-16  3:55     ` Michael S. Tsirkin
@ 2017-08-16  3:55     ` Jason Wang
  2017-08-16 10:24       ` Eric Dumazet
  1 sibling, 1 reply; 22+ messages in thread
From: Jason Wang @ 2017-08-16  3:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, mst, netdev, linux-kernel, kubakici



On 2017年08月16日 11:45, Eric Dumazet wrote:
> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
>> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
>> skb in the past. This socket based method is not suitable for high
>> speed userspace like virtualization which usually:
>>
>> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
>>    possible
>> - don't want to be block at sendmsg()
>>
>> To eliminate the above overheads, this patch tries to use build_skb()
>> for small packet. We will do this only when the following conditions
>> are all met:
>>
>> - TAP instead of TUN
>> - sk_sndbuf is INT_MAX
>> - caller don't want to be blocked
>> - zerocopy is not used
>> - packet size is smaller enough to use build_skb()
>>
>> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>>
>> Before: ~1.70Mpps
>> After : ~1.88Mpps
>>
>> What's more important, this makes it possible to implement XDP for tap
>> before creating skbs.
>
> Well well well.
>
> You do realize that tun_build_skb() is not thread safe ?

Ok, I think the issue if skb_page_frag_refill(), need a spinlock 
probably. Will prepare a patch.

Thanks

>
> general protection fault: 0000 [#1] SMP KASAN
> Dumping ftrace buffer:
>     (ftrace buffer empty)
> Modules linked in:
> CPU: 0 PID: 3982 Comm: syz-executor0 Not tainted 4.13.0-rc5-next-20170815+ #3
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> task: ffff880069f265c0 task.stack: ffff880067688000
> RIP: 0010:__read_once_size include/linux/compiler.h:276 [inline]
> RIP: 0010:compound_head include/linux/page-flags.h:146 [inline]
> RIP: 0010:put_page include/linux/mm.h:811 [inline]
> RIP: 0010:__skb_frag_unref include/linux/skbuff.h:2743 [inline]
> RIP: 0010:skb_release_data+0x26c/0x790 net/core/skbuff.c:568
> RSP: 0018:ffff88006768ef20 EFLAGS: 00010206
> RAX: 00d70cb5b39acdeb RBX: dffffc0000000000 RCX: 1ffff1000ced1e13
> RDX: 0000000000000000 RSI: ffff88003ec28c38 RDI: 06b865ad9cd66f59
> RBP: ffff88006768f040 R08: ffffea0000ee74a0 R09: ffffed0007ab4200
> R10: 0000000000028c28 R11: 0000000000000010 R12: ffff88003c5581b0
> R13: ffffed000ced1dfb R14: 1ffff1000ced1df3 R15: 06b865ad9cd66f39
> FS:  00007ffbc9ef7700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000000002001aff0 CR3: 000000003d623000 CR4: 00000000000006f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
>   skb_release_all+0x4a/0x60 net/core/skbuff.c:631
>   __kfree_skb net/core/skbuff.c:645 [inline]
>   kfree_skb+0x15d/0x4c0 net/core/skbuff.c:663
>   __netif_receive_skb_core+0x10f8/0x33d0 net/core/dev.c:4425
>   __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4456
>   netif_receive_skb_internal+0x10b/0x5e0 net/core/dev.c:4527
>   netif_receive_skb+0xae/0x390 net/core/dev.c:4551
>   tun_rx_batched.isra.43+0x5e7/0x860 drivers/net/tun.c:1221
>   tun_get_user+0x11dd/0x2150 drivers/net/tun.c:1542
>   tun_chr_write_iter+0xd8/0x190 drivers/net/tun.c:1568
>   call_write_iter include/linux/fs.h:1742 [inline]
>   new_sync_write fs/read_write.c:457 [inline]
>   __vfs_write+0x684/0x970 fs/read_write.c:470
>   vfs_write+0x189/0x510 fs/read_write.c:518
>   SYSC_write fs/read_write.c:565 [inline]
>   SyS_write+0xef/0x220 fs/read_write.c:557
>   entry_SYSCALL_64_fastpath+0x1f/0xbe
> RIP: 0033:0x40bab1
> RSP: 002b:00007ffbc9ef6c00 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
> RAX: ffffffffffffffda RBX: 0000000000000036 RCX: 000000000040bab1
> RDX: 0000000000000036 RSI: 0000000020002000 RDI: 0000000000000003
> RBP: 0000000000a5f870 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000
> R13: 0000000000000000 R14: 00007ffbc9ef79c0 R15: 00007ffbc9ef7700
> Code: c6 e8 c9 78 8d fd 4c 89 e0 48 c1 e8 03 80 3c 18 00 0f 85 93 04 00 00 4d 8b 3c 24 41 c6 45 00 00 49 8d 7f 20 48 89 f8 48 c1 e8 03 <80> 3c 18 00 0f 85 6b 04 00 00 41 80 7d 00 00 49 8b 47 20 0f 85
> RIP: __read_once_size include/linux/compiler.h:276 [inline] RSP: ffff88006768ef20
> RIP: compound_head include/linux/page-flags.h:146 [inline] RSP: ffff88006768ef20
> RIP: put_page include/linux/mm.h:811 [inline] RSP: ffff88006768ef20
> RIP: __skb_frag_unref include/linux/skbuff.h:2743 [inline] RSP: ffff88006768ef20
> RIP: skb_release_data+0x26c/0x790 net/core/skbuff.c:568 RSP: ffff88006768ef20
> ---[ end trace 54050eb1ec52ff83 ]---
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet
  2017-08-16  3:55     ` Michael S. Tsirkin
@ 2017-08-16  3:57       ` Jason Wang
  2017-08-16  3:59         ` Michael S. Tsirkin
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2017-08-16  3:57 UTC (permalink / raw)
  To: Michael S. Tsirkin, Eric Dumazet; +Cc: davem, netdev, linux-kernel, kubakici



On 2017年08月16日 11:55, Michael S. Tsirkin wrote:
> On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
>> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
>>> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
>>> skb in the past. This socket based method is not suitable for high
>>> speed userspace like virtualization which usually:
>>>
>>> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
>>>    possible
>>> - don't want to be block at sendmsg()
>>>
>>> To eliminate the above overheads, this patch tries to use build_skb()
>>> for small packet. We will do this only when the following conditions
>>> are all met:
>>>
>>> - TAP instead of TUN
>>> - sk_sndbuf is INT_MAX
>>> - caller don't want to be blocked
>>> - zerocopy is not used
>>> - packet size is smaller enough to use build_skb()
>>>
>>> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>>>
>>> Before: ~1.70Mpps
>>> After : ~1.88Mpps
>>>
>>> What's more important, this makes it possible to implement XDP for tap
>>> before creating skbs.
>> Well well well.
>>
>> You do realize that tun_build_skb() is not thread safe ?
> The issue is alloc frag, isn't it?
> I guess for now we can limit this to XDP mode only, and
> just allocate full pages in that mode.
>
>

Limit this to XDP mode only does not prevent user from sending packets 
to same queue in parallel I think?

Thanks

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet
  2017-08-16  3:57       ` Jason Wang
@ 2017-08-16  3:59         ` Michael S. Tsirkin
  2017-08-16  4:07           ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Michael S. Tsirkin @ 2017-08-16  3:59 UTC (permalink / raw)
  To: Jason Wang; +Cc: Eric Dumazet, davem, netdev, linux-kernel, kubakici

On Wed, Aug 16, 2017 at 11:57:51AM +0800, Jason Wang wrote:
> 
> 
> On 2017年08月16日 11:55, Michael S. Tsirkin wrote:
> > On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
> > > On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
> > > > We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
> > > > skb in the past. This socket based method is not suitable for high
> > > > speed userspace like virtualization which usually:
> > > > 
> > > > - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
> > > >    possible
> > > > - don't want to be block at sendmsg()
> > > > 
> > > > To eliminate the above overheads, this patch tries to use build_skb()
> > > > for small packet. We will do this only when the following conditions
> > > > are all met:
> > > > 
> > > > - TAP instead of TUN
> > > > - sk_sndbuf is INT_MAX
> > > > - caller don't want to be blocked
> > > > - zerocopy is not used
> > > > - packet size is smaller enough to use build_skb()
> > > > 
> > > > Pktgen from guest to host shows ~11% improvement for rx pps of tap:
> > > > 
> > > > Before: ~1.70Mpps
> > > > After : ~1.88Mpps
> > > > 
> > > > What's more important, this makes it possible to implement XDP for tap
> > > > before creating skbs.
> > > Well well well.
> > > 
> > > You do realize that tun_build_skb() is not thread safe ?
> > The issue is alloc frag, isn't it?
> > I guess for now we can limit this to XDP mode only, and
> > just allocate full pages in that mode.
> > 
> > 
> 
> Limit this to XDP mode only does not prevent user from sending packets to
> same queue in parallel I think?
> 
> Thanks

Yes but then you can just drop the page frag allocator since
XDP is assumed not to care about truesize for most packets.

-- 
MST

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet
  2017-08-16  3:59         ` Michael S. Tsirkin
@ 2017-08-16  4:07           ` Jason Wang
  2017-08-16  9:17             ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2017-08-16  4:07 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Eric Dumazet, davem, netdev, linux-kernel, kubakici



On 2017年08月16日 11:59, Michael S. Tsirkin wrote:
> On Wed, Aug 16, 2017 at 11:57:51AM +0800, Jason Wang wrote:
>>
>> On 2017年08月16日 11:55, Michael S. Tsirkin wrote:
>>> On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
>>>> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
>>>>> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
>>>>> skb in the past. This socket based method is not suitable for high
>>>>> speed userspace like virtualization which usually:
>>>>>
>>>>> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
>>>>>     possible
>>>>> - don't want to be block at sendmsg()
>>>>>
>>>>> To eliminate the above overheads, this patch tries to use build_skb()
>>>>> for small packet. We will do this only when the following conditions
>>>>> are all met:
>>>>>
>>>>> - TAP instead of TUN
>>>>> - sk_sndbuf is INT_MAX
>>>>> - caller don't want to be blocked
>>>>> - zerocopy is not used
>>>>> - packet size is smaller enough to use build_skb()
>>>>>
>>>>> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>>>>>
>>>>> Before: ~1.70Mpps
>>>>> After : ~1.88Mpps
>>>>>
>>>>> What's more important, this makes it possible to implement XDP for tap
>>>>> before creating skbs.
>>>> Well well well.
>>>>
>>>> You do realize that tun_build_skb() is not thread safe ?
>>> The issue is alloc frag, isn't it?
>>> I guess for now we can limit this to XDP mode only, and
>>> just allocate full pages in that mode.
>>>
>>>
>> Limit this to XDP mode only does not prevent user from sending packets to
>> same queue in parallel I think?
>>
>> Thanks
> Yes but then you can just drop the page frag allocator since
> XDP is assumed not to care about truesize for most packets.
>

Ok, let me do some test to see the numbers between the two methods first.

Thanks

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet
  2017-08-16  4:07           ` Jason Wang
@ 2017-08-16  9:17             ` Jason Wang
  2017-08-16 16:30               ` David Miller
  0 siblings, 1 reply; 22+ messages in thread
From: Jason Wang @ 2017-08-16  9:17 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Eric Dumazet, davem, netdev, linux-kernel, kubakici

[-- Attachment #1: Type: text/plain, Size: 2269 bytes --]



On 2017年08月16日 12:07, Jason Wang wrote:
>
>
> On 2017年08月16日 11:59, Michael S. Tsirkin wrote:
>> On Wed, Aug 16, 2017 at 11:57:51AM +0800, Jason Wang wrote:
>>>
>>> On 2017年08月16日 11:55, Michael S. Tsirkin wrote:
>>>> On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
>>>>> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
>>>>>> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to 
>>>>>> allocate
>>>>>> skb in the past. This socket based method is not suitable for high
>>>>>> speed userspace like virtualization which usually:
>>>>>>
>>>>>> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as 
>>>>>> fast as
>>>>>>     possible
>>>>>> - don't want to be block at sendmsg()
>>>>>>
>>>>>> To eliminate the above overheads, this patch tries to use 
>>>>>> build_skb()
>>>>>> for small packet. We will do this only when the following conditions
>>>>>> are all met:
>>>>>>
>>>>>> - TAP instead of TUN
>>>>>> - sk_sndbuf is INT_MAX
>>>>>> - caller don't want to be blocked
>>>>>> - zerocopy is not used
>>>>>> - packet size is smaller enough to use build_skb()
>>>>>>
>>>>>> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>>>>>>
>>>>>> Before: ~1.70Mpps
>>>>>> After : ~1.88Mpps
>>>>>>
>>>>>> What's more important, this makes it possible to implement XDP 
>>>>>> for tap
>>>>>> before creating skbs.
>>>>> Well well well.
>>>>>
>>>>> You do realize that tun_build_skb() is not thread safe ?
>>>> The issue is alloc frag, isn't it?
>>>> I guess for now we can limit this to XDP mode only, and
>>>> just allocate full pages in that mode.
>>>>
>>>>
>>> Limit this to XDP mode only does not prevent user from sending 
>>> packets to
>>> same queue in parallel I think?
>>>
>>> Thanks
>> Yes but then you can just drop the page frag allocator since
>> XDP is assumed not to care about truesize for most packets.
>>
>
> Ok, let me do some test to see the numbers between the two methods first.
>
> Thanks

It looks like full page allocation just produce too much stress on the 
page allocator.

I get 1.58Mpps (full page) vs 1.95Mpps (page frag) with the patches 
attached.

Since non-XDP case can also benefit from build_skb(), I tend to use 
spinlock instead of full page in this case.

Thanks

[-- Attachment #2: 0001-tun-thread-safe-tun_build_skb.patch --]
[-- Type: text/x-patch, Size: 3236 bytes --]

>From 0b9d930e8192466a9c4b85d136193f9c5f01d96a Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 16 Aug 2017 13:48:11 +0800
Subject: [PATCH] tun: thread safe tun_build_skb()

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5892284..c72c2ea 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1247,6 +1247,8 @@ static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
 static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
 			      int len, int noblock, bool zerocopy)
 {
+	struct bpf_prog *xdp_prog;
+
 	if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
 		return false;
 
@@ -1263,7 +1265,11 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
 	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
 		return false;
 
-	return true;
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(tun->xdp_prog);
+	rcu_read_unlock();
+
+	return xdp_prog;
 }
 
 static struct sk_buff *tun_build_skb(struct tun_struct *tun,
@@ -1272,7 +1278,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 				     struct virtio_net_hdr *hdr,
 				     int len, int *generic_xdp)
 {
-	struct page_frag *alloc_frag = &tfile->alloc_frag;
+	struct page *page = alloc_page(GFP_KERNEL);
 	struct sk_buff *skb;
 	struct bpf_prog *xdp_prog;
 	int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
@@ -1283,15 +1289,15 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	bool xdp_xmit = false;
 	int err;
 
-	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+	if (unlikely(!page))
 		return ERR_PTR(-ENOMEM);
 
-	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-	copied = copy_page_from_iter(alloc_frag->page,
-				     alloc_frag->offset + TUN_RX_PAD,
-				     len, from);
-	if (copied != len)
+	buf = (char *)page_address(page);
+	copied = copy_page_from_iter(page, TUN_RX_PAD, len, from);
+	if (copied != len) {
+		put_page(page);
 		return ERR_PTR(-EFAULT);
+	}
 
 	if (hdr->gso_type)
 		*generic_xdp = 1;
@@ -1313,11 +1319,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		switch (act) {
 		case XDP_REDIRECT:
-			get_page(alloc_frag->page);
-			alloc_frag->offset += buflen;
 			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
 			if (err)
-				goto err_redirect;
+				goto err_xdp;
 			return NULL;
 		case XDP_TX:
 			xdp_xmit = true;
@@ -1339,13 +1343,13 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	skb = build_skb(buf, buflen);
 	if (!skb) {
 		rcu_read_unlock();
+		put_page(page);
 		return ERR_PTR(-ENOMEM);
 	}
 
 	skb_reserve(skb, TUN_RX_PAD - delta);
 	skb_put(skb, len + delta);
-	get_page(alloc_frag->page);
-	alloc_frag->offset += buflen;
+	get_page(page);
 
 	if (xdp_xmit) {
 		skb->dev = tun->dev;
@@ -1358,9 +1362,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 	return skb;
 
-err_redirect:
-	put_page(alloc_frag->page);
 err_xdp:
+	put_page(page);
 	rcu_read_unlock();
 	this_cpu_inc(tun->pcpu_stats->rx_dropped);
 	return NULL;
-- 
2.7.4


[-- Attachment #3: 0001-tun-serialize-page-frag-allocation.patch --]
[-- Type: text/x-patch, Size: 3489 bytes --]

>From c2aa968d97f1ac892a93d14cbbf06dac0b91fb49 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 16 Aug 2017 13:17:15 +0800
Subject: [PATCH] tun: serialize page frag allocation

tun_build_skb() is not thread safe since we don't serialize protect
page frag allocation. This will lead crash when multiple threads are
sending packet into same tap queue in parallel. Solve this by
introducing a spinlock and use it to protect the page frag allocation.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5892284..202b20d 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -176,6 +176,7 @@ struct tun_file {
 	struct tun_struct *detached;
 	struct skb_array tx_array;
 	struct page_frag alloc_frag;
+	spinlock_t lock;
 };
 
 struct tun_flow_entry {
@@ -1273,25 +1274,36 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 				     int len, int *generic_xdp)
 {
 	struct page_frag *alloc_frag = &tfile->alloc_frag;
+	struct page *page;
 	struct sk_buff *skb;
 	struct bpf_prog *xdp_prog;
 	int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
 		     SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 	unsigned int delta = 0;
+	size_t offset;
 	char *buf;
 	size_t copied;
 	bool xdp_xmit = false;
 	int err;
 
-	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+	spin_lock(&tfile->lock);
+	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) {
+		spin_unlock(&tfile->lock);
 		return ERR_PTR(-ENOMEM);
+	}
+
+	page = alloc_frag->page;
+	offset = alloc_frag->offset;
+	get_page(page);
+	buf = (char *)page_address(page) + offset;
+	alloc_frag->offset += buflen;
+	spin_unlock(&tfile->lock);
 
-	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-	copied = copy_page_from_iter(alloc_frag->page,
-				     alloc_frag->offset + TUN_RX_PAD,
-				     len, from);
-	if (copied != len)
+	copied = copy_page_from_iter(page, offset + TUN_RX_PAD, len, from);
+	if (copied != len) {
+		put_page(page);
 		return ERR_PTR(-EFAULT);
+	}
 
 	if (hdr->gso_type)
 		*generic_xdp = 1;
@@ -1313,11 +1325,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		switch (act) {
 		case XDP_REDIRECT:
-			get_page(alloc_frag->page);
-			alloc_frag->offset += buflen;
 			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
 			if (err)
-				goto err_redirect;
+				goto err_xdp;
 			return NULL;
 		case XDP_TX:
 			xdp_xmit = true;
@@ -1339,13 +1349,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	skb = build_skb(buf, buflen);
 	if (!skb) {
 		rcu_read_unlock();
+		put_page(page);
 		return ERR_PTR(-ENOMEM);
 	}
 
 	skb_reserve(skb, TUN_RX_PAD - delta);
 	skb_put(skb, len + delta);
-	get_page(alloc_frag->page);
-	alloc_frag->offset += buflen;
 
 	if (xdp_xmit) {
 		skb->dev = tun->dev;
@@ -1358,9 +1367,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 	return skb;
 
-err_redirect:
-	put_page(alloc_frag->page);
 err_xdp:
+	put_page(page);
 	rcu_read_unlock();
 	this_cpu_inc(tun->pcpu_stats->rx_dropped);
 	return NULL;
@@ -2586,6 +2594,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	INIT_LIST_HEAD(&tfile->next);
 
 	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
+	spin_lock_init(&tfile->lock);
 
 	return 0;
 }
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet
  2017-08-16  3:55     ` Jason Wang
@ 2017-08-16 10:24       ` Eric Dumazet
  2017-08-16 13:16         ` Jason Wang
  0 siblings, 1 reply; 22+ messages in thread
From: Eric Dumazet @ 2017-08-16 10:24 UTC (permalink / raw)
  To: Jason Wang; +Cc: davem, mst, netdev, linux-kernel, kubakici

On Wed, 2017-08-16 at 11:55 +0800, Jason Wang wrote:
> 
> On 2017年08月16日 11:45, Eric Dumazet wrote:
> >
> > You do realize that tun_build_skb() is not thread safe ?
> 
> Ok, I think the issue if skb_page_frag_refill(), need a spinlock 
> probably. Will prepare a patch.

But since tun is used from process context, why don't you use the
per-thread generator (no lock involved)

tcp_sendmsg() uses this for GFP_KERNEL allocations.

Untested patch :

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5892284eb8d05b0678d820bad3d0d2c61a879aeb..c38cd840cc0b7fecf182b23976e36f709cacca1f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -175,7 +175,6 @@ struct tun_file {
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
-	struct page_frag alloc_frag;
 };
 
 struct tun_flow_entry {
@@ -578,8 +577,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 		}
 		if (tun)
 			skb_array_cleanup(&tfile->tx_array);
-		if (tfile->alloc_frag.page)
-			put_page(tfile->alloc_frag.page);
 		sock_put(&tfile->sk);
 	}
 }
@@ -1272,7 +1269,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 				     struct virtio_net_hdr *hdr,
 				     int len, int *generic_xdp)
 {
-	struct page_frag *alloc_frag = &tfile->alloc_frag;
+	struct page_frag *alloc_frag = &current->task_frag;
 	struct sk_buff *skb;
 	struct bpf_prog *xdp_prog;
 	int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
@@ -2580,8 +2577,6 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	tfile->sk.sk_write_space = tun_sock_write_space;
 	tfile->sk.sk_sndbuf = INT_MAX;
 
-	tfile->alloc_frag.page = NULL;
-
 	file->private_data = tfile;
 	INIT_LIST_HEAD(&tfile->next);
 

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet
  2017-08-16 10:24       ` Eric Dumazet
@ 2017-08-16 13:16         ` Jason Wang
  0 siblings, 0 replies; 22+ messages in thread
From: Jason Wang @ 2017-08-16 13:16 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, mst, netdev, linux-kernel, kubakici



On 2017年08月16日 18:24, Eric Dumazet wrote:
> On Wed, 2017-08-16 at 11:55 +0800, Jason Wang wrote:
>> On 2017年08月16日 11:45, Eric Dumazet wrote:
>>> You do realize that tun_build_skb() is not thread safe ?
>> Ok, I think the issue if skb_page_frag_refill(), need a spinlock
>> probably. Will prepare a patch.
> But since tun is used from process context, why don't you use the
> per-thread generator (no lock involved)

Haven't noticed this before.

>
> tcp_sendmsg() uses this for GFP_KERNEL allocations.
>
> Untested patch :
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 5892284eb8d05b0678d820bad3d0d2c61a879aeb..c38cd840cc0b7fecf182b23976e36f709cacca1f 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -175,7 +175,6 @@ struct tun_file {
>   	struct list_head next;
>   	struct tun_struct *detached;
>   	struct skb_array tx_array;
> -	struct page_frag alloc_frag;
>   };
>   
>   struct tun_flow_entry {
> @@ -578,8 +577,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
>   		}
>   		if (tun)
>   			skb_array_cleanup(&tfile->tx_array);
> -		if (tfile->alloc_frag.page)
> -			put_page(tfile->alloc_frag.page);
>   		sock_put(&tfile->sk);
>   	}
>   }
> @@ -1272,7 +1269,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>   				     struct virtio_net_hdr *hdr,
>   				     int len, int *generic_xdp)
>   {
> -	struct page_frag *alloc_frag = &tfile->alloc_frag;
> +	struct page_frag *alloc_frag = &current->task_frag;
>   	struct sk_buff *skb;
>   	struct bpf_prog *xdp_prog;
>   	int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
> @@ -2580,8 +2577,6 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>   	tfile->sk.sk_write_space = tun_sock_write_space;
>   	tfile->sk.sk_sndbuf = INT_MAX;
>   
> -	tfile->alloc_frag.page = NULL;
> -
>   	file->private_data = tfile;
>   	INIT_LIST_HEAD(&tfile->next);
>   
>
>
>
>

Tested-by: Jason Wang <jasowang@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet
  2017-08-16  9:17             ` Jason Wang
@ 2017-08-16 16:30               ` David Miller
  0 siblings, 0 replies; 22+ messages in thread
From: David Miller @ 2017-08-16 16:30 UTC (permalink / raw)
  To: jasowang; +Cc: mst, eric.dumazet, netdev, linux-kernel, kubakici

From: Jason Wang <jasowang@redhat.com>
Date: Wed, 16 Aug 2017 17:17:45 +0800

> It looks like full page allocation just produce too much stress on the
> page allocator.
> 
> I get 1.58Mpps (full page) vs 1.95Mpps (page frag) with the patches
> attached.

Yes, this is why drivers doing XDP tend to drift towards implementing
a local cache of pages.

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2017-08-16 16:30 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-11 11:41 [PATCH net-next V2 0/3] XDP support for tap Jason Wang
2017-08-11 11:41 ` [PATCH net-next V2 1/3] tap: use build_skb() for small packet Jason Wang
2017-08-16  3:45   ` Eric Dumazet
2017-08-16  3:55     ` Michael S. Tsirkin
2017-08-16  3:57       ` Jason Wang
2017-08-16  3:59         ` Michael S. Tsirkin
2017-08-16  4:07           ` Jason Wang
2017-08-16  9:17             ` Jason Wang
2017-08-16 16:30               ` David Miller
2017-08-16  3:55     ` Jason Wang
2017-08-16 10:24       ` Eric Dumazet
2017-08-16 13:16         ` Jason Wang
2017-08-11 11:41 ` [PATCH net-next V2 2/3] net: export some generic xdp helpers Jason Wang
2017-08-11 11:41 ` [PATCH net-next V2 3/3] tap: XDP support Jason Wang
2017-08-11 23:12   ` Jakub Kicinski
2017-08-12  2:48     ` Jason Wang
2017-08-14 16:01       ` Michael S. Tsirkin
2017-08-15  5:02         ` Jason Wang
2017-08-16  3:45           ` Michael S. Tsirkin
2017-08-14  8:43   ` Daniel Borkmann
2017-08-15  4:55     ` Jason Wang
2017-08-14  2:56 ` [PATCH net-next V2 0/3] XDP support for tap David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.