[net-next,09/11] tuntap: accept an array of XDP buffs through sendmsg()
diff mbox series

Message ID 20180906040526.22518-10-jasowang@redhat.com
State New, archived
Headers show
Series
  • Vhost_net TX batching
Related show

Commit Message

Jason Wang Sept. 6, 2018, 4:05 a.m. UTC
This patch implement TUN_MSG_PTR msg_control type. This type allows
the caller to pass an array of XDP buffs to tuntap through ptr field
of the tun_msg_control. If an XDP program is attached, tuntap can run
XDP program directly. If not, tuntap will build skb and do a fast
receiving since part of the work has been done by vhost_net.

This will avoid lots of indirect calls thus improves the icache
utilization and allows to do XDP batched flushing when doing XDP
redirection.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 103 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 100 insertions(+), 3 deletions(-)

Comments

Michael S. Tsirkin Sept. 6, 2018, 5:51 p.m. UTC | #1
On Thu, Sep 06, 2018 at 12:05:24PM +0800, Jason Wang wrote:
> This patch implement TUN_MSG_PTR msg_control type. This type allows
> the caller to pass an array of XDP buffs to tuntap through ptr field
> of the tun_msg_control. If an XDP program is attached, tuntap can run
> XDP program directly. If not, tuntap will build skb and do a fast
> receiving since part of the work has been done by vhost_net.
> 
> This will avoid lots of indirect calls thus improves the icache
> utilization and allows to do XDP batched flushing when doing XDP
> redirection.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Is most of the benefit in batched flushing or skipping
indirect calls? Because if it's flushing we can gain
most of it easily by adding an analog of xmit_more.

> ---
>  drivers/net/tun.c | 103 ++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 100 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index c839a4bdcbd9..069db2e5dd08 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -2424,22 +2424,119 @@ static void tun_sock_write_space(struct sock *sk)
>  	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
>  }
>  
> +static int tun_xdp_one(struct tun_struct *tun,
> +		       struct tun_file *tfile,
> +		       struct xdp_buff *xdp, int *flush)
> +{
> +	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
> +	struct tun_pcpu_stats *stats;
> +	struct bpf_prog *xdp_prog;
> +	struct sk_buff *skb = NULL;
> +	u32 rxhash = 0, act;
> +	int buflen = *(int *)xdp->data_hard_start;
> +	int err = 0;
> +	bool skb_xdp = false;
> +
> +	xdp_prog = rcu_dereference(tun->xdp_prog);
> +	if (xdp_prog) {
> +		if (gso->gso_type) {
> +			skb_xdp = true;
> +			goto build;
> +		}
> +		xdp_set_data_meta_invalid(xdp);
> +		xdp->rxq = &tfile->xdp_rxq;
> +		act = tun_do_xdp(tun, tfile, xdp_prog, xdp, &err);
> +		if (err)
> +			goto out;
> +		if (act == XDP_REDIRECT)
> +			*flush = true;
> +		if (act != XDP_PASS)
> +			goto out;
> +	}
> +
> +build:
> +	skb = build_skb(xdp->data_hard_start, buflen);
> +	if (!skb) {
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +
> +	skb_reserve(skb, xdp->data - xdp->data_hard_start);
> +	skb_put(skb, xdp->data_end - xdp->data);
> +
> +	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
> +		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
> +		kfree_skb(skb);
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	skb->protocol = eth_type_trans(skb, tun->dev);
> +	skb_reset_network_header(skb);
> +	skb_probe_transport_header(skb, 0);
> +
> +	if (skb_xdp) {
> +		err = do_xdp_generic(xdp_prog, skb);
> +		if (err != XDP_PASS)
> +			goto out;
> +	}
> +
> +	if (!rcu_dereference(tun->steering_prog))
> +		rxhash = __skb_get_hash_symmetric(skb);
> +
> +	netif_receive_skb(skb);
> +
> +	stats = get_cpu_ptr(tun->pcpu_stats);
> +	u64_stats_update_begin(&stats->syncp);
> +	stats->rx_packets++;
> +	stats->rx_bytes += skb->len;
> +	u64_stats_update_end(&stats->syncp);
> +	put_cpu_ptr(stats);
> +
> +	if (rxhash)
> +		tun_flow_update(tun, rxhash, tfile);
> +
> +out:
> +	return err;
> +}
> +
>  static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
>  {
> -	int ret;
> +	int ret, i;
>  	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
>  	struct tun_struct *tun = tun_get(tfile);
>  	struct tun_msg_ctl *ctl = m->msg_control;
> +	struct xdp_buff *xdp;
>  
>  	if (!tun)
>  		return -EBADFD;
>  
> -	if (ctl && ctl->type != TUN_MSG_UBUF)
> -		return -EINVAL;
> +	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
> +		int n = ctl->type >> 16;
> +		int flush = 0;
> +
> +		local_bh_disable();
> +		rcu_read_lock();
> +
> +		for (i = 0; i < n; i++) {
> +			xdp = &((struct xdp_buff *)ctl->ptr)[i];
> +			tun_xdp_one(tun, tfile, xdp, &flush);
> +		}
> +
> +		if (flush)
> +			xdp_do_flush_map();
> +
> +		rcu_read_unlock();
> +		local_bh_enable();
> +
> +		ret = total_len;
> +		goto out;
> +	}
>  
>  	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
>  			   m->msg_flags & MSG_DONTWAIT,
>  			   m->msg_flags & MSG_MORE);
> +out:
>  	tun_put(tun);
>  	return ret;
>  }
> -- 
> 2.17.1
Jason Wang Sept. 7, 2018, 7:33 a.m. UTC | #2
On 2018年09月07日 01:51, Michael S. Tsirkin wrote:
> On Thu, Sep 06, 2018 at 12:05:24PM +0800, Jason Wang wrote:
>> This patch implement TUN_MSG_PTR msg_control type. This type allows
>> the caller to pass an array of XDP buffs to tuntap through ptr field
>> of the tun_msg_control. If an XDP program is attached, tuntap can run
>> XDP program directly. If not, tuntap will build skb and do a fast
>> receiving since part of the work has been done by vhost_net.
>>
>> This will avoid lots of indirect calls thus improves the icache
>> utilization and allows to do XDP batched flushing when doing XDP
>> redirection.
>>
>> Signed-off-by: Jason Wang<jasowang@redhat.com>
> Is most of the benefit in batched flushing or skipping
> indirect calls? Because if it's flushing we can gain
> most of it easily by adding an analog of xmit_more.
>

Should be both. XDP_DROP doesn't flush but it gives obvious improvement.

Thanks

Patch
diff mbox series

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c839a4bdcbd9..069db2e5dd08 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2424,22 +2424,119 @@  static void tun_sock_write_space(struct sock *sk)
 	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
 }
 
+static int tun_xdp_one(struct tun_struct *tun,
+		       struct tun_file *tfile,
+		       struct xdp_buff *xdp, int *flush)
+{
+	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
+	struct tun_pcpu_stats *stats;
+	struct bpf_prog *xdp_prog;
+	struct sk_buff *skb = NULL;
+	u32 rxhash = 0, act;
+	int buflen = *(int *)xdp->data_hard_start;
+	int err = 0;
+	bool skb_xdp = false;
+
+	xdp_prog = rcu_dereference(tun->xdp_prog);
+	if (xdp_prog) {
+		if (gso->gso_type) {
+			skb_xdp = true;
+			goto build;
+		}
+		xdp_set_data_meta_invalid(xdp);
+		xdp->rxq = &tfile->xdp_rxq;
+		act = tun_do_xdp(tun, tfile, xdp_prog, xdp, &err);
+		if (err)
+			goto out;
+		if (act == XDP_REDIRECT)
+			*flush = true;
+		if (act != XDP_PASS)
+			goto out;
+	}
+
+build:
+	skb = build_skb(xdp->data_hard_start, buflen);
+	if (!skb) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	skb_put(skb, xdp->data_end - xdp->data);
+
+	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
+		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
+		kfree_skb(skb);
+		err = -EINVAL;
+		goto out;
+	}
+
+	skb->protocol = eth_type_trans(skb, tun->dev);
+	skb_reset_network_header(skb);
+	skb_probe_transport_header(skb, 0);
+
+	if (skb_xdp) {
+		err = do_xdp_generic(xdp_prog, skb);
+		if (err != XDP_PASS)
+			goto out;
+	}
+
+	if (!rcu_dereference(tun->steering_prog))
+		rxhash = __skb_get_hash_symmetric(skb);
+
+	netif_receive_skb(skb);
+
+	stats = get_cpu_ptr(tun->pcpu_stats);
+	u64_stats_update_begin(&stats->syncp);
+	stats->rx_packets++;
+	stats->rx_bytes += skb->len;
+	u64_stats_update_end(&stats->syncp);
+	put_cpu_ptr(stats);
+
+	if (rxhash)
+		tun_flow_update(tun, rxhash, tfile);
+
+out:
+	return err;
+}
+
 static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 {
-	int ret;
+	int ret, i;
 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	struct tun_struct *tun = tun_get(tfile);
 	struct tun_msg_ctl *ctl = m->msg_control;
+	struct xdp_buff *xdp;
 
 	if (!tun)
 		return -EBADFD;
 
-	if (ctl && ctl->type != TUN_MSG_UBUF)
-		return -EINVAL;
+	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
+		int n = ctl->type >> 16;
+		int flush = 0;
+
+		local_bh_disable();
+		rcu_read_lock();
+
+		for (i = 0; i < n; i++) {
+			xdp = &((struct xdp_buff *)ctl->ptr)[i];
+			tun_xdp_one(tun, tfile, xdp, &flush);
+		}
+
+		if (flush)
+			xdp_do_flush_map();
+
+		rcu_read_unlock();
+		local_bh_enable();
+
+		ret = total_len;
+		goto out;
+	}
 
 	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
 			   m->msg_flags & MSG_DONTWAIT,
 			   m->msg_flags & MSG_MORE);
+out:
 	tun_put(tun);
 	return ret;
 }