All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
       [not found] ` <20120625061018.6765.76633.stgit@amd-6168-8-1.englab.nay.redhat.com>
@ 2012-06-25  8:25   ` Michael S. Tsirkin
  2012-06-25  8:41     ` Michael S. Tsirkin
                       ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Michael S. Tsirkin @ 2012-06-25  8:25 UTC (permalink / raw)
  To: Jason Wang
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle

On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
> This patch adds multiqueue support for tap device. This is done by abstracting
> each queue as a file/socket and allowing multiple sockets to be attached to the
> tuntap device (an array of tun_file were stored in the tun_struct). Userspace
> could write and read from those files to do the parallel packet
> sending/receiving.
> 
> Unlike the previous single queue implementation, the socket and device were
> loosely coupled, each of them were allowed to go away first. In order to let the
> tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
> synchronize between data path and system call.

Don't use LLTX/RCU. It's not worth it.
Use something like netif_set_real_num_tx_queues.

> 
> The tx queue selecting is first based on the recorded rxq index of an skb, it
> there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Interestingly macvtap switched to hashing first:
ef0002b577b52941fb147128f30bd1ecfdd3ff6d
(the commit log is corrupted but see what it
does in the patch).
Any idea why?

> ---
>  drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
>  1 files changed, 232 insertions(+), 139 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 8233b0a..5c26757 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -107,6 +107,8 @@ struct tap_filter {
>  	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
>  };
>  
> +#define MAX_TAP_QUEUES (NR_CPUS < 16 ? NR_CPUS : 16)

Why the limit? I am guessing you copied this from macvtap?
This is problematic for a number of reasons:
	- will not play well with migration
	- will not work well for a large guest

Yes, macvtap needs to be fixed too.

I am guessing what it is trying to prevent is queueing
up a huge number of packets?
So just divide the default tx queue limit by the # of queues.

And by the way, for MQ applications maybe we can finally
ignore tx queue altogether and limit the total number
of bytes queued?
To avoid regressions we can make it large like 64M/# queues.
Could be a separate patch I think, and for a single queue
might need a compatible mode though I am not sure.

> +
>  struct tun_file {
>  	struct sock sk;
>  	struct socket socket;
> @@ -114,16 +116,18 @@ struct tun_file {
>  	int vnet_hdr_sz;
>  	struct tap_filter txflt;
>  	atomic_t count;
> -	struct tun_struct *tun;
> +	struct tun_struct __rcu *tun;
>  	struct net *net;
>  	struct fasync_struct *fasync;
>  	unsigned int flags;
> +	u16 queue_index;
>  };
>  
>  struct tun_sock;
>  
>  struct tun_struct {
> -	struct tun_file		*tfile;
> +	struct tun_file		*tfiles[MAX_TAP_QUEUES];
> +	unsigned int            numqueues;
>  	unsigned int 		flags;
>  	uid_t			owner;
>  	gid_t			group;
> @@ -138,80 +142,159 @@ struct tun_struct {
>  #endif
>  };
>  
> -static int tun_attach(struct tun_struct *tun, struct file *file)
> +static DEFINE_SPINLOCK(tun_lock);
> +
> +/*
> + * tun_get_queue(): calculate the queue index
> + *     - if skbs comes from mq nics, we can just borrow
> + *     - if not, calculate from the hash
> + */
> +static struct tun_file *tun_get_queue(struct net_device *dev,
> +				      struct sk_buff *skb)
>  {
> -	struct tun_file *tfile = file->private_data;
> -	int err;
> +	struct tun_struct *tun = netdev_priv(dev);
> +	struct tun_file *tfile = NULL;
> +	int numqueues = tun->numqueues;
> +	__u32 rxq;
>  
> -	ASSERT_RTNL();
> +	BUG_ON(!rcu_read_lock_held());
>  
> -	netif_tx_lock_bh(tun->dev);
> +	if (!numqueues)
> +		goto out;
>  
> -	err = -EINVAL;
> -	if (tfile->tun)
> +	if (numqueues == 1) {
> +		tfile = rcu_dereference(tun->tfiles[0]);

Instead of hacks like this, you can ask for an MQ
flag to be set in SETIFF. Then you won't need to
handle attach/detach at random times.
And most of the scary num_queues checks can go away.
You can then also ask userspace about the max # of queues
to expect if you want to save some memory.


>  		goto out;
> +	}
>  
> -	err = -EBUSY;
> -	if (tun->tfile)
> +	if (likely(skb_rx_queue_recorded(skb))) {
> +		rxq = skb_get_rx_queue(skb);
> +
> +		while (unlikely(rxq >= numqueues))
> +			rxq -= numqueues;
> +
> +		tfile = rcu_dereference(tun->tfiles[rxq]);
>  		goto out;
> +	}
>  
> -	err = 0;
> -	tfile->tun = tun;
> -	tun->tfile = tfile;
> -	netif_carrier_on(tun->dev);
> -	dev_hold(tun->dev);
> -	sock_hold(&tfile->sk);
> -	atomic_inc(&tfile->count);
> +	/* Check if we can use flow to select a queue */
> +	rxq = skb_get_rxhash(skb);
> +	if (rxq) {
> +		u32 idx = ((u64)rxq * numqueues) >> 32;

This completely confuses me. What's the logic here?
How do we even know it's in range?

> +		tfile = rcu_dereference(tun->tfiles[idx]);
> +		goto out;
> +	}
>  
> +	tfile = rcu_dereference(tun->tfiles[0]);
>  out:
> -	netif_tx_unlock_bh(tun->dev);
> -	return err;
> +	return tfile;
>  }
>  
> -static void __tun_detach(struct tun_struct *tun)
> +static int tun_detach(struct tun_file *tfile, bool clean)
>  {
> -	struct tun_file *tfile = tun->tfile;
> -	/* Detach from net device */
> -	netif_tx_lock_bh(tun->dev);
> -	netif_carrier_off(tun->dev);
> -	tun->tfile = NULL;
> -	netif_tx_unlock_bh(tun->dev);
> -
> -	/* Drop read queue */
> -	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
> -
> -	/* Drop the extra count on the net device */
> -	dev_put(tun->dev);
> -}
> +	struct tun_struct *tun;
> +	struct net_device *dev = NULL;
> +	bool destroy = false;
>  
> -static void tun_detach(struct tun_struct *tun)
> -{
> -	rtnl_lock();
> -	__tun_detach(tun);
> -	rtnl_unlock();
> -}
> +	spin_lock(&tun_lock);
>  
> -static struct tun_struct *__tun_get(struct tun_file *tfile)
> -{
> -	struct tun_struct *tun = NULL;
> +	tun = rcu_dereference_protected(tfile->tun,
> +					lockdep_is_held(&tun_lock));
> +	if (tun) {
> +		u16 index = tfile->queue_index;
> +		BUG_ON(index >= tun->numqueues);
> +		dev = tun->dev;
> +
> +		rcu_assign_pointer(tun->tfiles[index],
> +				   tun->tfiles[tun->numqueues - 1]);
> +		tun->tfiles[index]->queue_index = index;
> +		rcu_assign_pointer(tfile->tun, NULL);
> +		--tun->numqueues;
> +		sock_put(&tfile->sk);
>  
> -	if (atomic_inc_not_zero(&tfile->count))
> -		tun = tfile->tun;
> +		if (tun->numqueues == 0 && !(tun->flags & TUN_PERSIST))
> +			destroy = true;

Please don't use flags like that. Use dedicated labels and goto there on error.


> +	}
>  
> -	return tun;
> +	spin_unlock(&tun_lock);
> +
> +	synchronize_rcu();
> +	if (clean)
> +		sock_put(&tfile->sk);
> +
> +	if (destroy) {
> +		rtnl_lock();
> +		if (dev->reg_state == NETREG_REGISTERED)
> +			unregister_netdevice(dev);
> +		rtnl_unlock();
> +	}
> +
> +	return 0;
>  }
>  
> -static struct tun_struct *tun_get(struct file *file)
> +static void tun_detach_all(struct net_device *dev)
>  {
> -	return __tun_get(file->private_data);
> +	struct tun_struct *tun = netdev_priv(dev);
> +	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
> +	int i, j = 0;
> +
> +	spin_lock(&tun_lock);
> +
> +	for (i = 0; i < MAX_TAP_QUEUES && tun->numqueues; i++) {
> +		tfile = rcu_dereference_protected(tun->tfiles[i],
> +						lockdep_is_held(&tun_lock));
> +		BUG_ON(!tfile);
> +		wake_up_all(&tfile->wq.wait);
> +		tfile_list[j++] = tfile;
> +		rcu_assign_pointer(tfile->tun, NULL);
> +		--tun->numqueues;
> +	}
> +	BUG_ON(tun->numqueues != 0);
> +	/* guarantee that any future tun_attach will fail */
> +	tun->numqueues = MAX_TAP_QUEUES;
> +	spin_unlock(&tun_lock);
> +
> +	synchronize_rcu();
> +	for (--j; j >= 0; j--)
> +		sock_put(&tfile_list[j]->sk);
>  }
>  
> -static void tun_put(struct tun_struct *tun)
> +static int tun_attach(struct tun_struct *tun, struct file *file)
>  {
> -	struct tun_file *tfile = tun->tfile;
> +	struct tun_file *tfile = file->private_data;
> +	int err;
> +
> +	ASSERT_RTNL();
> +
> +	spin_lock(&tun_lock);
>  
> -	if (atomic_dec_and_test(&tfile->count))
> -		tun_detach(tfile->tun);
> +	err = -EINVAL;
> +	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
> +		goto out;
> +
> +	err = -EBUSY;
> +	if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
> +		goto out;
> +
> +	if (tun->numqueues == MAX_TAP_QUEUES)
> +		goto out;
> +
> +	err = 0;
> +	tfile->queue_index = tun->numqueues;
> +	rcu_assign_pointer(tfile->tun, tun);
> +	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
> +	sock_hold(&tfile->sk);
> +	tun->numqueues++;
> +
> +	if (tun->numqueues == 1)
> +		netif_carrier_on(tun->dev);
> +
> +	/* device is allowed to go away first, so no need to hold extra
> +	 * refcnt. */
> +
> +out:
> +	spin_unlock(&tun_lock);
> +	return err;
>  }
>  
>  /* TAP filtering */
> @@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
>  /* Net device detach from fd. */
>  static void tun_net_uninit(struct net_device *dev)
>  {
> -	struct tun_struct *tun = netdev_priv(dev);
> -	struct tun_file *tfile = tun->tfile;
> -
> -	/* Inform the methods they need to stop using the dev.
> -	 */
> -	if (tfile) {
> -		wake_up_all(&tfile->wq.wait);
> -		if (atomic_dec_and_test(&tfile->count))
> -			__tun_detach(tun);
> -	}
> +	tun_detach_all(dev);
>  }
>  
>  /* Net device open. */
> @@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
>  /* Net device start xmit */
>  static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
> -	struct tun_struct *tun = netdev_priv(dev);
> -	struct tun_file *tfile = tun->tfile;
> +	struct tun_file *tfile = NULL;
>  
> -	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
> +	rcu_read_lock();
> +	tfile = tun_get_queue(dev, skb);
>  
>  	/* Drop packet if interface is not attached */
>  	if (!tfile)
> @@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>  
>  	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
>  	    >= dev->tx_queue_len) {
> -		if (!(tun->flags & TUN_ONE_QUEUE)) {
> +		if (!(tfile->flags & TUN_ONE_QUEUE) &&

Which patch moved flags from tun to tfile?

> +		    !(tfile->flags & TUN_TAP_MQ)) {
>  			/* Normal queueing mode. */
>  			/* Packet scheduler handles dropping of further packets. */
>  			netif_stop_queue(dev);
> @@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>  			 * error is more appropriate. */
>  			dev->stats.tx_fifo_errors++;
>  		} else {
> -			/* Single queue mode.
> +			/* Single queue mode or multi queue mode.
>  			 * Driver handles dropping of all packets itself. */

Please don't do this. Stop the queue on overrun as appropriate.
ONE_QUEUE is a legacy hack.

BTW we really should stop queue before we start dropping packets,
but that can be a separate patch.

>  			goto drop;
>  		}
> @@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>  		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
>  	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
>  				   POLLRDNORM | POLLRDBAND);
> +	rcu_read_unlock();
>  	return NETDEV_TX_OK;
>  
>  drop:
> +	rcu_read_unlock();
>  	dev->stats.tx_dropped++;
>  	kfree_skb(skb);
>  	return NETDEV_TX_OK;
> @@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
>  static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>  {
>  	struct tun_file *tfile = file->private_data;
> -	struct tun_struct *tun = __tun_get(tfile);
> +	struct tun_struct *tun = NULL;
>  	struct sock *sk;
>  	unsigned int mask = 0;
>  
> -	if (!tun)
> +	if (!tfile)
>  		return POLLERR;
>  
> -	sk = tfile->socket.sk;
> +	rcu_read_lock();
> +	tun = rcu_dereference(tfile->tun);
> +	if (!tun) {
> +		rcu_read_unlock();
> +		return POLLERR;
> +	}
> +	rcu_read_unlock();
>  
> -	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
> +	sk = &tfile->sk;
>  
>  	poll_wait(file, &tfile->wq.wait, wait);
>  
> @@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>  	     sock_writeable(sk)))
>  		mask |= POLLOUT | POLLWRNORM;
>  
> -	if (tun->dev->reg_state != NETREG_REGISTERED)
> +	rcu_read_lock();
> +	tun = rcu_dereference(tfile->tun);
> +	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
>  		mask = POLLERR;
> +	rcu_read_unlock();
>  
> -	tun_put(tun);
>  	return mask;
>  }
>  
> @@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>  		skb_shinfo(skb)->gso_segs = 0;
>  	}
>  
> -	tun = __tun_get(tfile);
> -	if (!tun)
> +	rcu_read_lock();
> +	tun = rcu_dereference(tfile->tun);
> +	if (!tun) {
> +		rcu_read_unlock();
>  		return -EBADFD;
> +	}
>  
>  	switch (tfile->flags & TUN_TYPE_MASK) {
>  	case TUN_TUN_DEV:
> @@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>  		skb->protocol = eth_type_trans(skb, tun->dev);
>  		break;
>  	}
> -
> -	netif_rx_ni(skb);
>  	tun->dev->stats.rx_packets++;
>  	tun->dev->stats.rx_bytes += len;
> -	tun_put(tun);
> +	rcu_read_unlock();
> +
> +	netif_rx_ni(skb);
> +
>  	return count;
>  
>  err_free:
>  	count = -EINVAL;
>  	kfree_skb(skb);
>  err:
> -	tun = __tun_get(tfile);
> -	if (!tun)
> +	rcu_read_lock();
> +	tun = rcu_dereference(tfile->tun);
> +	if (!tun) {
> +		rcu_read_unlock();
>  		return -EBADFD;
> +	}
>  
>  	if (drop)
>  		tun->dev->stats.rx_dropped++;
>  	if (error)
>  		tun->dev->stats.rx_frame_errors++;
> -	tun_put(tun);
> +	rcu_read_unlock();
>  	return count;
>  }
>  
> @@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
>  	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
>  	total += skb->len;
>  
> -	tun = __tun_get(tfile);
> +	rcu_read_lock();
> +	tun = rcu_dereference(tfile->tun);
>  	if (tun) {
>  		tun->dev->stats.tx_packets++;
>  		tun->dev->stats.tx_bytes += len;
> -		tun_put(tun);
>  	}
> +	rcu_read_unlock();
>  
>  	return total;
>  }
> @@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
>  				break;
>  			}
>  
> -			tun = __tun_get(tfile);
> +			rcu_read_lock();
> +			tun = rcu_dereference(tfile->tun);
>  			if (!tun) {
> -				ret = -EIO;
> +				ret = -EBADFD;

BADFD is for when you get passed something like -1 fd.
Here fd is OK, it's just in a bad state so you can not do IO.


> +				rcu_read_unlock();
>  				break;
>  			}
>  			if (tun->dev->reg_state != NETREG_REGISTERED) {
>  				ret = -EIO;
> -				tun_put(tun);
> +				rcu_read_unlock();
>  				break;
>  			}
> -			tun_put(tun);
> +			rcu_read_unlock();
>  
>  			/* Nothing to read, let's sleep */
>  			schedule();
>  			continue;
>  		}
>  
> -		tun = __tun_get(tfile);
> +		rcu_read_lock();
> +		tun = rcu_dereference(tfile->tun);
>  		if (tun) {
>  			netif_wake_queue(tun->dev);
> -			tun_put(tun);
>  		}
> +		rcu_read_unlock();
>  
>  		ret = tun_put_user(tfile, skb, iv, len);
>  		kfree_skb(skb);
> @@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
>  	if (tun->flags & TUN_VNET_HDR)
>  		flags |= IFF_VNET_HDR;
>  
> +	if (tun->flags & TUN_TAP_MQ)
> +		flags |= IFF_MULTI_QUEUE;
> +
>  	return flags;
>  }
>  
> @@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  		err = tun_attach(tun, file);
>  		if (err < 0)
>  			return err;
> -	}
> -	else {
> +	} else {
>  		char *name;
>  		unsigned long flags = 0;
>  
> @@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
>  			TUN_USER_FEATURES;
>  		dev->features = dev->hw_features;
> +		if (ifr->ifr_flags & IFF_MULTI_QUEUE)
> +			dev->features |= NETIF_F_LLTX;
>  
>  		err = register_netdevice(tun->dev);
>  		if (err < 0)
> @@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  
>  		err = tun_attach(tun, file);
>  		if (err < 0)
> -			goto failed;
> +			goto err_free_dev;
>  	}
>  
>  	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
> @@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  	else
>  		tun->flags &= ~TUN_VNET_HDR;
>  
> +	if (ifr->ifr_flags & IFF_MULTI_QUEUE)
> +		tun->flags |= TUN_TAP_MQ;
> +	else
> +		tun->flags &= ~TUN_TAP_MQ;
> +
>  	/* Cache flags from tun device */
>  	tfile->flags = tun->flags;
>  	/* Make sure persistent devices do not get stuck in
> @@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  
>  err_free_dev:
>  	free_netdev(dev);
> -failed:
>  	return err;
>  }
>  
> @@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  				(unsigned int __user*)argp);
>  	}
>  
> -	rtnl_lock();
> -
> -	tun = __tun_get(tfile);
> -	if (cmd == TUNSETIFF && !tun) {
> +	ret = 0;
> +	if (cmd == TUNSETIFF) {
> +		rtnl_lock();
>  		ifr.ifr_name[IFNAMSIZ-1] = '\0';
> -
>  		ret = tun_set_iff(tfile->net, file, &ifr);
> -
> +		rtnl_unlock();
>  		if (ret)
> -			goto unlock;
> -
> +			return ret;
>  		if (copy_to_user(argp, &ifr, ifreq_len))
> -			ret = -EFAULT;
> -		goto unlock;
> +			return -EFAULT;
> +		return ret;
>  	}
>  
> +	rtnl_lock();
> +
> +	rcu_read_lock();
> +
>  	ret = -EBADFD;
> +	tun = rcu_dereference(tfile->tun);
>  	if (!tun)
>  		goto unlock;
> +	else
> +		ret = 0;
>  
> -	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
> -
> -	ret = 0;
>  	switch (cmd) {
>  	case TUNGETIFF:
>  		ret = tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
> +		rcu_read_unlock();
>  		if (ret)
> -			break;
> +			goto out;
>  
>  		if (copy_to_user(argp, &ifr, ifreq_len))
>  			ret = -EFAULT;
> -		break;
> +		goto out;
>  
>  	case TUNSETNOCSUM:
>  		/* Disable/Enable checksum */
> @@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  		/* Get hw address */
>  		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
>  		ifr.ifr_hwaddr.sa_family = tun->dev->type;
> +		rcu_read_unlock();
>  		if (copy_to_user(argp, &ifr, ifreq_len))
>  			ret = -EFAULT;
> -		break;
> +		goto out;
>  
>  	case SIOCSIFHWADDR:
>  		/* Set hw address */
> @@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  	}
>  
>  unlock:
> +	rcu_read_unlock();
> +out:
>  	rtnl_unlock();
> -	if (tun)
> -		tun_put(tun);
>  	return ret;
>  }
>  
> @@ -1517,6 +1624,11 @@ out:
>  	return ret;
>  }
>  
> +static void tun_sock_destruct(struct sock *sk)
> +{
> +	skb_queue_purge(&sk->sk_receive_queue);
> +}
> +
>  static int tun_chr_open(struct inode *inode, struct file * file)
>  {
>  	struct net *net = current->nsproxy->net_ns;
> @@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>  	sock_init_data(&tfile->socket, &tfile->sk);
>  
>  	tfile->sk.sk_write_space = tun_sock_write_space;
> +	tfile->sk.sk_destruct = tun_sock_destruct;
>  	tfile->sk.sk_sndbuf = INT_MAX;
>  	file->private_data = tfile;
>  
> @@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>  static int tun_chr_close(struct inode *inode, struct file *file)
>  {
>  	struct tun_file *tfile = file->private_data;
> -	struct tun_struct *tun;
> -
> -	tun = __tun_get(tfile);
> -	if (tun) {
> -		struct net_device *dev = tun->dev;
> -
> -		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
> -
> -		__tun_detach(tun);
> -
> -		/* If desirable, unregister the netdevice. */
> -		if (!(tun->flags & TUN_PERSIST)) {
> -			rtnl_lock();
> -			if (dev->reg_state == NETREG_REGISTERED)
> -				unregister_netdevice(dev);
> -			rtnl_unlock();
> -		}
>  
> -		/* drop the reference that netdevice holds */
> -		sock_put(&tfile->sk);
> -
> -	}
> -
> -	/* drop the reference that file holds */
> -	sock_put(&tfile->sk);
> +	tun_detach(tfile, true);
>  
>  	return 0;
>  }
> @@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
>   * holding a reference to the file for as long as the socket is in use. */
>  struct socket *tun_get_socket(struct file *file)
>  {
> -	struct tun_struct *tun;
> +	struct tun_struct *tun = NULL;
>  	struct tun_file *tfile = file->private_data;
>  	if (file->f_op != &tun_fops)
>  		return ERR_PTR(-EINVAL);
> -	tun = tun_get(file);
> -	if (!tun)
> +	rcu_read_lock();
> +	tun = rcu_dereference(tfile->tun);
> +	if (!tun) {
> +		rcu_read_unlock();
>  		return ERR_PTR(-EBADFD);
> -	tun_put(tun);
> +	}
> +	rcu_read_unlock();
>  	return &tfile->socket;
>  }
>  EXPORT_SYMBOL_GPL(tun_get_socket);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 1/6] tuntap: move socket to tun_file
       [not found] ` <20120625060945.6765.98618.stgit@amd-6168-8-1.englab.nay.redhat.com>
@ 2012-06-25  8:27   ` Michael S. Tsirkin
  2012-06-26  5:55     ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2012-06-25  8:27 UTC (permalink / raw)
  To: Jason Wang
  Cc: netdev, linux-kernel, krkumar2, tahm, akong, davem, shemminger, mashirle

On Mon, Jun 25, 2012 at 02:09:45PM +0800, Jason Wang wrote:
> This patch moves socket structure from tun_device and to tun_file in order to
> let it possbile for multiple sockets to be attached to tun/tap device. The
> reference between tap device and socket was setup during TUNSETIFF as
> usual.
> 
> After this patch, we can go further towards multiqueue tun/tap support by
> storing an array of pointers of tun_file in tun_device.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

I think this changes visible userspace
behaviour for persistent devices.

Specifically, with this patch, TUNSETSNDBUF and TUNATTACHFILTER won't
be effective if you close and reopen the device, right?

It's possible that no application uses either of these
ioctls on persistent tun devices at the moment,
but seems safer to avoid changing such behaviour.


> ---
>  drivers/net/tun.c |  352 +++++++++++++++++++++++++++--------------------------
>  1 files changed, 181 insertions(+), 171 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 987aeef..1f27789 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -108,9 +108,16 @@ struct tap_filter {
>  };
>  
>  struct tun_file {
> +	struct sock sk;
> +	struct socket socket;
> +	struct socket_wq wq;
> +	int vnet_hdr_sz;
> +	struct tap_filter txflt;
>  	atomic_t count;
>  	struct tun_struct *tun;
>  	struct net *net;
> +	struct fasync_struct *fasync;
> +	unsigned int flags;
>  };
>  
>  struct tun_sock;
> @@ -125,29 +132,12 @@ struct tun_struct {
>  	netdev_features_t	set_features;
>  #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
>  			  NETIF_F_TSO6|NETIF_F_UFO)
> -	struct fasync_struct	*fasync;
> -
> -	struct tap_filter       txflt;
> -	struct socket		socket;
> -	struct socket_wq	wq;
> -
> -	int			vnet_hdr_sz;
>  
>  #ifdef TUN_DEBUG
>  	int debug;
>  #endif
>  };
>  
> -struct tun_sock {
> -	struct sock		sk;
> -	struct tun_struct	*tun;
> -};
> -
> -static inline struct tun_sock *tun_sk(struct sock *sk)
> -{
> -	return container_of(sk, struct tun_sock, sk);
> -}
> -
>  static int tun_attach(struct tun_struct *tun, struct file *file)
>  {
>  	struct tun_file *tfile = file->private_data;
> @@ -168,10 +158,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
>  	err = 0;
>  	tfile->tun = tun;
>  	tun->tfile = tfile;
> -	tun->socket.file = file;
>  	netif_carrier_on(tun->dev);
>  	dev_hold(tun->dev);
> -	sock_hold(tun->socket.sk);
> +	sock_hold(&tfile->sk);
>  	atomic_inc(&tfile->count);
>  
>  out:
> @@ -181,15 +170,15 @@ out:
>  
>  static void __tun_detach(struct tun_struct *tun)
>  {
> +	struct tun_file *tfile = tun->tfile;
>  	/* Detach from net device */
>  	netif_tx_lock_bh(tun->dev);
>  	netif_carrier_off(tun->dev);
>  	tun->tfile = NULL;
> -	tun->socket.file = NULL;
>  	netif_tx_unlock_bh(tun->dev);
>  
>  	/* Drop read queue */
> -	skb_queue_purge(&tun->socket.sk->sk_receive_queue);
> +	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
>  
>  	/* Drop the extra count on the net device */
>  	dev_put(tun->dev);
> @@ -348,19 +337,12 @@ static void tun_net_uninit(struct net_device *dev)
>  	/* Inform the methods they need to stop using the dev.
>  	 */
>  	if (tfile) {
> -		wake_up_all(&tun->wq.wait);
> +		wake_up_all(&tfile->wq.wait);
>  		if (atomic_dec_and_test(&tfile->count))
>  			__tun_detach(tun);
>  	}
>  }
>  
> -static void tun_free_netdev(struct net_device *dev)
> -{
> -	struct tun_struct *tun = netdev_priv(dev);
> -
> -	sk_release_kernel(tun->socket.sk);
> -}
> -
>  /* Net device open. */
>  static int tun_net_open(struct net_device *dev)
>  {
> @@ -379,24 +361,26 @@ static int tun_net_close(struct net_device *dev)
>  static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>  	struct tun_struct *tun = netdev_priv(dev);
> +	struct tun_file *tfile = tun->tfile;
>  
>  	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
>  
>  	/* Drop packet if interface is not attached */
> -	if (!tun->tfile)
> +	if (!tfile)
>  		goto drop;
>  
>  	/* Drop if the filter does not like it.
>  	 * This is a noop if the filter is disabled.
>  	 * Filter can be enabled only for the TAP devices. */
> -	if (!check_filter(&tun->txflt, skb))
> +	if (!check_filter(&tfile->txflt, skb))
>  		goto drop;
>  
> -	if (tun->socket.sk->sk_filter &&
> -	    sk_filter(tun->socket.sk, skb))
> +	if (tfile->socket.sk->sk_filter &&
> +	    sk_filter(tfile->socket.sk, skb))
>  		goto drop;
>  
> -	if (skb_queue_len(&tun->socket.sk->sk_receive_queue) >= dev->tx_queue_len) {
> +	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
> +	    >= dev->tx_queue_len) {
>  		if (!(tun->flags & TUN_ONE_QUEUE)) {
>  			/* Normal queueing mode. */
>  			/* Packet scheduler handles dropping of further packets. */
> @@ -417,12 +401,12 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>  	skb_orphan(skb);
>  
>  	/* Enqueue packet */
> -	skb_queue_tail(&tun->socket.sk->sk_receive_queue, skb);
> +	skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
>  
>  	/* Notify and wake up reader process */
> -	if (tun->flags & TUN_FASYNC)
> -		kill_fasync(&tun->fasync, SIGIO, POLL_IN);
> -	wake_up_interruptible_poll(&tun->wq.wait, POLLIN |
> +	if (tfile->flags & TUN_FASYNC)
> +		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
> +	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
>  				   POLLRDNORM | POLLRDBAND);
>  	return NETDEV_TX_OK;
>  
> @@ -550,11 +534,11 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>  	if (!tun)
>  		return POLLERR;
>  
> -	sk = tun->socket.sk;
> +	sk = tfile->socket.sk;
>  
>  	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
>  
> -	poll_wait(file, &tun->wq.wait, wait);
> +	poll_wait(file, &tfile->wq.wait, wait);
>  
>  	if (!skb_queue_empty(&sk->sk_receive_queue))
>  		mask |= POLLIN | POLLRDNORM;
> @@ -573,11 +557,11 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>  
>  /* prepad is the amount to reserve at front.  len is length after that.
>   * linear is a hint as to how much to copy (usually headers). */
> -static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
> +static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
>  				     size_t prepad, size_t len,
>  				     size_t linear, int noblock)
>  {
> -	struct sock *sk = tun->socket.sk;
> +	struct sock *sk = tfile->socket.sk;
>  	struct sk_buff *skb;
>  	int err;
>  
> @@ -601,7 +585,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
>  }
>  
>  /* Get packet from user space buffer */
> -static ssize_t tun_get_user(struct tun_struct *tun,
> +static ssize_t tun_get_user(struct tun_file *tfile,
>  			    const struct iovec *iv, size_t count,
>  			    int noblock)
>  {
> @@ -610,8 +594,10 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>  	size_t len = count, align = NET_SKB_PAD;
>  	struct virtio_net_hdr gso = { 0 };
>  	int offset = 0;
> +	struct tun_struct *tun = NULL;
> +	bool drop = false, error = false;
>  
> -	if (!(tun->flags & TUN_NO_PI)) {
> +	if (!(tfile->flags & TUN_NO_PI)) {
>  		if ((len -= sizeof(pi)) > count)
>  			return -EINVAL;
>  
> @@ -620,8 +606,9 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>  		offset += sizeof(pi);
>  	}
>  
> -	if (tun->flags & TUN_VNET_HDR) {
> -		if ((len -= tun->vnet_hdr_sz) > count)
> +	if (tfile->flags & TUN_VNET_HDR) {
> +		len -= tfile->vnet_hdr_sz;
> +		if (len > count)
>  			return -EINVAL;
>  
>  		if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
> @@ -633,41 +620,43 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>  
>  		if (gso.hdr_len > len)
>  			return -EINVAL;
> -		offset += tun->vnet_hdr_sz;
> +		offset += tfile->vnet_hdr_sz;
>  	}
>  
> -	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
> +	if ((tfile->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
>  		align += NET_IP_ALIGN;
>  		if (unlikely(len < ETH_HLEN ||
>  			     (gso.hdr_len && gso.hdr_len < ETH_HLEN)))
>  			return -EINVAL;
>  	}
>  
> -	skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock);
> +	skb = tun_alloc_skb(tfile, align, len, gso.hdr_len, noblock);
> +
>  	if (IS_ERR(skb)) {
>  		if (PTR_ERR(skb) != -EAGAIN)
> -			tun->dev->stats.rx_dropped++;
> -		return PTR_ERR(skb);
> +			drop = true;
> +		count = PTR_ERR(skb);
> +		goto err;
>  	}
>  
>  	if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) {
> -		tun->dev->stats.rx_dropped++;
> +		drop = true;
>  		kfree_skb(skb);
> -		return -EFAULT;
> +		count = -EFAULT;
> +		goto err;
>  	}
>  
>  	if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
>  		if (!skb_partial_csum_set(skb, gso.csum_start,
>  					  gso.csum_offset)) {
> -			tun->dev->stats.rx_frame_errors++;
> -			kfree_skb(skb);
> -			return -EINVAL;
> +			error = true;
> +			goto err_free;
>  		}
>  	}
>  
> -	switch (tun->flags & TUN_TYPE_MASK) {
> +	switch (tfile->flags & TUN_TYPE_MASK) {
>  	case TUN_TUN_DEV:
> -		if (tun->flags & TUN_NO_PI) {
> +		if (tfile->flags & TUN_NO_PI) {
>  			switch (skb->data[0] & 0xf0) {
>  			case 0x40:
>  				pi.proto = htons(ETH_P_IP);
> @@ -676,18 +665,15 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>  				pi.proto = htons(ETH_P_IPV6);
>  				break;
>  			default:
> -				tun->dev->stats.rx_dropped++;
> -				kfree_skb(skb);
> -				return -EINVAL;
> +				drop = true;
> +				goto err_free;
>  			}
>  		}
>  
>  		skb_reset_mac_header(skb);
>  		skb->protocol = pi.proto;
> -		skb->dev = tun->dev;
>  		break;
>  	case TUN_TAP_DEV:
> -		skb->protocol = eth_type_trans(skb, tun->dev);
>  		break;
>  	}
>  
> @@ -704,9 +690,8 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>  			skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
>  			break;
>  		default:
> -			tun->dev->stats.rx_frame_errors++;
> -			kfree_skb(skb);
> -			return -EINVAL;
> +			error = true;
> +			goto err_free;
>  		}
>  
>  		if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
> @@ -714,9 +699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>  
>  		skb_shinfo(skb)->gso_size = gso.gso_size;
>  		if (skb_shinfo(skb)->gso_size == 0) {
> -			tun->dev->stats.rx_frame_errors++;
> -			kfree_skb(skb);
> -			return -EINVAL;
> +			error = true;
> +			goto err_free;
>  		}
>  
>  		/* Header must be checked, and gso_segs computed. */
> @@ -724,11 +708,38 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>  		skb_shinfo(skb)->gso_segs = 0;
>  	}
>  
> -	netif_rx_ni(skb);
> +	tun = __tun_get(tfile);
> +	if (!tun)
> +		return -EBADFD;
>  
> +	switch (tfile->flags & TUN_TYPE_MASK) {
> +	case TUN_TUN_DEV:
> +		skb->dev = tun->dev;
> +		break;
> +	case TUN_TAP_DEV:
> +		skb->protocol = eth_type_trans(skb, tun->dev);
> +		break;
> +	}
> +
> +	netif_rx_ni(skb);
>  	tun->dev->stats.rx_packets++;
>  	tun->dev->stats.rx_bytes += len;
> +	tun_put(tun);
> +	return count;
> +
> +err_free:
> +	count = -EINVAL;
> +	kfree_skb(skb);
> +err:
> +	tun = __tun_get(tfile);
> +	if (!tun)
> +		return -EBADFD;
>  
> +	if (drop)
> +		tun->dev->stats.rx_dropped++;
> +	if (error)
> +		tun->dev->stats.rx_frame_errors++;
> +	tun_put(tun);
>  	return count;
>  }
>  
> @@ -736,30 +747,25 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
>  			      unsigned long count, loff_t pos)
>  {
>  	struct file *file = iocb->ki_filp;
> -	struct tun_struct *tun = tun_get(file);
> +	struct tun_file *tfile = file->private_data;
>  	ssize_t result;
>  
> -	if (!tun)
> -		return -EBADFD;
> -
> -	tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
> -
> -	result = tun_get_user(tun, iv, iov_length(iv, count),
> +	result = tun_get_user(tfile, iv, iov_length(iv, count),
>  			      file->f_flags & O_NONBLOCK);
>  
> -	tun_put(tun);
>  	return result;
>  }
>  
>  /* Put packet to the user space buffer */
> -static ssize_t tun_put_user(struct tun_struct *tun,
> +static ssize_t tun_put_user(struct tun_file *tfile,
>  			    struct sk_buff *skb,
>  			    const struct iovec *iv, int len)
>  {
> +	struct tun_struct *tun = NULL;
>  	struct tun_pi pi = { 0, skb->protocol };
>  	ssize_t total = 0;
>  
> -	if (!(tun->flags & TUN_NO_PI)) {
> +	if (!(tfile->flags & TUN_NO_PI)) {
>  		if ((len -= sizeof(pi)) < 0)
>  			return -EINVAL;
>  
> @@ -773,9 +779,10 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>  		total += sizeof(pi);
>  	}
>  
> -	if (tun->flags & TUN_VNET_HDR) {
> +	if (tfile->flags & TUN_VNET_HDR) {
>  		struct virtio_net_hdr gso = { 0 }; /* no info leak */
> -		if ((len -= tun->vnet_hdr_sz) < 0)
> +		len -= tfile->vnet_hdr_sz;
> +		if (len < 0)
>  			return -EINVAL;
>  
>  		if (skb_is_gso(skb)) {
> @@ -818,7 +825,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>  		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
>  					       sizeof(gso))))
>  			return -EFAULT;
> -		total += tun->vnet_hdr_sz;
> +		total += tfile->vnet_hdr_sz;
>  	}
>  
>  	len = min_t(int, skb->len, len);
> @@ -826,29 +833,33 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>  	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
>  	total += skb->len;
>  
> -	tun->dev->stats.tx_packets++;
> -	tun->dev->stats.tx_bytes += len;
> +	tun = __tun_get(tfile);
> +	if (tun) {
> +		tun->dev->stats.tx_packets++;
> +		tun->dev->stats.tx_bytes += len;
> +		tun_put(tun);
> +	}
>  
>  	return total;
>  }
>  
> -static ssize_t tun_do_read(struct tun_struct *tun,
> +static ssize_t tun_do_read(struct tun_file *tfile,
>  			   struct kiocb *iocb, const struct iovec *iv,
>  			   ssize_t len, int noblock)
>  {
>  	DECLARE_WAITQUEUE(wait, current);
>  	struct sk_buff *skb;
>  	ssize_t ret = 0;
> -
> -	tun_debug(KERN_INFO, tun, "tun_chr_read\n");
> +	struct tun_struct *tun = NULL;
>  
>  	if (unlikely(!noblock))
> -		add_wait_queue(&tun->wq.wait, &wait);
> +		add_wait_queue(&tfile->wq.wait, &wait);
>  	while (len) {
>  		current->state = TASK_INTERRUPTIBLE;
>  
> +		skb = skb_dequeue(&tfile->socket.sk->sk_receive_queue);
>  		/* Read frames from the queue */
> -		if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
> +		if (!skb) {
>  			if (noblock) {
>  				ret = -EAGAIN;
>  				break;
> @@ -857,25 +868,38 @@ static ssize_t tun_do_read(struct tun_struct *tun,
>  				ret = -ERESTARTSYS;
>  				break;
>  			}
> +
> +			tun = __tun_get(tfile);
> +			if (!tun) {
> +				ret = -EIO;
> +				break;
> +			}
>  			if (tun->dev->reg_state != NETREG_REGISTERED) {
>  				ret = -EIO;
> +				tun_put(tun);
>  				break;
>  			}
> +			tun_put(tun);
>  
>  			/* Nothing to read, let's sleep */
>  			schedule();
>  			continue;
>  		}
> -		netif_wake_queue(tun->dev);
>  
> -		ret = tun_put_user(tun, skb, iv, len);
> +		tun = __tun_get(tfile);
> +		if (tun) {
> +			netif_wake_queue(tun->dev);
> +			tun_put(tun);
> +		}
> +
> +		ret = tun_put_user(tfile, skb, iv, len);
>  		kfree_skb(skb);
>  		break;
>  	}
>  
>  	current->state = TASK_RUNNING;
>  	if (unlikely(!noblock))
> -		remove_wait_queue(&tun->wq.wait, &wait);
> +		remove_wait_queue(&tfile->wq.wait, &wait);
>  
>  	return ret;
>  }
> @@ -885,21 +909,17 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct tun_file *tfile = file->private_data;
> -	struct tun_struct *tun = __tun_get(tfile);
>  	ssize_t len, ret;
>  
> -	if (!tun)
> -		return -EBADFD;
>  	len = iov_length(iv, count);
>  	if (len < 0) {
>  		ret = -EINVAL;
>  		goto out;
>  	}
>  
> -	ret = tun_do_read(tun, iocb, iv, len, file->f_flags & O_NONBLOCK);
> +	ret = tun_do_read(tfile, iocb, iv, len, file->f_flags & O_NONBLOCK);
>  	ret = min_t(ssize_t, ret, len);
>  out:
> -	tun_put(tun);
>  	return ret;
>  }
>  
> @@ -911,7 +931,7 @@ static void tun_setup(struct net_device *dev)
>  	tun->group = -1;
>  
>  	dev->ethtool_ops = &tun_ethtool_ops;
> -	dev->destructor = tun_free_netdev;
> +	dev->destructor = free_netdev;
>  }
>  
>  /* Trivial set of netlink ops to allow deleting tun or tap
> @@ -931,7 +951,7 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = {
>  
>  static void tun_sock_write_space(struct sock *sk)
>  {
> -	struct tun_struct *tun;
> +	struct tun_file *tfile = NULL;
>  	wait_queue_head_t *wqueue;
>  
>  	if (!sock_writeable(sk))
> @@ -945,37 +965,38 @@ static void tun_sock_write_space(struct sock *sk)
>  		wake_up_interruptible_sync_poll(wqueue, POLLOUT |
>  						POLLWRNORM | POLLWRBAND);
>  
> -	tun = tun_sk(sk)->tun;
> -	kill_fasync(&tun->fasync, SIGIO, POLL_OUT);
> -}
> -
> -static void tun_sock_destruct(struct sock *sk)
> -{
> -	free_netdev(tun_sk(sk)->tun->dev);
> +	tfile = container_of(sk, struct tun_file, sk);
> +	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
>  }
>  
>  static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
>  		       struct msghdr *m, size_t total_len)
>  {
> -	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
> -	return tun_get_user(tun, m->msg_iov, total_len,
> -			    m->msg_flags & MSG_DONTWAIT);
> +	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
> +	ssize_t result;
> +
> +	result = tun_get_user(tfile, m->msg_iov, total_len,
> +			      m->msg_flags & MSG_DONTWAIT);
> +	return result;
>  }
>  
>  static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
>  		       struct msghdr *m, size_t total_len,
>  		       int flags)
>  {
> -	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
> +	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
>  	int ret;
> +
>  	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
>  		return -EINVAL;
> -	ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
> +
> +	ret = tun_do_read(tfile, iocb, m->msg_iov, total_len,
>  			  flags & MSG_DONTWAIT);
>  	if (ret > total_len) {
>  		m->msg_flags |= MSG_TRUNC;
>  		ret = flags & MSG_TRUNC ? ret : total_len;
>  	}
> +
>  	return ret;
>  }
>  
> @@ -996,7 +1017,7 @@ static const struct proto_ops tun_socket_ops = {
>  static struct proto tun_proto = {
>  	.name		= "tun",
>  	.owner		= THIS_MODULE,
> -	.obj_size	= sizeof(struct tun_sock),
> +	.obj_size	= sizeof(struct tun_file),
>  };
>  
>  static int tun_flags(struct tun_struct *tun)
> @@ -1047,8 +1068,8 @@ static DEVICE_ATTR(group, 0444, tun_show_group, NULL);
>  
>  static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  {
> -	struct sock *sk;
>  	struct tun_struct *tun;
> +	struct tun_file *tfile = file->private_data;
>  	struct net_device *dev;
>  	int err;
>  
> @@ -1069,7 +1090,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  		     (tun->group != -1 && !in_egroup_p(tun->group))) &&
>  		    !capable(CAP_NET_ADMIN))
>  			return -EPERM;
> -		err = security_tun_dev_attach(tun->socket.sk);
> +		err = security_tun_dev_attach(tfile->socket.sk);
>  		if (err < 0)
>  			return err;
>  
> @@ -1113,25 +1134,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  		tun = netdev_priv(dev);
>  		tun->dev = dev;
>  		tun->flags = flags;
> -		tun->txflt.count = 0;
> -		tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
>  
> -		err = -ENOMEM;
> -		sk = sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
> -		if (!sk)
> -			goto err_free_dev;
> -
> -		sk_change_net(sk, net);
> -		tun->socket.wq = &tun->wq;
> -		init_waitqueue_head(&tun->wq.wait);
> -		tun->socket.ops = &tun_socket_ops;
> -		sock_init_data(&tun->socket, sk);
> -		sk->sk_write_space = tun_sock_write_space;
> -		sk->sk_sndbuf = INT_MAX;
> -
> -		tun_sk(sk)->tun = tun;
> -
> -		security_tun_dev_post_create(sk);
> +		security_tun_dev_post_create(&tfile->sk);
>  
>  		tun_net_init(dev);
>  
> @@ -1141,15 +1145,13 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  
>  		err = register_netdevice(tun->dev);
>  		if (err < 0)
> -			goto err_free_sk;
> +			goto err_free_dev;
>  
>  		if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
>  		    device_create_file(&tun->dev->dev, &dev_attr_owner) ||
>  		    device_create_file(&tun->dev->dev, &dev_attr_group))
>  			pr_err("Failed to create tun sysfs files\n");
>  
> -		sk->sk_destruct = tun_sock_destruct;
> -
>  		err = tun_attach(tun, file);
>  		if (err < 0)
>  			goto failed;
> @@ -1172,6 +1174,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  	else
>  		tun->flags &= ~TUN_VNET_HDR;
>  
> +	/* Cache flags from tun device */
> +	tfile->flags = tun->flags;
>  	/* Make sure persistent devices do not get stuck in
>  	 * xoff state.
>  	 */
> @@ -1181,11 +1185,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  	strcpy(ifr->ifr_name, tun->dev->name);
>  	return 0;
>  
> - err_free_sk:
> -	tun_free_netdev(dev);
> - err_free_dev:
> +err_free_dev:
>  	free_netdev(dev);
> - failed:
> +failed:
>  	return err;
>  }
>  
> @@ -1357,9 +1359,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  	case TUNSETTXFILTER:
>  		/* Can be set only for TAPs */
>  		ret = -EINVAL;
> -		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
> +		if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
>  			break;
> -		ret = update_filter(&tun->txflt, (void __user *)arg);
> +		ret = update_filter(&tfile->txflt, (void __user *)arg);
>  		break;
>  
>  	case SIOCGIFHWADDR:
> @@ -1379,7 +1381,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  		break;
>  
>  	case TUNGETSNDBUF:
> -		sndbuf = tun->socket.sk->sk_sndbuf;
> +		sndbuf = tfile->socket.sk->sk_sndbuf;
>  		if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
>  			ret = -EFAULT;
>  		break;
> @@ -1390,11 +1392,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  			break;
>  		}
>  
> -		tun->socket.sk->sk_sndbuf = sndbuf;
> +		tfile->socket.sk->sk_sndbuf = sndbuf;
>  		break;
>  
>  	case TUNGETVNETHDRSZ:
> -		vnet_hdr_sz = tun->vnet_hdr_sz;
> +		vnet_hdr_sz = tfile->vnet_hdr_sz;
>  		if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
>  			ret = -EFAULT;
>  		break;
> @@ -1409,27 +1411,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  			break;
>  		}
>  
> -		tun->vnet_hdr_sz = vnet_hdr_sz;
> +		tfile->vnet_hdr_sz = vnet_hdr_sz;
>  		break;
>  
>  	case TUNATTACHFILTER:
>  		/* Can be set only for TAPs */
>  		ret = -EINVAL;
> -		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
> +		if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
>  			break;
>  		ret = -EFAULT;
>  		if (copy_from_user(&fprog, argp, sizeof(fprog)))
>  			break;
>  
> -		ret = sk_attach_filter(&fprog, tun->socket.sk);
> +		ret = sk_attach_filter(&fprog, tfile->socket.sk);
>  		break;
>  
>  	case TUNDETACHFILTER:
>  		/* Can be set only for TAPs */
>  		ret = -EINVAL;
> -		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
> +		if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
>  			break;
> -		ret = sk_detach_filter(tun->socket.sk);
> +		ret = sk_detach_filter(tfile->socket.sk);
>  		break;
>  
>  	default:
> @@ -1481,43 +1483,50 @@ static long tun_chr_compat_ioctl(struct file *file,
>  
>  static int tun_chr_fasync(int fd, struct file *file, int on)
>  {
> -	struct tun_struct *tun = tun_get(file);
> -	int ret;
> -
> -	if (!tun)
> -		return -EBADFD;
> -
> -	tun_debug(KERN_INFO, tun, "tun_chr_fasync %d\n", on);
> +	struct tun_file *tfile = file->private_data;
> +	int ret = fasync_helper(fd, file, on, &tfile->fasync);
>  
> -	if ((ret = fasync_helper(fd, file, on, &tun->fasync)) < 0)
> +	if (ret < 0)
>  		goto out;
>  
>  	if (on) {
>  		ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0);
>  		if (ret)
>  			goto out;
> -		tun->flags |= TUN_FASYNC;
> +		tfile->flags |= TUN_FASYNC;
>  	} else
> -		tun->flags &= ~TUN_FASYNC;
> +		tfile->flags &= ~TUN_FASYNC;
>  	ret = 0;
>  out:
> -	tun_put(tun);
>  	return ret;
>  }
>  
>  static int tun_chr_open(struct inode *inode, struct file * file)
>  {
> +	struct net *net = current->nsproxy->net_ns;
>  	struct tun_file *tfile;
>  
>  	DBG1(KERN_INFO, "tunX: tun_chr_open\n");
>  
> -	tfile = kmalloc(sizeof(*tfile), GFP_KERNEL);
> +	tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
> +					&tun_proto);
>  	if (!tfile)
>  		return -ENOMEM;
> -	atomic_set(&tfile->count, 0);
> +
>  	tfile->tun = NULL;
> -	tfile->net = get_net(current->nsproxy->net_ns);
> +	tfile->net = net;
> +	tfile->txflt.count = 0;
> +	tfile->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
> +	tfile->socket.wq = &tfile->wq;
> +	init_waitqueue_head(&tfile->wq.wait);
> +	tfile->socket.file = file;
> +	tfile->socket.ops = &tun_socket_ops;
> +	sock_init_data(&tfile->socket, &tfile->sk);
> +
> +	tfile->sk.sk_write_space = tun_sock_write_space;
> +	tfile->sk.sk_sndbuf = INT_MAX;
>  	file->private_data = tfile;
> +
>  	return 0;
>  }
>  
> @@ -1541,14 +1550,14 @@ static int tun_chr_close(struct inode *inode, struct file *file)
>  				unregister_netdevice(dev);
>  			rtnl_unlock();
>  		}
> -	}
>  
> -	tun = tfile->tun;
> -	if (tun)
> -		sock_put(tun->socket.sk);
> +		/* drop the reference that netdevice holds */
> +		sock_put(&tfile->sk);
>  
> -	put_net(tfile->net);
> -	kfree(tfile);
> +	}
> +
> +	/* drop the reference that file holds */
> +	sock_put(&tfile->sk);
>  
>  	return 0;
>  }
> @@ -1676,13 +1685,14 @@ static void tun_cleanup(void)
>  struct socket *tun_get_socket(struct file *file)
>  {
>  	struct tun_struct *tun;
> +	struct tun_file *tfile = file->private_data;
>  	if (file->f_op != &tun_fops)
>  		return ERR_PTR(-EINVAL);
>  	tun = tun_get(file);
>  	if (!tun)
>  		return ERR_PTR(-EBADFD);
>  	tun_put(tun);
> -	return &tun->socket;
> +	return &tfile->socket;
>  }
>  EXPORT_SYMBOL_GPL(tun_get_socket);
>  

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-25  8:25   ` [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support Michael S. Tsirkin
@ 2012-06-25  8:41     ` Michael S. Tsirkin
  2012-06-26  3:42     ` Jason Wang
  2012-06-26  5:52     ` Jason Wang
  2 siblings, 0 replies; 28+ messages in thread
From: Michael S. Tsirkin @ 2012-06-25  8:41 UTC (permalink / raw)
  To: Jason Wang
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle

On Mon, Jun 25, 2012 at 11:25:53AM +0300, Michael S. Tsirkin wrote:
> On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
> > This patch adds multiqueue support for tap device. This is done by abstracting
> > each queue as a file/socket and allowing multiple sockets to be attached to the
> > tuntap device (an array of tun_file were stored in the tun_struct). Userspace
> > could write and read from those files to do the parallel packet
> > sending/receiving.
> > 
> > Unlike the previous single queue implementation, the socket and device were
> > loosely coupled, each of them were allowed to go away first. In order to let the
> > tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
> > synchronize between data path and system call.
> 
> Don't use LLTX/RCU. It's not worth it.

Or maybe we should use LLTX. Need to think about it.
But if yes I'd like a separate patch moving tun to LLTX
and move it always to LLTX. Don't play with LLTX at runtime.

-- 
MST

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [net-next RFC V3 0/6] Multiqueue support in tun/tap
       [not found] <20120625060830.6765.27584.stgit@amd-6168-8-1.englab.nay.redhat.com>
       [not found] ` <20120625061018.6765.76633.stgit@amd-6168-8-1.englab.nay.redhat.com>
       [not found] ` <20120625060945.6765.98618.stgit@amd-6168-8-1.englab.nay.redhat.com>
@ 2012-06-25 11:59 ` Jason Wang
  2012-06-25 11:59 ` [PATCH 1/6] tuntap: move socket to tun_file Jason Wang
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2012-06-25 11:59 UTC (permalink / raw)
  To: mst, akong, habanero, tahm, haixiao, jwhan, ernesto.martin,
	mashirle, davem, netdev, linux-kernel, krkumar2
  Cc: shemminger, edumazet, Jason Wang

Hello All:

This is an update of multiqueue support in tun/tap from V2. Please consider to
merge.

The main idea for this series is to let tun/tap device to benefit from
multiqueue network cards and multi-cores host by letting it to be able to
transmit and receive packets from mmultiple sockets/queues. This series allows
multiple sockets to be attached and detached to the tun/tap devices. Userspace
could utilize this parallism to achiveve higher throughput.

Some quick overview of the design:

- Moving socket from tun_device to tun_file.
- Allowing multiple sockets to be attached to a tun/tap devices.
- Using RCU to synchronize the data path and system call.
- A simple hash based queue selecting algorithm is used to choose the tx queue.
- Two new ioctls were added for the usespace to attach and detach socket to the device.
- ABI compatibility were maintained, userspace that only use one queue won't
 need any changes.

Performance test:

This series were originally designed to serve as the backend of the multiqueue
virito-net in kvm guest. But the design is generic enough to let it to be reused
by any other type of userspace.

Since I would post a series of multiqueue virtio-net as RFC, so I would post the
performance result in that thread. To summarize the performance, the multiqueue
improves the transaction in the TCP_RR test but have some regression in small
packets transmission in TCP_STREAM test.

Martin test the multiqueue tap for their userspace, and he see an improvement in
terms of packets per second.

References:
- V2 of multiqueue tun/tap, http://lwn.net/Articles/459270/
- V1 of multiqueue tun/tap, http://www.mail-archive.com/kvm@vger.kernel.org/msg59479.html

Changes from V2:

- Rebase to the latest net-next
- Fix netdev leak when tun_attach fails
- Fix return value of TUNSETOWNER
- Purge the receive queue in socket destructor
- Enable multiqueue tun (V1 and V2 only allows mq to be eanbled for tap
- Add per-queue u64 statistics
- Fix wrong BUG_ON() check in tun_detach()
- Check numqueues instead of tfile[0] in tun_set_iff() to let tunctl -d works
  correctly
- Set numqueues to MAX_TAP_QUEUES during tun_detach_all() to prevent the
  attaching.

Changes from V1:

- Simplify the sockets array management by not leaving NULL in the slot.
- Optimization on the tx queue selecting.
- Fix the bug in tun_deatch_all()

Jason Wang (6):
  tuntap: move socket to tun_file
  tuntap: categorize ioctl
  tuntap: introduce multiqueue flags
  tuntap: multiqueue support
  tuntap: per queue 64 bit stats
  tuntap: add ioctls to attach or detach a file form tuntap device

 drivers/net/tun.c      |  797 ++++++++++++++++++++++++++++++------------------
 include/linux/if_tun.h |    5 +
 2 files changed, 503 insertions(+), 299 deletions(-)


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/6] tuntap: move socket to tun_file
       [not found] <20120625060830.6765.27584.stgit@amd-6168-8-1.englab.nay.redhat.com>
                   ` (2 preceding siblings ...)
  2012-06-25 11:59 ` [net-next RFC V3 0/6] Multiqueue support in tun/tap Jason Wang
@ 2012-06-25 11:59 ` Jason Wang
  2012-06-25 11:59 ` [PATCH 2/6] tuntap: categorize ioctl Jason Wang
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2012-06-25 11:59 UTC (permalink / raw)
  To: mst, akong, habanero, tahm, haixiao, jwhan, ernesto.martin,
	mashirle, davem, netdev, linux-kernel, krkumar2
  Cc: shemminger, edumazet, Jason Wang

This patch moves socket structure from tun_device and to tun_file in order to
let it possbile for multiple sockets to be attached to tun/tap device. The
reference between tap device and socket was setup during TUNSETIFF as
usual.

After this patch, we can go further towards multiqueue tun/tap support by
storing an array of pointers of tun_file in tun_device.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c |  352 +++++++++++++++++++++++++++--------------------------
 1 files changed, 181 insertions(+), 171 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 987aeef..1f27789 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -108,9 +108,16 @@ struct tap_filter {
 };
 
 struct tun_file {
+	struct sock sk;
+	struct socket socket;
+	struct socket_wq wq;
+	int vnet_hdr_sz;
+	struct tap_filter txflt;
 	atomic_t count;
 	struct tun_struct *tun;
 	struct net *net;
+	struct fasync_struct *fasync;
+	unsigned int flags;
 };
 
 struct tun_sock;
@@ -125,29 +132,12 @@ struct tun_struct {
 	netdev_features_t	set_features;
 #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
 			  NETIF_F_TSO6|NETIF_F_UFO)
-	struct fasync_struct	*fasync;
-
-	struct tap_filter       txflt;
-	struct socket		socket;
-	struct socket_wq	wq;
-
-	int			vnet_hdr_sz;
 
 #ifdef TUN_DEBUG
 	int debug;
 #endif
 };
 
-struct tun_sock {
-	struct sock		sk;
-	struct tun_struct	*tun;
-};
-
-static inline struct tun_sock *tun_sk(struct sock *sk)
-{
-	return container_of(sk, struct tun_sock, sk);
-}
-
 static int tun_attach(struct tun_struct *tun, struct file *file)
 {
 	struct tun_file *tfile = file->private_data;
@@ -168,10 +158,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
 	err = 0;
 	tfile->tun = tun;
 	tun->tfile = tfile;
-	tun->socket.file = file;
 	netif_carrier_on(tun->dev);
 	dev_hold(tun->dev);
-	sock_hold(tun->socket.sk);
+	sock_hold(&tfile->sk);
 	atomic_inc(&tfile->count);
 
 out:
@@ -181,15 +170,15 @@ out:
 
 static void __tun_detach(struct tun_struct *tun)
 {
+	struct tun_file *tfile = tun->tfile;
 	/* Detach from net device */
 	netif_tx_lock_bh(tun->dev);
 	netif_carrier_off(tun->dev);
 	tun->tfile = NULL;
-	tun->socket.file = NULL;
 	netif_tx_unlock_bh(tun->dev);
 
 	/* Drop read queue */
-	skb_queue_purge(&tun->socket.sk->sk_receive_queue);
+	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
 
 	/* Drop the extra count on the net device */
 	dev_put(tun->dev);
@@ -348,19 +337,12 @@ static void tun_net_uninit(struct net_device *dev)
 	/* Inform the methods they need to stop using the dev.
 	 */
 	if (tfile) {
-		wake_up_all(&tun->wq.wait);
+		wake_up_all(&tfile->wq.wait);
 		if (atomic_dec_and_test(&tfile->count))
 			__tun_detach(tun);
 	}
 }
 
-static void tun_free_netdev(struct net_device *dev)
-{
-	struct tun_struct *tun = netdev_priv(dev);
-
-	sk_release_kernel(tun->socket.sk);
-}
-
 /* Net device open. */
 static int tun_net_open(struct net_device *dev)
 {
@@ -379,24 +361,26 @@ static int tun_net_close(struct net_device *dev)
 static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile = tun->tfile;
 
 	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
 
 	/* Drop packet if interface is not attached */
-	if (!tun->tfile)
+	if (!tfile)
 		goto drop;
 
 	/* Drop if the filter does not like it.
 	 * This is a noop if the filter is disabled.
 	 * Filter can be enabled only for the TAP devices. */
-	if (!check_filter(&tun->txflt, skb))
+	if (!check_filter(&tfile->txflt, skb))
 		goto drop;
 
-	if (tun->socket.sk->sk_filter &&
-	    sk_filter(tun->socket.sk, skb))
+	if (tfile->socket.sk->sk_filter &&
+	    sk_filter(tfile->socket.sk, skb))
 		goto drop;
 
-	if (skb_queue_len(&tun->socket.sk->sk_receive_queue) >= dev->tx_queue_len) {
+	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
+	    >= dev->tx_queue_len) {
 		if (!(tun->flags & TUN_ONE_QUEUE)) {
 			/* Normal queueing mode. */
 			/* Packet scheduler handles dropping of further packets. */
@@ -417,12 +401,12 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb_orphan(skb);
 
 	/* Enqueue packet */
-	skb_queue_tail(&tun->socket.sk->sk_receive_queue, skb);
+	skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
 
 	/* Notify and wake up reader process */
-	if (tun->flags & TUN_FASYNC)
-		kill_fasync(&tun->fasync, SIGIO, POLL_IN);
-	wake_up_interruptible_poll(&tun->wq.wait, POLLIN |
+	if (tfile->flags & TUN_FASYNC)
+		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
+	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
 				   POLLRDNORM | POLLRDBAND);
 	return NETDEV_TX_OK;
 
@@ -550,11 +534,11 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
 	if (!tun)
 		return POLLERR;
 
-	sk = tun->socket.sk;
+	sk = tfile->socket.sk;
 
 	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
 
-	poll_wait(file, &tun->wq.wait, wait);
+	poll_wait(file, &tfile->wq.wait, wait);
 
 	if (!skb_queue_empty(&sk->sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
@@ -573,11 +557,11 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
 
 /* prepad is the amount to reserve at front.  len is length after that.
  * linear is a hint as to how much to copy (usually headers). */
-static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
+static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
 				     size_t prepad, size_t len,
 				     size_t linear, int noblock)
 {
-	struct sock *sk = tun->socket.sk;
+	struct sock *sk = tfile->socket.sk;
 	struct sk_buff *skb;
 	int err;
 
@@ -601,7 +585,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
 }
 
 /* Get packet from user space buffer */
-static ssize_t tun_get_user(struct tun_struct *tun,
+static ssize_t tun_get_user(struct tun_file *tfile,
 			    const struct iovec *iv, size_t count,
 			    int noblock)
 {
@@ -610,8 +594,10 @@ static ssize_t tun_get_user(struct tun_struct *tun,
 	size_t len = count, align = NET_SKB_PAD;
 	struct virtio_net_hdr gso = { 0 };
 	int offset = 0;
+	struct tun_struct *tun = NULL;
+	bool drop = false, error = false;
 
-	if (!(tun->flags & TUN_NO_PI)) {
+	if (!(tfile->flags & TUN_NO_PI)) {
 		if ((len -= sizeof(pi)) > count)
 			return -EINVAL;
 
@@ -620,8 +606,9 @@ static ssize_t tun_get_user(struct tun_struct *tun,
 		offset += sizeof(pi);
 	}
 
-	if (tun->flags & TUN_VNET_HDR) {
-		if ((len -= tun->vnet_hdr_sz) > count)
+	if (tfile->flags & TUN_VNET_HDR) {
+		len -= tfile->vnet_hdr_sz;
+		if (len > count)
 			return -EINVAL;
 
 		if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
@@ -633,41 +620,43 @@ static ssize_t tun_get_user(struct tun_struct *tun,
 
 		if (gso.hdr_len > len)
 			return -EINVAL;
-		offset += tun->vnet_hdr_sz;
+		offset += tfile->vnet_hdr_sz;
 	}
 
-	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
+	if ((tfile->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
 		align += NET_IP_ALIGN;
 		if (unlikely(len < ETH_HLEN ||
 			     (gso.hdr_len && gso.hdr_len < ETH_HLEN)))
 			return -EINVAL;
 	}
 
-	skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock);
+	skb = tun_alloc_skb(tfile, align, len, gso.hdr_len, noblock);
+
 	if (IS_ERR(skb)) {
 		if (PTR_ERR(skb) != -EAGAIN)
-			tun->dev->stats.rx_dropped++;
-		return PTR_ERR(skb);
+			drop = true;
+		count = PTR_ERR(skb);
+		goto err;
 	}
 
 	if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) {
-		tun->dev->stats.rx_dropped++;
+		drop = true;
 		kfree_skb(skb);
-		return -EFAULT;
+		count = -EFAULT;
+		goto err;
 	}
 
 	if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
 		if (!skb_partial_csum_set(skb, gso.csum_start,
 					  gso.csum_offset)) {
-			tun->dev->stats.rx_frame_errors++;
-			kfree_skb(skb);
-			return -EINVAL;
+			error = true;
+			goto err_free;
 		}
 	}
 
-	switch (tun->flags & TUN_TYPE_MASK) {
+	switch (tfile->flags & TUN_TYPE_MASK) {
 	case TUN_TUN_DEV:
-		if (tun->flags & TUN_NO_PI) {
+		if (tfile->flags & TUN_NO_PI) {
 			switch (skb->data[0] & 0xf0) {
 			case 0x40:
 				pi.proto = htons(ETH_P_IP);
@@ -676,18 +665,15 @@ static ssize_t tun_get_user(struct tun_struct *tun,
 				pi.proto = htons(ETH_P_IPV6);
 				break;
 			default:
-				tun->dev->stats.rx_dropped++;
-				kfree_skb(skb);
-				return -EINVAL;
+				drop = true;
+				goto err_free;
 			}
 		}
 
 		skb_reset_mac_header(skb);
 		skb->protocol = pi.proto;
-		skb->dev = tun->dev;
 		break;
 	case TUN_TAP_DEV:
-		skb->protocol = eth_type_trans(skb, tun->dev);
 		break;
 	}
 
@@ -704,9 +690,8 @@ static ssize_t tun_get_user(struct tun_struct *tun,
 			skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 			break;
 		default:
-			tun->dev->stats.rx_frame_errors++;
-			kfree_skb(skb);
-			return -EINVAL;
+			error = true;
+			goto err_free;
 		}
 
 		if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
@@ -714,9 +699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun,
 
 		skb_shinfo(skb)->gso_size = gso.gso_size;
 		if (skb_shinfo(skb)->gso_size == 0) {
-			tun->dev->stats.rx_frame_errors++;
-			kfree_skb(skb);
-			return -EINVAL;
+			error = true;
+			goto err_free;
 		}
 
 		/* Header must be checked, and gso_segs computed. */
@@ -724,11 +708,38 @@ static ssize_t tun_get_user(struct tun_struct *tun,
 		skb_shinfo(skb)->gso_segs = 0;
 	}
 
-	netif_rx_ni(skb);
+	tun = __tun_get(tfile);
+	if (!tun)
+		return -EBADFD;
 
+	switch (tfile->flags & TUN_TYPE_MASK) {
+	case TUN_TUN_DEV:
+		skb->dev = tun->dev;
+		break;
+	case TUN_TAP_DEV:
+		skb->protocol = eth_type_trans(skb, tun->dev);
+		break;
+	}
+
+	netif_rx_ni(skb);
 	tun->dev->stats.rx_packets++;
 	tun->dev->stats.rx_bytes += len;
+	tun_put(tun);
+	return count;
+
+err_free:
+	count = -EINVAL;
+	kfree_skb(skb);
+err:
+	tun = __tun_get(tfile);
+	if (!tun)
+		return -EBADFD;
 
+	if (drop)
+		tun->dev->stats.rx_dropped++;
+	if (error)
+		tun->dev->stats.rx_frame_errors++;
+	tun_put(tun);
 	return count;
 }
 
@@ -736,30 +747,25 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 			      unsigned long count, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct tun_struct *tun = tun_get(file);
+	struct tun_file *tfile = file->private_data;
 	ssize_t result;
 
-	if (!tun)
-		return -EBADFD;
-
-	tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
-
-	result = tun_get_user(tun, iv, iov_length(iv, count),
+	result = tun_get_user(tfile, iv, iov_length(iv, count),
 			      file->f_flags & O_NONBLOCK);
 
-	tun_put(tun);
 	return result;
 }
 
 /* Put packet to the user space buffer */
-static ssize_t tun_put_user(struct tun_struct *tun,
+static ssize_t tun_put_user(struct tun_file *tfile,
 			    struct sk_buff *skb,
 			    const struct iovec *iv, int len)
 {
+	struct tun_struct *tun = NULL;
 	struct tun_pi pi = { 0, skb->protocol };
 	ssize_t total = 0;
 
-	if (!(tun->flags & TUN_NO_PI)) {
+	if (!(tfile->flags & TUN_NO_PI)) {
 		if ((len -= sizeof(pi)) < 0)
 			return -EINVAL;
 
@@ -773,9 +779,10 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 		total += sizeof(pi);
 	}
 
-	if (tun->flags & TUN_VNET_HDR) {
+	if (tfile->flags & TUN_VNET_HDR) {
 		struct virtio_net_hdr gso = { 0 }; /* no info leak */
-		if ((len -= tun->vnet_hdr_sz) < 0)
+		len -= tfile->vnet_hdr_sz;
+		if (len < 0)
 			return -EINVAL;
 
 		if (skb_is_gso(skb)) {
@@ -818,7 +825,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
 					       sizeof(gso))))
 			return -EFAULT;
-		total += tun->vnet_hdr_sz;
+		total += tfile->vnet_hdr_sz;
 	}
 
 	len = min_t(int, skb->len, len);
@@ -826,29 +833,33 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
 	total += skb->len;
 
-	tun->dev->stats.tx_packets++;
-	tun->dev->stats.tx_bytes += len;
+	tun = __tun_get(tfile);
+	if (tun) {
+		tun->dev->stats.tx_packets++;
+		tun->dev->stats.tx_bytes += len;
+		tun_put(tun);
+	}
 
 	return total;
 }
 
-static ssize_t tun_do_read(struct tun_struct *tun,
+static ssize_t tun_do_read(struct tun_file *tfile,
 			   struct kiocb *iocb, const struct iovec *iv,
 			   ssize_t len, int noblock)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct sk_buff *skb;
 	ssize_t ret = 0;
-
-	tun_debug(KERN_INFO, tun, "tun_chr_read\n");
+	struct tun_struct *tun = NULL;
 
 	if (unlikely(!noblock))
-		add_wait_queue(&tun->wq.wait, &wait);
+		add_wait_queue(&tfile->wq.wait, &wait);
 	while (len) {
 		current->state = TASK_INTERRUPTIBLE;
 
+		skb = skb_dequeue(&tfile->socket.sk->sk_receive_queue);
 		/* Read frames from the queue */
-		if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
+		if (!skb) {
 			if (noblock) {
 				ret = -EAGAIN;
 				break;
@@ -857,25 +868,38 @@ static ssize_t tun_do_read(struct tun_struct *tun,
 				ret = -ERESTARTSYS;
 				break;
 			}
+
+			tun = __tun_get(tfile);
+			if (!tun) {
+				ret = -EIO;
+				break;
+			}
 			if (tun->dev->reg_state != NETREG_REGISTERED) {
 				ret = -EIO;
+				tun_put(tun);
 				break;
 			}
+			tun_put(tun);
 
 			/* Nothing to read, let's sleep */
 			schedule();
 			continue;
 		}
-		netif_wake_queue(tun->dev);
 
-		ret = tun_put_user(tun, skb, iv, len);
+		tun = __tun_get(tfile);
+		if (tun) {
+			netif_wake_queue(tun->dev);
+			tun_put(tun);
+		}
+
+		ret = tun_put_user(tfile, skb, iv, len);
 		kfree_skb(skb);
 		break;
 	}
 
 	current->state = TASK_RUNNING;
 	if (unlikely(!noblock))
-		remove_wait_queue(&tun->wq.wait, &wait);
+		remove_wait_queue(&tfile->wq.wait, &wait);
 
 	return ret;
 }
@@ -885,21 +909,17 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
 {
 	struct file *file = iocb->ki_filp;
 	struct tun_file *tfile = file->private_data;
-	struct tun_struct *tun = __tun_get(tfile);
 	ssize_t len, ret;
 
-	if (!tun)
-		return -EBADFD;
 	len = iov_length(iv, count);
 	if (len < 0) {
 		ret = -EINVAL;
 		goto out;
 	}
 
-	ret = tun_do_read(tun, iocb, iv, len, file->f_flags & O_NONBLOCK);
+	ret = tun_do_read(tfile, iocb, iv, len, file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 out:
-	tun_put(tun);
 	return ret;
 }
 
@@ -911,7 +931,7 @@ static void tun_setup(struct net_device *dev)
 	tun->group = -1;
 
 	dev->ethtool_ops = &tun_ethtool_ops;
-	dev->destructor = tun_free_netdev;
+	dev->destructor = free_netdev;
 }
 
 /* Trivial set of netlink ops to allow deleting tun or tap
@@ -931,7 +951,7 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = {
 
 static void tun_sock_write_space(struct sock *sk)
 {
-	struct tun_struct *tun;
+	struct tun_file *tfile = NULL;
 	wait_queue_head_t *wqueue;
 
 	if (!sock_writeable(sk))
@@ -945,37 +965,38 @@ static void tun_sock_write_space(struct sock *sk)
 		wake_up_interruptible_sync_poll(wqueue, POLLOUT |
 						POLLWRNORM | POLLWRBAND);
 
-	tun = tun_sk(sk)->tun;
-	kill_fasync(&tun->fasync, SIGIO, POLL_OUT);
-}
-
-static void tun_sock_destruct(struct sock *sk)
-{
-	free_netdev(tun_sk(sk)->tun->dev);
+	tfile = container_of(sk, struct tun_file, sk);
+	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
 }
 
 static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
 		       struct msghdr *m, size_t total_len)
 {
-	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
-	return tun_get_user(tun, m->msg_iov, total_len,
-			    m->msg_flags & MSG_DONTWAIT);
+	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
+	ssize_t result;
+
+	result = tun_get_user(tfile, m->msg_iov, total_len,
+			      m->msg_flags & MSG_DONTWAIT);
+	return result;
 }
 
 static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 		       struct msghdr *m, size_t total_len,
 		       int flags)
 {
-	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
+	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 	int ret;
+
 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
 		return -EINVAL;
-	ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
+
+	ret = tun_do_read(tfile, iocb, m->msg_iov, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;
 		ret = flags & MSG_TRUNC ? ret : total_len;
 	}
+
 	return ret;
 }
 
@@ -996,7 +1017,7 @@ static const struct proto_ops tun_socket_ops = {
 static struct proto tun_proto = {
 	.name		= "tun",
 	.owner		= THIS_MODULE,
-	.obj_size	= sizeof(struct tun_sock),
+	.obj_size	= sizeof(struct tun_file),
 };
 
 static int tun_flags(struct tun_struct *tun)
@@ -1047,8 +1068,8 @@ static DEVICE_ATTR(group, 0444, tun_show_group, NULL);
 
 static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 {
-	struct sock *sk;
 	struct tun_struct *tun;
+	struct tun_file *tfile = file->private_data;
 	struct net_device *dev;
 	int err;
 
@@ -1069,7 +1090,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		     (tun->group != -1 && !in_egroup_p(tun->group))) &&
 		    !capable(CAP_NET_ADMIN))
 			return -EPERM;
-		err = security_tun_dev_attach(tun->socket.sk);
+		err = security_tun_dev_attach(tfile->socket.sk);
 		if (err < 0)
 			return err;
 
@@ -1113,25 +1134,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		tun = netdev_priv(dev);
 		tun->dev = dev;
 		tun->flags = flags;
-		tun->txflt.count = 0;
-		tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
 
-		err = -ENOMEM;
-		sk = sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
-		if (!sk)
-			goto err_free_dev;
-
-		sk_change_net(sk, net);
-		tun->socket.wq = &tun->wq;
-		init_waitqueue_head(&tun->wq.wait);
-		tun->socket.ops = &tun_socket_ops;
-		sock_init_data(&tun->socket, sk);
-		sk->sk_write_space = tun_sock_write_space;
-		sk->sk_sndbuf = INT_MAX;
-
-		tun_sk(sk)->tun = tun;
-
-		security_tun_dev_post_create(sk);
+		security_tun_dev_post_create(&tfile->sk);
 
 		tun_net_init(dev);
 
@@ -1141,15 +1145,13 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 		err = register_netdevice(tun->dev);
 		if (err < 0)
-			goto err_free_sk;
+			goto err_free_dev;
 
 		if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
 		    device_create_file(&tun->dev->dev, &dev_attr_owner) ||
 		    device_create_file(&tun->dev->dev, &dev_attr_group))
 			pr_err("Failed to create tun sysfs files\n");
 
-		sk->sk_destruct = tun_sock_destruct;
-
 		err = tun_attach(tun, file);
 		if (err < 0)
 			goto failed;
@@ -1172,6 +1174,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	else
 		tun->flags &= ~TUN_VNET_HDR;
 
+	/* Cache flags from tun device */
+	tfile->flags = tun->flags;
 	/* Make sure persistent devices do not get stuck in
 	 * xoff state.
 	 */
@@ -1181,11 +1185,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	strcpy(ifr->ifr_name, tun->dev->name);
 	return 0;
 
- err_free_sk:
-	tun_free_netdev(dev);
- err_free_dev:
+err_free_dev:
 	free_netdev(dev);
- failed:
+failed:
 	return err;
 }
 
@@ -1357,9 +1359,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	case TUNSETTXFILTER:
 		/* Can be set only for TAPs */
 		ret = -EINVAL;
-		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
+		if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
 			break;
-		ret = update_filter(&tun->txflt, (void __user *)arg);
+		ret = update_filter(&tfile->txflt, (void __user *)arg);
 		break;
 
 	case SIOCGIFHWADDR:
@@ -1379,7 +1381,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		break;
 
 	case TUNGETSNDBUF:
-		sndbuf = tun->socket.sk->sk_sndbuf;
+		sndbuf = tfile->socket.sk->sk_sndbuf;
 		if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
 			ret = -EFAULT;
 		break;
@@ -1390,11 +1392,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 			break;
 		}
 
-		tun->socket.sk->sk_sndbuf = sndbuf;
+		tfile->socket.sk->sk_sndbuf = sndbuf;
 		break;
 
 	case TUNGETVNETHDRSZ:
-		vnet_hdr_sz = tun->vnet_hdr_sz;
+		vnet_hdr_sz = tfile->vnet_hdr_sz;
 		if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
 			ret = -EFAULT;
 		break;
@@ -1409,27 +1411,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 			break;
 		}
 
-		tun->vnet_hdr_sz = vnet_hdr_sz;
+		tfile->vnet_hdr_sz = vnet_hdr_sz;
 		break;
 
 	case TUNATTACHFILTER:
 		/* Can be set only for TAPs */
 		ret = -EINVAL;
-		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
+		if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
 			break;
 		ret = -EFAULT;
 		if (copy_from_user(&fprog, argp, sizeof(fprog)))
 			break;
 
-		ret = sk_attach_filter(&fprog, tun->socket.sk);
+		ret = sk_attach_filter(&fprog, tfile->socket.sk);
 		break;
 
 	case TUNDETACHFILTER:
 		/* Can be set only for TAPs */
 		ret = -EINVAL;
-		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
+		if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
 			break;
-		ret = sk_detach_filter(tun->socket.sk);
+		ret = sk_detach_filter(tfile->socket.sk);
 		break;
 
 	default:
@@ -1481,43 +1483,50 @@ static long tun_chr_compat_ioctl(struct file *file,
 
 static int tun_chr_fasync(int fd, struct file *file, int on)
 {
-	struct tun_struct *tun = tun_get(file);
-	int ret;
-
-	if (!tun)
-		return -EBADFD;
-
-	tun_debug(KERN_INFO, tun, "tun_chr_fasync %d\n", on);
+	struct tun_file *tfile = file->private_data;
+	int ret = fasync_helper(fd, file, on, &tfile->fasync);
 
-	if ((ret = fasync_helper(fd, file, on, &tun->fasync)) < 0)
+	if (ret < 0)
 		goto out;
 
 	if (on) {
 		ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0);
 		if (ret)
 			goto out;
-		tun->flags |= TUN_FASYNC;
+		tfile->flags |= TUN_FASYNC;
 	} else
-		tun->flags &= ~TUN_FASYNC;
+		tfile->flags &= ~TUN_FASYNC;
 	ret = 0;
 out:
-	tun_put(tun);
 	return ret;
 }
 
 static int tun_chr_open(struct inode *inode, struct file * file)
 {
+	struct net *net = current->nsproxy->net_ns;
 	struct tun_file *tfile;
 
 	DBG1(KERN_INFO, "tunX: tun_chr_open\n");
 
-	tfile = kmalloc(sizeof(*tfile), GFP_KERNEL);
+	tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
+					&tun_proto);
 	if (!tfile)
 		return -ENOMEM;
-	atomic_set(&tfile->count, 0);
+
 	tfile->tun = NULL;
-	tfile->net = get_net(current->nsproxy->net_ns);
+	tfile->net = net;
+	tfile->txflt.count = 0;
+	tfile->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
+	tfile->socket.wq = &tfile->wq;
+	init_waitqueue_head(&tfile->wq.wait);
+	tfile->socket.file = file;
+	tfile->socket.ops = &tun_socket_ops;
+	sock_init_data(&tfile->socket, &tfile->sk);
+
+	tfile->sk.sk_write_space = tun_sock_write_space;
+	tfile->sk.sk_sndbuf = INT_MAX;
 	file->private_data = tfile;
+
 	return 0;
 }
 
@@ -1541,14 +1550,14 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 				unregister_netdevice(dev);
 			rtnl_unlock();
 		}
-	}
 
-	tun = tfile->tun;
-	if (tun)
-		sock_put(tun->socket.sk);
+		/* drop the reference that netdevice holds */
+		sock_put(&tfile->sk);
 
-	put_net(tfile->net);
-	kfree(tfile);
+	}
+
+	/* drop the reference that file holds */
+	sock_put(&tfile->sk);
 
 	return 0;
 }
@@ -1676,13 +1685,14 @@ static void tun_cleanup(void)
 struct socket *tun_get_socket(struct file *file)
 {
 	struct tun_struct *tun;
+	struct tun_file *tfile = file->private_data;
 	if (file->f_op != &tun_fops)
 		return ERR_PTR(-EINVAL);
 	tun = tun_get(file);
 	if (!tun)
 		return ERR_PTR(-EBADFD);
 	tun_put(tun);
-	return &tun->socket;
+	return &tfile->socket;
 }
 EXPORT_SYMBOL_GPL(tun_get_socket);
 
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/6] tuntap: categorize ioctl
       [not found] <20120625060830.6765.27584.stgit@amd-6168-8-1.englab.nay.redhat.com>
                   ` (3 preceding siblings ...)
  2012-06-25 11:59 ` [PATCH 1/6] tuntap: move socket to tun_file Jason Wang
@ 2012-06-25 11:59 ` Jason Wang
  2012-06-25 11:59 ` [PATCH 3/6] tuntap: introduce multiqueue flags Jason Wang
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2012-06-25 11:59 UTC (permalink / raw)
  To: mst, akong, habanero, tahm, haixiao, jwhan, ernesto.martin,
	mashirle, davem, netdev, linux-kernel, krkumar2
  Cc: shemminger, edumazet, Jason Wang

As we've moved socket related structure to file->private_data, we can optimizes
the ioctls that only touch socket out of tun_chr_ioctl() as it don't need hold
rtnl lock.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c |   52 ++++++++++++++++++++++++++++++++++------------------
 1 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1f27789..8233b0a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1248,10 +1248,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	struct tun_file *tfile = file->private_data;
 	struct tun_struct *tun;
 	void __user* argp = (void __user*)arg;
-	struct sock_fprog fprog;
 	struct ifreq ifr;
-	int sndbuf;
-	int vnet_hdr_sz;
 	int ret;
 
 	if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
@@ -1356,14 +1353,6 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		ret = set_offload(tun, arg);
 		break;
 
-	case TUNSETTXFILTER:
-		/* Can be set only for TAPs */
-		ret = -EINVAL;
-		if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
-			break;
-		ret = update_filter(&tfile->txflt, (void __user *)arg);
-		break;
-
 	case SIOCGIFHWADDR:
 		/* Get hw address */
 		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
@@ -1380,6 +1369,37 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
 		break;
 
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+unlock:
+	rtnl_unlock();
+	if (tun)
+		tun_put(tun);
+	return ret;
+}
+
+static long __tun_socket_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long arg, int ifreq_len)
+{
+	struct tun_file *tfile = file->private_data;
+	void __user *argp = (void __user *)arg;
+	struct sock_fprog fprog;
+	int sndbuf;
+	int vnet_hdr_sz;
+	int ret = 0;
+
+	switch (cmd) {
+	case TUNSETTXFILTER:
+		/* Can be set only for TAPs */
+		ret = -EINVAL;
+		if ((tfile->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
+			break;
+		ret = update_filter(&tfile->txflt, (void __user *)arg);
+		break;
+
 	case TUNGETSNDBUF:
 		sndbuf = tfile->socket.sk->sk_sndbuf;
 		if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
@@ -1435,21 +1455,17 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		break;
 
 	default:
-		ret = -EINVAL;
+		ret = __tun_chr_ioctl(file, cmd, arg, ifreq_len);
 		break;
 	}
 
-unlock:
-	rtnl_unlock();
-	if (tun)
-		tun_put(tun);
 	return ret;
 }
 
 static long tun_chr_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long arg)
 {
-	return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
+	return __tun_socket_ioctl(file, cmd, arg, sizeof(struct ifreq));
 }
 
 #ifdef CONFIG_COMPAT
@@ -1477,7 +1493,7 @@ static long tun_chr_compat_ioctl(struct file *file,
 	 * driver are compatible though, we don't need to convert the
 	 * contents.
 	 */
-	return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
+	return __tun_socket_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
 }
 #endif /* CONFIG_COMPAT */
 
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/6] tuntap: introduce multiqueue flags
       [not found] <20120625060830.6765.27584.stgit@amd-6168-8-1.englab.nay.redhat.com>
                   ` (4 preceding siblings ...)
  2012-06-25 11:59 ` [PATCH 2/6] tuntap: categorize ioctl Jason Wang
@ 2012-06-25 11:59 ` Jason Wang
  2012-06-25 11:59 ` [PATCH 4/6] tuntap: multiqueue support Jason Wang
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2012-06-25 11:59 UTC (permalink / raw)
  To: mst, akong, habanero, tahm, haixiao, jwhan, ernesto.martin,
	mashirle, davem, netdev, linux-kernel, krkumar2
  Cc: shemminger, edumazet, Jason Wang

Add flags to be used by creating multiqueue tuntap device.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/if_tun.h |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 06b1829..c92a291 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -34,6 +34,7 @@
 #define TUN_ONE_QUEUE	0x0080
 #define TUN_PERSIST 	0x0100	
 #define TUN_VNET_HDR 	0x0200
+#define TUN_TAP_MQ      0x0400
 
 /* Ioctl defines */
 #define TUNSETNOCSUM  _IOW('T', 200, int) 
@@ -61,6 +62,7 @@
 #define IFF_ONE_QUEUE	0x2000
 #define IFF_VNET_HDR	0x4000
 #define IFF_TUN_EXCL	0x8000
+#define IFF_MULTI_QUEUE 0x0100
 
 /* Features for GSO (TUNSETOFFLOAD). */
 #define TUN_F_CSUM	0x01	/* You can hand me unchecksummed packets. */
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 4/6] tuntap: multiqueue support
       [not found] <20120625060830.6765.27584.stgit@amd-6168-8-1.englab.nay.redhat.com>
                   ` (5 preceding siblings ...)
  2012-06-25 11:59 ` [PATCH 3/6] tuntap: introduce multiqueue flags Jason Wang
@ 2012-06-25 11:59 ` Jason Wang
  2012-06-25 11:59 ` [PATCH 5/6] tuntap: per queue 64 bit stats Jason Wang
  2012-06-25 11:59 ` [PATCH 6/6] tuntap: add ioctls to attach or detach a file form tuntap device Jason Wang
  8 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2012-06-25 11:59 UTC (permalink / raw)
  To: mst, akong, habanero, tahm, haixiao, jwhan, ernesto.martin,
	mashirle, davem, netdev, linux-kernel, krkumar2
  Cc: shemminger, edumazet, Jason Wang

This patch adds multiqueue support for tap device. This is done by abstracting
each queue as a file/socket and allowing multiple sockets to be attached to the
tuntap device (an array of tun_file were stored in the tun_struct). Userspace
could write and read from those files to do the parallel packet
sending/receiving.

Unlike the previous single queue implementation, the socket and device were
loosely coupled, each of them were allowed to go away first. In order to let the
tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
synchronize between data path and system call.

The tx queue selecting is first based on the recorded rxq index of an skb, it
there's no such one, then choosing based on rx hashing (skb_get_rxhash()).

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
 1 files changed, 232 insertions(+), 139 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8233b0a..5c26757 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -107,6 +107,8 @@ struct tap_filter {
 	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
 };
 
+#define MAX_TAP_QUEUES (NR_CPUS < 16 ? NR_CPUS : 16)
+
 struct tun_file {
 	struct sock sk;
 	struct socket socket;
@@ -114,16 +116,18 @@ struct tun_file {
 	int vnet_hdr_sz;
 	struct tap_filter txflt;
 	atomic_t count;
-	struct tun_struct *tun;
+	struct tun_struct __rcu *tun;
 	struct net *net;
 	struct fasync_struct *fasync;
 	unsigned int flags;
+	u16 queue_index;
 };
 
 struct tun_sock;
 
 struct tun_struct {
-	struct tun_file		*tfile;
+	struct tun_file		*tfiles[MAX_TAP_QUEUES];
+	unsigned int            numqueues;
 	unsigned int 		flags;
 	uid_t			owner;
 	gid_t			group;
@@ -138,80 +142,159 @@ struct tun_struct {
 #endif
 };
 
-static int tun_attach(struct tun_struct *tun, struct file *file)
+static DEFINE_SPINLOCK(tun_lock);
+
+/*
+ * tun_get_queue(): calculate the queue index
+ *     - if skbs comes from mq nics, we can just borrow
+ *     - if not, calculate from the hash
+ */
+static struct tun_file *tun_get_queue(struct net_device *dev,
+				      struct sk_buff *skb)
 {
-	struct tun_file *tfile = file->private_data;
-	int err;
+	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile = NULL;
+	int numqueues = tun->numqueues;
+	__u32 rxq;
 
-	ASSERT_RTNL();
+	BUG_ON(!rcu_read_lock_held());
 
-	netif_tx_lock_bh(tun->dev);
+	if (!numqueues)
+		goto out;
 
-	err = -EINVAL;
-	if (tfile->tun)
+	if (numqueues == 1) {
+		tfile = rcu_dereference(tun->tfiles[0]);
 		goto out;
+	}
 
-	err = -EBUSY;
-	if (tun->tfile)
+	if (likely(skb_rx_queue_recorded(skb))) {
+		rxq = skb_get_rx_queue(skb);
+
+		while (unlikely(rxq >= numqueues))
+			rxq -= numqueues;
+
+		tfile = rcu_dereference(tun->tfiles[rxq]);
 		goto out;
+	}
 
-	err = 0;
-	tfile->tun = tun;
-	tun->tfile = tfile;
-	netif_carrier_on(tun->dev);
-	dev_hold(tun->dev);
-	sock_hold(&tfile->sk);
-	atomic_inc(&tfile->count);
+	/* Check if we can use flow to select a queue */
+	rxq = skb_get_rxhash(skb);
+	if (rxq) {
+		u32 idx = ((u64)rxq * numqueues) >> 32;
+		tfile = rcu_dereference(tun->tfiles[idx]);
+		goto out;
+	}
 
+	tfile = rcu_dereference(tun->tfiles[0]);
 out:
-	netif_tx_unlock_bh(tun->dev);
-	return err;
+	return tfile;
 }
 
-static void __tun_detach(struct tun_struct *tun)
+static int tun_detach(struct tun_file *tfile, bool clean)
 {
-	struct tun_file *tfile = tun->tfile;
-	/* Detach from net device */
-	netif_tx_lock_bh(tun->dev);
-	netif_carrier_off(tun->dev);
-	tun->tfile = NULL;
-	netif_tx_unlock_bh(tun->dev);
-
-	/* Drop read queue */
-	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
-
-	/* Drop the extra count on the net device */
-	dev_put(tun->dev);
-}
+	struct tun_struct *tun;
+	struct net_device *dev = NULL;
+	bool destroy = false;
 
-static void tun_detach(struct tun_struct *tun)
-{
-	rtnl_lock();
-	__tun_detach(tun);
-	rtnl_unlock();
-}
+	spin_lock(&tun_lock);
 
-static struct tun_struct *__tun_get(struct tun_file *tfile)
-{
-	struct tun_struct *tun = NULL;
+	tun = rcu_dereference_protected(tfile->tun,
+					lockdep_is_held(&tun_lock));
+	if (tun) {
+		u16 index = tfile->queue_index;
+		BUG_ON(index >= tun->numqueues);
+		dev = tun->dev;
+
+		rcu_assign_pointer(tun->tfiles[index],
+				   tun->tfiles[tun->numqueues - 1]);
+		tun->tfiles[index]->queue_index = index;
+		rcu_assign_pointer(tfile->tun, NULL);
+		--tun->numqueues;
+		sock_put(&tfile->sk);
 
-	if (atomic_inc_not_zero(&tfile->count))
-		tun = tfile->tun;
+		if (tun->numqueues == 0 && !(tun->flags & TUN_PERSIST))
+			destroy = true;
+	}
 
-	return tun;
+	spin_unlock(&tun_lock);
+
+	synchronize_rcu();
+	if (clean)
+		sock_put(&tfile->sk);
+
+	if (destroy) {
+		rtnl_lock();
+		if (dev->reg_state == NETREG_REGISTERED)
+			unregister_netdevice(dev);
+		rtnl_unlock();
+	}
+
+	return 0;
 }
 
-static struct tun_struct *tun_get(struct file *file)
+static void tun_detach_all(struct net_device *dev)
 {
-	return __tun_get(file->private_data);
+	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
+	int i, j = 0;
+
+	spin_lock(&tun_lock);
+
+	for (i = 0; i < MAX_TAP_QUEUES && tun->numqueues; i++) {
+		tfile = rcu_dereference_protected(tun->tfiles[i],
+						lockdep_is_held(&tun_lock));
+		BUG_ON(!tfile);
+		wake_up_all(&tfile->wq.wait);
+		tfile_list[j++] = tfile;
+		rcu_assign_pointer(tfile->tun, NULL);
+		--tun->numqueues;
+	}
+	BUG_ON(tun->numqueues != 0);
+	/* guarantee that any future tun_attach will fail */
+	tun->numqueues = MAX_TAP_QUEUES;
+	spin_unlock(&tun_lock);
+
+	synchronize_rcu();
+	for (--j; j >= 0; j--)
+		sock_put(&tfile_list[j]->sk);
 }
 
-static void tun_put(struct tun_struct *tun)
+static int tun_attach(struct tun_struct *tun, struct file *file)
 {
-	struct tun_file *tfile = tun->tfile;
+	struct tun_file *tfile = file->private_data;
+	int err;
+
+	ASSERT_RTNL();
+
+	spin_lock(&tun_lock);
 
-	if (atomic_dec_and_test(&tfile->count))
-		tun_detach(tfile->tun);
+	err = -EINVAL;
+	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
+		goto out;
+
+	err = -EBUSY;
+	if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
+		goto out;
+
+	if (tun->numqueues == MAX_TAP_QUEUES)
+		goto out;
+
+	err = 0;
+	tfile->queue_index = tun->numqueues;
+	rcu_assign_pointer(tfile->tun, tun);
+	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
+	sock_hold(&tfile->sk);
+	tun->numqueues++;
+
+	if (tun->numqueues == 1)
+		netif_carrier_on(tun->dev);
+
+	/* device is allowed to go away first, so no need to hold extra
+	 * refcnt. */
+
+out:
+	spin_unlock(&tun_lock);
+	return err;
 }
 
 /* TAP filtering */
@@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
 /* Net device detach from fd. */
 static void tun_net_uninit(struct net_device *dev)
 {
-	struct tun_struct *tun = netdev_priv(dev);
-	struct tun_file *tfile = tun->tfile;
-
-	/* Inform the methods they need to stop using the dev.
-	 */
-	if (tfile) {
-		wake_up_all(&tfile->wq.wait);
-		if (atomic_dec_and_test(&tfile->count))
-			__tun_detach(tun);
-	}
+	tun_detach_all(dev);
 }
 
 /* Net device open. */
@@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
 /* Net device start xmit */
 static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct tun_struct *tun = netdev_priv(dev);
-	struct tun_file *tfile = tun->tfile;
+	struct tun_file *tfile = NULL;
 
-	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
+	rcu_read_lock();
+	tfile = tun_get_queue(dev, skb);
 
 	/* Drop packet if interface is not attached */
 	if (!tfile)
@@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
 	    >= dev->tx_queue_len) {
-		if (!(tun->flags & TUN_ONE_QUEUE)) {
+		if (!(tfile->flags & TUN_ONE_QUEUE) &&
+		    !(tfile->flags & TUN_TAP_MQ)) {
 			/* Normal queueing mode. */
 			/* Packet scheduler handles dropping of further packets. */
 			netif_stop_queue(dev);
@@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 			 * error is more appropriate. */
 			dev->stats.tx_fifo_errors++;
 		} else {
-			/* Single queue mode.
+			/* Single queue mode or multi queue mode.
 			 * Driver handles dropping of all packets itself. */
 			goto drop;
 		}
@@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
 	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
 				   POLLRDNORM | POLLRDBAND);
+	rcu_read_unlock();
 	return NETDEV_TX_OK;
 
 drop:
+	rcu_read_unlock();
 	dev->stats.tx_dropped++;
 	kfree_skb(skb);
 	return NETDEV_TX_OK;
@@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
 static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
 {
 	struct tun_file *tfile = file->private_data;
-	struct tun_struct *tun = __tun_get(tfile);
+	struct tun_struct *tun = NULL;
 	struct sock *sk;
 	unsigned int mask = 0;
 
-	if (!tun)
+	if (!tfile)
 		return POLLERR;
 
-	sk = tfile->socket.sk;
+	rcu_read_lock();
+	tun = rcu_dereference(tfile->tun);
+	if (!tun) {
+		rcu_read_unlock();
+		return POLLERR;
+	}
+	rcu_read_unlock();
 
-	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
+	sk = &tfile->sk;
 
 	poll_wait(file, &tfile->wq.wait, wait);
 
@@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
 	     sock_writeable(sk)))
 		mask |= POLLOUT | POLLWRNORM;
 
-	if (tun->dev->reg_state != NETREG_REGISTERED)
+	rcu_read_lock();
+	tun = rcu_dereference(tfile->tun);
+	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
 		mask = POLLERR;
+	rcu_read_unlock();
 
-	tun_put(tun);
 	return mask;
 }
 
@@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
 		skb_shinfo(skb)->gso_segs = 0;
 	}
 
-	tun = __tun_get(tfile);
-	if (!tun)
+	rcu_read_lock();
+	tun = rcu_dereference(tfile->tun);
+	if (!tun) {
+		rcu_read_unlock();
 		return -EBADFD;
+	}
 
 	switch (tfile->flags & TUN_TYPE_MASK) {
 	case TUN_TUN_DEV:
@@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
 		skb->protocol = eth_type_trans(skb, tun->dev);
 		break;
 	}
-
-	netif_rx_ni(skb);
 	tun->dev->stats.rx_packets++;
 	tun->dev->stats.rx_bytes += len;
-	tun_put(tun);
+	rcu_read_unlock();
+
+	netif_rx_ni(skb);
+
 	return count;
 
 err_free:
 	count = -EINVAL;
 	kfree_skb(skb);
 err:
-	tun = __tun_get(tfile);
-	if (!tun)
+	rcu_read_lock();
+	tun = rcu_dereference(tfile->tun);
+	if (!tun) {
+		rcu_read_unlock();
 		return -EBADFD;
+	}
 
 	if (drop)
 		tun->dev->stats.rx_dropped++;
 	if (error)
 		tun->dev->stats.rx_frame_errors++;
-	tun_put(tun);
+	rcu_read_unlock();
 	return count;
 }
 
@@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
 	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
 	total += skb->len;
 
-	tun = __tun_get(tfile);
+	rcu_read_lock();
+	tun = rcu_dereference(tfile->tun);
 	if (tun) {
 		tun->dev->stats.tx_packets++;
 		tun->dev->stats.tx_bytes += len;
-		tun_put(tun);
 	}
+	rcu_read_unlock();
 
 	return total;
 }
@@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
 				break;
 			}
 
-			tun = __tun_get(tfile);
+			rcu_read_lock();
+			tun = rcu_dereference(tfile->tun);
 			if (!tun) {
-				ret = -EIO;
+				ret = -EBADFD;
+				rcu_read_unlock();
 				break;
 			}
 			if (tun->dev->reg_state != NETREG_REGISTERED) {
 				ret = -EIO;
-				tun_put(tun);
+				rcu_read_unlock();
 				break;
 			}
-			tun_put(tun);
+			rcu_read_unlock();
 
 			/* Nothing to read, let's sleep */
 			schedule();
 			continue;
 		}
 
-		tun = __tun_get(tfile);
+		rcu_read_lock();
+		tun = rcu_dereference(tfile->tun);
 		if (tun) {
 			netif_wake_queue(tun->dev);
-			tun_put(tun);
 		}
+		rcu_read_unlock();
 
 		ret = tun_put_user(tfile, skb, iv, len);
 		kfree_skb(skb);
@@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
 	if (tun->flags & TUN_VNET_HDR)
 		flags |= IFF_VNET_HDR;
 
+	if (tun->flags & TUN_TAP_MQ)
+		flags |= IFF_MULTI_QUEUE;
+
 	return flags;
 }
 
@@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		err = tun_attach(tun, file);
 		if (err < 0)
 			return err;
-	}
-	else {
+	} else {
 		char *name;
 		unsigned long flags = 0;
 
@@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
 			TUN_USER_FEATURES;
 		dev->features = dev->hw_features;
+		if (ifr->ifr_flags & IFF_MULTI_QUEUE)
+			dev->features |= NETIF_F_LLTX;
 
 		err = register_netdevice(tun->dev);
 		if (err < 0)
@@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 		err = tun_attach(tun, file);
 		if (err < 0)
-			goto failed;
+			goto err_free_dev;
 	}
 
 	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
@@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	else
 		tun->flags &= ~TUN_VNET_HDR;
 
+	if (ifr->ifr_flags & IFF_MULTI_QUEUE)
+		tun->flags |= TUN_TAP_MQ;
+	else
+		tun->flags &= ~TUN_TAP_MQ;
+
 	/* Cache flags from tun device */
 	tfile->flags = tun->flags;
 	/* Make sure persistent devices do not get stuck in
@@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 err_free_dev:
 	free_netdev(dev);
-failed:
 	return err;
 }
 
@@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 				(unsigned int __user*)argp);
 	}
 
-	rtnl_lock();
-
-	tun = __tun_get(tfile);
-	if (cmd == TUNSETIFF && !tun) {
+	ret = 0;
+	if (cmd == TUNSETIFF) {
+		rtnl_lock();
 		ifr.ifr_name[IFNAMSIZ-1] = '\0';
-
 		ret = tun_set_iff(tfile->net, file, &ifr);
-
+		rtnl_unlock();
 		if (ret)
-			goto unlock;
-
+			return ret;
 		if (copy_to_user(argp, &ifr, ifreq_len))
-			ret = -EFAULT;
-		goto unlock;
+			return -EFAULT;
+		return ret;
 	}
 
+	rtnl_lock();
+
+	rcu_read_lock();
+
 	ret = -EBADFD;
+	tun = rcu_dereference(tfile->tun);
 	if (!tun)
 		goto unlock;
+	else
+		ret = 0;
 
-	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
-
-	ret = 0;
 	switch (cmd) {
 	case TUNGETIFF:
 		ret = tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
+		rcu_read_unlock();
 		if (ret)
-			break;
+			goto out;
 
 		if (copy_to_user(argp, &ifr, ifreq_len))
 			ret = -EFAULT;
-		break;
+		goto out;
 
 	case TUNSETNOCSUM:
 		/* Disable/Enable checksum */
@@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		/* Get hw address */
 		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
 		ifr.ifr_hwaddr.sa_family = tun->dev->type;
+		rcu_read_unlock();
 		if (copy_to_user(argp, &ifr, ifreq_len))
 			ret = -EFAULT;
-		break;
+		goto out;
 
 	case SIOCSIFHWADDR:
 		/* Set hw address */
@@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	}
 
 unlock:
+	rcu_read_unlock();
+out:
 	rtnl_unlock();
-	if (tun)
-		tun_put(tun);
 	return ret;
 }
 
@@ -1517,6 +1624,11 @@ out:
 	return ret;
 }
 
+static void tun_sock_destruct(struct sock *sk)
+{
+	skb_queue_purge(&sk->sk_receive_queue);
+}
+
 static int tun_chr_open(struct inode *inode, struct file * file)
 {
 	struct net *net = current->nsproxy->net_ns;
@@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	sock_init_data(&tfile->socket, &tfile->sk);
 
 	tfile->sk.sk_write_space = tun_sock_write_space;
+	tfile->sk.sk_destruct = tun_sock_destruct;
 	tfile->sk.sk_sndbuf = INT_MAX;
 	file->private_data = tfile;
 
@@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 static int tun_chr_close(struct inode *inode, struct file *file)
 {
 	struct tun_file *tfile = file->private_data;
-	struct tun_struct *tun;
-
-	tun = __tun_get(tfile);
-	if (tun) {
-		struct net_device *dev = tun->dev;
-
-		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
-
-		__tun_detach(tun);
-
-		/* If desirable, unregister the netdevice. */
-		if (!(tun->flags & TUN_PERSIST)) {
-			rtnl_lock();
-			if (dev->reg_state == NETREG_REGISTERED)
-				unregister_netdevice(dev);
-			rtnl_unlock();
-		}
 
-		/* drop the reference that netdevice holds */
-		sock_put(&tfile->sk);
-
-	}
-
-	/* drop the reference that file holds */
-	sock_put(&tfile->sk);
+	tun_detach(tfile, true);
 
 	return 0;
 }
@@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
  * holding a reference to the file for as long as the socket is in use. */
 struct socket *tun_get_socket(struct file *file)
 {
-	struct tun_struct *tun;
+	struct tun_struct *tun = NULL;
 	struct tun_file *tfile = file->private_data;
 	if (file->f_op != &tun_fops)
 		return ERR_PTR(-EINVAL);
-	tun = tun_get(file);
-	if (!tun)
+	rcu_read_lock();
+	tun = rcu_dereference(tfile->tun);
+	if (!tun) {
+		rcu_read_unlock();
 		return ERR_PTR(-EBADFD);
-	tun_put(tun);
+	}
+	rcu_read_unlock();
 	return &tfile->socket;
 }
 EXPORT_SYMBOL_GPL(tun_get_socket);
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 5/6] tuntap: per queue 64 bit stats
       [not found] <20120625060830.6765.27584.stgit@amd-6168-8-1.englab.nay.redhat.com>
                   ` (6 preceding siblings ...)
  2012-06-25 11:59 ` [PATCH 4/6] tuntap: multiqueue support Jason Wang
@ 2012-06-25 11:59 ` Jason Wang
  2012-06-25 12:52   ` Eric Dumazet
  2012-06-25 11:59 ` [PATCH 6/6] tuntap: add ioctls to attach or detach a file form tuntap device Jason Wang
  8 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2012-06-25 11:59 UTC (permalink / raw)
  To: mst, akong, habanero, tahm, haixiao, jwhan, ernesto.martin,
	mashirle, davem, netdev, linux-kernel, krkumar2
  Cc: shemminger, edumazet, Jason Wang

As we've added multiqueue support for tun/tap, this patch convert the statistics
to use per-queue 64 bit statistics.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c |  105 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 83 insertions(+), 22 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5c26757..37e62d3 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -64,6 +64,7 @@
 #include <linux/nsproxy.h>
 #include <linux/virtio_net.h>
 #include <linux/rcupdate.h>
+#include <linux/u64_stats_sync.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
@@ -109,6 +110,19 @@ struct tap_filter {
 
 #define MAX_TAP_QUEUES (NR_CPUS < 16 ? NR_CPUS : 16)
 
+struct tun_queue_stats {
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			tx_packets;
+	u64			tx_bytes;
+	struct u64_stats_sync	rx_syncp;
+	struct u64_stats_sync	tx_syncp;
+	u32			rx_dropped;
+	u32			tx_dropped;
+	u32			rx_frame_errors;
+	u32			tx_fifo_errors;
+};
+
 struct tun_file {
 	struct sock sk;
 	struct socket socket;
@@ -119,6 +133,7 @@ struct tun_file {
 	struct tun_struct __rcu *tun;
 	struct net *net;
 	struct fasync_struct *fasync;
+	struct tun_queue_stats stats;
 	unsigned int flags;
 	u16 queue_index;
 };
@@ -134,6 +149,7 @@ struct tun_struct {
 
 	struct net_device	*dev;
 	netdev_features_t	set_features;
+	struct tun_queue_stats	stats;
 #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
 			  NETIF_F_TSO6|NETIF_F_UFO)
 
@@ -463,7 +479,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 
 			/* We won't see all dropped packets individually, so overrun
 			 * error is more appropriate. */
-			dev->stats.tx_fifo_errors++;
+			tfile->stats.tx_fifo_errors++;
 		} else {
 			/* Single queue mode or multi queue mode.
 			 * Driver handles dropping of all packets itself. */
@@ -488,7 +504,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 
 drop:
 	rcu_read_unlock();
-	dev->stats.tx_dropped++;
+	if (tfile)
+		tfile->stats.tx_dropped++;
 	kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
@@ -538,6 +555,56 @@ static void tun_poll_controller(struct net_device *dev)
 	return;
 }
 #endif
+
+static struct rtnl_link_stats64 *tun_net_stats(struct net_device *dev,
+					       struct rtnl_link_stats64 *stats)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	struct tun_file *tfile;
+	struct tun_queue_stats *qstats;
+	u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+	u32 rx_dropped = 0, tx_dropped = 0,
+	    rx_frame_errors = 0, tx_fifo_errors = 0;
+	unsigned int start;
+	int i;
+
+	rcu_read_lock();
+	for (i = 0; i < tun->numqueues; i++) {
+		tfile = rcu_dereference(tun->tfiles[i]);
+		qstats = &tfile->stats;
+
+		do {
+			start = u64_stats_fetch_begin_bh(&qstats->rx_syncp);
+			rx_packets = qstats->rx_packets;
+			rx_bytes = qstats->rx_bytes;
+		} while (u64_stats_fetch_retry_bh(&qstats->rx_syncp, start));
+
+		do {
+			start = u64_stats_fetch_begin_bh(&qstats->tx_syncp);
+			tx_packets = qstats->tx_packets;
+			tx_bytes = qstats->tx_bytes;
+		} while (u64_stats_fetch_retry_bh(&qstats->tx_syncp, start));
+
+		stats->rx_packets += rx_packets;
+		stats->rx_bytes	+= rx_bytes;
+		stats->tx_packets += tx_packets;
+		stats->tx_bytes	+= tx_bytes;
+		/* following fileds are u32, no need syncp */
+		rx_dropped += qstats->rx_dropped;
+		tx_dropped += qstats->tx_dropped;
+		rx_frame_errors += qstats->rx_frame_errors;
+		tx_fifo_errors += qstats->tx_fifo_errors;
+	}
+	rcu_read_unlock();
+
+	stats->rx_dropped = rx_dropped;
+	stats->tx_dropped = tx_dropped;
+	stats->rx_frame_errors = rx_frame_errors;
+	stats->tx_fifo_errors = tx_fifo_errors;
+
+	return stats;
+}
+
 static const struct net_device_ops tun_netdev_ops = {
 	.ndo_uninit		= tun_net_uninit,
 	.ndo_open		= tun_net_open,
@@ -545,6 +612,7 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_start_xmit		= tun_net_xmit,
 	.ndo_change_mtu		= tun_net_change_mtu,
 	.ndo_fix_features	= tun_net_fix_features,
+	.ndo_get_stats64	= tun_net_stats,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= tun_poll_controller,
 #endif
@@ -560,6 +628,7 @@ static const struct net_device_ops tap_netdev_ops = {
 	.ndo_set_rx_mode	= tun_net_mclist,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_get_stats64	= tun_net_stats,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= tun_poll_controller,
 #endif
@@ -808,30 +877,25 @@ static ssize_t tun_get_user(struct tun_file *tfile,
 		skb->protocol = eth_type_trans(skb, tun->dev);
 		break;
 	}
-	tun->dev->stats.rx_packets++;
-	tun->dev->stats.rx_bytes += len;
 	rcu_read_unlock();
 
 	netif_rx_ni(skb);
 
+	u64_stats_update_begin(&tfile->stats.rx_syncp);
+	tfile->stats.rx_packets++;
+	tfile->stats.rx_bytes += len;
+	u64_stats_update_end(&tfile->stats.rx_syncp);
+
 	return count;
 
 err_free:
 	count = -EINVAL;
 	kfree_skb(skb);
 err:
-	rcu_read_lock();
-	tun = rcu_dereference(tfile->tun);
-	if (!tun) {
-		rcu_read_unlock();
-		return -EBADFD;
-	}
-
 	if (drop)
-		tun->dev->stats.rx_dropped++;
+		tfile->stats.rx_dropped++;
 	if (error)
-		tun->dev->stats.rx_frame_errors++;
-	rcu_read_unlock();
+		tfile->stats.rx_frame_errors++;
 	return count;
 }
 
@@ -853,7 +917,6 @@ static ssize_t tun_put_user(struct tun_file *tfile,
 			    struct sk_buff *skb,
 			    const struct iovec *iv, int len)
 {
-	struct tun_struct *tun = NULL;
 	struct tun_pi pi = { 0, skb->protocol };
 	ssize_t total = 0;
 
@@ -925,13 +988,10 @@ static ssize_t tun_put_user(struct tun_file *tfile,
 	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
 	total += skb->len;
 
-	rcu_read_lock();
-	tun = rcu_dereference(tfile->tun);
-	if (tun) {
-		tun->dev->stats.tx_packets++;
-		tun->dev->stats.tx_bytes += len;
-	}
-	rcu_read_unlock();
+	u64_stats_update_begin(&tfile->stats.tx_syncp);
+	tfile->stats.tx_packets++;
+	tfile->stats.tx_bytes += total;
+	u64_stats_update_end(&tfile->stats.tx_syncp);
 
 	return total;
 }
@@ -1650,6 +1710,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	tfile->socket.file = file;
 	tfile->socket.ops = &tun_socket_ops;
 	sock_init_data(&tfile->socket, &tfile->sk);
+	memset(&tfile->stats, 0, sizeof(struct tun_queue_stats));
 
 	tfile->sk.sk_write_space = tun_sock_write_space;
 	tfile->sk.sk_destruct = tun_sock_destruct;
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 6/6] tuntap: add ioctls to attach or detach a file form tuntap device
       [not found] <20120625060830.6765.27584.stgit@amd-6168-8-1.englab.nay.redhat.com>
                   ` (7 preceding siblings ...)
  2012-06-25 11:59 ` [PATCH 5/6] tuntap: per queue 64 bit stats Jason Wang
@ 2012-06-25 11:59 ` Jason Wang
  8 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2012-06-25 11:59 UTC (permalink / raw)
  To: mst, akong, habanero, tahm, haixiao, jwhan, ernesto.martin,
	mashirle, davem, netdev, linux-kernel, krkumar2
  Cc: shemminger, edumazet, Jason Wang

This patch introduces two new ioctls which is used to attach and detach a socket
from tuntap devices:

1) TUNATTACHQUEUE which is used to attach a socket to tuntap device.
2) TUNDETACHQUEUE which is used to detach a socket from tuntap device. It allows
                  a socket to be detached from the device temporarily and could
                  be re-attached again by TUNATTACHQUEUE.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c      |   25 ++++++++++++++++++++++---
 include/linux/if_tun.h |    3 +++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 37e62d3..25d5e1f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1411,11 +1411,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 {
 	struct tun_file *tfile = file->private_data;
 	struct tun_struct *tun;
+	struct net_device *dev = NULL;
 	void __user* argp = (void __user*)arg;
 	struct ifreq ifr;
 	int ret;
 
-	if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
+	if (cmd == TUNSETIFF || cmd == TUNATTACHQUEUE || _IOC_TYPE(cmd) == 0x89)
 		if (copy_from_user(&ifr, argp, ifreq_len))
 			return -EFAULT;
 
@@ -1424,7 +1425,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		 * This is needed because we never checked for invalid flags on
 		 * TUNSETIFF. */
 		return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE |
-				IFF_VNET_HDR,
+				IFF_VNET_HDR | IFF_MULTI_QUEUE,
 				(unsigned int __user*)argp);
 	}
 
@@ -1440,6 +1441,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 			return -EFAULT;
 		return ret;
 	}
+	if (cmd == TUNDETACHQUEUE)
+		return tun_detach(tfile, false);
 
 	rtnl_lock();
 
@@ -1447,7 +1450,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 
 	ret = -EBADFD;
 	tun = rcu_dereference(tfile->tun);
-	if (!tun)
+	if (!tun && cmd != TUNATTACHQUEUE)
 		goto unlock;
 	else
 		ret = 0;
@@ -1463,6 +1466,22 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 			ret = -EFAULT;
 		goto out;
 
+	case TUNATTACHQUEUE:
+		dev = __dev_get_by_name(tfile->net, ifr.ifr_name);
+		if (!dev || (dev->netdev_ops != &tap_netdev_ops &&
+				dev->netdev_ops != &tun_netdev_ops))
+			ret = -EINVAL;
+		else if (ifr.ifr_flags &
+			~(IFF_TAP | IFF_TUN | IFF_NO_PI | IFF_VNET_HDR)) {
+			/* ignore illegal flag */
+			ret = -EINVAL;
+		} else {
+			tun = netdev_priv(dev);
+			tfile->flags = tun->flags;
+			ret = tun_attach(tun, file);
+		}
+		break;
+
 	case TUNSETNOCSUM:
 		/* Disable/Enable checksum */
 
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index c92a291..d3f24d8 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -54,6 +54,9 @@
 #define TUNDETACHFILTER _IOW('T', 214, struct sock_fprog)
 #define TUNGETVNETHDRSZ _IOR('T', 215, int)
 #define TUNSETVNETHDRSZ _IOW('T', 216, int)
+#define TUNATTACHQUEUE  _IOW('T', 217, int)
+#define TUNDETACHQUEUE  _IOW('T', 218, int)
+
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
-- 
1.7.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/6] tuntap: per queue 64 bit stats
  2012-06-25 11:59 ` [PATCH 5/6] tuntap: per queue 64 bit stats Jason Wang
@ 2012-06-25 12:52   ` Eric Dumazet
  2012-06-26  6:00     ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Eric Dumazet @ 2012-06-25 12:52 UTC (permalink / raw)
  To: Jason Wang
  Cc: mst, akong, habanero, tahm, haixiao, jwhan, ernesto.martin,
	mashirle, davem, netdev, linux-kernel, krkumar2, shemminger,
	edumazet

On Mon, 2012-06-25 at 19:59 +0800, Jason Wang wrote:
> As we've added multiqueue support for tun/tap, this patch convert the statistics
> to use per-queue 64 bit statistics.

LLTX means you can have several cpus calling TX path in parallel.

So tx stats are wrong (even before this patch), and racy after this
patch (if several cpu access same queue, it seems to be possible)

       u64_stats_update_begin(&tfile->stats.tx_syncp);
       tfile->stats.tx_packets++;
       tfile->stats.tx_bytes += total;
       u64_stats_update_end(&tfile->stats.tx_syncp);
 
This can break horribly if several cpus run this code using same 'tfile'
pointer.

I suggest this patch comes before 'tuntap: multiqueue support' in the
serie.




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-25  8:25   ` [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support Michael S. Tsirkin
  2012-06-25  8:41     ` Michael S. Tsirkin
@ 2012-06-26  3:42     ` Jason Wang
  2012-06-26 10:42       ` Michael S. Tsirkin
  2012-06-26  5:52     ` Jason Wang
  2 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2012-06-26  3:42 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle

On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
> On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
>> This patch adds multiqueue support for tap device. This is done by abstracting
>> each queue as a file/socket and allowing multiple sockets to be attached to the
>> tuntap device (an array of tun_file were stored in the tun_struct). Userspace
>> could write and read from those files to do the parallel packet
>> sending/receiving.
>>
>> Unlike the previous single queue implementation, the socket and device were
>> loosely coupled, each of them were allowed to go away first. In order to let the
>> tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
>> synchronize between data path and system call.
> Don't use LLTX/RCU. It's not worth it.
> Use something like netif_set_real_num_tx_queues.
>
>> The tx queue selecting is first based on the recorded rxq index of an skb, it
>> there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
>>
>> Signed-off-by: Jason Wang<jasowang@redhat.com>
> Interestingly macvtap switched to hashing first:
> ef0002b577b52941fb147128f30bd1ecfdd3ff6d
> (the commit log is corrupted but see what it
> does in the patch).
> Any idea why?

Yes, so tap should be changed to behave same as macvtap. I remember the 
reason we do that is to make sure the packet of a single flow to be 
queued to a fixed socket/virtqueues. As 10g cards like ixgbe choose the 
rx queue for a flow based on the last tx queue where the packets of that 
flow comes. So if we are using recored rx queue in macvtap, the queue 
index of a flow would change as vhost thread moves amongs processors.

But during test tun/tap, one interesting thing I find is that even ixgbe 
has recorded the queue index during rx, it seems be lost when tap tries 
to transmit skbs to userspace.

>> ---
>>   drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
>>   1 files changed, 232 insertions(+), 139 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 8233b0a..5c26757 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -107,6 +107,8 @@ struct tap_filter {
>>   	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
>>   };
>>
>> +#define MAX_TAP_QUEUES (NR_CPUS<  16 ? NR_CPUS : 16)
> Why the limit? I am guessing you copied this from macvtap?
> This is problematic for a number of reasons:
> 	- will not play well with migration
> 	- will not work well for a large guest
>
> Yes, macvtap needs to be fixed too.
>
> I am guessing what it is trying to prevent is queueing
> up a huge number of packets?
> So just divide the default tx queue limit by the # of queues.
>
> And by the way, for MQ applications maybe we can finally
> ignore tx queue altogether and limit the total number
> of bytes queued?
> To avoid regressions we can make it large like 64M/# queues.
> Could be a separate patch I think, and for a single queue
> might need a compatible mode though I am not sure.
>
>> +
>>   struct tun_file {
>>   	struct sock sk;
>>   	struct socket socket;
>> @@ -114,16 +116,18 @@ struct tun_file {
>>   	int vnet_hdr_sz;
>>   	struct tap_filter txflt;
>>   	atomic_t count;
>> -	struct tun_struct *tun;
>> +	struct tun_struct __rcu *tun;
>>   	struct net *net;
>>   	struct fasync_struct *fasync;
>>   	unsigned int flags;
>> +	u16 queue_index;
>>   };
>>
>>   struct tun_sock;
>>
>>   struct tun_struct {
>> -	struct tun_file		*tfile;
>> +	struct tun_file		*tfiles[MAX_TAP_QUEUES];
>> +	unsigned int            numqueues;
>>   	unsigned int 		flags;
>>   	uid_t			owner;
>>   	gid_t			group;
>> @@ -138,80 +142,159 @@ struct tun_struct {
>>   #endif
>>   };
>>
>> -static int tun_attach(struct tun_struct *tun, struct file *file)
>> +static DEFINE_SPINLOCK(tun_lock);
>> +
>> +/*
>> + * tun_get_queue(): calculate the queue index
>> + *     - if skbs comes from mq nics, we can just borrow
>> + *     - if not, calculate from the hash
>> + */
>> +static struct tun_file *tun_get_queue(struct net_device *dev,
>> +				      struct sk_buff *skb)
>>   {
>> -	struct tun_file *tfile = file->private_data;
>> -	int err;
>> +	struct tun_struct *tun = netdev_priv(dev);
>> +	struct tun_file *tfile = NULL;
>> +	int numqueues = tun->numqueues;
>> +	__u32 rxq;
>>
>> -	ASSERT_RTNL();
>> +	BUG_ON(!rcu_read_lock_held());
>>
>> -	netif_tx_lock_bh(tun->dev);
>> +	if (!numqueues)
>> +		goto out;
>>
>> -	err = -EINVAL;
>> -	if (tfile->tun)
>> +	if (numqueues == 1) {
>> +		tfile = rcu_dereference(tun->tfiles[0]);
> Instead of hacks like this, you can ask for an MQ
> flag to be set in SETIFF. Then you won't need to
> handle attach/detach at random times.
> And most of the scary num_queues checks can go away.
> You can then also ask userspace about the max # of queues
> to expect if you want to save some memory.
>
>
>>   		goto out;
>> +	}
>>
>> -	err = -EBUSY;
>> -	if (tun->tfile)
>> +	if (likely(skb_rx_queue_recorded(skb))) {
>> +		rxq = skb_get_rx_queue(skb);
>> +
>> +		while (unlikely(rxq>= numqueues))
>> +			rxq -= numqueues;
>> +
>> +		tfile = rcu_dereference(tun->tfiles[rxq]);
>>   		goto out;
>> +	}
>>
>> -	err = 0;
>> -	tfile->tun = tun;
>> -	tun->tfile = tfile;
>> -	netif_carrier_on(tun->dev);
>> -	dev_hold(tun->dev);
>> -	sock_hold(&tfile->sk);
>> -	atomic_inc(&tfile->count);
>> +	/* Check if we can use flow to select a queue */
>> +	rxq = skb_get_rxhash(skb);
>> +	if (rxq) {
>> +		u32 idx = ((u64)rxq * numqueues)>>  32;
> This completely confuses me. What's the logic here?
> How do we even know it's in range?
>
>> +		tfile = rcu_dereference(tun->tfiles[idx]);
>> +		goto out;
>> +	}
>>
>> +	tfile = rcu_dereference(tun->tfiles[0]);
>>   out:
>> -	netif_tx_unlock_bh(tun->dev);
>> -	return err;
>> +	return tfile;
>>   }
>>
>> -static void __tun_detach(struct tun_struct *tun)
>> +static int tun_detach(struct tun_file *tfile, bool clean)
>>   {
>> -	struct tun_file *tfile = tun->tfile;
>> -	/* Detach from net device */
>> -	netif_tx_lock_bh(tun->dev);
>> -	netif_carrier_off(tun->dev);
>> -	tun->tfile = NULL;
>> -	netif_tx_unlock_bh(tun->dev);
>> -
>> -	/* Drop read queue */
>> -	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
>> -
>> -	/* Drop the extra count on the net device */
>> -	dev_put(tun->dev);
>> -}
>> +	struct tun_struct *tun;
>> +	struct net_device *dev = NULL;
>> +	bool destroy = false;
>>
>> -static void tun_detach(struct tun_struct *tun)
>> -{
>> -	rtnl_lock();
>> -	__tun_detach(tun);
>> -	rtnl_unlock();
>> -}
>> +	spin_lock(&tun_lock);
>>
>> -static struct tun_struct *__tun_get(struct tun_file *tfile)
>> -{
>> -	struct tun_struct *tun = NULL;
>> +	tun = rcu_dereference_protected(tfile->tun,
>> +					lockdep_is_held(&tun_lock));
>> +	if (tun) {
>> +		u16 index = tfile->queue_index;
>> +		BUG_ON(index>= tun->numqueues);
>> +		dev = tun->dev;
>> +
>> +		rcu_assign_pointer(tun->tfiles[index],
>> +				   tun->tfiles[tun->numqueues - 1]);
>> +		tun->tfiles[index]->queue_index = index;
>> +		rcu_assign_pointer(tfile->tun, NULL);
>> +		--tun->numqueues;
>> +		sock_put(&tfile->sk);
>>
>> -	if (atomic_inc_not_zero(&tfile->count))
>> -		tun = tfile->tun;
>> +		if (tun->numqueues == 0&&  !(tun->flags&  TUN_PERSIST))
>> +			destroy = true;
> Please don't use flags like that. Use dedicated labels and goto there on error.
>
>
>> +	}
>>
>> -	return tun;
>> +	spin_unlock(&tun_lock);
>> +
>> +	synchronize_rcu();
>> +	if (clean)
>> +		sock_put(&tfile->sk);
>> +
>> +	if (destroy) {
>> +		rtnl_lock();
>> +		if (dev->reg_state == NETREG_REGISTERED)
>> +			unregister_netdevice(dev);
>> +		rtnl_unlock();
>> +	}
>> +
>> +	return 0;
>>   }
>>
>> -static struct tun_struct *tun_get(struct file *file)
>> +static void tun_detach_all(struct net_device *dev)
>>   {
>> -	return __tun_get(file->private_data);
>> +	struct tun_struct *tun = netdev_priv(dev);
>> +	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
>> +	int i, j = 0;
>> +
>> +	spin_lock(&tun_lock);
>> +
>> +	for (i = 0; i<  MAX_TAP_QUEUES&&  tun->numqueues; i++) {
>> +		tfile = rcu_dereference_protected(tun->tfiles[i],
>> +						lockdep_is_held(&tun_lock));
>> +		BUG_ON(!tfile);
>> +		wake_up_all(&tfile->wq.wait);
>> +		tfile_list[j++] = tfile;
>> +		rcu_assign_pointer(tfile->tun, NULL);
>> +		--tun->numqueues;
>> +	}
>> +	BUG_ON(tun->numqueues != 0);
>> +	/* guarantee that any future tun_attach will fail */
>> +	tun->numqueues = MAX_TAP_QUEUES;
>> +	spin_unlock(&tun_lock);
>> +
>> +	synchronize_rcu();
>> +	for (--j; j>= 0; j--)
>> +		sock_put(&tfile_list[j]->sk);
>>   }
>>
>> -static void tun_put(struct tun_struct *tun)
>> +static int tun_attach(struct tun_struct *tun, struct file *file)
>>   {
>> -	struct tun_file *tfile = tun->tfile;
>> +	struct tun_file *tfile = file->private_data;
>> +	int err;
>> +
>> +	ASSERT_RTNL();
>> +
>> +	spin_lock(&tun_lock);
>>
>> -	if (atomic_dec_and_test(&tfile->count))
>> -		tun_detach(tfile->tun);
>> +	err = -EINVAL;
>> +	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
>> +		goto out;
>> +
>> +	err = -EBUSY;
>> +	if (!(tun->flags&  TUN_TAP_MQ)&&  tun->numqueues == 1)
>> +		goto out;
>> +
>> +	if (tun->numqueues == MAX_TAP_QUEUES)
>> +		goto out;
>> +
>> +	err = 0;
>> +	tfile->queue_index = tun->numqueues;
>> +	rcu_assign_pointer(tfile->tun, tun);
>> +	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>> +	sock_hold(&tfile->sk);
>> +	tun->numqueues++;
>> +
>> +	if (tun->numqueues == 1)
>> +		netif_carrier_on(tun->dev);
>> +
>> +	/* device is allowed to go away first, so no need to hold extra
>> +	 * refcnt. */
>> +
>> +out:
>> +	spin_unlock(&tun_lock);
>> +	return err;
>>   }
>>
>>   /* TAP filtering */
>> @@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
>>   /* Net device detach from fd. */
>>   static void tun_net_uninit(struct net_device *dev)
>>   {
>> -	struct tun_struct *tun = netdev_priv(dev);
>> -	struct tun_file *tfile = tun->tfile;
>> -
>> -	/* Inform the methods they need to stop using the dev.
>> -	 */
>> -	if (tfile) {
>> -		wake_up_all(&tfile->wq.wait);
>> -		if (atomic_dec_and_test(&tfile->count))
>> -			__tun_detach(tun);
>> -	}
>> +	tun_detach_all(dev);
>>   }
>>
>>   /* Net device open. */
>> @@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
>>   /* Net device start xmit */
>>   static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>   {
>> -	struct tun_struct *tun = netdev_priv(dev);
>> -	struct tun_file *tfile = tun->tfile;
>> +	struct tun_file *tfile = NULL;
>>
>> -	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
>> +	rcu_read_lock();
>> +	tfile = tun_get_queue(dev, skb);
>>
>>   	/* Drop packet if interface is not attached */
>>   	if (!tfile)
>> @@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>
>>   	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
>>   	>= dev->tx_queue_len) {
>> -		if (!(tun->flags&  TUN_ONE_QUEUE)) {
>> +		if (!(tfile->flags&  TUN_ONE_QUEUE)&&
> Which patch moved flags from tun to tfile?
>
>> +		    !(tfile->flags&  TUN_TAP_MQ)) {
>>   			/* Normal queueing mode. */
>>   			/* Packet scheduler handles dropping of further packets. */
>>   			netif_stop_queue(dev);
>> @@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>   			 * error is more appropriate. */
>>   			dev->stats.tx_fifo_errors++;
>>   		} else {
>> -			/* Single queue mode.
>> +			/* Single queue mode or multi queue mode.
>>   			 * Driver handles dropping of all packets itself. */
> Please don't do this. Stop the queue on overrun as appropriate.
> ONE_QUEUE is a legacy hack.
>
> BTW we really should stop queue before we start dropping packets,
> but that can be a separate patch.
>
>>   			goto drop;
>>   		}
>> @@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>   		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
>>   	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
>>   				   POLLRDNORM | POLLRDBAND);
>> +	rcu_read_unlock();
>>   	return NETDEV_TX_OK;
>>
>>   drop:
>> +	rcu_read_unlock();
>>   	dev->stats.tx_dropped++;
>>   	kfree_skb(skb);
>>   	return NETDEV_TX_OK;
>> @@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
>>   static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>   {
>>   	struct tun_file *tfile = file->private_data;
>> -	struct tun_struct *tun = __tun_get(tfile);
>> +	struct tun_struct *tun = NULL;
>>   	struct sock *sk;
>>   	unsigned int mask = 0;
>>
>> -	if (!tun)
>> +	if (!tfile)
>>   		return POLLERR;
>>
>> -	sk = tfile->socket.sk;
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>> +	if (!tun) {
>> +		rcu_read_unlock();
>> +		return POLLERR;
>> +	}
>> +	rcu_read_unlock();
>>
>> -	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
>> +	sk =&tfile->sk;
>>
>>   	poll_wait(file,&tfile->wq.wait, wait);
>>
>> @@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>   	     sock_writeable(sk)))
>>   		mask |= POLLOUT | POLLWRNORM;
>>
>> -	if (tun->dev->reg_state != NETREG_REGISTERED)
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>> +	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
>>   		mask = POLLERR;
>> +	rcu_read_unlock();
>>
>> -	tun_put(tun);
>>   	return mask;
>>   }
>>
>> @@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>   		skb_shinfo(skb)->gso_segs = 0;
>>   	}
>>
>> -	tun = __tun_get(tfile);
>> -	if (!tun)
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>> +	if (!tun) {
>> +		rcu_read_unlock();
>>   		return -EBADFD;
>> +	}
>>
>>   	switch (tfile->flags&  TUN_TYPE_MASK) {
>>   	case TUN_TUN_DEV:
>> @@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>   		skb->protocol = eth_type_trans(skb, tun->dev);
>>   		break;
>>   	}
>> -
>> -	netif_rx_ni(skb);
>>   	tun->dev->stats.rx_packets++;
>>   	tun->dev->stats.rx_bytes += len;
>> -	tun_put(tun);
>> +	rcu_read_unlock();
>> +
>> +	netif_rx_ni(skb);
>> +
>>   	return count;
>>
>>   err_free:
>>   	count = -EINVAL;
>>   	kfree_skb(skb);
>>   err:
>> -	tun = __tun_get(tfile);
>> -	if (!tun)
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>> +	if (!tun) {
>> +		rcu_read_unlock();
>>   		return -EBADFD;
>> +	}
>>
>>   	if (drop)
>>   		tun->dev->stats.rx_dropped++;
>>   	if (error)
>>   		tun->dev->stats.rx_frame_errors++;
>> -	tun_put(tun);
>> +	rcu_read_unlock();
>>   	return count;
>>   }
>>
>> @@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
>>   	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
>>   	total += skb->len;
>>
>> -	tun = __tun_get(tfile);
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>>   	if (tun) {
>>   		tun->dev->stats.tx_packets++;
>>   		tun->dev->stats.tx_bytes += len;
>> -		tun_put(tun);
>>   	}
>> +	rcu_read_unlock();
>>
>>   	return total;
>>   }
>> @@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
>>   				break;
>>   			}
>>
>> -			tun = __tun_get(tfile);
>> +			rcu_read_lock();
>> +			tun = rcu_dereference(tfile->tun);
>>   			if (!tun) {
>> -				ret = -EIO;
>> +				ret = -EBADFD;
> BADFD is for when you get passed something like -1 fd.
> Here fd is OK, it's just in a bad state so you can not do IO.
>
>
>> +				rcu_read_unlock();
>>   				break;
>>   			}
>>   			if (tun->dev->reg_state != NETREG_REGISTERED) {
>>   				ret = -EIO;
>> -				tun_put(tun);
>> +				rcu_read_unlock();
>>   				break;
>>   			}
>> -			tun_put(tun);
>> +			rcu_read_unlock();
>>
>>   			/* Nothing to read, let's sleep */
>>   			schedule();
>>   			continue;
>>   		}
>>
>> -		tun = __tun_get(tfile);
>> +		rcu_read_lock();
>> +		tun = rcu_dereference(tfile->tun);
>>   		if (tun) {
>>   			netif_wake_queue(tun->dev);
>> -			tun_put(tun);
>>   		}
>> +		rcu_read_unlock();
>>
>>   		ret = tun_put_user(tfile, skb, iv, len);
>>   		kfree_skb(skb);
>> @@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
>>   	if (tun->flags&  TUN_VNET_HDR)
>>   		flags |= IFF_VNET_HDR;
>>
>> +	if (tun->flags&  TUN_TAP_MQ)
>> +		flags |= IFF_MULTI_QUEUE;
>> +
>>   	return flags;
>>   }
>>
>> @@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>   		err = tun_attach(tun, file);
>>   		if (err<  0)
>>   			return err;
>> -	}
>> -	else {
>> +	} else {
>>   		char *name;
>>   		unsigned long flags = 0;
>>
>> @@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>   		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
>>   			TUN_USER_FEATURES;
>>   		dev->features = dev->hw_features;
>> +		if (ifr->ifr_flags&  IFF_MULTI_QUEUE)
>> +			dev->features |= NETIF_F_LLTX;
>>
>>   		err = register_netdevice(tun->dev);
>>   		if (err<  0)
>> @@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>
>>   		err = tun_attach(tun, file);
>>   		if (err<  0)
>> -			goto failed;
>> +			goto err_free_dev;
>>   	}
>>
>>   	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
>> @@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>   	else
>>   		tun->flags&= ~TUN_VNET_HDR;
>>
>> +	if (ifr->ifr_flags&  IFF_MULTI_QUEUE)
>> +		tun->flags |= TUN_TAP_MQ;
>> +	else
>> +		tun->flags&= ~TUN_TAP_MQ;
>> +
>>   	/* Cache flags from tun device */
>>   	tfile->flags = tun->flags;
>>   	/* Make sure persistent devices do not get stuck in
>> @@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>
>>   err_free_dev:
>>   	free_netdev(dev);
>> -failed:
>>   	return err;
>>   }
>>
>> @@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   				(unsigned int __user*)argp);
>>   	}
>>
>> -	rtnl_lock();
>> -
>> -	tun = __tun_get(tfile);
>> -	if (cmd == TUNSETIFF&&  !tun) {
>> +	ret = 0;
>> +	if (cmd == TUNSETIFF) {
>> +		rtnl_lock();
>>   		ifr.ifr_name[IFNAMSIZ-1] = '\0';
>> -
>>   		ret = tun_set_iff(tfile->net, file,&ifr);
>> -
>> +		rtnl_unlock();
>>   		if (ret)
>> -			goto unlock;
>> -
>> +			return ret;
>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>> -			ret = -EFAULT;
>> -		goto unlock;
>> +			return -EFAULT;
>> +		return ret;
>>   	}
>>
>> +	rtnl_lock();
>> +
>> +	rcu_read_lock();
>> +
>>   	ret = -EBADFD;
>> +	tun = rcu_dereference(tfile->tun);
>>   	if (!tun)
>>   		goto unlock;
>> +	else
>> +		ret = 0;
>>
>> -	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
>> -
>> -	ret = 0;
>>   	switch (cmd) {
>>   	case TUNGETIFF:
>>   		ret = tun_get_iff(current->nsproxy->net_ns, tun,&ifr);
>> +		rcu_read_unlock();
>>   		if (ret)
>> -			break;
>> +			goto out;
>>
>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>   			ret = -EFAULT;
>> -		break;
>> +		goto out;
>>
>>   	case TUNSETNOCSUM:
>>   		/* Disable/Enable checksum */
>> @@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   		/* Get hw address */
>>   		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
>>   		ifr.ifr_hwaddr.sa_family = tun->dev->type;
>> +		rcu_read_unlock();
>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>   			ret = -EFAULT;
>> -		break;
>> +		goto out;
>>
>>   	case SIOCSIFHWADDR:
>>   		/* Set hw address */
>> @@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   	}
>>
>>   unlock:
>> +	rcu_read_unlock();
>> +out:
>>   	rtnl_unlock();
>> -	if (tun)
>> -		tun_put(tun);
>>   	return ret;
>>   }
>>
>> @@ -1517,6 +1624,11 @@ out:
>>   	return ret;
>>   }
>>
>> +static void tun_sock_destruct(struct sock *sk)
>> +{
>> +	skb_queue_purge(&sk->sk_receive_queue);
>> +}
>> +
>>   static int tun_chr_open(struct inode *inode, struct file * file)
>>   {
>>   	struct net *net = current->nsproxy->net_ns;
>> @@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>   	sock_init_data(&tfile->socket,&tfile->sk);
>>
>>   	tfile->sk.sk_write_space = tun_sock_write_space;
>> +	tfile->sk.sk_destruct = tun_sock_destruct;
>>   	tfile->sk.sk_sndbuf = INT_MAX;
>>   	file->private_data = tfile;
>>
>> @@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>   static int tun_chr_close(struct inode *inode, struct file *file)
>>   {
>>   	struct tun_file *tfile = file->private_data;
>> -	struct tun_struct *tun;
>> -
>> -	tun = __tun_get(tfile);
>> -	if (tun) {
>> -		struct net_device *dev = tun->dev;
>> -
>> -		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
>> -
>> -		__tun_detach(tun);
>> -
>> -		/* If desirable, unregister the netdevice. */
>> -		if (!(tun->flags&  TUN_PERSIST)) {
>> -			rtnl_lock();
>> -			if (dev->reg_state == NETREG_REGISTERED)
>> -				unregister_netdevice(dev);
>> -			rtnl_unlock();
>> -		}
>>
>> -		/* drop the reference that netdevice holds */
>> -		sock_put(&tfile->sk);
>> -
>> -	}
>> -
>> -	/* drop the reference that file holds */
>> -	sock_put(&tfile->sk);
>> +	tun_detach(tfile, true);
>>
>>   	return 0;
>>   }
>> @@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
>>    * holding a reference to the file for as long as the socket is in use. */
>>   struct socket *tun_get_socket(struct file *file)
>>   {
>> -	struct tun_struct *tun;
>> +	struct tun_struct *tun = NULL;
>>   	struct tun_file *tfile = file->private_data;
>>   	if (file->f_op !=&tun_fops)
>>   		return ERR_PTR(-EINVAL);
>> -	tun = tun_get(file);
>> -	if (!tun)
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>> +	if (!tun) {
>> +		rcu_read_unlock();
>>   		return ERR_PTR(-EBADFD);
>> -	tun_put(tun);
>> +	}
>> +	rcu_read_unlock();
>>   	return&tfile->socket;
>>   }
>>   EXPORT_SYMBOL_GPL(tun_get_socket);


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-25  8:25   ` [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support Michael S. Tsirkin
  2012-06-25  8:41     ` Michael S. Tsirkin
  2012-06-26  3:42     ` Jason Wang
@ 2012-06-26  5:52     ` Jason Wang
  2012-06-26 11:54       ` Michael S. Tsirkin
  2 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2012-06-26  5:52 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle, Eric Dumazet

On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
> On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
>> This patch adds multiqueue support for tap device. This is done by abstracting
>> each queue as a file/socket and allowing multiple sockets to be attached to the
>> tuntap device (an array of tun_file were stored in the tun_struct). Userspace
>> could write and read from those files to do the parallel packet
>> sending/receiving.
>>
>> Unlike the previous single queue implementation, the socket and device were
>> loosely coupled, each of them were allowed to go away first. In order to let the
>> tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
>> synchronize between data path and system call.
> Don't use LLTX/RCU. It's not worth it.
> Use something like netif_set_real_num_tx_queues.
>

For LLTX, maybe it's better to convert it to alloc_netdev_mq() to let 
the kernel see all queues and make the queue stopping and per-queue 
stats eaiser.
RCU is used to handle the attaching/detaching when tun/tap is sending 
and receiving packets which looks reasonalbe for me. Not sure 
netif_set_real_num_tx_queues() can help in this situation.

>> The tx queue selecting is first based on the recorded rxq index of an skb, it
>> there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
>>
>> Signed-off-by: Jason Wang<jasowang@redhat.com>
> Interestingly macvtap switched to hashing first:
> ef0002b577b52941fb147128f30bd1ecfdd3ff6d
> (the commit log is corrupted but see what it
> does in the patch).
> Any idea why?
>
>> ---
>>   drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
>>   1 files changed, 232 insertions(+), 139 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 8233b0a..5c26757 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -107,6 +107,8 @@ struct tap_filter {
>>   	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
>>   };
>>
>> +#define MAX_TAP_QUEUES (NR_CPUS<  16 ? NR_CPUS : 16)
> Why the limit? I am guessing you copied this from macvtap?
> This is problematic for a number of reasons:
> 	- will not play well with migration
> 	- will not work well for a large guest
>
> Yes, macvtap needs to be fixed too.
>
> I am guessing what it is trying to prevent is queueing
> up a huge number of packets?
> So just divide the default tx queue limit by the # of queues.

Not sure, another reasons I can guess:
- to prevent storing a large array of pointers in tun_struct or macvlan_dev.
- it may not be suitable to allow the number of virtqueues greater than 
the number of physical queues in the card

>
> And by the way, for MQ applications maybe we can finally
> ignore tx queue altogether and limit the total number
> of bytes queued?
> To avoid regressions we can make it large like 64M/# queues.
> Could be a separate patch I think, and for a single queue
> might need a compatible mode though I am not sure.

Could you explain more about this? Did you mean to have a total sndbuf 
for all sockets that attached to tun/tap?
>> +
>>   struct tun_file {
>>   	struct sock sk;
>>   	struct socket socket;
>> @@ -114,16 +116,18 @@ struct tun_file {
>>   	int vnet_hdr_sz;
>>   	struct tap_filter txflt;
>>   	atomic_t count;
>> -	struct tun_struct *tun;
>> +	struct tun_struct __rcu *tun;
>>   	struct net *net;
>>   	struct fasync_struct *fasync;
>>   	unsigned int flags;
>> +	u16 queue_index;
>>   };
>>
>>   struct tun_sock;
>>
>>   struct tun_struct {
>> -	struct tun_file		*tfile;
>> +	struct tun_file		*tfiles[MAX_TAP_QUEUES];
>> +	unsigned int            numqueues;
>>   	unsigned int 		flags;
>>   	uid_t			owner;
>>   	gid_t			group;
>> @@ -138,80 +142,159 @@ struct tun_struct {
>>   #endif
>>   };
>>
>> -static int tun_attach(struct tun_struct *tun, struct file *file)
>> +static DEFINE_SPINLOCK(tun_lock);
>> +
>> +/*
>> + * tun_get_queue(): calculate the queue index
>> + *     - if skbs comes from mq nics, we can just borrow
>> + *     - if not, calculate from the hash
>> + */
>> +static struct tun_file *tun_get_queue(struct net_device *dev,
>> +				      struct sk_buff *skb)
>>   {
>> -	struct tun_file *tfile = file->private_data;
>> -	int err;
>> +	struct tun_struct *tun = netdev_priv(dev);
>> +	struct tun_file *tfile = NULL;
>> +	int numqueues = tun->numqueues;
>> +	__u32 rxq;
>>
>> -	ASSERT_RTNL();
>> +	BUG_ON(!rcu_read_lock_held());
>>
>> -	netif_tx_lock_bh(tun->dev);
>> +	if (!numqueues)
>> +		goto out;
>>
>> -	err = -EINVAL;
>> -	if (tfile->tun)
>> +	if (numqueues == 1) {
>> +		tfile = rcu_dereference(tun->tfiles[0]);
> Instead of hacks like this, you can ask for an MQ
> flag to be set in SETIFF. Then you won't need to
> handle attach/detach at random times.

Consier user switch between a sq guest to mq guest, qemu would attach or 
detach the fd which could not be expceted in kernel.
> And most of the scary num_queues checks can go away.

Even we has a MQ flag, userspace could still just attach one queue to 
the device.
> You can then also ask userspace about the max # of queues
> to expect if you want to save some memory.
>

Yes, good suggestion.
>>   		goto out;
>> +	}
>>
>> -	err = -EBUSY;
>> -	if (tun->tfile)
>> +	if (likely(skb_rx_queue_recorded(skb))) {
>> +		rxq = skb_get_rx_queue(skb);
>> +
>> +		while (unlikely(rxq>= numqueues))
>> +			rxq -= numqueues;
>> +
>> +		tfile = rcu_dereference(tun->tfiles[rxq]);
>>   		goto out;
>> +	}
>>
>> -	err = 0;
>> -	tfile->tun = tun;
>> -	tun->tfile = tfile;
>> -	netif_carrier_on(tun->dev);
>> -	dev_hold(tun->dev);
>> -	sock_hold(&tfile->sk);
>> -	atomic_inc(&tfile->count);
>> +	/* Check if we can use flow to select a queue */
>> +	rxq = skb_get_rxhash(skb);
>> +	if (rxq) {
>> +		u32 idx = ((u64)rxq * numqueues)>>  32;
> This completely confuses me. What's the logic here?
> How do we even know it's in range?
>

rxq is a u32, so the result should be less than numqueues.
>> +		tfile = rcu_dereference(tun->tfiles[idx]);
>> +		goto out;
>> +	}
>>
>> +	tfile = rcu_dereference(tun->tfiles[0]);
>>   out:
>> -	netif_tx_unlock_bh(tun->dev);
>> -	return err;
>> +	return tfile;
>>   }
>>
>> -static void __tun_detach(struct tun_struct *tun)
>> +static int tun_detach(struct tun_file *tfile, bool clean)
>>   {
>> -	struct tun_file *tfile = tun->tfile;
>> -	/* Detach from net device */
>> -	netif_tx_lock_bh(tun->dev);
>> -	netif_carrier_off(tun->dev);
>> -	tun->tfile = NULL;
>> -	netif_tx_unlock_bh(tun->dev);
>> -
>> -	/* Drop read queue */
>> -	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
>> -
>> -	/* Drop the extra count on the net device */
>> -	dev_put(tun->dev);
>> -}
>> +	struct tun_struct *tun;
>> +	struct net_device *dev = NULL;
>> +	bool destroy = false;
>>
>> -static void tun_detach(struct tun_struct *tun)
>> -{
>> -	rtnl_lock();
>> -	__tun_detach(tun);
>> -	rtnl_unlock();
>> -}
>> +	spin_lock(&tun_lock);
>>
>> -static struct tun_struct *__tun_get(struct tun_file *tfile)
>> -{
>> -	struct tun_struct *tun = NULL;
>> +	tun = rcu_dereference_protected(tfile->tun,
>> +					lockdep_is_held(&tun_lock));
>> +	if (tun) {
>> +		u16 index = tfile->queue_index;
>> +		BUG_ON(index>= tun->numqueues);
>> +		dev = tun->dev;
>> +
>> +		rcu_assign_pointer(tun->tfiles[index],
>> +				   tun->tfiles[tun->numqueues - 1]);
>> +		tun->tfiles[index]->queue_index = index;
>> +		rcu_assign_pointer(tfile->tun, NULL);
>> +		--tun->numqueues;
>> +		sock_put(&tfile->sk);
>>
>> -	if (atomic_inc_not_zero(&tfile->count))
>> -		tun = tfile->tun;
>> +		if (tun->numqueues == 0&&  !(tun->flags&  TUN_PERSIST))
>> +			destroy = true;
> Please don't use flags like that. Use dedicated labels and goto there on error.

ok.
>
>> +	}
>>
>> -	return tun;
>> +	spin_unlock(&tun_lock);
>> +
>> +	synchronize_rcu();
>> +	if (clean)
>> +		sock_put(&tfile->sk);
>> +
>> +	if (destroy) {
>> +		rtnl_lock();
>> +		if (dev->reg_state == NETREG_REGISTERED)
>> +			unregister_netdevice(dev);
>> +		rtnl_unlock();
>> +	}
>> +
>> +	return 0;
>>   }
>>
>> -static struct tun_struct *tun_get(struct file *file)
>> +static void tun_detach_all(struct net_device *dev)
>>   {
>> -	return __tun_get(file->private_data);
>> +	struct tun_struct *tun = netdev_priv(dev);
>> +	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
>> +	int i, j = 0;
>> +
>> +	spin_lock(&tun_lock);
>> +
>> +	for (i = 0; i<  MAX_TAP_QUEUES&&  tun->numqueues; i++) {
>> +		tfile = rcu_dereference_protected(tun->tfiles[i],
>> +						lockdep_is_held(&tun_lock));
>> +		BUG_ON(!tfile);
>> +		wake_up_all(&tfile->wq.wait);
>> +		tfile_list[j++] = tfile;
>> +		rcu_assign_pointer(tfile->tun, NULL);
>> +		--tun->numqueues;
>> +	}
>> +	BUG_ON(tun->numqueues != 0);
>> +	/* guarantee that any future tun_attach will fail */
>> +	tun->numqueues = MAX_TAP_QUEUES;
>> +	spin_unlock(&tun_lock);
>> +
>> +	synchronize_rcu();
>> +	for (--j; j>= 0; j--)
>> +		sock_put(&tfile_list[j]->sk);
>>   }
>>
>> -static void tun_put(struct tun_struct *tun)
>> +static int tun_attach(struct tun_struct *tun, struct file *file)
>>   {
>> -	struct tun_file *tfile = tun->tfile;
>> +	struct tun_file *tfile = file->private_data;
>> +	int err;
>> +
>> +	ASSERT_RTNL();
>> +
>> +	spin_lock(&tun_lock);
>>
>> -	if (atomic_dec_and_test(&tfile->count))
>> -		tun_detach(tfile->tun);
>> +	err = -EINVAL;
>> +	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
>> +		goto out;
>> +
>> +	err = -EBUSY;
>> +	if (!(tun->flags&  TUN_TAP_MQ)&&  tun->numqueues == 1)
>> +		goto out;
>> +
>> +	if (tun->numqueues == MAX_TAP_QUEUES)
>> +		goto out;
>> +
>> +	err = 0;
>> +	tfile->queue_index = tun->numqueues;
>> +	rcu_assign_pointer(tfile->tun, tun);
>> +	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>> +	sock_hold(&tfile->sk);
>> +	tun->numqueues++;
>> +
>> +	if (tun->numqueues == 1)
>> +		netif_carrier_on(tun->dev);
>> +
>> +	/* device is allowed to go away first, so no need to hold extra
>> +	 * refcnt. */
>> +
>> +out:
>> +	spin_unlock(&tun_lock);
>> +	return err;
>>   }
>>
>>   /* TAP filtering */
>> @@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
>>   /* Net device detach from fd. */
>>   static void tun_net_uninit(struct net_device *dev)
>>   {
>> -	struct tun_struct *tun = netdev_priv(dev);
>> -	struct tun_file *tfile = tun->tfile;
>> -
>> -	/* Inform the methods they need to stop using the dev.
>> -	 */
>> -	if (tfile) {
>> -		wake_up_all(&tfile->wq.wait);
>> -		if (atomic_dec_and_test(&tfile->count))
>> -			__tun_detach(tun);
>> -	}
>> +	tun_detach_all(dev);
>>   }
>>
>>   /* Net device open. */
>> @@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
>>   /* Net device start xmit */
>>   static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>   {
>> -	struct tun_struct *tun = netdev_priv(dev);
>> -	struct tun_file *tfile = tun->tfile;
>> +	struct tun_file *tfile = NULL;
>>
>> -	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
>> +	rcu_read_lock();
>> +	tfile = tun_get_queue(dev, skb);
>>
>>   	/* Drop packet if interface is not attached */
>>   	if (!tfile)
>> @@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>
>>   	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
>>   	>= dev->tx_queue_len) {
>> -		if (!(tun->flags&  TUN_ONE_QUEUE)) {
>> +		if (!(tfile->flags&  TUN_ONE_QUEUE)&&
> Which patch moved flags from tun to tfile?

Patch 1 cache the tun->flags in tfile, but it seems this may let the 
flags out of sync. So we'd better to use the one in tun_struct.
>
>> +		    !(tfile->flags&  TUN_TAP_MQ)) {
>>   			/* Normal queueing mode. */
>>   			/* Packet scheduler handles dropping of further packets. */
>>   			netif_stop_queue(dev);
>> @@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>   			 * error is more appropriate. */
>>   			dev->stats.tx_fifo_errors++;
>>   		} else {
>> -			/* Single queue mode.
>> +			/* Single queue mode or multi queue mode.
>>   			 * Driver handles dropping of all packets itself. */
> Please don't do this. Stop the queue on overrun as appropriate.
> ONE_QUEUE is a legacy hack.
>
> BTW we really should stop queue before we start dropping packets,
> but that can be a separate patch.

The problem here is the using of NETIF_F_LLTX. Kernel could only see one 
queue even for a multiqueue tun/tap. If we use netif_stop_queue(), all 
other queues would be stopped also.
>>   			goto drop;
>>   		}
>> @@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>   		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
>>   	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
>>   				   POLLRDNORM | POLLRDBAND);
>> +	rcu_read_unlock();
>>   	return NETDEV_TX_OK;
>>
>>   drop:
>> +	rcu_read_unlock();
>>   	dev->stats.tx_dropped++;
>>   	kfree_skb(skb);
>>   	return NETDEV_TX_OK;
>> @@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
>>   static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>   {
>>   	struct tun_file *tfile = file->private_data;
>> -	struct tun_struct *tun = __tun_get(tfile);
>> +	struct tun_struct *tun = NULL;
>>   	struct sock *sk;
>>   	unsigned int mask = 0;
>>
>> -	if (!tun)
>> +	if (!tfile)
>>   		return POLLERR;
>>
>> -	sk = tfile->socket.sk;
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>> +	if (!tun) {
>> +		rcu_read_unlock();
>> +		return POLLERR;
>> +	}
>> +	rcu_read_unlock();
>>
>> -	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
>> +	sk =&tfile->sk;
>>
>>   	poll_wait(file,&tfile->wq.wait, wait);
>>
>> @@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>   	     sock_writeable(sk)))
>>   		mask |= POLLOUT | POLLWRNORM;
>>
>> -	if (tun->dev->reg_state != NETREG_REGISTERED)
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>> +	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
>>   		mask = POLLERR;
>> +	rcu_read_unlock();
>>
>> -	tun_put(tun);
>>   	return mask;
>>   }
>>
>> @@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>   		skb_shinfo(skb)->gso_segs = 0;
>>   	}
>>
>> -	tun = __tun_get(tfile);
>> -	if (!tun)
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>> +	if (!tun) {
>> +		rcu_read_unlock();
>>   		return -EBADFD;
>> +	}
>>
>>   	switch (tfile->flags&  TUN_TYPE_MASK) {
>>   	case TUN_TUN_DEV:
>> @@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>   		skb->protocol = eth_type_trans(skb, tun->dev);
>>   		break;
>>   	}
>> -
>> -	netif_rx_ni(skb);
>>   	tun->dev->stats.rx_packets++;
>>   	tun->dev->stats.rx_bytes += len;
>> -	tun_put(tun);
>> +	rcu_read_unlock();
>> +
>> +	netif_rx_ni(skb);
>> +
>>   	return count;
>>
>>   err_free:
>>   	count = -EINVAL;
>>   	kfree_skb(skb);
>>   err:
>> -	tun = __tun_get(tfile);
>> -	if (!tun)
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>> +	if (!tun) {
>> +		rcu_read_unlock();
>>   		return -EBADFD;
>> +	}
>>
>>   	if (drop)
>>   		tun->dev->stats.rx_dropped++;
>>   	if (error)
>>   		tun->dev->stats.rx_frame_errors++;
>> -	tun_put(tun);
>> +	rcu_read_unlock();
>>   	return count;
>>   }
>>
>> @@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
>>   	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
>>   	total += skb->len;
>>
>> -	tun = __tun_get(tfile);
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>>   	if (tun) {
>>   		tun->dev->stats.tx_packets++;
>>   		tun->dev->stats.tx_bytes += len;
>> -		tun_put(tun);
>>   	}
>> +	rcu_read_unlock();
>>
>>   	return total;
>>   }
>> @@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
>>   				break;
>>   			}
>>
>> -			tun = __tun_get(tfile);
>> +			rcu_read_lock();
>> +			tun = rcu_dereference(tfile->tun);
>>   			if (!tun) {
>> -				ret = -EIO;
>> +				ret = -EBADFD;
> BADFD is for when you get passed something like -1 fd.
> Here fd is OK, it's just in a bad state so you can not do IO.
>

Sure.
>> +				rcu_read_unlock();
>>   				break;
>>   			}
>>   			if (tun->dev->reg_state != NETREG_REGISTERED) {
>>   				ret = -EIO;
>> -				tun_put(tun);
>> +				rcu_read_unlock();
>>   				break;
>>   			}
>> -			tun_put(tun);
>> +			rcu_read_unlock();
>>
>>   			/* Nothing to read, let's sleep */
>>   			schedule();
>>   			continue;
>>   		}
>>
>> -		tun = __tun_get(tfile);
>> +		rcu_read_lock();
>> +		tun = rcu_dereference(tfile->tun);
>>   		if (tun) {
>>   			netif_wake_queue(tun->dev);
>> -			tun_put(tun);
>>   		}
>> +		rcu_read_unlock();
>>
>>   		ret = tun_put_user(tfile, skb, iv, len);
>>   		kfree_skb(skb);
>> @@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
>>   	if (tun->flags&  TUN_VNET_HDR)
>>   		flags |= IFF_VNET_HDR;
>>
>> +	if (tun->flags&  TUN_TAP_MQ)
>> +		flags |= IFF_MULTI_QUEUE;
>> +
>>   	return flags;
>>   }
>>
>> @@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>   		err = tun_attach(tun, file);
>>   		if (err<  0)
>>   			return err;
>> -	}
>> -	else {
>> +	} else {
>>   		char *name;
>>   		unsigned long flags = 0;
>>
>> @@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>   		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
>>   			TUN_USER_FEATURES;
>>   		dev->features = dev->hw_features;
>> +		if (ifr->ifr_flags&  IFF_MULTI_QUEUE)
>> +			dev->features |= NETIF_F_LLTX;
>>
>>   		err = register_netdevice(tun->dev);
>>   		if (err<  0)
>> @@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>
>>   		err = tun_attach(tun, file);
>>   		if (err<  0)
>> -			goto failed;
>> +			goto err_free_dev;
>>   	}
>>
>>   	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
>> @@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>   	else
>>   		tun->flags&= ~TUN_VNET_HDR;
>>
>> +	if (ifr->ifr_flags&  IFF_MULTI_QUEUE)
>> +		tun->flags |= TUN_TAP_MQ;
>> +	else
>> +		tun->flags&= ~TUN_TAP_MQ;
>> +
>>   	/* Cache flags from tun device */
>>   	tfile->flags = tun->flags;
>>   	/* Make sure persistent devices do not get stuck in
>> @@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>
>>   err_free_dev:
>>   	free_netdev(dev);
>> -failed:
>>   	return err;
>>   }
>>
>> @@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   				(unsigned int __user*)argp);
>>   	}
>>
>> -	rtnl_lock();
>> -
>> -	tun = __tun_get(tfile);
>> -	if (cmd == TUNSETIFF&&  !tun) {
>> +	ret = 0;
>> +	if (cmd == TUNSETIFF) {
>> +		rtnl_lock();
>>   		ifr.ifr_name[IFNAMSIZ-1] = '\0';
>> -
>>   		ret = tun_set_iff(tfile->net, file,&ifr);
>> -
>> +		rtnl_unlock();
>>   		if (ret)
>> -			goto unlock;
>> -
>> +			return ret;
>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>> -			ret = -EFAULT;
>> -		goto unlock;
>> +			return -EFAULT;
>> +		return ret;
>>   	}
>>
>> +	rtnl_lock();
>> +
>> +	rcu_read_lock();
>> +
>>   	ret = -EBADFD;
>> +	tun = rcu_dereference(tfile->tun);
>>   	if (!tun)
>>   		goto unlock;
>> +	else
>> +		ret = 0;
>>
>> -	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
>> -
>> -	ret = 0;
>>   	switch (cmd) {
>>   	case TUNGETIFF:
>>   		ret = tun_get_iff(current->nsproxy->net_ns, tun,&ifr);
>> +		rcu_read_unlock();
>>   		if (ret)
>> -			break;
>> +			goto out;
>>
>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>   			ret = -EFAULT;
>> -		break;
>> +		goto out;
>>
>>   	case TUNSETNOCSUM:
>>   		/* Disable/Enable checksum */
>> @@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   		/* Get hw address */
>>   		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
>>   		ifr.ifr_hwaddr.sa_family = tun->dev->type;
>> +		rcu_read_unlock();
>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>   			ret = -EFAULT;
>> -		break;
>> +		goto out;
>>
>>   	case SIOCSIFHWADDR:
>>   		/* Set hw address */
>> @@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   	}
>>
>>   unlock:
>> +	rcu_read_unlock();
>> +out:
>>   	rtnl_unlock();
>> -	if (tun)
>> -		tun_put(tun);
>>   	return ret;
>>   }
>>
>> @@ -1517,6 +1624,11 @@ out:
>>   	return ret;
>>   }
>>
>> +static void tun_sock_destruct(struct sock *sk)
>> +{
>> +	skb_queue_purge(&sk->sk_receive_queue);
>> +}
>> +
>>   static int tun_chr_open(struct inode *inode, struct file * file)
>>   {
>>   	struct net *net = current->nsproxy->net_ns;
>> @@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>   	sock_init_data(&tfile->socket,&tfile->sk);
>>
>>   	tfile->sk.sk_write_space = tun_sock_write_space;
>> +	tfile->sk.sk_destruct = tun_sock_destruct;
>>   	tfile->sk.sk_sndbuf = INT_MAX;
>>   	file->private_data = tfile;
>>
>> @@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>   static int tun_chr_close(struct inode *inode, struct file *file)
>>   {
>>   	struct tun_file *tfile = file->private_data;
>> -	struct tun_struct *tun;
>> -
>> -	tun = __tun_get(tfile);
>> -	if (tun) {
>> -		struct net_device *dev = tun->dev;
>> -
>> -		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
>> -
>> -		__tun_detach(tun);
>> -
>> -		/* If desirable, unregister the netdevice. */
>> -		if (!(tun->flags&  TUN_PERSIST)) {
>> -			rtnl_lock();
>> -			if (dev->reg_state == NETREG_REGISTERED)
>> -				unregister_netdevice(dev);
>> -			rtnl_unlock();
>> -		}
>>
>> -		/* drop the reference that netdevice holds */
>> -		sock_put(&tfile->sk);
>> -
>> -	}
>> -
>> -	/* drop the reference that file holds */
>> -	sock_put(&tfile->sk);
>> +	tun_detach(tfile, true);
>>
>>   	return 0;
>>   }
>> @@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
>>    * holding a reference to the file for as long as the socket is in use. */
>>   struct socket *tun_get_socket(struct file *file)
>>   {
>> -	struct tun_struct *tun;
>> +	struct tun_struct *tun = NULL;
>>   	struct tun_file *tfile = file->private_data;
>>   	if (file->f_op !=&tun_fops)
>>   		return ERR_PTR(-EINVAL);
>> -	tun = tun_get(file);
>> -	if (!tun)
>> +	rcu_read_lock();
>> +	tun = rcu_dereference(tfile->tun);
>> +	if (!tun) {
>> +		rcu_read_unlock();
>>   		return ERR_PTR(-EBADFD);
>> -	tun_put(tun);
>> +	}
>> +	rcu_read_unlock();
>>   	return&tfile->socket;
>>   }
>>   EXPORT_SYMBOL_GPL(tun_get_socket);


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 1/6] tuntap: move socket to tun_file
  2012-06-25  8:27   ` [net-next RFC V3 PATCH 1/6] tuntap: move socket to tun_file Michael S. Tsirkin
@ 2012-06-26  5:55     ` Jason Wang
  0 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2012-06-26  5:55 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, linux-kernel, krkumar2, tahm, akong, davem, shemminger, mashirle

On 06/25/2012 04:27 PM, Michael S. Tsirkin wrote:
> On Mon, Jun 25, 2012 at 02:09:45PM +0800, Jason Wang wrote:
>> This patch moves socket structure from tun_device and to tun_file in order to
>> let it possbile for multiple sockets to be attached to tun/tap device. The
>> reference between tap device and socket was setup during TUNSETIFF as
>> usual.
>>
>> After this patch, we can go further towards multiqueue tun/tap support by
>> storing an array of pointers of tun_file in tun_device.
>>
>> Signed-off-by: Jason Wang<jasowang@redhat.com>
> I think this changes visible userspace
> behaviour for persistent devices.
>
> Specifically, with this patch, TUNSETSNDBUF and TUNATTACHFILTER won't
> be effective if you close and reopen the device, right?

Yes, good catch.
> It's possible that no application uses either of these
> ioctls on persistent tun devices at the moment,
> but seems safer to avoid changing such behaviour.

Agree, I would modify the socket filer and sndbuf to be per-device 
instead of per-socket.
>
>> ---
>>   drivers/net/tun.c |  352 +++++++++++++++++++++++++++--------------------------
>>   1 files changed, 181 insertions(+), 171 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index 987aeef..1f27789 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -108,9 +108,16 @@ struct tap_filter {
>>   };
>>
>>   struct tun_file {
>> +	struct sock sk;
>> +	struct socket socket;
>> +	struct socket_wq wq;
>> +	int vnet_hdr_sz;
>> +	struct tap_filter txflt;
>>   	atomic_t count;
>>   	struct tun_struct *tun;
>>   	struct net *net;
>> +	struct fasync_struct *fasync;
>> +	unsigned int flags;
>>   };
>>
>>   struct tun_sock;
>> @@ -125,29 +132,12 @@ struct tun_struct {
>>   	netdev_features_t	set_features;
>>   #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
>>   			  NETIF_F_TSO6|NETIF_F_UFO)
>> -	struct fasync_struct	*fasync;
>> -
>> -	struct tap_filter       txflt;
>> -	struct socket		socket;
>> -	struct socket_wq	wq;
>> -
>> -	int			vnet_hdr_sz;
>>
>>   #ifdef TUN_DEBUG
>>   	int debug;
>>   #endif
>>   };
>>
>> -struct tun_sock {
>> -	struct sock		sk;
>> -	struct tun_struct	*tun;
>> -};
>> -
>> -static inline struct tun_sock *tun_sk(struct sock *sk)
>> -{
>> -	return container_of(sk, struct tun_sock, sk);
>> -}
>> -
>>   static int tun_attach(struct tun_struct *tun, struct file *file)
>>   {
>>   	struct tun_file *tfile = file->private_data;
>> @@ -168,10 +158,9 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
>>   	err = 0;
>>   	tfile->tun = tun;
>>   	tun->tfile = tfile;
>> -	tun->socket.file = file;
>>   	netif_carrier_on(tun->dev);
>>   	dev_hold(tun->dev);
>> -	sock_hold(tun->socket.sk);
>> +	sock_hold(&tfile->sk);
>>   	atomic_inc(&tfile->count);
>>
>>   out:
>> @@ -181,15 +170,15 @@ out:
>>
>>   static void __tun_detach(struct tun_struct *tun)
>>   {
>> +	struct tun_file *tfile = tun->tfile;
>>   	/* Detach from net device */
>>   	netif_tx_lock_bh(tun->dev);
>>   	netif_carrier_off(tun->dev);
>>   	tun->tfile = NULL;
>> -	tun->socket.file = NULL;
>>   	netif_tx_unlock_bh(tun->dev);
>>
>>   	/* Drop read queue */
>> -	skb_queue_purge(&tun->socket.sk->sk_receive_queue);
>> +	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
>>
>>   	/* Drop the extra count on the net device */
>>   	dev_put(tun->dev);
>> @@ -348,19 +337,12 @@ static void tun_net_uninit(struct net_device *dev)
>>   	/* Inform the methods they need to stop using the dev.
>>   	 */
>>   	if (tfile) {
>> -		wake_up_all(&tun->wq.wait);
>> +		wake_up_all(&tfile->wq.wait);
>>   		if (atomic_dec_and_test(&tfile->count))
>>   			__tun_detach(tun);
>>   	}
>>   }
>>
>> -static void tun_free_netdev(struct net_device *dev)
>> -{
>> -	struct tun_struct *tun = netdev_priv(dev);
>> -
>> -	sk_release_kernel(tun->socket.sk);
>> -}
>> -
>>   /* Net device open. */
>>   static int tun_net_open(struct net_device *dev)
>>   {
>> @@ -379,24 +361,26 @@ static int tun_net_close(struct net_device *dev)
>>   static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>   {
>>   	struct tun_struct *tun = netdev_priv(dev);
>> +	struct tun_file *tfile = tun->tfile;
>>
>>   	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
>>
>>   	/* Drop packet if interface is not attached */
>> -	if (!tun->tfile)
>> +	if (!tfile)
>>   		goto drop;
>>
>>   	/* Drop if the filter does not like it.
>>   	 * This is a noop if the filter is disabled.
>>   	 * Filter can be enabled only for the TAP devices. */
>> -	if (!check_filter(&tun->txflt, skb))
>> +	if (!check_filter(&tfile->txflt, skb))
>>   		goto drop;
>>
>> -	if (tun->socket.sk->sk_filter&&
>> -	    sk_filter(tun->socket.sk, skb))
>> +	if (tfile->socket.sk->sk_filter&&
>> +	    sk_filter(tfile->socket.sk, skb))
>>   		goto drop;
>>
>> -	if (skb_queue_len(&tun->socket.sk->sk_receive_queue)>= dev->tx_queue_len) {
>> +	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
>> +	>= dev->tx_queue_len) {
>>   		if (!(tun->flags&  TUN_ONE_QUEUE)) {
>>   			/* Normal queueing mode. */
>>   			/* Packet scheduler handles dropping of further packets. */
>> @@ -417,12 +401,12 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>   	skb_orphan(skb);
>>
>>   	/* Enqueue packet */
>> -	skb_queue_tail(&tun->socket.sk->sk_receive_queue, skb);
>> +	skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
>>
>>   	/* Notify and wake up reader process */
>> -	if (tun->flags&  TUN_FASYNC)
>> -		kill_fasync(&tun->fasync, SIGIO, POLL_IN);
>> -	wake_up_interruptible_poll(&tun->wq.wait, POLLIN |
>> +	if (tfile->flags&  TUN_FASYNC)
>> +		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
>> +	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
>>   				   POLLRDNORM | POLLRDBAND);
>>   	return NETDEV_TX_OK;
>>
>> @@ -550,11 +534,11 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>   	if (!tun)
>>   		return POLLERR;
>>
>> -	sk = tun->socket.sk;
>> +	sk = tfile->socket.sk;
>>
>>   	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
>>
>> -	poll_wait(file,&tun->wq.wait, wait);
>> +	poll_wait(file,&tfile->wq.wait, wait);
>>
>>   	if (!skb_queue_empty(&sk->sk_receive_queue))
>>   		mask |= POLLIN | POLLRDNORM;
>> @@ -573,11 +557,11 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>
>>   /* prepad is the amount to reserve at front.  len is length after that.
>>    * linear is a hint as to how much to copy (usually headers). */
>> -static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
>> +static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
>>   				     size_t prepad, size_t len,
>>   				     size_t linear, int noblock)
>>   {
>> -	struct sock *sk = tun->socket.sk;
>> +	struct sock *sk = tfile->socket.sk;
>>   	struct sk_buff *skb;
>>   	int err;
>>
>> @@ -601,7 +585,7 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
>>   }
>>
>>   /* Get packet from user space buffer */
>> -static ssize_t tun_get_user(struct tun_struct *tun,
>> +static ssize_t tun_get_user(struct tun_file *tfile,
>>   			    const struct iovec *iv, size_t count,
>>   			    int noblock)
>>   {
>> @@ -610,8 +594,10 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>>   	size_t len = count, align = NET_SKB_PAD;
>>   	struct virtio_net_hdr gso = { 0 };
>>   	int offset = 0;
>> +	struct tun_struct *tun = NULL;
>> +	bool drop = false, error = false;
>>
>> -	if (!(tun->flags&  TUN_NO_PI)) {
>> +	if (!(tfile->flags&  TUN_NO_PI)) {
>>   		if ((len -= sizeof(pi))>  count)
>>   			return -EINVAL;
>>
>> @@ -620,8 +606,9 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>>   		offset += sizeof(pi);
>>   	}
>>
>> -	if (tun->flags&  TUN_VNET_HDR) {
>> -		if ((len -= tun->vnet_hdr_sz)>  count)
>> +	if (tfile->flags&  TUN_VNET_HDR) {
>> +		len -= tfile->vnet_hdr_sz;
>> +		if (len>  count)
>>   			return -EINVAL;
>>
>>   		if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
>> @@ -633,41 +620,43 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>>
>>   		if (gso.hdr_len>  len)
>>   			return -EINVAL;
>> -		offset += tun->vnet_hdr_sz;
>> +		offset += tfile->vnet_hdr_sz;
>>   	}
>>
>> -	if ((tun->flags&  TUN_TYPE_MASK) == TUN_TAP_DEV) {
>> +	if ((tfile->flags&  TUN_TYPE_MASK) == TUN_TAP_DEV) {
>>   		align += NET_IP_ALIGN;
>>   		if (unlikely(len<  ETH_HLEN ||
>>   			     (gso.hdr_len&&  gso.hdr_len<  ETH_HLEN)))
>>   			return -EINVAL;
>>   	}
>>
>> -	skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock);
>> +	skb = tun_alloc_skb(tfile, align, len, gso.hdr_len, noblock);
>> +
>>   	if (IS_ERR(skb)) {
>>   		if (PTR_ERR(skb) != -EAGAIN)
>> -			tun->dev->stats.rx_dropped++;
>> -		return PTR_ERR(skb);
>> +			drop = true;
>> +		count = PTR_ERR(skb);
>> +		goto err;
>>   	}
>>
>>   	if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) {
>> -		tun->dev->stats.rx_dropped++;
>> +		drop = true;
>>   		kfree_skb(skb);
>> -		return -EFAULT;
>> +		count = -EFAULT;
>> +		goto err;
>>   	}
>>
>>   	if (gso.flags&  VIRTIO_NET_HDR_F_NEEDS_CSUM) {
>>   		if (!skb_partial_csum_set(skb, gso.csum_start,
>>   					  gso.csum_offset)) {
>> -			tun->dev->stats.rx_frame_errors++;
>> -			kfree_skb(skb);
>> -			return -EINVAL;
>> +			error = true;
>> +			goto err_free;
>>   		}
>>   	}
>>
>> -	switch (tun->flags&  TUN_TYPE_MASK) {
>> +	switch (tfile->flags&  TUN_TYPE_MASK) {
>>   	case TUN_TUN_DEV:
>> -		if (tun->flags&  TUN_NO_PI) {
>> +		if (tfile->flags&  TUN_NO_PI) {
>>   			switch (skb->data[0]&  0xf0) {
>>   			case 0x40:
>>   				pi.proto = htons(ETH_P_IP);
>> @@ -676,18 +665,15 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>>   				pi.proto = htons(ETH_P_IPV6);
>>   				break;
>>   			default:
>> -				tun->dev->stats.rx_dropped++;
>> -				kfree_skb(skb);
>> -				return -EINVAL;
>> +				drop = true;
>> +				goto err_free;
>>   			}
>>   		}
>>
>>   		skb_reset_mac_header(skb);
>>   		skb->protocol = pi.proto;
>> -		skb->dev = tun->dev;
>>   		break;
>>   	case TUN_TAP_DEV:
>> -		skb->protocol = eth_type_trans(skb, tun->dev);
>>   		break;
>>   	}
>>
>> @@ -704,9 +690,8 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>>   			skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
>>   			break;
>>   		default:
>> -			tun->dev->stats.rx_frame_errors++;
>> -			kfree_skb(skb);
>> -			return -EINVAL;
>> +			error = true;
>> +			goto err_free;
>>   		}
>>
>>   		if (gso.gso_type&  VIRTIO_NET_HDR_GSO_ECN)
>> @@ -714,9 +699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>>
>>   		skb_shinfo(skb)->gso_size = gso.gso_size;
>>   		if (skb_shinfo(skb)->gso_size == 0) {
>> -			tun->dev->stats.rx_frame_errors++;
>> -			kfree_skb(skb);
>> -			return -EINVAL;
>> +			error = true;
>> +			goto err_free;
>>   		}
>>
>>   		/* Header must be checked, and gso_segs computed. */
>> @@ -724,11 +708,38 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>>   		skb_shinfo(skb)->gso_segs = 0;
>>   	}
>>
>> -	netif_rx_ni(skb);
>> +	tun = __tun_get(tfile);
>> +	if (!tun)
>> +		return -EBADFD;
>>
>> +	switch (tfile->flags&  TUN_TYPE_MASK) {
>> +	case TUN_TUN_DEV:
>> +		skb->dev = tun->dev;
>> +		break;
>> +	case TUN_TAP_DEV:
>> +		skb->protocol = eth_type_trans(skb, tun->dev);
>> +		break;
>> +	}
>> +
>> +	netif_rx_ni(skb);
>>   	tun->dev->stats.rx_packets++;
>>   	tun->dev->stats.rx_bytes += len;
>> +	tun_put(tun);
>> +	return count;
>> +
>> +err_free:
>> +	count = -EINVAL;
>> +	kfree_skb(skb);
>> +err:
>> +	tun = __tun_get(tfile);
>> +	if (!tun)
>> +		return -EBADFD;
>>
>> +	if (drop)
>> +		tun->dev->stats.rx_dropped++;
>> +	if (error)
>> +		tun->dev->stats.rx_frame_errors++;
>> +	tun_put(tun);
>>   	return count;
>>   }
>>
>> @@ -736,30 +747,25 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
>>   			      unsigned long count, loff_t pos)
>>   {
>>   	struct file *file = iocb->ki_filp;
>> -	struct tun_struct *tun = tun_get(file);
>> +	struct tun_file *tfile = file->private_data;
>>   	ssize_t result;
>>
>> -	if (!tun)
>> -		return -EBADFD;
>> -
>> -	tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
>> -
>> -	result = tun_get_user(tun, iv, iov_length(iv, count),
>> +	result = tun_get_user(tfile, iv, iov_length(iv, count),
>>   			      file->f_flags&  O_NONBLOCK);
>>
>> -	tun_put(tun);
>>   	return result;
>>   }
>>
>>   /* Put packet to the user space buffer */
>> -static ssize_t tun_put_user(struct tun_struct *tun,
>> +static ssize_t tun_put_user(struct tun_file *tfile,
>>   			    struct sk_buff *skb,
>>   			    const struct iovec *iv, int len)
>>   {
>> +	struct tun_struct *tun = NULL;
>>   	struct tun_pi pi = { 0, skb->protocol };
>>   	ssize_t total = 0;
>>
>> -	if (!(tun->flags&  TUN_NO_PI)) {
>> +	if (!(tfile->flags&  TUN_NO_PI)) {
>>   		if ((len -= sizeof(pi))<  0)
>>   			return -EINVAL;
>>
>> @@ -773,9 +779,10 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>   		total += sizeof(pi);
>>   	}
>>
>> -	if (tun->flags&  TUN_VNET_HDR) {
>> +	if (tfile->flags&  TUN_VNET_HDR) {
>>   		struct virtio_net_hdr gso = { 0 }; /* no info leak */
>> -		if ((len -= tun->vnet_hdr_sz)<  0)
>> +		len -= tfile->vnet_hdr_sz;
>> +		if (len<  0)
>>   			return -EINVAL;
>>
>>   		if (skb_is_gso(skb)) {
>> @@ -818,7 +825,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>   		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
>>   					       sizeof(gso))))
>>   			return -EFAULT;
>> -		total += tun->vnet_hdr_sz;
>> +		total += tfile->vnet_hdr_sz;
>>   	}
>>
>>   	len = min_t(int, skb->len, len);
>> @@ -826,29 +833,33 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>>   	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
>>   	total += skb->len;
>>
>> -	tun->dev->stats.tx_packets++;
>> -	tun->dev->stats.tx_bytes += len;
>> +	tun = __tun_get(tfile);
>> +	if (tun) {
>> +		tun->dev->stats.tx_packets++;
>> +		tun->dev->stats.tx_bytes += len;
>> +		tun_put(tun);
>> +	}
>>
>>   	return total;
>>   }
>>
>> -static ssize_t tun_do_read(struct tun_struct *tun,
>> +static ssize_t tun_do_read(struct tun_file *tfile,
>>   			   struct kiocb *iocb, const struct iovec *iv,
>>   			   ssize_t len, int noblock)
>>   {
>>   	DECLARE_WAITQUEUE(wait, current);
>>   	struct sk_buff *skb;
>>   	ssize_t ret = 0;
>> -
>> -	tun_debug(KERN_INFO, tun, "tun_chr_read\n");
>> +	struct tun_struct *tun = NULL;
>>
>>   	if (unlikely(!noblock))
>> -		add_wait_queue(&tun->wq.wait,&wait);
>> +		add_wait_queue(&tfile->wq.wait,&wait);
>>   	while (len) {
>>   		current->state = TASK_INTERRUPTIBLE;
>>
>> +		skb = skb_dequeue(&tfile->socket.sk->sk_receive_queue);
>>   		/* Read frames from the queue */
>> -		if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
>> +		if (!skb) {
>>   			if (noblock) {
>>   				ret = -EAGAIN;
>>   				break;
>> @@ -857,25 +868,38 @@ static ssize_t tun_do_read(struct tun_struct *tun,
>>   				ret = -ERESTARTSYS;
>>   				break;
>>   			}
>> +
>> +			tun = __tun_get(tfile);
>> +			if (!tun) {
>> +				ret = -EIO;
>> +				break;
>> +			}
>>   			if (tun->dev->reg_state != NETREG_REGISTERED) {
>>   				ret = -EIO;
>> +				tun_put(tun);
>>   				break;
>>   			}
>> +			tun_put(tun);
>>
>>   			/* Nothing to read, let's sleep */
>>   			schedule();
>>   			continue;
>>   		}
>> -		netif_wake_queue(tun->dev);
>>
>> -		ret = tun_put_user(tun, skb, iv, len);
>> +		tun = __tun_get(tfile);
>> +		if (tun) {
>> +			netif_wake_queue(tun->dev);
>> +			tun_put(tun);
>> +		}
>> +
>> +		ret = tun_put_user(tfile, skb, iv, len);
>>   		kfree_skb(skb);
>>   		break;
>>   	}
>>
>>   	current->state = TASK_RUNNING;
>>   	if (unlikely(!noblock))
>> -		remove_wait_queue(&tun->wq.wait,&wait);
>> +		remove_wait_queue(&tfile->wq.wait,&wait);
>>
>>   	return ret;
>>   }
>> @@ -885,21 +909,17 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
>>   {
>>   	struct file *file = iocb->ki_filp;
>>   	struct tun_file *tfile = file->private_data;
>> -	struct tun_struct *tun = __tun_get(tfile);
>>   	ssize_t len, ret;
>>
>> -	if (!tun)
>> -		return -EBADFD;
>>   	len = iov_length(iv, count);
>>   	if (len<  0) {
>>   		ret = -EINVAL;
>>   		goto out;
>>   	}
>>
>> -	ret = tun_do_read(tun, iocb, iv, len, file->f_flags&  O_NONBLOCK);
>> +	ret = tun_do_read(tfile, iocb, iv, len, file->f_flags&  O_NONBLOCK);
>>   	ret = min_t(ssize_t, ret, len);
>>   out:
>> -	tun_put(tun);
>>   	return ret;
>>   }
>>
>> @@ -911,7 +931,7 @@ static void tun_setup(struct net_device *dev)
>>   	tun->group = -1;
>>
>>   	dev->ethtool_ops =&tun_ethtool_ops;
>> -	dev->destructor = tun_free_netdev;
>> +	dev->destructor = free_netdev;
>>   }
>>
>>   /* Trivial set of netlink ops to allow deleting tun or tap
>> @@ -931,7 +951,7 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = {
>>
>>   static void tun_sock_write_space(struct sock *sk)
>>   {
>> -	struct tun_struct *tun;
>> +	struct tun_file *tfile = NULL;
>>   	wait_queue_head_t *wqueue;
>>
>>   	if (!sock_writeable(sk))
>> @@ -945,37 +965,38 @@ static void tun_sock_write_space(struct sock *sk)
>>   		wake_up_interruptible_sync_poll(wqueue, POLLOUT |
>>   						POLLWRNORM | POLLWRBAND);
>>
>> -	tun = tun_sk(sk)->tun;
>> -	kill_fasync(&tun->fasync, SIGIO, POLL_OUT);
>> -}
>> -
>> -static void tun_sock_destruct(struct sock *sk)
>> -{
>> -	free_netdev(tun_sk(sk)->tun->dev);
>> +	tfile = container_of(sk, struct tun_file, sk);
>> +	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
>>   }
>>
>>   static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
>>   		       struct msghdr *m, size_t total_len)
>>   {
>> -	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
>> -	return tun_get_user(tun, m->msg_iov, total_len,
>> -			    m->msg_flags&  MSG_DONTWAIT);
>> +	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
>> +	ssize_t result;
>> +
>> +	result = tun_get_user(tfile, m->msg_iov, total_len,
>> +			      m->msg_flags&  MSG_DONTWAIT);
>> +	return result;
>>   }
>>
>>   static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
>>   		       struct msghdr *m, size_t total_len,
>>   		       int flags)
>>   {
>> -	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
>> +	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
>>   	int ret;
>> +
>>   	if (flags&  ~(MSG_DONTWAIT|MSG_TRUNC))
>>   		return -EINVAL;
>> -	ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
>> +
>> +	ret = tun_do_read(tfile, iocb, m->msg_iov, total_len,
>>   			  flags&  MSG_DONTWAIT);
>>   	if (ret>  total_len) {
>>   		m->msg_flags |= MSG_TRUNC;
>>   		ret = flags&  MSG_TRUNC ? ret : total_len;
>>   	}
>> +
>>   	return ret;
>>   }
>>
>> @@ -996,7 +1017,7 @@ static const struct proto_ops tun_socket_ops = {
>>   static struct proto tun_proto = {
>>   	.name		= "tun",
>>   	.owner		= THIS_MODULE,
>> -	.obj_size	= sizeof(struct tun_sock),
>> +	.obj_size	= sizeof(struct tun_file),
>>   };
>>
>>   static int tun_flags(struct tun_struct *tun)
>> @@ -1047,8 +1068,8 @@ static DEVICE_ATTR(group, 0444, tun_show_group, NULL);
>>
>>   static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>   {
>> -	struct sock *sk;
>>   	struct tun_struct *tun;
>> +	struct tun_file *tfile = file->private_data;
>>   	struct net_device *dev;
>>   	int err;
>>
>> @@ -1069,7 +1090,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>   		     (tun->group != -1&&  !in_egroup_p(tun->group)))&&
>>   		!capable(CAP_NET_ADMIN))
>>   			return -EPERM;
>> -		err = security_tun_dev_attach(tun->socket.sk);
>> +		err = security_tun_dev_attach(tfile->socket.sk);
>>   		if (err<  0)
>>   			return err;
>>
>> @@ -1113,25 +1134,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>   		tun = netdev_priv(dev);
>>   		tun->dev = dev;
>>   		tun->flags = flags;
>> -		tun->txflt.count = 0;
>> -		tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
>>
>> -		err = -ENOMEM;
>> -		sk = sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL,&tun_proto);
>> -		if (!sk)
>> -			goto err_free_dev;
>> -
>> -		sk_change_net(sk, net);
>> -		tun->socket.wq =&tun->wq;
>> -		init_waitqueue_head(&tun->wq.wait);
>> -		tun->socket.ops =&tun_socket_ops;
>> -		sock_init_data(&tun->socket, sk);
>> -		sk->sk_write_space = tun_sock_write_space;
>> -		sk->sk_sndbuf = INT_MAX;
>> -
>> -		tun_sk(sk)->tun = tun;
>> -
>> -		security_tun_dev_post_create(sk);
>> +		security_tun_dev_post_create(&tfile->sk);
>>
>>   		tun_net_init(dev);
>>
>> @@ -1141,15 +1145,13 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>
>>   		err = register_netdevice(tun->dev);
>>   		if (err<  0)
>> -			goto err_free_sk;
>> +			goto err_free_dev;
>>
>>   		if (device_create_file(&tun->dev->dev,&dev_attr_tun_flags) ||
>>   		    device_create_file(&tun->dev->dev,&dev_attr_owner) ||
>>   		    device_create_file(&tun->dev->dev,&dev_attr_group))
>>   			pr_err("Failed to create tun sysfs files\n");
>>
>> -		sk->sk_destruct = tun_sock_destruct;
>> -
>>   		err = tun_attach(tun, file);
>>   		if (err<  0)
>>   			goto failed;
>> @@ -1172,6 +1174,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>   	else
>>   		tun->flags&= ~TUN_VNET_HDR;
>>
>> +	/* Cache flags from tun device */
>> +	tfile->flags = tun->flags;
>>   	/* Make sure persistent devices do not get stuck in
>>   	 * xoff state.
>>   	 */
>> @@ -1181,11 +1185,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>   	strcpy(ifr->ifr_name, tun->dev->name);
>>   	return 0;
>>
>> - err_free_sk:
>> -	tun_free_netdev(dev);
>> - err_free_dev:
>> +err_free_dev:
>>   	free_netdev(dev);
>> - failed:
>> +failed:
>>   	return err;
>>   }
>>
>> @@ -1357,9 +1359,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   	case TUNSETTXFILTER:
>>   		/* Can be set only for TAPs */
>>   		ret = -EINVAL;
>> -		if ((tun->flags&  TUN_TYPE_MASK) != TUN_TAP_DEV)
>> +		if ((tfile->flags&  TUN_TYPE_MASK) != TUN_TAP_DEV)
>>   			break;
>> -		ret = update_filter(&tun->txflt, (void __user *)arg);
>> +		ret = update_filter(&tfile->txflt, (void __user *)arg);
>>   		break;
>>
>>   	case SIOCGIFHWADDR:
>> @@ -1379,7 +1381,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   		break;
>>
>>   	case TUNGETSNDBUF:
>> -		sndbuf = tun->socket.sk->sk_sndbuf;
>> +		sndbuf = tfile->socket.sk->sk_sndbuf;
>>   		if (copy_to_user(argp,&sndbuf, sizeof(sndbuf)))
>>   			ret = -EFAULT;
>>   		break;
>> @@ -1390,11 +1392,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   			break;
>>   		}
>>
>> -		tun->socket.sk->sk_sndbuf = sndbuf;
>> +		tfile->socket.sk->sk_sndbuf = sndbuf;
>>   		break;
>>
>>   	case TUNGETVNETHDRSZ:
>> -		vnet_hdr_sz = tun->vnet_hdr_sz;
>> +		vnet_hdr_sz = tfile->vnet_hdr_sz;
>>   		if (copy_to_user(argp,&vnet_hdr_sz, sizeof(vnet_hdr_sz)))
>>   			ret = -EFAULT;
>>   		break;
>> @@ -1409,27 +1411,27 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>   			break;
>>   		}
>>
>> -		tun->vnet_hdr_sz = vnet_hdr_sz;
>> +		tfile->vnet_hdr_sz = vnet_hdr_sz;
>>   		break;
>>
>>   	case TUNATTACHFILTER:
>>   		/* Can be set only for TAPs */
>>   		ret = -EINVAL;
>> -		if ((tun->flags&  TUN_TYPE_MASK) != TUN_TAP_DEV)
>> +		if ((tfile->flags&  TUN_TYPE_MASK) != TUN_TAP_DEV)
>>   			break;
>>   		ret = -EFAULT;
>>   		if (copy_from_user(&fprog, argp, sizeof(fprog)))
>>   			break;
>>
>> -		ret = sk_attach_filter(&fprog, tun->socket.sk);
>> +		ret = sk_attach_filter(&fprog, tfile->socket.sk);
>>   		break;
>>
>>   	case TUNDETACHFILTER:
>>   		/* Can be set only for TAPs */
>>   		ret = -EINVAL;
>> -		if ((tun->flags&  TUN_TYPE_MASK) != TUN_TAP_DEV)
>> +		if ((tfile->flags&  TUN_TYPE_MASK) != TUN_TAP_DEV)
>>   			break;
>> -		ret = sk_detach_filter(tun->socket.sk);
>> +		ret = sk_detach_filter(tfile->socket.sk);
>>   		break;
>>
>>   	default:
>> @@ -1481,43 +1483,50 @@ static long tun_chr_compat_ioctl(struct file *file,
>>
>>   static int tun_chr_fasync(int fd, struct file *file, int on)
>>   {
>> -	struct tun_struct *tun = tun_get(file);
>> -	int ret;
>> -
>> -	if (!tun)
>> -		return -EBADFD;
>> -
>> -	tun_debug(KERN_INFO, tun, "tun_chr_fasync %d\n", on);
>> +	struct tun_file *tfile = file->private_data;
>> +	int ret = fasync_helper(fd, file, on,&tfile->fasync);
>>
>> -	if ((ret = fasync_helper(fd, file, on,&tun->fasync))<  0)
>> +	if (ret<  0)
>>   		goto out;
>>
>>   	if (on) {
>>   		ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0);
>>   		if (ret)
>>   			goto out;
>> -		tun->flags |= TUN_FASYNC;
>> +		tfile->flags |= TUN_FASYNC;
>>   	} else
>> -		tun->flags&= ~TUN_FASYNC;
>> +		tfile->flags&= ~TUN_FASYNC;
>>   	ret = 0;
>>   out:
>> -	tun_put(tun);
>>   	return ret;
>>   }
>>
>>   static int tun_chr_open(struct inode *inode, struct file * file)
>>   {
>> +	struct net *net = current->nsproxy->net_ns;
>>   	struct tun_file *tfile;
>>
>>   	DBG1(KERN_INFO, "tunX: tun_chr_open\n");
>>
>> -	tfile = kmalloc(sizeof(*tfile), GFP_KERNEL);
>> +	tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
>> +					&tun_proto);
>>   	if (!tfile)
>>   		return -ENOMEM;
>> -	atomic_set(&tfile->count, 0);
>> +
>>   	tfile->tun = NULL;
>> -	tfile->net = get_net(current->nsproxy->net_ns);
>> +	tfile->net = net;
>> +	tfile->txflt.count = 0;
>> +	tfile->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
>> +	tfile->socket.wq =&tfile->wq;
>> +	init_waitqueue_head(&tfile->wq.wait);
>> +	tfile->socket.file = file;
>> +	tfile->socket.ops =&tun_socket_ops;
>> +	sock_init_data(&tfile->socket,&tfile->sk);
>> +
>> +	tfile->sk.sk_write_space = tun_sock_write_space;
>> +	tfile->sk.sk_sndbuf = INT_MAX;
>>   	file->private_data = tfile;
>> +
>>   	return 0;
>>   }
>>
>> @@ -1541,14 +1550,14 @@ static int tun_chr_close(struct inode *inode, struct file *file)
>>   				unregister_netdevice(dev);
>>   			rtnl_unlock();
>>   		}
>> -	}
>>
>> -	tun = tfile->tun;
>> -	if (tun)
>> -		sock_put(tun->socket.sk);
>> +		/* drop the reference that netdevice holds */
>> +		sock_put(&tfile->sk);
>>
>> -	put_net(tfile->net);
>> -	kfree(tfile);
>> +	}
>> +
>> +	/* drop the reference that file holds */
>> +	sock_put(&tfile->sk);
>>
>>   	return 0;
>>   }
>> @@ -1676,13 +1685,14 @@ static void tun_cleanup(void)
>>   struct socket *tun_get_socket(struct file *file)
>>   {
>>   	struct tun_struct *tun;
>> +	struct tun_file *tfile = file->private_data;
>>   	if (file->f_op !=&tun_fops)
>>   		return ERR_PTR(-EINVAL);
>>   	tun = tun_get(file);
>>   	if (!tun)
>>   		return ERR_PTR(-EBADFD);
>>   	tun_put(tun);
>> -	return&tun->socket;
>> +	return&tfile->socket;
>>   }
>>   EXPORT_SYMBOL_GPL(tun_get_socket);
>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/6] tuntap: per queue 64 bit stats
  2012-06-25 12:52   ` Eric Dumazet
@ 2012-06-26  6:00     ` Jason Wang
  2012-06-26  6:10       ` Eric Dumazet
  2012-06-26 19:46       ` [PATCH 5/6] tuntap: per queue 64 bit stats\ Michael S. Tsirkin
  0 siblings, 2 replies; 28+ messages in thread
From: Jason Wang @ 2012-06-26  6:00 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: mst, akong, habanero, tahm, haixiao, jwhan, ernesto.martin,
	mashirle, davem, netdev, linux-kernel, krkumar2, shemminger,
	edumazet

On 06/25/2012 08:52 PM, Eric Dumazet wrote:
> On Mon, 2012-06-25 at 19:59 +0800, Jason Wang wrote:
>> As we've added multiqueue support for tun/tap, this patch convert the statistics
>> to use per-queue 64 bit statistics.
> LLTX means you can have several cpus calling TX path in parallel.
>
> So tx stats are wrong (even before this patch), and racy after this
> patch (if several cpu access same queue, it seems to be possible)
>
>         u64_stats_update_begin(&tfile->stats.tx_syncp);
>         tfile->stats.tx_packets++;
>         tfile->stats.tx_bytes += total;
>         u64_stats_update_end(&tfile->stats.tx_syncp);
>
> This can break horribly if several cpus run this code using same 'tfile'
> pointer.

Yes, looks like it's hard to use NETIF_F_LLTX without breaking the u64 
statistics, may worth to use tx lock and alloc_netdev_mq().

> I suggest this patch comes before 'tuntap: multiqueue support' in the
> serie.

Sure, thanks.
>
>
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/6] tuntap: per queue 64 bit stats
  2012-06-26  6:00     ` Jason Wang
@ 2012-06-26  6:10       ` Eric Dumazet
  2012-06-26  6:28         ` Jason Wang
  2012-06-26 19:46       ` [PATCH 5/6] tuntap: per queue 64 bit stats\ Michael S. Tsirkin
  1 sibling, 1 reply; 28+ messages in thread
From: Eric Dumazet @ 2012-06-26  6:10 UTC (permalink / raw)
  To: Jason Wang
  Cc: mst, akong, habanero, tahm, haixiao, jwhan, ernesto.martin,
	mashirle, davem, netdev, linux-kernel, krkumar2, shemminger,
	edumazet

On Tue, 2012-06-26 at 14:00 +0800, Jason Wang wrote:

> Yes, looks like it's hard to use NETIF_F_LLTX without breaking the u64 
> statistics, may worth to use tx lock and alloc_netdev_mq().

Yes, this probably needs percpu storage (if you really want to use 
include/linux/u64_stats_sync.h).

But percpu storage seems a bit overkill with a raising number of cpus
on typical machines.

For loopback device, its fine because we only have one lo device per
network namespace, and some workloads really hit hard this device.

But for tuntap, I am not sure ?




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/6] tuntap: per queue 64 bit stats
  2012-06-26  6:10       ` Eric Dumazet
@ 2012-06-26  6:28         ` Jason Wang
  0 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2012-06-26  6:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: mst, akong, habanero, tahm, haixiao, jwhan, ernesto.martin,
	mashirle, davem, netdev, linux-kernel, krkumar2, shemminger,
	edumazet

On 06/26/2012 02:10 PM, Eric Dumazet wrote:
> On Tue, 2012-06-26 at 14:00 +0800, Jason Wang wrote:
>
>> Yes, looks like it's hard to use NETIF_F_LLTX without breaking the u64
>> statistics, may worth to use tx lock and alloc_netdev_mq().
> Yes, this probably needs percpu storage (if you really want to use
> include/linux/u64_stats_sync.h).
>
> But percpu storage seems a bit overkill with a raising number of cpus
> on typical machines.
>
> For loopback device, its fine because we only have one lo device per
> network namespace, and some workloads really hit hard this device.
>
> But for tuntap, I am not sure ?
>

The problem is that we want to collect per-queue statistics. So if we 
convert tuntap to use alloc_netdev_mq(), the tx statistics would be 
updated under tx lock which looks safe.

>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-26  3:42     ` Jason Wang
@ 2012-06-26 10:42       ` Michael S. Tsirkin
  2012-06-27  5:16         ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2012-06-26 10:42 UTC (permalink / raw)
  To: Jason Wang
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle

On Tue, Jun 26, 2012 at 11:42:17AM +0800, Jason Wang wrote:
> On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
> >On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
> >>This patch adds multiqueue support for tap device. This is done by abstracting
> >>each queue as a file/socket and allowing multiple sockets to be attached to the
> >>tuntap device (an array of tun_file were stored in the tun_struct). Userspace
> >>could write and read from those files to do the parallel packet
> >>sending/receiving.
> >>
> >>Unlike the previous single queue implementation, the socket and device were
> >>loosely coupled, each of them were allowed to go away first. In order to let the
> >>tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
> >>synchronize between data path and system call.
> >Don't use LLTX/RCU. It's not worth it.
> >Use something like netif_set_real_num_tx_queues.
> >
> >>The tx queue selecting is first based on the recorded rxq index of an skb, it
> >>there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
> >>
> >>Signed-off-by: Jason Wang<jasowang@redhat.com>
> >Interestingly macvtap switched to hashing first:
> >ef0002b577b52941fb147128f30bd1ecfdd3ff6d
> >(the commit log is corrupted but see what it
> >does in the patch).
> >Any idea why?
> 
> Yes, so tap should be changed to behave same as macvtap. I remember
> the reason we do that is to make sure the packet of a single flow to
> be queued to a fixed socket/virtqueues. As 10g cards like ixgbe
> choose the rx queue for a flow based on the last tx queue where the
> packets of that flow comes. So if we are using recored rx queue in
> macvtap, the queue index of a flow would change as vhost thread
> moves amongs processors.

Hmm. OTOH if you override this, if TX is sent from VCPU0, RX might land
on VCPU1 in the guest, which is not good, right?

> But during test tun/tap, one interesting thing I find is that even
> ixgbe has recorded the queue index during rx, it seems be lost when
> tap tries to transmit skbs to userspace.

dev_pick_tx does this I think but ndo_select_queue
should be able to get it without trouble.


> >>---
> >>  drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
> >>  1 files changed, 232 insertions(+), 139 deletions(-)
> >>
> >>diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>index 8233b0a..5c26757 100644
> >>--- a/drivers/net/tun.c
> >>+++ b/drivers/net/tun.c
> >>@@ -107,6 +107,8 @@ struct tap_filter {
> >>  	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
> >>  };
> >>
> >>+#define MAX_TAP_QUEUES (NR_CPUS<  16 ? NR_CPUS : 16)
> >Why the limit? I am guessing you copied this from macvtap?
> >This is problematic for a number of reasons:
> >	- will not play well with migration
> >	- will not work well for a large guest
> >
> >Yes, macvtap needs to be fixed too.
> >
> >I am guessing what it is trying to prevent is queueing
> >up a huge number of packets?
> >So just divide the default tx queue limit by the # of queues.
> >
> >And by the way, for MQ applications maybe we can finally
> >ignore tx queue altogether and limit the total number
> >of bytes queued?
> >To avoid regressions we can make it large like 64M/# queues.
> >Could be a separate patch I think, and for a single queue
> >might need a compatible mode though I am not sure.
> >
> >>+
> >>  struct tun_file {
> >>  	struct sock sk;
> >>  	struct socket socket;
> >>@@ -114,16 +116,18 @@ struct tun_file {
> >>  	int vnet_hdr_sz;
> >>  	struct tap_filter txflt;
> >>  	atomic_t count;
> >>-	struct tun_struct *tun;
> >>+	struct tun_struct __rcu *tun;
> >>  	struct net *net;
> >>  	struct fasync_struct *fasync;
> >>  	unsigned int flags;
> >>+	u16 queue_index;
> >>  };
> >>
> >>  struct tun_sock;
> >>
> >>  struct tun_struct {
> >>-	struct tun_file		*tfile;
> >>+	struct tun_file		*tfiles[MAX_TAP_QUEUES];
> >>+	unsigned int            numqueues;
> >>  	unsigned int 		flags;
> >>  	uid_t			owner;
> >>  	gid_t			group;
> >>@@ -138,80 +142,159 @@ struct tun_struct {
> >>  #endif
> >>  };
> >>
> >>-static int tun_attach(struct tun_struct *tun, struct file *file)
> >>+static DEFINE_SPINLOCK(tun_lock);
> >>+
> >>+/*
> >>+ * tun_get_queue(): calculate the queue index
> >>+ *     - if skbs comes from mq nics, we can just borrow
> >>+ *     - if not, calculate from the hash
> >>+ */
> >>+static struct tun_file *tun_get_queue(struct net_device *dev,
> >>+				      struct sk_buff *skb)
> >>  {
> >>-	struct tun_file *tfile = file->private_data;
> >>-	int err;
> >>+	struct tun_struct *tun = netdev_priv(dev);
> >>+	struct tun_file *tfile = NULL;
> >>+	int numqueues = tun->numqueues;
> >>+	__u32 rxq;
> >>
> >>-	ASSERT_RTNL();
> >>+	BUG_ON(!rcu_read_lock_held());
> >>
> >>-	netif_tx_lock_bh(tun->dev);
> >>+	if (!numqueues)
> >>+		goto out;
> >>
> >>-	err = -EINVAL;
> >>-	if (tfile->tun)
> >>+	if (numqueues == 1) {
> >>+		tfile = rcu_dereference(tun->tfiles[0]);
> >Instead of hacks like this, you can ask for an MQ
> >flag to be set in SETIFF. Then you won't need to
> >handle attach/detach at random times.
> >And most of the scary num_queues checks can go away.
> >You can then also ask userspace about the max # of queues
> >to expect if you want to save some memory.
> >
> >
> >>  		goto out;
> >>+	}
> >>
> >>-	err = -EBUSY;
> >>-	if (tun->tfile)
> >>+	if (likely(skb_rx_queue_recorded(skb))) {
> >>+		rxq = skb_get_rx_queue(skb);
> >>+
> >>+		while (unlikely(rxq>= numqueues))
> >>+			rxq -= numqueues;
> >>+
> >>+		tfile = rcu_dereference(tun->tfiles[rxq]);
> >>  		goto out;
> >>+	}
> >>
> >>-	err = 0;
> >>-	tfile->tun = tun;
> >>-	tun->tfile = tfile;
> >>-	netif_carrier_on(tun->dev);
> >>-	dev_hold(tun->dev);
> >>-	sock_hold(&tfile->sk);
> >>-	atomic_inc(&tfile->count);
> >>+	/* Check if we can use flow to select a queue */
> >>+	rxq = skb_get_rxhash(skb);
> >>+	if (rxq) {
> >>+		u32 idx = ((u64)rxq * numqueues)>>  32;
> >This completely confuses me. What's the logic here?
> >How do we even know it's in range?
> >
> >>+		tfile = rcu_dereference(tun->tfiles[idx]);
> >>+		goto out;
> >>+	}
> >>
> >>+	tfile = rcu_dereference(tun->tfiles[0]);
> >>  out:
> >>-	netif_tx_unlock_bh(tun->dev);
> >>-	return err;
> >>+	return tfile;
> >>  }
> >>
> >>-static void __tun_detach(struct tun_struct *tun)
> >>+static int tun_detach(struct tun_file *tfile, bool clean)
> >>  {
> >>-	struct tun_file *tfile = tun->tfile;
> >>-	/* Detach from net device */
> >>-	netif_tx_lock_bh(tun->dev);
> >>-	netif_carrier_off(tun->dev);
> >>-	tun->tfile = NULL;
> >>-	netif_tx_unlock_bh(tun->dev);
> >>-
> >>-	/* Drop read queue */
> >>-	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
> >>-
> >>-	/* Drop the extra count on the net device */
> >>-	dev_put(tun->dev);
> >>-}
> >>+	struct tun_struct *tun;
> >>+	struct net_device *dev = NULL;
> >>+	bool destroy = false;
> >>
> >>-static void tun_detach(struct tun_struct *tun)
> >>-{
> >>-	rtnl_lock();
> >>-	__tun_detach(tun);
> >>-	rtnl_unlock();
> >>-}
> >>+	spin_lock(&tun_lock);
> >>
> >>-static struct tun_struct *__tun_get(struct tun_file *tfile)
> >>-{
> >>-	struct tun_struct *tun = NULL;
> >>+	tun = rcu_dereference_protected(tfile->tun,
> >>+					lockdep_is_held(&tun_lock));
> >>+	if (tun) {
> >>+		u16 index = tfile->queue_index;
> >>+		BUG_ON(index>= tun->numqueues);
> >>+		dev = tun->dev;
> >>+
> >>+		rcu_assign_pointer(tun->tfiles[index],
> >>+				   tun->tfiles[tun->numqueues - 1]);
> >>+		tun->tfiles[index]->queue_index = index;
> >>+		rcu_assign_pointer(tfile->tun, NULL);
> >>+		--tun->numqueues;
> >>+		sock_put(&tfile->sk);
> >>
> >>-	if (atomic_inc_not_zero(&tfile->count))
> >>-		tun = tfile->tun;
> >>+		if (tun->numqueues == 0&&  !(tun->flags&  TUN_PERSIST))
> >>+			destroy = true;
> >Please don't use flags like that. Use dedicated labels and goto there on error.
> >
> >
> >>+	}
> >>
> >>-	return tun;
> >>+	spin_unlock(&tun_lock);
> >>+
> >>+	synchronize_rcu();
> >>+	if (clean)
> >>+		sock_put(&tfile->sk);
> >>+
> >>+	if (destroy) {
> >>+		rtnl_lock();
> >>+		if (dev->reg_state == NETREG_REGISTERED)
> >>+			unregister_netdevice(dev);
> >>+		rtnl_unlock();
> >>+	}
> >>+
> >>+	return 0;
> >>  }
> >>
> >>-static struct tun_struct *tun_get(struct file *file)
> >>+static void tun_detach_all(struct net_device *dev)
> >>  {
> >>-	return __tun_get(file->private_data);
> >>+	struct tun_struct *tun = netdev_priv(dev);
> >>+	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
> >>+	int i, j = 0;
> >>+
> >>+	spin_lock(&tun_lock);
> >>+
> >>+	for (i = 0; i<  MAX_TAP_QUEUES&&  tun->numqueues; i++) {
> >>+		tfile = rcu_dereference_protected(tun->tfiles[i],
> >>+						lockdep_is_held(&tun_lock));
> >>+		BUG_ON(!tfile);
> >>+		wake_up_all(&tfile->wq.wait);
> >>+		tfile_list[j++] = tfile;
> >>+		rcu_assign_pointer(tfile->tun, NULL);
> >>+		--tun->numqueues;
> >>+	}
> >>+	BUG_ON(tun->numqueues != 0);
> >>+	/* guarantee that any future tun_attach will fail */
> >>+	tun->numqueues = MAX_TAP_QUEUES;
> >>+	spin_unlock(&tun_lock);
> >>+
> >>+	synchronize_rcu();
> >>+	for (--j; j>= 0; j--)
> >>+		sock_put(&tfile_list[j]->sk);
> >>  }
> >>
> >>-static void tun_put(struct tun_struct *tun)
> >>+static int tun_attach(struct tun_struct *tun, struct file *file)
> >>  {
> >>-	struct tun_file *tfile = tun->tfile;
> >>+	struct tun_file *tfile = file->private_data;
> >>+	int err;
> >>+
> >>+	ASSERT_RTNL();
> >>+
> >>+	spin_lock(&tun_lock);
> >>
> >>-	if (atomic_dec_and_test(&tfile->count))
> >>-		tun_detach(tfile->tun);
> >>+	err = -EINVAL;
> >>+	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
> >>+		goto out;
> >>+
> >>+	err = -EBUSY;
> >>+	if (!(tun->flags&  TUN_TAP_MQ)&&  tun->numqueues == 1)
> >>+		goto out;
> >>+
> >>+	if (tun->numqueues == MAX_TAP_QUEUES)
> >>+		goto out;
> >>+
> >>+	err = 0;
> >>+	tfile->queue_index = tun->numqueues;
> >>+	rcu_assign_pointer(tfile->tun, tun);
> >>+	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
> >>+	sock_hold(&tfile->sk);
> >>+	tun->numqueues++;
> >>+
> >>+	if (tun->numqueues == 1)
> >>+		netif_carrier_on(tun->dev);
> >>+
> >>+	/* device is allowed to go away first, so no need to hold extra
> >>+	 * refcnt. */
> >>+
> >>+out:
> >>+	spin_unlock(&tun_lock);
> >>+	return err;
> >>  }
> >>
> >>  /* TAP filtering */
> >>@@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
> >>  /* Net device detach from fd. */
> >>  static void tun_net_uninit(struct net_device *dev)
> >>  {
> >>-	struct tun_struct *tun = netdev_priv(dev);
> >>-	struct tun_file *tfile = tun->tfile;
> >>-
> >>-	/* Inform the methods they need to stop using the dev.
> >>-	 */
> >>-	if (tfile) {
> >>-		wake_up_all(&tfile->wq.wait);
> >>-		if (atomic_dec_and_test(&tfile->count))
> >>-			__tun_detach(tun);
> >>-	}
> >>+	tun_detach_all(dev);
> >>  }
> >>
> >>  /* Net device open. */
> >>@@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
> >>  /* Net device start xmit */
> >>  static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>  {
> >>-	struct tun_struct *tun = netdev_priv(dev);
> >>-	struct tun_file *tfile = tun->tfile;
> >>+	struct tun_file *tfile = NULL;
> >>
> >>-	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
> >>+	rcu_read_lock();
> >>+	tfile = tun_get_queue(dev, skb);
> >>
> >>  	/* Drop packet if interface is not attached */
> >>  	if (!tfile)
> >>@@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>
> >>  	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
> >>  	>= dev->tx_queue_len) {
> >>-		if (!(tun->flags&  TUN_ONE_QUEUE)) {
> >>+		if (!(tfile->flags&  TUN_ONE_QUEUE)&&
> >Which patch moved flags from tun to tfile?
> >
> >>+		    !(tfile->flags&  TUN_TAP_MQ)) {
> >>  			/* Normal queueing mode. */
> >>  			/* Packet scheduler handles dropping of further packets. */
> >>  			netif_stop_queue(dev);
> >>@@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>  			 * error is more appropriate. */
> >>  			dev->stats.tx_fifo_errors++;
> >>  		} else {
> >>-			/* Single queue mode.
> >>+			/* Single queue mode or multi queue mode.
> >>  			 * Driver handles dropping of all packets itself. */
> >Please don't do this. Stop the queue on overrun as appropriate.
> >ONE_QUEUE is a legacy hack.
> >
> >BTW we really should stop queue before we start dropping packets,
> >but that can be a separate patch.
> >
> >>  			goto drop;
> >>  		}
> >>@@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>  		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
> >>  	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
> >>  				   POLLRDNORM | POLLRDBAND);
> >>+	rcu_read_unlock();
> >>  	return NETDEV_TX_OK;
> >>
> >>  drop:
> >>+	rcu_read_unlock();
> >>  	dev->stats.tx_dropped++;
> >>  	kfree_skb(skb);
> >>  	return NETDEV_TX_OK;
> >>@@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
> >>  static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
> >>  {
> >>  	struct tun_file *tfile = file->private_data;
> >>-	struct tun_struct *tun = __tun_get(tfile);
> >>+	struct tun_struct *tun = NULL;
> >>  	struct sock *sk;
> >>  	unsigned int mask = 0;
> >>
> >>-	if (!tun)
> >>+	if (!tfile)
> >>  		return POLLERR;
> >>
> >>-	sk = tfile->socket.sk;
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>+	if (!tun) {
> >>+		rcu_read_unlock();
> >>+		return POLLERR;
> >>+	}
> >>+	rcu_read_unlock();
> >>
> >>-	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
> >>+	sk =&tfile->sk;
> >>
> >>  	poll_wait(file,&tfile->wq.wait, wait);
> >>
> >>@@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
> >>  	     sock_writeable(sk)))
> >>  		mask |= POLLOUT | POLLWRNORM;
> >>
> >>-	if (tun->dev->reg_state != NETREG_REGISTERED)
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>+	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
> >>  		mask = POLLERR;
> >>+	rcu_read_unlock();
> >>
> >>-	tun_put(tun);
> >>  	return mask;
> >>  }
> >>
> >>@@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
> >>  		skb_shinfo(skb)->gso_segs = 0;
> >>  	}
> >>
> >>-	tun = __tun_get(tfile);
> >>-	if (!tun)
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>+	if (!tun) {
> >>+		rcu_read_unlock();
> >>  		return -EBADFD;
> >>+	}
> >>
> >>  	switch (tfile->flags&  TUN_TYPE_MASK) {
> >>  	case TUN_TUN_DEV:
> >>@@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
> >>  		skb->protocol = eth_type_trans(skb, tun->dev);
> >>  		break;
> >>  	}
> >>-
> >>-	netif_rx_ni(skb);
> >>  	tun->dev->stats.rx_packets++;
> >>  	tun->dev->stats.rx_bytes += len;
> >>-	tun_put(tun);
> >>+	rcu_read_unlock();
> >>+
> >>+	netif_rx_ni(skb);
> >>+
> >>  	return count;
> >>
> >>  err_free:
> >>  	count = -EINVAL;
> >>  	kfree_skb(skb);
> >>  err:
> >>-	tun = __tun_get(tfile);
> >>-	if (!tun)
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>+	if (!tun) {
> >>+		rcu_read_unlock();
> >>  		return -EBADFD;
> >>+	}
> >>
> >>  	if (drop)
> >>  		tun->dev->stats.rx_dropped++;
> >>  	if (error)
> >>  		tun->dev->stats.rx_frame_errors++;
> >>-	tun_put(tun);
> >>+	rcu_read_unlock();
> >>  	return count;
> >>  }
> >>
> >>@@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
> >>  	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
> >>  	total += skb->len;
> >>
> >>-	tun = __tun_get(tfile);
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>  	if (tun) {
> >>  		tun->dev->stats.tx_packets++;
> >>  		tun->dev->stats.tx_bytes += len;
> >>-		tun_put(tun);
> >>  	}
> >>+	rcu_read_unlock();
> >>
> >>  	return total;
> >>  }
> >>@@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
> >>  				break;
> >>  			}
> >>
> >>-			tun = __tun_get(tfile);
> >>+			rcu_read_lock();
> >>+			tun = rcu_dereference(tfile->tun);
> >>  			if (!tun) {
> >>-				ret = -EIO;
> >>+				ret = -EBADFD;
> >BADFD is for when you get passed something like -1 fd.
> >Here fd is OK, it's just in a bad state so you can not do IO.
> >
> >
> >>+				rcu_read_unlock();
> >>  				break;
> >>  			}
> >>  			if (tun->dev->reg_state != NETREG_REGISTERED) {
> >>  				ret = -EIO;
> >>-				tun_put(tun);
> >>+				rcu_read_unlock();
> >>  				break;
> >>  			}
> >>-			tun_put(tun);
> >>+			rcu_read_unlock();
> >>
> >>  			/* Nothing to read, let's sleep */
> >>  			schedule();
> >>  			continue;
> >>  		}
> >>
> >>-		tun = __tun_get(tfile);
> >>+		rcu_read_lock();
> >>+		tun = rcu_dereference(tfile->tun);
> >>  		if (tun) {
> >>  			netif_wake_queue(tun->dev);
> >>-			tun_put(tun);
> >>  		}
> >>+		rcu_read_unlock();
> >>
> >>  		ret = tun_put_user(tfile, skb, iv, len);
> >>  		kfree_skb(skb);
> >>@@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
> >>  	if (tun->flags&  TUN_VNET_HDR)
> >>  		flags |= IFF_VNET_HDR;
> >>
> >>+	if (tun->flags&  TUN_TAP_MQ)
> >>+		flags |= IFF_MULTI_QUEUE;
> >>+
> >>  	return flags;
> >>  }
> >>
> >>@@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>  		err = tun_attach(tun, file);
> >>  		if (err<  0)
> >>  			return err;
> >>-	}
> >>-	else {
> >>+	} else {
> >>  		char *name;
> >>  		unsigned long flags = 0;
> >>
> >>@@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>  		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
> >>  			TUN_USER_FEATURES;
> >>  		dev->features = dev->hw_features;
> >>+		if (ifr->ifr_flags&  IFF_MULTI_QUEUE)
> >>+			dev->features |= NETIF_F_LLTX;
> >>
> >>  		err = register_netdevice(tun->dev);
> >>  		if (err<  0)
> >>@@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>
> >>  		err = tun_attach(tun, file);
> >>  		if (err<  0)
> >>-			goto failed;
> >>+			goto err_free_dev;
> >>  	}
> >>
> >>  	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
> >>@@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>  	else
> >>  		tun->flags&= ~TUN_VNET_HDR;
> >>
> >>+	if (ifr->ifr_flags&  IFF_MULTI_QUEUE)
> >>+		tun->flags |= TUN_TAP_MQ;
> >>+	else
> >>+		tun->flags&= ~TUN_TAP_MQ;
> >>+
> >>  	/* Cache flags from tun device */
> >>  	tfile->flags = tun->flags;
> >>  	/* Make sure persistent devices do not get stuck in
> >>@@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>
> >>  err_free_dev:
> >>  	free_netdev(dev);
> >>-failed:
> >>  	return err;
> >>  }
> >>
> >>@@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>  				(unsigned int __user*)argp);
> >>  	}
> >>
> >>-	rtnl_lock();
> >>-
> >>-	tun = __tun_get(tfile);
> >>-	if (cmd == TUNSETIFF&&  !tun) {
> >>+	ret = 0;
> >>+	if (cmd == TUNSETIFF) {
> >>+		rtnl_lock();
> >>  		ifr.ifr_name[IFNAMSIZ-1] = '\0';
> >>-
> >>  		ret = tun_set_iff(tfile->net, file,&ifr);
> >>-
> >>+		rtnl_unlock();
> >>  		if (ret)
> >>-			goto unlock;
> >>-
> >>+			return ret;
> >>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>-			ret = -EFAULT;
> >>-		goto unlock;
> >>+			return -EFAULT;
> >>+		return ret;
> >>  	}
> >>
> >>+	rtnl_lock();
> >>+
> >>+	rcu_read_lock();
> >>+
> >>  	ret = -EBADFD;
> >>+	tun = rcu_dereference(tfile->tun);
> >>  	if (!tun)
> >>  		goto unlock;
> >>+	else
> >>+		ret = 0;
> >>
> >>-	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
> >>-
> >>-	ret = 0;
> >>  	switch (cmd) {
> >>  	case TUNGETIFF:
> >>  		ret = tun_get_iff(current->nsproxy->net_ns, tun,&ifr);
> >>+		rcu_read_unlock();
> >>  		if (ret)
> >>-			break;
> >>+			goto out;
> >>
> >>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>  			ret = -EFAULT;
> >>-		break;
> >>+		goto out;
> >>
> >>  	case TUNSETNOCSUM:
> >>  		/* Disable/Enable checksum */
> >>@@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>  		/* Get hw address */
> >>  		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
> >>  		ifr.ifr_hwaddr.sa_family = tun->dev->type;
> >>+		rcu_read_unlock();
> >>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>  			ret = -EFAULT;
> >>-		break;
> >>+		goto out;
> >>
> >>  	case SIOCSIFHWADDR:
> >>  		/* Set hw address */
> >>@@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>  	}
> >>
> >>  unlock:
> >>+	rcu_read_unlock();
> >>+out:
> >>  	rtnl_unlock();
> >>-	if (tun)
> >>-		tun_put(tun);
> >>  	return ret;
> >>  }
> >>
> >>@@ -1517,6 +1624,11 @@ out:
> >>  	return ret;
> >>  }
> >>
> >>+static void tun_sock_destruct(struct sock *sk)
> >>+{
> >>+	skb_queue_purge(&sk->sk_receive_queue);
> >>+}
> >>+
> >>  static int tun_chr_open(struct inode *inode, struct file * file)
> >>  {
> >>  	struct net *net = current->nsproxy->net_ns;
> >>@@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
> >>  	sock_init_data(&tfile->socket,&tfile->sk);
> >>
> >>  	tfile->sk.sk_write_space = tun_sock_write_space;
> >>+	tfile->sk.sk_destruct = tun_sock_destruct;
> >>  	tfile->sk.sk_sndbuf = INT_MAX;
> >>  	file->private_data = tfile;
> >>
> >>@@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
> >>  static int tun_chr_close(struct inode *inode, struct file *file)
> >>  {
> >>  	struct tun_file *tfile = file->private_data;
> >>-	struct tun_struct *tun;
> >>-
> >>-	tun = __tun_get(tfile);
> >>-	if (tun) {
> >>-		struct net_device *dev = tun->dev;
> >>-
> >>-		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
> >>-
> >>-		__tun_detach(tun);
> >>-
> >>-		/* If desirable, unregister the netdevice. */
> >>-		if (!(tun->flags&  TUN_PERSIST)) {
> >>-			rtnl_lock();
> >>-			if (dev->reg_state == NETREG_REGISTERED)
> >>-				unregister_netdevice(dev);
> >>-			rtnl_unlock();
> >>-		}
> >>
> >>-		/* drop the reference that netdevice holds */
> >>-		sock_put(&tfile->sk);
> >>-
> >>-	}
> >>-
> >>-	/* drop the reference that file holds */
> >>-	sock_put(&tfile->sk);
> >>+	tun_detach(tfile, true);
> >>
> >>  	return 0;
> >>  }
> >>@@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
> >>   * holding a reference to the file for as long as the socket is in use. */
> >>  struct socket *tun_get_socket(struct file *file)
> >>  {
> >>-	struct tun_struct *tun;
> >>+	struct tun_struct *tun = NULL;
> >>  	struct tun_file *tfile = file->private_data;
> >>  	if (file->f_op !=&tun_fops)
> >>  		return ERR_PTR(-EINVAL);
> >>-	tun = tun_get(file);
> >>-	if (!tun)
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>+	if (!tun) {
> >>+		rcu_read_unlock();
> >>  		return ERR_PTR(-EBADFD);
> >>-	tun_put(tun);
> >>+	}
> >>+	rcu_read_unlock();
> >>  	return&tfile->socket;
> >>  }
> >>  EXPORT_SYMBOL_GPL(tun_get_socket);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-26  5:52     ` Jason Wang
@ 2012-06-26 11:54       ` Michael S. Tsirkin
  2012-06-27  5:59         ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2012-06-26 11:54 UTC (permalink / raw)
  To: Jason Wang
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle, Eric Dumazet

On Tue, Jun 26, 2012 at 01:52:57PM +0800, Jason Wang wrote:
> On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
> >On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
> >>This patch adds multiqueue support for tap device. This is done by abstracting
> >>each queue as a file/socket and allowing multiple sockets to be attached to the
> >>tuntap device (an array of tun_file were stored in the tun_struct). Userspace
> >>could write and read from those files to do the parallel packet
> >>sending/receiving.
> >>
> >>Unlike the previous single queue implementation, the socket and device were
> >>loosely coupled, each of them were allowed to go away first. In order to let the
> >>tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
> >>synchronize between data path and system call.
> >Don't use LLTX/RCU. It's not worth it.
> >Use something like netif_set_real_num_tx_queues.
> >
> 
> For LLTX, maybe it's better to convert it to alloc_netdev_mq() to
> let the kernel see all queues and make the queue stopping and
> per-queue stats eaiser.
> RCU is used to handle the attaching/detaching when tun/tap is
> sending and receiving packets which looks reasonalbe for me.

Yes but do we have to allow this? How about we always ask
userspace to attach to all active queues?

> Not
> sure netif_set_real_num_tx_queues() can help in this situation.

Check it out.

> >>The tx queue selecting is first based on the recorded rxq index of an skb, it
> >>there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
> >>
> >>Signed-off-by: Jason Wang<jasowang@redhat.com>
> >Interestingly macvtap switched to hashing first:
> >ef0002b577b52941fb147128f30bd1ecfdd3ff6d
> >(the commit log is corrupted but see what it
> >does in the patch).
> >Any idea why?
> >
> >>---
> >>  drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
> >>  1 files changed, 232 insertions(+), 139 deletions(-)
> >>
> >>diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>index 8233b0a..5c26757 100644
> >>--- a/drivers/net/tun.c
> >>+++ b/drivers/net/tun.c
> >>@@ -107,6 +107,8 @@ struct tap_filter {
> >>  	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
> >>  };
> >>
> >>+#define MAX_TAP_QUEUES (NR_CPUS<  16 ? NR_CPUS : 16)
> >Why the limit? I am guessing you copied this from macvtap?
> >This is problematic for a number of reasons:
> >	- will not play well with migration
> >	- will not work well for a large guest
> >
> >Yes, macvtap needs to be fixed too.
> >
> >I am guessing what it is trying to prevent is queueing
> >up a huge number of packets?
> >So just divide the default tx queue limit by the # of queues.
> 
> Not sure,
> another reasons I can guess:
> - to prevent storing a large array of pointers in tun_struct or macvlan_dev.

OK so with the limit of e.g. 1024 we'd allocate at most
2 pages of memory. This doesn't look too bad. 1024 is probably a
high enough limit: modern hypervisors seem to support on the order
of 100-200 CPUs so this leaves us some breathing space
if we want to match a queue per guest CPU.
Of course we need to limit the packets per queue
in such a setup more aggressively. 1000 packets * 1000 queues
* 64K per packet is too much.

> - it may not be suitable to allow the number of virtqueues greater
> than the number of physical queues in the card

Maybe for macvtap, here we have no idea which card we
are working with and how many queues it has.

> >
> >And by the way, for MQ applications maybe we can finally
> >ignore tx queue altogether and limit the total number
> >of bytes queued?
> >To avoid regressions we can make it large like 64M/# queues.
> >Could be a separate patch I think, and for a single queue
> >might need a compatible mode though I am not sure.
> 
> Could you explain more about this?
> Did you mean to have a total
> sndbuf for all sockets that attached to tun/tap?

Consider that we currently limit the # of
packets queued at tun for xmit to userspace.
Some limit is needed but # of packets sounds
very silly - limiting the total memory
might be more reasonable.

In case of multiqueue, we really care about
total # of packets or total memory, but a simple
approximation could be to divide the allocation
between active queues equally.

qdisc also queues some packets, that logic is
using # of packets anyway. So either make that
1000/# queues, or even set to 0 as Eric once
suggested.

> >>+
> >>  struct tun_file {
> >>  	struct sock sk;
> >>  	struct socket socket;
> >>@@ -114,16 +116,18 @@ struct tun_file {
> >>  	int vnet_hdr_sz;
> >>  	struct tap_filter txflt;
> >>  	atomic_t count;
> >>-	struct tun_struct *tun;
> >>+	struct tun_struct __rcu *tun;
> >>  	struct net *net;
> >>  	struct fasync_struct *fasync;
> >>  	unsigned int flags;
> >>+	u16 queue_index;
> >>  };
> >>
> >>  struct tun_sock;
> >>
> >>  struct tun_struct {
> >>-	struct tun_file		*tfile;
> >>+	struct tun_file		*tfiles[MAX_TAP_QUEUES];
> >>+	unsigned int            numqueues;
> >>  	unsigned int 		flags;
> >>  	uid_t			owner;
> >>  	gid_t			group;
> >>@@ -138,80 +142,159 @@ struct tun_struct {
> >>  #endif
> >>  };
> >>
> >>-static int tun_attach(struct tun_struct *tun, struct file *file)
> >>+static DEFINE_SPINLOCK(tun_lock);
> >>+
> >>+/*
> >>+ * tun_get_queue(): calculate the queue index
> >>+ *     - if skbs comes from mq nics, we can just borrow
> >>+ *     - if not, calculate from the hash
> >>+ */
> >>+static struct tun_file *tun_get_queue(struct net_device *dev,
> >>+				      struct sk_buff *skb)
> >>  {
> >>-	struct tun_file *tfile = file->private_data;
> >>-	int err;
> >>+	struct tun_struct *tun = netdev_priv(dev);
> >>+	struct tun_file *tfile = NULL;
> >>+	int numqueues = tun->numqueues;
> >>+	__u32 rxq;
> >>
> >>-	ASSERT_RTNL();
> >>+	BUG_ON(!rcu_read_lock_held());
> >>
> >>-	netif_tx_lock_bh(tun->dev);
> >>+	if (!numqueues)
> >>+		goto out;
> >>
> >>-	err = -EINVAL;
> >>-	if (tfile->tun)
> >>+	if (numqueues == 1) {
> >>+		tfile = rcu_dereference(tun->tfiles[0]);
> >Instead of hacks like this, you can ask for an MQ
> >flag to be set in SETIFF. Then you won't need to
> >handle attach/detach at random times.
> 
> Consier user switch between a sq guest to mq guest, qemu would
> attach or detach the fd which could not be expceted in kernel.

Can't userspace keep it attached always, just deactivate MQ?

> >And most of the scary num_queues checks can go away.
> 
> Even we has a MQ flag, userspace could still just attach one queue
> to the device.

I think we allow too much flexibility if we let
userspace detach a random queue.
Maybe only allow attaching/detaching with MQ off?
If userspace wants to attach/detach, clear MQ first?
Alternatively, attach/detach all queues in one ioctl?

> >You can then also ask userspace about the max # of queues
> >to expect if you want to save some memory.
> >
> 
> Yes, good suggestion.
> >>  		goto out;
> >>+	}
> >>
> >>-	err = -EBUSY;
> >>-	if (tun->tfile)
> >>+	if (likely(skb_rx_queue_recorded(skb))) {
> >>+		rxq = skb_get_rx_queue(skb);
> >>+
> >>+		while (unlikely(rxq>= numqueues))
> >>+			rxq -= numqueues;
> >>+
> >>+		tfile = rcu_dereference(tun->tfiles[rxq]);
> >>  		goto out;
> >>+	}
> >>
> >>-	err = 0;
> >>-	tfile->tun = tun;
> >>-	tun->tfile = tfile;
> >>-	netif_carrier_on(tun->dev);
> >>-	dev_hold(tun->dev);
> >>-	sock_hold(&tfile->sk);
> >>-	atomic_inc(&tfile->count);
> >>+	/* Check if we can use flow to select a queue */
> >>+	rxq = skb_get_rxhash(skb);
> >>+	if (rxq) {
> >>+		u32 idx = ((u64)rxq * numqueues)>>  32;
> >This completely confuses me. What's the logic here?
> >How do we even know it's in range?
> >
> 
> rxq is a u32, so the result should be less than numqueues.

Aha. So the point is to use multiply+shift instead of %?
Please add a comment.


> >>+		tfile = rcu_dereference(tun->tfiles[idx]);
> >>+		goto out;
> >>+	}
> >>
> >>+	tfile = rcu_dereference(tun->tfiles[0]);
> >>  out:
> >>-	netif_tx_unlock_bh(tun->dev);
> >>-	return err;
> >>+	return tfile;
> >>  }
> >>
> >>-static void __tun_detach(struct tun_struct *tun)
> >>+static int tun_detach(struct tun_file *tfile, bool clean)
> >>  {
> >>-	struct tun_file *tfile = tun->tfile;
> >>-	/* Detach from net device */
> >>-	netif_tx_lock_bh(tun->dev);
> >>-	netif_carrier_off(tun->dev);
> >>-	tun->tfile = NULL;
> >>-	netif_tx_unlock_bh(tun->dev);
> >>-
> >>-	/* Drop read queue */
> >>-	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
> >>-
> >>-	/* Drop the extra count on the net device */
> >>-	dev_put(tun->dev);
> >>-}
> >>+	struct tun_struct *tun;
> >>+	struct net_device *dev = NULL;
> >>+	bool destroy = false;
> >>
> >>-static void tun_detach(struct tun_struct *tun)
> >>-{
> >>-	rtnl_lock();
> >>-	__tun_detach(tun);
> >>-	rtnl_unlock();
> >>-}
> >>+	spin_lock(&tun_lock);
> >>
> >>-static struct tun_struct *__tun_get(struct tun_file *tfile)
> >>-{
> >>-	struct tun_struct *tun = NULL;
> >>+	tun = rcu_dereference_protected(tfile->tun,
> >>+					lockdep_is_held(&tun_lock));
> >>+	if (tun) {
> >>+		u16 index = tfile->queue_index;
> >>+		BUG_ON(index>= tun->numqueues);
> >>+		dev = tun->dev;
> >>+
> >>+		rcu_assign_pointer(tun->tfiles[index],
> >>+				   tun->tfiles[tun->numqueues - 1]);
> >>+		tun->tfiles[index]->queue_index = index;
> >>+		rcu_assign_pointer(tfile->tun, NULL);
> >>+		--tun->numqueues;
> >>+		sock_put(&tfile->sk);
> >>
> >>-	if (atomic_inc_not_zero(&tfile->count))
> >>-		tun = tfile->tun;
> >>+		if (tun->numqueues == 0&&  !(tun->flags&  TUN_PERSIST))
> >>+			destroy = true;
> >Please don't use flags like that. Use dedicated labels and goto there on error.
> 
> ok.
> >
> >>+	}
> >>
> >>-	return tun;
> >>+	spin_unlock(&tun_lock);
> >>+
> >>+	synchronize_rcu();
> >>+	if (clean)
> >>+		sock_put(&tfile->sk);
> >>+
> >>+	if (destroy) {
> >>+		rtnl_lock();
> >>+		if (dev->reg_state == NETREG_REGISTERED)
> >>+			unregister_netdevice(dev);
> >>+		rtnl_unlock();
> >>+	}
> >>+
> >>+	return 0;
> >>  }
> >>
> >>-static struct tun_struct *tun_get(struct file *file)
> >>+static void tun_detach_all(struct net_device *dev)
> >>  {
> >>-	return __tun_get(file->private_data);
> >>+	struct tun_struct *tun = netdev_priv(dev);
> >>+	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
> >>+	int i, j = 0;
> >>+
> >>+	spin_lock(&tun_lock);
> >>+
> >>+	for (i = 0; i<  MAX_TAP_QUEUES&&  tun->numqueues; i++) {
> >>+		tfile = rcu_dereference_protected(tun->tfiles[i],
> >>+						lockdep_is_held(&tun_lock));
> >>+		BUG_ON(!tfile);
> >>+		wake_up_all(&tfile->wq.wait);
> >>+		tfile_list[j++] = tfile;
> >>+		rcu_assign_pointer(tfile->tun, NULL);
> >>+		--tun->numqueues;
> >>+	}
> >>+	BUG_ON(tun->numqueues != 0);
> >>+	/* guarantee that any future tun_attach will fail */
> >>+	tun->numqueues = MAX_TAP_QUEUES;
> >>+	spin_unlock(&tun_lock);
> >>+
> >>+	synchronize_rcu();
> >>+	for (--j; j>= 0; j--)
> >>+		sock_put(&tfile_list[j]->sk);
> >>  }
> >>
> >>-static void tun_put(struct tun_struct *tun)
> >>+static int tun_attach(struct tun_struct *tun, struct file *file)
> >>  {
> >>-	struct tun_file *tfile = tun->tfile;
> >>+	struct tun_file *tfile = file->private_data;
> >>+	int err;
> >>+
> >>+	ASSERT_RTNL();
> >>+
> >>+	spin_lock(&tun_lock);
> >>
> >>-	if (atomic_dec_and_test(&tfile->count))
> >>-		tun_detach(tfile->tun);
> >>+	err = -EINVAL;
> >>+	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
> >>+		goto out;
> >>+
> >>+	err = -EBUSY;
> >>+	if (!(tun->flags&  TUN_TAP_MQ)&&  tun->numqueues == 1)
> >>+		goto out;
> >>+
> >>+	if (tun->numqueues == MAX_TAP_QUEUES)
> >>+		goto out;
> >>+
> >>+	err = 0;
> >>+	tfile->queue_index = tun->numqueues;
> >>+	rcu_assign_pointer(tfile->tun, tun);
> >>+	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
> >>+	sock_hold(&tfile->sk);
> >>+	tun->numqueues++;
> >>+
> >>+	if (tun->numqueues == 1)
> >>+		netif_carrier_on(tun->dev);
> >>+
> >>+	/* device is allowed to go away first, so no need to hold extra
> >>+	 * refcnt. */
> >>+
> >>+out:
> >>+	spin_unlock(&tun_lock);
> >>+	return err;
> >>  }
> >>
> >>  /* TAP filtering */
> >>@@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
> >>  /* Net device detach from fd. */
> >>  static void tun_net_uninit(struct net_device *dev)
> >>  {
> >>-	struct tun_struct *tun = netdev_priv(dev);
> >>-	struct tun_file *tfile = tun->tfile;
> >>-
> >>-	/* Inform the methods they need to stop using the dev.
> >>-	 */
> >>-	if (tfile) {
> >>-		wake_up_all(&tfile->wq.wait);
> >>-		if (atomic_dec_and_test(&tfile->count))
> >>-			__tun_detach(tun);
> >>-	}
> >>+	tun_detach_all(dev);
> >>  }
> >>
> >>  /* Net device open. */
> >>@@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
> >>  /* Net device start xmit */
> >>  static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>  {
> >>-	struct tun_struct *tun = netdev_priv(dev);
> >>-	struct tun_file *tfile = tun->tfile;
> >>+	struct tun_file *tfile = NULL;
> >>
> >>-	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
> >>+	rcu_read_lock();
> >>+	tfile = tun_get_queue(dev, skb);
> >>
> >>  	/* Drop packet if interface is not attached */
> >>  	if (!tfile)
> >>@@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>
> >>  	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
> >>  	>= dev->tx_queue_len) {
> >>-		if (!(tun->flags&  TUN_ONE_QUEUE)) {
> >>+		if (!(tfile->flags&  TUN_ONE_QUEUE)&&
> >Which patch moved flags from tun to tfile?
> 
> Patch 1 cache the tun->flags in tfile, but it seems this may let the
> flags out of sync. So we'd better to use the one in tun_struct.
> >
> >>+		    !(tfile->flags&  TUN_TAP_MQ)) {
> >>  			/* Normal queueing mode. */
> >>  			/* Packet scheduler handles dropping of further packets. */
> >>  			netif_stop_queue(dev);
> >>@@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>  			 * error is more appropriate. */
> >>  			dev->stats.tx_fifo_errors++;
> >>  		} else {
> >>-			/* Single queue mode.
> >>+			/* Single queue mode or multi queue mode.
> >>  			 * Driver handles dropping of all packets itself. */
> >Please don't do this. Stop the queue on overrun as appropriate.
> >ONE_QUEUE is a legacy hack.
> >
> >BTW we really should stop queue before we start dropping packets,
> >but that can be a separate patch.
> 
> The problem here is the using of NETIF_F_LLTX. Kernel could only see
> one queue even for a multiqueue tun/tap. If we use
> netif_stop_queue(), all other queues would be stopped also.

Another reason not to use LLTX?

> >>  			goto drop;
> >>  		}
> >>@@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>  		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
> >>  	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
> >>  				   POLLRDNORM | POLLRDBAND);
> >>+	rcu_read_unlock();
> >>  	return NETDEV_TX_OK;
> >>
> >>  drop:
> >>+	rcu_read_unlock();
> >>  	dev->stats.tx_dropped++;
> >>  	kfree_skb(skb);
> >>  	return NETDEV_TX_OK;
> >>@@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
> >>  static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
> >>  {
> >>  	struct tun_file *tfile = file->private_data;
> >>-	struct tun_struct *tun = __tun_get(tfile);
> >>+	struct tun_struct *tun = NULL;
> >>  	struct sock *sk;
> >>  	unsigned int mask = 0;
> >>
> >>-	if (!tun)
> >>+	if (!tfile)
> >>  		return POLLERR;
> >>
> >>-	sk = tfile->socket.sk;
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>+	if (!tun) {
> >>+		rcu_read_unlock();
> >>+		return POLLERR;
> >>+	}
> >>+	rcu_read_unlock();
> >>
> >>-	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
> >>+	sk =&tfile->sk;
> >>
> >>  	poll_wait(file,&tfile->wq.wait, wait);
> >>
> >>@@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
> >>  	     sock_writeable(sk)))
> >>  		mask |= POLLOUT | POLLWRNORM;
> >>
> >>-	if (tun->dev->reg_state != NETREG_REGISTERED)
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>+	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
> >>  		mask = POLLERR;
> >>+	rcu_read_unlock();
> >>
> >>-	tun_put(tun);
> >>  	return mask;
> >>  }
> >>
> >>@@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
> >>  		skb_shinfo(skb)->gso_segs = 0;
> >>  	}
> >>
> >>-	tun = __tun_get(tfile);
> >>-	if (!tun)
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>+	if (!tun) {
> >>+		rcu_read_unlock();
> >>  		return -EBADFD;
> >>+	}
> >>
> >>  	switch (tfile->flags&  TUN_TYPE_MASK) {
> >>  	case TUN_TUN_DEV:
> >>@@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
> >>  		skb->protocol = eth_type_trans(skb, tun->dev);
> >>  		break;
> >>  	}
> >>-
> >>-	netif_rx_ni(skb);
> >>  	tun->dev->stats.rx_packets++;
> >>  	tun->dev->stats.rx_bytes += len;
> >>-	tun_put(tun);
> >>+	rcu_read_unlock();
> >>+
> >>+	netif_rx_ni(skb);
> >>+
> >>  	return count;
> >>
> >>  err_free:
> >>  	count = -EINVAL;
> >>  	kfree_skb(skb);
> >>  err:
> >>-	tun = __tun_get(tfile);
> >>-	if (!tun)
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>+	if (!tun) {
> >>+		rcu_read_unlock();
> >>  		return -EBADFD;
> >>+	}
> >>
> >>  	if (drop)
> >>  		tun->dev->stats.rx_dropped++;
> >>  	if (error)
> >>  		tun->dev->stats.rx_frame_errors++;
> >>-	tun_put(tun);
> >>+	rcu_read_unlock();
> >>  	return count;
> >>  }
> >>
> >>@@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
> >>  	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
> >>  	total += skb->len;
> >>
> >>-	tun = __tun_get(tfile);
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>  	if (tun) {
> >>  		tun->dev->stats.tx_packets++;
> >>  		tun->dev->stats.tx_bytes += len;
> >>-		tun_put(tun);
> >>  	}
> >>+	rcu_read_unlock();
> >>
> >>  	return total;
> >>  }
> >>@@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
> >>  				break;
> >>  			}
> >>
> >>-			tun = __tun_get(tfile);
> >>+			rcu_read_lock();
> >>+			tun = rcu_dereference(tfile->tun);
> >>  			if (!tun) {
> >>-				ret = -EIO;
> >>+				ret = -EBADFD;
> >BADFD is for when you get passed something like -1 fd.
> >Here fd is OK, it's just in a bad state so you can not do IO.
> >
> 
> Sure.
> >>+				rcu_read_unlock();
> >>  				break;
> >>  			}
> >>  			if (tun->dev->reg_state != NETREG_REGISTERED) {
> >>  				ret = -EIO;
> >>-				tun_put(tun);
> >>+				rcu_read_unlock();
> >>  				break;
> >>  			}
> >>-			tun_put(tun);
> >>+			rcu_read_unlock();
> >>
> >>  			/* Nothing to read, let's sleep */
> >>  			schedule();
> >>  			continue;
> >>  		}
> >>
> >>-		tun = __tun_get(tfile);
> >>+		rcu_read_lock();
> >>+		tun = rcu_dereference(tfile->tun);
> >>  		if (tun) {
> >>  			netif_wake_queue(tun->dev);
> >>-			tun_put(tun);
> >>  		}
> >>+		rcu_read_unlock();
> >>
> >>  		ret = tun_put_user(tfile, skb, iv, len);
> >>  		kfree_skb(skb);
> >>@@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
> >>  	if (tun->flags&  TUN_VNET_HDR)
> >>  		flags |= IFF_VNET_HDR;
> >>
> >>+	if (tun->flags&  TUN_TAP_MQ)
> >>+		flags |= IFF_MULTI_QUEUE;
> >>+
> >>  	return flags;
> >>  }
> >>
> >>@@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>  		err = tun_attach(tun, file);
> >>  		if (err<  0)
> >>  			return err;
> >>-	}
> >>-	else {
> >>+	} else {
> >>  		char *name;
> >>  		unsigned long flags = 0;
> >>
> >>@@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>  		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
> >>  			TUN_USER_FEATURES;
> >>  		dev->features = dev->hw_features;
> >>+		if (ifr->ifr_flags&  IFF_MULTI_QUEUE)
> >>+			dev->features |= NETIF_F_LLTX;
> >>
> >>  		err = register_netdevice(tun->dev);
> >>  		if (err<  0)
> >>@@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>
> >>  		err = tun_attach(tun, file);
> >>  		if (err<  0)
> >>-			goto failed;
> >>+			goto err_free_dev;
> >>  	}
> >>
> >>  	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
> >>@@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>  	else
> >>  		tun->flags&= ~TUN_VNET_HDR;
> >>
> >>+	if (ifr->ifr_flags&  IFF_MULTI_QUEUE)
> >>+		tun->flags |= TUN_TAP_MQ;
> >>+	else
> >>+		tun->flags&= ~TUN_TAP_MQ;
> >>+
> >>  	/* Cache flags from tun device */
> >>  	tfile->flags = tun->flags;
> >>  	/* Make sure persistent devices do not get stuck in
> >>@@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>
> >>  err_free_dev:
> >>  	free_netdev(dev);
> >>-failed:
> >>  	return err;
> >>  }
> >>
> >>@@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>  				(unsigned int __user*)argp);
> >>  	}
> >>
> >>-	rtnl_lock();
> >>-
> >>-	tun = __tun_get(tfile);
> >>-	if (cmd == TUNSETIFF&&  !tun) {
> >>+	ret = 0;
> >>+	if (cmd == TUNSETIFF) {
> >>+		rtnl_lock();
> >>  		ifr.ifr_name[IFNAMSIZ-1] = '\0';
> >>-
> >>  		ret = tun_set_iff(tfile->net, file,&ifr);
> >>-
> >>+		rtnl_unlock();
> >>  		if (ret)
> >>-			goto unlock;
> >>-
> >>+			return ret;
> >>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>-			ret = -EFAULT;
> >>-		goto unlock;
> >>+			return -EFAULT;
> >>+		return ret;
> >>  	}
> >>
> >>+	rtnl_lock();
> >>+
> >>+	rcu_read_lock();
> >>+
> >>  	ret = -EBADFD;
> >>+	tun = rcu_dereference(tfile->tun);
> >>  	if (!tun)
> >>  		goto unlock;
> >>+	else
> >>+		ret = 0;
> >>
> >>-	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
> >>-
> >>-	ret = 0;
> >>  	switch (cmd) {
> >>  	case TUNGETIFF:
> >>  		ret = tun_get_iff(current->nsproxy->net_ns, tun,&ifr);
> >>+		rcu_read_unlock();
> >>  		if (ret)
> >>-			break;
> >>+			goto out;
> >>
> >>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>  			ret = -EFAULT;
> >>-		break;
> >>+		goto out;
> >>
> >>  	case TUNSETNOCSUM:
> >>  		/* Disable/Enable checksum */
> >>@@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>  		/* Get hw address */
> >>  		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
> >>  		ifr.ifr_hwaddr.sa_family = tun->dev->type;
> >>+		rcu_read_unlock();
> >>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>  			ret = -EFAULT;
> >>-		break;
> >>+		goto out;
> >>
> >>  	case SIOCSIFHWADDR:
> >>  		/* Set hw address */
> >>@@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>  	}
> >>
> >>  unlock:
> >>+	rcu_read_unlock();
> >>+out:
> >>  	rtnl_unlock();
> >>-	if (tun)
> >>-		tun_put(tun);
> >>  	return ret;
> >>  }
> >>
> >>@@ -1517,6 +1624,11 @@ out:
> >>  	return ret;
> >>  }
> >>
> >>+static void tun_sock_destruct(struct sock *sk)
> >>+{
> >>+	skb_queue_purge(&sk->sk_receive_queue);
> >>+}
> >>+
> >>  static int tun_chr_open(struct inode *inode, struct file * file)
> >>  {
> >>  	struct net *net = current->nsproxy->net_ns;
> >>@@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
> >>  	sock_init_data(&tfile->socket,&tfile->sk);
> >>
> >>  	tfile->sk.sk_write_space = tun_sock_write_space;
> >>+	tfile->sk.sk_destruct = tun_sock_destruct;
> >>  	tfile->sk.sk_sndbuf = INT_MAX;
> >>  	file->private_data = tfile;
> >>
> >>@@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
> >>  static int tun_chr_close(struct inode *inode, struct file *file)
> >>  {
> >>  	struct tun_file *tfile = file->private_data;
> >>-	struct tun_struct *tun;
> >>-
> >>-	tun = __tun_get(tfile);
> >>-	if (tun) {
> >>-		struct net_device *dev = tun->dev;
> >>-
> >>-		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
> >>-
> >>-		__tun_detach(tun);
> >>-
> >>-		/* If desirable, unregister the netdevice. */
> >>-		if (!(tun->flags&  TUN_PERSIST)) {
> >>-			rtnl_lock();
> >>-			if (dev->reg_state == NETREG_REGISTERED)
> >>-				unregister_netdevice(dev);
> >>-			rtnl_unlock();
> >>-		}
> >>
> >>-		/* drop the reference that netdevice holds */
> >>-		sock_put(&tfile->sk);
> >>-
> >>-	}
> >>-
> >>-	/* drop the reference that file holds */
> >>-	sock_put(&tfile->sk);
> >>+	tun_detach(tfile, true);
> >>
> >>  	return 0;
> >>  }
> >>@@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
> >>   * holding a reference to the file for as long as the socket is in use. */
> >>  struct socket *tun_get_socket(struct file *file)
> >>  {
> >>-	struct tun_struct *tun;
> >>+	struct tun_struct *tun = NULL;
> >>  	struct tun_file *tfile = file->private_data;
> >>  	if (file->f_op !=&tun_fops)
> >>  		return ERR_PTR(-EINVAL);
> >>-	tun = tun_get(file);
> >>-	if (!tun)
> >>+	rcu_read_lock();
> >>+	tun = rcu_dereference(tfile->tun);
> >>+	if (!tun) {
> >>+		rcu_read_unlock();
> >>  		return ERR_PTR(-EBADFD);
> >>-	tun_put(tun);
> >>+	}
> >>+	rcu_read_unlock();
> >>  	return&tfile->socket;
> >>  }
> >>  EXPORT_SYMBOL_GPL(tun_get_socket);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/6] tuntap: per queue 64 bit stats\
  2012-06-26  6:00     ` Jason Wang
  2012-06-26  6:10       ` Eric Dumazet
@ 2012-06-26 19:46       ` Michael S. Tsirkin
  1 sibling, 0 replies; 28+ messages in thread
From: Michael S. Tsirkin @ 2012-06-26 19:46 UTC (permalink / raw)
  To: Jason Wang
  Cc: Eric Dumazet, akong, habanero, tahm, haixiao, jwhan,
	ernesto.martin, mashirle, davem, netdev, linux-kernel, krkumar2,
	shemminger, edumazet

On Tue, Jun 26, 2012 at 02:00:53PM +0800, Jason Wang wrote:
> On 06/25/2012 08:52 PM, Eric Dumazet wrote:
> >On Mon, 2012-06-25 at 19:59 +0800, Jason Wang wrote:
> >>As we've added multiqueue support for tun/tap, this patch convert the statistics
> >>to use per-queue 64 bit statistics.
> >LLTX means you can have several cpus calling TX path in parallel.
> >
> >So tx stats are wrong (even before this patch), and racy after this
> >patch (if several cpu access same queue, it seems to be possible)
> >
> >        u64_stats_update_begin(&tfile->stats.tx_syncp);
> >        tfile->stats.tx_packets++;
> >        tfile->stats.tx_bytes += total;
> >        u64_stats_update_end(&tfile->stats.tx_syncp);
> >
> >This can break horribly if several cpus run this code using same 'tfile'
> >pointer.
> 
> Yes, looks like it's hard to use NETIF_F_LLTX without breaking the
> u64 statistics, may worth to use tx lock and alloc_netdev_mq().

Or make them per cpu as most everyone did.


> >I suggest this patch comes before 'tuntap: multiqueue support' in the
> >serie.
> 
> Sure, thanks.
> >
> >
> >

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-26 10:42       ` Michael S. Tsirkin
@ 2012-06-27  5:16         ` Jason Wang
  2012-06-27  8:44           ` Michael S. Tsirkin
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2012-06-27  5:16 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle

On 06/26/2012 06:42 PM, Michael S. Tsirkin wrote:
> On Tue, Jun 26, 2012 at 11:42:17AM +0800, Jason Wang wrote:
>> On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
>>> On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
>>>> This patch adds multiqueue support for tap device. This is done by abstracting
>>>> each queue as a file/socket and allowing multiple sockets to be attached to the
>>>> tuntap device (an array of tun_file were stored in the tun_struct). Userspace
>>>> could write and read from those files to do the parallel packet
>>>> sending/receiving.
>>>>
>>>> Unlike the previous single queue implementation, the socket and device were
>>>> loosely coupled, each of them were allowed to go away first. In order to let the
>>>> tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
>>>> synchronize between data path and system call.
>>> Don't use LLTX/RCU. It's not worth it.
>>> Use something like netif_set_real_num_tx_queues.
>>>
>>>> The tx queue selecting is first based on the recorded rxq index of an skb, it
>>>> there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
>>>>
>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>> Interestingly macvtap switched to hashing first:
>>> ef0002b577b52941fb147128f30bd1ecfdd3ff6d
>>> (the commit log is corrupted but see what it
>>> does in the patch).
>>> Any idea why?
>> Yes, so tap should be changed to behave same as macvtap. I remember
>> the reason we do that is to make sure the packet of a single flow to
>> be queued to a fixed socket/virtqueues. As 10g cards like ixgbe
>> choose the rx queue for a flow based on the last tx queue where the
>> packets of that flow comes. So if we are using recored rx queue in
>> macvtap, the queue index of a flow would change as vhost thread
>> moves amongs processors.
> Hmm. OTOH if you override this, if TX is sent from VCPU0, RX might land
> on VCPU1 in the guest, which is not good, right?

Yes, but better than making the rx moves between vcpus when we use 
recorded rx queue. Flow steering is needed to make sure the tx and rx on 
the same vcpu.
>> But during test tun/tap, one interesting thing I find is that even
>> ixgbe has recorded the queue index during rx, it seems be lost when
>> tap tries to transmit skbs to userspace.
> dev_pick_tx does this I think but ndo_select_queue
> should be able to get it without trouble.
>
>
>>>> ---
>>>>   drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
>>>>   1 files changed, 232 insertions(+), 139 deletions(-)
>>>>
>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>> index 8233b0a..5c26757 100644
>>>> --- a/drivers/net/tun.c
>>>> +++ b/drivers/net/tun.c
>>>> @@ -107,6 +107,8 @@ struct tap_filter {
>>>>   	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
>>>>   };
>>>>
>>>> +#define MAX_TAP_QUEUES (NR_CPUS<   16 ? NR_CPUS : 16)
>>> Why the limit? I am guessing you copied this from macvtap?
>>> This is problematic for a number of reasons:
>>> 	- will not play well with migration
>>> 	- will not work well for a large guest
>>>
>>> Yes, macvtap needs to be fixed too.
>>>
>>> I am guessing what it is trying to prevent is queueing
>>> up a huge number of packets?
>>> So just divide the default tx queue limit by the # of queues.
>>>
>>> And by the way, for MQ applications maybe we can finally
>>> ignore tx queue altogether and limit the total number
>>> of bytes queued?
>>> To avoid regressions we can make it large like 64M/# queues.
>>> Could be a separate patch I think, and for a single queue
>>> might need a compatible mode though I am not sure.
>>>
>>>> +
>>>>   struct tun_file {
>>>>   	struct sock sk;
>>>>   	struct socket socket;
>>>> @@ -114,16 +116,18 @@ struct tun_file {
>>>>   	int vnet_hdr_sz;
>>>>   	struct tap_filter txflt;
>>>>   	atomic_t count;
>>>> -	struct tun_struct *tun;
>>>> +	struct tun_struct __rcu *tun;
>>>>   	struct net *net;
>>>>   	struct fasync_struct *fasync;
>>>>   	unsigned int flags;
>>>> +	u16 queue_index;
>>>>   };
>>>>
>>>>   struct tun_sock;
>>>>
>>>>   struct tun_struct {
>>>> -	struct tun_file		*tfile;
>>>> +	struct tun_file		*tfiles[MAX_TAP_QUEUES];
>>>> +	unsigned int            numqueues;
>>>>   	unsigned int 		flags;
>>>>   	uid_t			owner;
>>>>   	gid_t			group;
>>>> @@ -138,80 +142,159 @@ struct tun_struct {
>>>>   #endif
>>>>   };
>>>>
>>>> -static int tun_attach(struct tun_struct *tun, struct file *file)
>>>> +static DEFINE_SPINLOCK(tun_lock);
>>>> +
>>>> +/*
>>>> + * tun_get_queue(): calculate the queue index
>>>> + *     - if skbs comes from mq nics, we can just borrow
>>>> + *     - if not, calculate from the hash
>>>> + */
>>>> +static struct tun_file *tun_get_queue(struct net_device *dev,
>>>> +				      struct sk_buff *skb)
>>>>   {
>>>> -	struct tun_file *tfile = file->private_data;
>>>> -	int err;
>>>> +	struct tun_struct *tun = netdev_priv(dev);
>>>> +	struct tun_file *tfile = NULL;
>>>> +	int numqueues = tun->numqueues;
>>>> +	__u32 rxq;
>>>>
>>>> -	ASSERT_RTNL();
>>>> +	BUG_ON(!rcu_read_lock_held());
>>>>
>>>> -	netif_tx_lock_bh(tun->dev);
>>>> +	if (!numqueues)
>>>> +		goto out;
>>>>
>>>> -	err = -EINVAL;
>>>> -	if (tfile->tun)
>>>> +	if (numqueues == 1) {
>>>> +		tfile = rcu_dereference(tun->tfiles[0]);
>>> Instead of hacks like this, you can ask for an MQ
>>> flag to be set in SETIFF. Then you won't need to
>>> handle attach/detach at random times.
>>> And most of the scary num_queues checks can go away.
>>> You can then also ask userspace about the max # of queues
>>> to expect if you want to save some memory.
>>>
>>>
>>>>   		goto out;
>>>> +	}
>>>>
>>>> -	err = -EBUSY;
>>>> -	if (tun->tfile)
>>>> +	if (likely(skb_rx_queue_recorded(skb))) {
>>>> +		rxq = skb_get_rx_queue(skb);
>>>> +
>>>> +		while (unlikely(rxq>= numqueues))
>>>> +			rxq -= numqueues;
>>>> +
>>>> +		tfile = rcu_dereference(tun->tfiles[rxq]);
>>>>   		goto out;
>>>> +	}
>>>>
>>>> -	err = 0;
>>>> -	tfile->tun = tun;
>>>> -	tun->tfile = tfile;
>>>> -	netif_carrier_on(tun->dev);
>>>> -	dev_hold(tun->dev);
>>>> -	sock_hold(&tfile->sk);
>>>> -	atomic_inc(&tfile->count);
>>>> +	/* Check if we can use flow to select a queue */
>>>> +	rxq = skb_get_rxhash(skb);
>>>> +	if (rxq) {
>>>> +		u32 idx = ((u64)rxq * numqueues)>>   32;
>>> This completely confuses me. What's the logic here?
>>> How do we even know it's in range?
>>>
>>>> +		tfile = rcu_dereference(tun->tfiles[idx]);
>>>> +		goto out;
>>>> +	}
>>>>
>>>> +	tfile = rcu_dereference(tun->tfiles[0]);
>>>>   out:
>>>> -	netif_tx_unlock_bh(tun->dev);
>>>> -	return err;
>>>> +	return tfile;
>>>>   }
>>>>
>>>> -static void __tun_detach(struct tun_struct *tun)
>>>> +static int tun_detach(struct tun_file *tfile, bool clean)
>>>>   {
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> -	/* Detach from net device */
>>>> -	netif_tx_lock_bh(tun->dev);
>>>> -	netif_carrier_off(tun->dev);
>>>> -	tun->tfile = NULL;
>>>> -	netif_tx_unlock_bh(tun->dev);
>>>> -
>>>> -	/* Drop read queue */
>>>> -	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
>>>> -
>>>> -	/* Drop the extra count on the net device */
>>>> -	dev_put(tun->dev);
>>>> -}
>>>> +	struct tun_struct *tun;
>>>> +	struct net_device *dev = NULL;
>>>> +	bool destroy = false;
>>>>
>>>> -static void tun_detach(struct tun_struct *tun)
>>>> -{
>>>> -	rtnl_lock();
>>>> -	__tun_detach(tun);
>>>> -	rtnl_unlock();
>>>> -}
>>>> +	spin_lock(&tun_lock);
>>>>
>>>> -static struct tun_struct *__tun_get(struct tun_file *tfile)
>>>> -{
>>>> -	struct tun_struct *tun = NULL;
>>>> +	tun = rcu_dereference_protected(tfile->tun,
>>>> +					lockdep_is_held(&tun_lock));
>>>> +	if (tun) {
>>>> +		u16 index = tfile->queue_index;
>>>> +		BUG_ON(index>= tun->numqueues);
>>>> +		dev = tun->dev;
>>>> +
>>>> +		rcu_assign_pointer(tun->tfiles[index],
>>>> +				   tun->tfiles[tun->numqueues - 1]);
>>>> +		tun->tfiles[index]->queue_index = index;
>>>> +		rcu_assign_pointer(tfile->tun, NULL);
>>>> +		--tun->numqueues;
>>>> +		sock_put(&tfile->sk);
>>>>
>>>> -	if (atomic_inc_not_zero(&tfile->count))
>>>> -		tun = tfile->tun;
>>>> +		if (tun->numqueues == 0&&   !(tun->flags&   TUN_PERSIST))
>>>> +			destroy = true;
>>> Please don't use flags like that. Use dedicated labels and goto there on error.
>>>
>>>
>>>> +	}
>>>>
>>>> -	return tun;
>>>> +	spin_unlock(&tun_lock);
>>>> +
>>>> +	synchronize_rcu();
>>>> +	if (clean)
>>>> +		sock_put(&tfile->sk);
>>>> +
>>>> +	if (destroy) {
>>>> +		rtnl_lock();
>>>> +		if (dev->reg_state == NETREG_REGISTERED)
>>>> +			unregister_netdevice(dev);
>>>> +		rtnl_unlock();
>>>> +	}
>>>> +
>>>> +	return 0;
>>>>   }
>>>>
>>>> -static struct tun_struct *tun_get(struct file *file)
>>>> +static void tun_detach_all(struct net_device *dev)
>>>>   {
>>>> -	return __tun_get(file->private_data);
>>>> +	struct tun_struct *tun = netdev_priv(dev);
>>>> +	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
>>>> +	int i, j = 0;
>>>> +
>>>> +	spin_lock(&tun_lock);
>>>> +
>>>> +	for (i = 0; i<   MAX_TAP_QUEUES&&   tun->numqueues; i++) {
>>>> +		tfile = rcu_dereference_protected(tun->tfiles[i],
>>>> +						lockdep_is_held(&tun_lock));
>>>> +		BUG_ON(!tfile);
>>>> +		wake_up_all(&tfile->wq.wait);
>>>> +		tfile_list[j++] = tfile;
>>>> +		rcu_assign_pointer(tfile->tun, NULL);
>>>> +		--tun->numqueues;
>>>> +	}
>>>> +	BUG_ON(tun->numqueues != 0);
>>>> +	/* guarantee that any future tun_attach will fail */
>>>> +	tun->numqueues = MAX_TAP_QUEUES;
>>>> +	spin_unlock(&tun_lock);
>>>> +
>>>> +	synchronize_rcu();
>>>> +	for (--j; j>= 0; j--)
>>>> +		sock_put(&tfile_list[j]->sk);
>>>>   }
>>>>
>>>> -static void tun_put(struct tun_struct *tun)
>>>> +static int tun_attach(struct tun_struct *tun, struct file *file)
>>>>   {
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> +	struct tun_file *tfile = file->private_data;
>>>> +	int err;
>>>> +
>>>> +	ASSERT_RTNL();
>>>> +
>>>> +	spin_lock(&tun_lock);
>>>>
>>>> -	if (atomic_dec_and_test(&tfile->count))
>>>> -		tun_detach(tfile->tun);
>>>> +	err = -EINVAL;
>>>> +	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
>>>> +		goto out;
>>>> +
>>>> +	err = -EBUSY;
>>>> +	if (!(tun->flags&   TUN_TAP_MQ)&&   tun->numqueues == 1)
>>>> +		goto out;
>>>> +
>>>> +	if (tun->numqueues == MAX_TAP_QUEUES)
>>>> +		goto out;
>>>> +
>>>> +	err = 0;
>>>> +	tfile->queue_index = tun->numqueues;
>>>> +	rcu_assign_pointer(tfile->tun, tun);
>>>> +	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>>>> +	sock_hold(&tfile->sk);
>>>> +	tun->numqueues++;
>>>> +
>>>> +	if (tun->numqueues == 1)
>>>> +		netif_carrier_on(tun->dev);
>>>> +
>>>> +	/* device is allowed to go away first, so no need to hold extra
>>>> +	 * refcnt. */
>>>> +
>>>> +out:
>>>> +	spin_unlock(&tun_lock);
>>>> +	return err;
>>>>   }
>>>>
>>>>   /* TAP filtering */
>>>> @@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
>>>>   /* Net device detach from fd. */
>>>>   static void tun_net_uninit(struct net_device *dev)
>>>>   {
>>>> -	struct tun_struct *tun = netdev_priv(dev);
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> -
>>>> -	/* Inform the methods they need to stop using the dev.
>>>> -	 */
>>>> -	if (tfile) {
>>>> -		wake_up_all(&tfile->wq.wait);
>>>> -		if (atomic_dec_and_test(&tfile->count))
>>>> -			__tun_detach(tun);
>>>> -	}
>>>> +	tun_detach_all(dev);
>>>>   }
>>>>
>>>>   /* Net device open. */
>>>> @@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
>>>>   /* Net device start xmit */
>>>>   static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   {
>>>> -	struct tun_struct *tun = netdev_priv(dev);
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> +	struct tun_file *tfile = NULL;
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
>>>> +	rcu_read_lock();
>>>> +	tfile = tun_get_queue(dev, skb);
>>>>
>>>>   	/* Drop packet if interface is not attached */
>>>>   	if (!tfile)
>>>> @@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>
>>>>   	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
>>>>   	>= dev->tx_queue_len) {
>>>> -		if (!(tun->flags&   TUN_ONE_QUEUE)) {
>>>> +		if (!(tfile->flags&   TUN_ONE_QUEUE)&&
>>> Which patch moved flags from tun to tfile?
>>>
>>>> +		    !(tfile->flags&   TUN_TAP_MQ)) {
>>>>   			/* Normal queueing mode. */
>>>>   			/* Packet scheduler handles dropping of further packets. */
>>>>   			netif_stop_queue(dev);
>>>> @@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   			 * error is more appropriate. */
>>>>   			dev->stats.tx_fifo_errors++;
>>>>   		} else {
>>>> -			/* Single queue mode.
>>>> +			/* Single queue mode or multi queue mode.
>>>>   			 * Driver handles dropping of all packets itself. */
>>> Please don't do this. Stop the queue on overrun as appropriate.
>>> ONE_QUEUE is a legacy hack.
>>>
>>> BTW we really should stop queue before we start dropping packets,
>>> but that can be a separate patch.
>>>
>>>>   			goto drop;
>>>>   		}
>>>> @@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
>>>>   	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
>>>>   				   POLLRDNORM | POLLRDBAND);
>>>> +	rcu_read_unlock();
>>>>   	return NETDEV_TX_OK;
>>>>
>>>>   drop:
>>>> +	rcu_read_unlock();
>>>>   	dev->stats.tx_dropped++;
>>>>   	kfree_skb(skb);
>>>>   	return NETDEV_TX_OK;
>>>> @@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
>>>>   static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>>>   {
>>>>   	struct tun_file *tfile = file->private_data;
>>>> -	struct tun_struct *tun = __tun_get(tfile);
>>>> +	struct tun_struct *tun = NULL;
>>>>   	struct sock *sk;
>>>>   	unsigned int mask = 0;
>>>>
>>>> -	if (!tun)
>>>> +	if (!tfile)
>>>>   		return POLLERR;
>>>>
>>>> -	sk = tfile->socket.sk;
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>> +		return POLLERR;
>>>> +	}
>>>> +	rcu_read_unlock();
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
>>>> +	sk =&tfile->sk;
>>>>
>>>>   	poll_wait(file,&tfile->wq.wait, wait);
>>>>
>>>> @@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>>>   	     sock_writeable(sk)))
>>>>   		mask |= POLLOUT | POLLWRNORM;
>>>>
>>>> -	if (tun->dev->reg_state != NETREG_REGISTERED)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
>>>>   		mask = POLLERR;
>>>> +	rcu_read_unlock();
>>>>
>>>> -	tun_put(tun);
>>>>   	return mask;
>>>>   }
>>>>
>>>> @@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>>>   		skb_shinfo(skb)->gso_segs = 0;
>>>>   	}
>>>>
>>>> -	tun = __tun_get(tfile);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return -EBADFD;
>>>> +	}
>>>>
>>>>   	switch (tfile->flags&   TUN_TYPE_MASK) {
>>>>   	case TUN_TUN_DEV:
>>>> @@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>>>   		skb->protocol = eth_type_trans(skb, tun->dev);
>>>>   		break;
>>>>   	}
>>>> -
>>>> -	netif_rx_ni(skb);
>>>>   	tun->dev->stats.rx_packets++;
>>>>   	tun->dev->stats.rx_bytes += len;
>>>> -	tun_put(tun);
>>>> +	rcu_read_unlock();
>>>> +
>>>> +	netif_rx_ni(skb);
>>>> +
>>>>   	return count;
>>>>
>>>>   err_free:
>>>>   	count = -EINVAL;
>>>>   	kfree_skb(skb);
>>>>   err:
>>>> -	tun = __tun_get(tfile);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return -EBADFD;
>>>> +	}
>>>>
>>>>   	if (drop)
>>>>   		tun->dev->stats.rx_dropped++;
>>>>   	if (error)
>>>>   		tun->dev->stats.rx_frame_errors++;
>>>> -	tun_put(tun);
>>>> +	rcu_read_unlock();
>>>>   	return count;
>>>>   }
>>>>
>>>> @@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
>>>>   	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
>>>>   	total += skb->len;
>>>>
>>>> -	tun = __tun_get(tfile);
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>>   	if (tun) {
>>>>   		tun->dev->stats.tx_packets++;
>>>>   		tun->dev->stats.tx_bytes += len;
>>>> -		tun_put(tun);
>>>>   	}
>>>> +	rcu_read_unlock();
>>>>
>>>>   	return total;
>>>>   }
>>>> @@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
>>>>   				break;
>>>>   			}
>>>>
>>>> -			tun = __tun_get(tfile);
>>>> +			rcu_read_lock();
>>>> +			tun = rcu_dereference(tfile->tun);
>>>>   			if (!tun) {
>>>> -				ret = -EIO;
>>>> +				ret = -EBADFD;
>>> BADFD is for when you get passed something like -1 fd.
>>> Here fd is OK, it's just in a bad state so you can not do IO.
>>>
>>>
>>>> +				rcu_read_unlock();
>>>>   				break;
>>>>   			}
>>>>   			if (tun->dev->reg_state != NETREG_REGISTERED) {
>>>>   				ret = -EIO;
>>>> -				tun_put(tun);
>>>> +				rcu_read_unlock();
>>>>   				break;
>>>>   			}
>>>> -			tun_put(tun);
>>>> +			rcu_read_unlock();
>>>>
>>>>   			/* Nothing to read, let's sleep */
>>>>   			schedule();
>>>>   			continue;
>>>>   		}
>>>>
>>>> -		tun = __tun_get(tfile);
>>>> +		rcu_read_lock();
>>>> +		tun = rcu_dereference(tfile->tun);
>>>>   		if (tun) {
>>>>   			netif_wake_queue(tun->dev);
>>>> -			tun_put(tun);
>>>>   		}
>>>> +		rcu_read_unlock();
>>>>
>>>>   		ret = tun_put_user(tfile, skb, iv, len);
>>>>   		kfree_skb(skb);
>>>> @@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
>>>>   	if (tun->flags&   TUN_VNET_HDR)
>>>>   		flags |= IFF_VNET_HDR;
>>>>
>>>> +	if (tun->flags&   TUN_TAP_MQ)
>>>> +		flags |= IFF_MULTI_QUEUE;
>>>> +
>>>>   	return flags;
>>>>   }
>>>>
>>>> @@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   		err = tun_attach(tun, file);
>>>>   		if (err<   0)
>>>>   			return err;
>>>> -	}
>>>> -	else {
>>>> +	} else {
>>>>   		char *name;
>>>>   		unsigned long flags = 0;
>>>>
>>>> @@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
>>>>   			TUN_USER_FEATURES;
>>>>   		dev->features = dev->hw_features;
>>>> +		if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
>>>> +			dev->features |= NETIF_F_LLTX;
>>>>
>>>>   		err = register_netdevice(tun->dev);
>>>>   		if (err<   0)
>>>> @@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>
>>>>   		err = tun_attach(tun, file);
>>>>   		if (err<   0)
>>>> -			goto failed;
>>>> +			goto err_free_dev;
>>>>   	}
>>>>
>>>>   	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
>>>> @@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   	else
>>>>   		tun->flags&= ~TUN_VNET_HDR;
>>>>
>>>> +	if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
>>>> +		tun->flags |= TUN_TAP_MQ;
>>>> +	else
>>>> +		tun->flags&= ~TUN_TAP_MQ;
>>>> +
>>>>   	/* Cache flags from tun device */
>>>>   	tfile->flags = tun->flags;
>>>>   	/* Make sure persistent devices do not get stuck in
>>>> @@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>
>>>>   err_free_dev:
>>>>   	free_netdev(dev);
>>>> -failed:
>>>>   	return err;
>>>>   }
>>>>
>>>> @@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   				(unsigned int __user*)argp);
>>>>   	}
>>>>
>>>> -	rtnl_lock();
>>>> -
>>>> -	tun = __tun_get(tfile);
>>>> -	if (cmd == TUNSETIFF&&   !tun) {
>>>> +	ret = 0;
>>>> +	if (cmd == TUNSETIFF) {
>>>> +		rtnl_lock();
>>>>   		ifr.ifr_name[IFNAMSIZ-1] = '\0';
>>>> -
>>>>   		ret = tun_set_iff(tfile->net, file,&ifr);
>>>> -
>>>> +		rtnl_unlock();
>>>>   		if (ret)
>>>> -			goto unlock;
>>>> -
>>>> +			return ret;
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>> -			ret = -EFAULT;
>>>> -		goto unlock;
>>>> +			return -EFAULT;
>>>> +		return ret;
>>>>   	}
>>>>
>>>> +	rtnl_lock();
>>>> +
>>>> +	rcu_read_lock();
>>>> +
>>>>   	ret = -EBADFD;
>>>> +	tun = rcu_dereference(tfile->tun);
>>>>   	if (!tun)
>>>>   		goto unlock;
>>>> +	else
>>>> +		ret = 0;
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
>>>> -
>>>> -	ret = 0;
>>>>   	switch (cmd) {
>>>>   	case TUNGETIFF:
>>>>   		ret = tun_get_iff(current->nsproxy->net_ns, tun,&ifr);
>>>> +		rcu_read_unlock();
>>>>   		if (ret)
>>>> -			break;
>>>> +			goto out;
>>>>
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>>   			ret = -EFAULT;
>>>> -		break;
>>>> +		goto out;
>>>>
>>>>   	case TUNSETNOCSUM:
>>>>   		/* Disable/Enable checksum */
>>>> @@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   		/* Get hw address */
>>>>   		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
>>>>   		ifr.ifr_hwaddr.sa_family = tun->dev->type;
>>>> +		rcu_read_unlock();
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>>   			ret = -EFAULT;
>>>> -		break;
>>>> +		goto out;
>>>>
>>>>   	case SIOCSIFHWADDR:
>>>>   		/* Set hw address */
>>>> @@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   	}
>>>>
>>>>   unlock:
>>>> +	rcu_read_unlock();
>>>> +out:
>>>>   	rtnl_unlock();
>>>> -	if (tun)
>>>> -		tun_put(tun);
>>>>   	return ret;
>>>>   }
>>>>
>>>> @@ -1517,6 +1624,11 @@ out:
>>>>   	return ret;
>>>>   }
>>>>
>>>> +static void tun_sock_destruct(struct sock *sk)
>>>> +{
>>>> +	skb_queue_purge(&sk->sk_receive_queue);
>>>> +}
>>>> +
>>>>   static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   {
>>>>   	struct net *net = current->nsproxy->net_ns;
>>>> @@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   	sock_init_data(&tfile->socket,&tfile->sk);
>>>>
>>>>   	tfile->sk.sk_write_space = tun_sock_write_space;
>>>> +	tfile->sk.sk_destruct = tun_sock_destruct;
>>>>   	tfile->sk.sk_sndbuf = INT_MAX;
>>>>   	file->private_data = tfile;
>>>>
>>>> @@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   static int tun_chr_close(struct inode *inode, struct file *file)
>>>>   {
>>>>   	struct tun_file *tfile = file->private_data;
>>>> -	struct tun_struct *tun;
>>>> -
>>>> -	tun = __tun_get(tfile);
>>>> -	if (tun) {
>>>> -		struct net_device *dev = tun->dev;
>>>> -
>>>> -		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
>>>> -
>>>> -		__tun_detach(tun);
>>>> -
>>>> -		/* If desirable, unregister the netdevice. */
>>>> -		if (!(tun->flags&   TUN_PERSIST)) {
>>>> -			rtnl_lock();
>>>> -			if (dev->reg_state == NETREG_REGISTERED)
>>>> -				unregister_netdevice(dev);
>>>> -			rtnl_unlock();
>>>> -		}
>>>>
>>>> -		/* drop the reference that netdevice holds */
>>>> -		sock_put(&tfile->sk);
>>>> -
>>>> -	}
>>>> -
>>>> -	/* drop the reference that file holds */
>>>> -	sock_put(&tfile->sk);
>>>> +	tun_detach(tfile, true);
>>>>
>>>>   	return 0;
>>>>   }
>>>> @@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
>>>>    * holding a reference to the file for as long as the socket is in use. */
>>>>   struct socket *tun_get_socket(struct file *file)
>>>>   {
>>>> -	struct tun_struct *tun;
>>>> +	struct tun_struct *tun = NULL;
>>>>   	struct tun_file *tfile = file->private_data;
>>>>   	if (file->f_op !=&tun_fops)
>>>>   		return ERR_PTR(-EINVAL);
>>>> -	tun = tun_get(file);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return ERR_PTR(-EBADFD);
>>>> -	tun_put(tun);
>>>> +	}
>>>> +	rcu_read_unlock();
>>>>   	return&tfile->socket;
>>>>   }
>>>>   EXPORT_SYMBOL_GPL(tun_get_socket);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-26 11:54       ` Michael S. Tsirkin
@ 2012-06-27  5:59         ` Jason Wang
  2012-06-27  8:26           ` Michael S. Tsirkin
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2012-06-27  5:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle, Eric Dumazet

On 06/26/2012 07:54 PM, Michael S. Tsirkin wrote:
> On Tue, Jun 26, 2012 at 01:52:57PM +0800, Jason Wang wrote:
>> On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
>>> On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
>>>> This patch adds multiqueue support for tap device. This is done by abstracting
>>>> each queue as a file/socket and allowing multiple sockets to be attached to the
>>>> tuntap device (an array of tun_file were stored in the tun_struct). Userspace
>>>> could write and read from those files to do the parallel packet
>>>> sending/receiving.
>>>>
>>>> Unlike the previous single queue implementation, the socket and device were
>>>> loosely coupled, each of them were allowed to go away first. In order to let the
>>>> tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
>>>> synchronize between data path and system call.
>>> Don't use LLTX/RCU. It's not worth it.
>>> Use something like netif_set_real_num_tx_queues.
>>>
>> For LLTX, maybe it's better to convert it to alloc_netdev_mq() to
>> let the kernel see all queues and make the queue stopping and
>> per-queue stats eaiser.
>> RCU is used to handle the attaching/detaching when tun/tap is
>> sending and receiving packets which looks reasonalbe for me.
> Yes but do we have to allow this? How about we always ask
> userspace to attach to all active queues?

Attaching/detaching is a method to active/deactive a queue, if all 
queues were kept attached, then we need other method or flag to mark the 
queue as activateddeactived and still need to synchronize with data path.
>> Not
>> sure netif_set_real_num_tx_queues() can help in this situation.
> Check it out.
>
>>>> The tx queue selecting is first based on the recorded rxq index of an skb, it
>>>> there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
>>>>
>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>> Interestingly macvtap switched to hashing first:
>>> ef0002b577b52941fb147128f30bd1ecfdd3ff6d
>>> (the commit log is corrupted but see what it
>>> does in the patch).
>>> Any idea why?
>>>
>>>> ---
>>>>   drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
>>>>   1 files changed, 232 insertions(+), 139 deletions(-)
>>>>
>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>> index 8233b0a..5c26757 100644
>>>> --- a/drivers/net/tun.c
>>>> +++ b/drivers/net/tun.c
>>>> @@ -107,6 +107,8 @@ struct tap_filter {
>>>>   	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
>>>>   };
>>>>
>>>> +#define MAX_TAP_QUEUES (NR_CPUS<   16 ? NR_CPUS : 16)
>>> Why the limit? I am guessing you copied this from macvtap?
>>> This is problematic for a number of reasons:
>>> 	- will not play well with migration
>>> 	- will not work well for a large guest
>>>
>>> Yes, macvtap needs to be fixed too.
>>>
>>> I am guessing what it is trying to prevent is queueing
>>> up a huge number of packets?
>>> So just divide the default tx queue limit by the # of queues.
>> Not sure,
>> another reasons I can guess:
>> - to prevent storing a large array of pointers in tun_struct or macvlan_dev.
> OK so with the limit of e.g. 1024 we'd allocate at most
> 2 pages of memory. This doesn't look too bad. 1024 is probably a
> high enough limit: modern hypervisors seem to support on the order
> of 100-200 CPUs so this leaves us some breathing space
> if we want to match a queue per guest CPU.
> Of course we need to limit the packets per queue
> in such a setup more aggressively. 1000 packets * 1000 queues
> * 64K per packet is too much.
>
>> - it may not be suitable to allow the number of virtqueues greater
>> than the number of physical queues in the card
> Maybe for macvtap, here we have no idea which card we
> are working with and how many queues it has.
>
>>> And by the way, for MQ applications maybe we can finally
>>> ignore tx queue altogether and limit the total number
>>> of bytes queued?
>>> To avoid regressions we can make it large like 64M/# queues.
>>> Could be a separate patch I think, and for a single queue
>>> might need a compatible mode though I am not sure.
>> Could you explain more about this?
>> Did you mean to have a total
>> sndbuf for all sockets that attached to tun/tap?
> Consider that we currently limit the # of
> packets queued at tun for xmit to userspace.
> Some limit is needed but # of packets sounds
> very silly - limiting the total memory
> might be more reasonable.
>
> In case of multiqueue, we really care about
> total # of packets or total memory, but a simple
> approximation could be to divide the allocation
> between active queues equally.

A possible method is to divce the TUN_READQ_SIZE by #queues, but make it 
at least to be equal to the vring size (256).
>
> qdisc also queues some packets, that logic is
> using # of packets anyway. So either make that
> 1000/# queues, or even set to 0 as Eric once
> suggested.
>
>>>> +
>>>>   struct tun_file {
>>>>   	struct sock sk;
>>>>   	struct socket socket;
>>>> @@ -114,16 +116,18 @@ struct tun_file {
>>>>   	int vnet_hdr_sz;
>>>>   	struct tap_filter txflt;
>>>>   	atomic_t count;
>>>> -	struct tun_struct *tun;
>>>> +	struct tun_struct __rcu *tun;
>>>>   	struct net *net;
>>>>   	struct fasync_struct *fasync;
>>>>   	unsigned int flags;
>>>> +	u16 queue_index;
>>>>   };
>>>>
>>>>   struct tun_sock;
>>>>
>>>>   struct tun_struct {
>>>> -	struct tun_file		*tfile;
>>>> +	struct tun_file		*tfiles[MAX_TAP_QUEUES];
>>>> +	unsigned int            numqueues;
>>>>   	unsigned int 		flags;
>>>>   	uid_t			owner;
>>>>   	gid_t			group;
>>>> @@ -138,80 +142,159 @@ struct tun_struct {
>>>>   #endif
>>>>   };
>>>>
>>>> -static int tun_attach(struct tun_struct *tun, struct file *file)
>>>> +static DEFINE_SPINLOCK(tun_lock);
>>>> +
>>>> +/*
>>>> + * tun_get_queue(): calculate the queue index
>>>> + *     - if skbs comes from mq nics, we can just borrow
>>>> + *     - if not, calculate from the hash
>>>> + */
>>>> +static struct tun_file *tun_get_queue(struct net_device *dev,
>>>> +				      struct sk_buff *skb)
>>>>   {
>>>> -	struct tun_file *tfile = file->private_data;
>>>> -	int err;
>>>> +	struct tun_struct *tun = netdev_priv(dev);
>>>> +	struct tun_file *tfile = NULL;
>>>> +	int numqueues = tun->numqueues;
>>>> +	__u32 rxq;
>>>>
>>>> -	ASSERT_RTNL();
>>>> +	BUG_ON(!rcu_read_lock_held());
>>>>
>>>> -	netif_tx_lock_bh(tun->dev);
>>>> +	if (!numqueues)
>>>> +		goto out;
>>>>
>>>> -	err = -EINVAL;
>>>> -	if (tfile->tun)
>>>> +	if (numqueues == 1) {
>>>> +		tfile = rcu_dereference(tun->tfiles[0]);
>>> Instead of hacks like this, you can ask for an MQ
>>> flag to be set in SETIFF. Then you won't need to
>>> handle attach/detach at random times.
>> Consier user switch between a sq guest to mq guest, qemu would
>> attach or detach the fd which could not be expceted in kernel.
> Can't userspace keep it attached always, just deactivate MQ?
>
>>> And most of the scary num_queues checks can go away.
>> Even we has a MQ flag, userspace could still just attach one queue
>> to the device.
> I think we allow too much flexibility if we let
> userspace detach a random queue.

The point is to let tun/tap has the same flexibility as macvtap. Macvtap 
allows add/delete queues at any time and it's very easy to add 
detach/attach to macvtap. So we can easily use almost the same ioctls to 
active/deactive a queue at any time for both tap and macvtap.
> Maybe only allow attaching/detaching with MQ off?
> If userspace wants to attach/detach, clear MQ first?

Maybe I didn't understand the point here but I didn't advantages except 
more times of ioctl().
> Alternatively, attach/detach all queues in one ioctl?

Yes, it can be same one.
>
>>> You can then also ask userspace about the max # of queues
>>> to expect if you want to save some memory.
>>>
>> Yes, good suggestion.
>>>>   		goto out;
>>>> +	}
>>>>
>>>> -	err = -EBUSY;
>>>> -	if (tun->tfile)
>>>> +	if (likely(skb_rx_queue_recorded(skb))) {
>>>> +		rxq = skb_get_rx_queue(skb);
>>>> +
>>>> +		while (unlikely(rxq>= numqueues))
>>>> +			rxq -= numqueues;
>>>> +
>>>> +		tfile = rcu_dereference(tun->tfiles[rxq]);
>>>>   		goto out;
>>>> +	}
>>>>
>>>> -	err = 0;
>>>> -	tfile->tun = tun;
>>>> -	tun->tfile = tfile;
>>>> -	netif_carrier_on(tun->dev);
>>>> -	dev_hold(tun->dev);
>>>> -	sock_hold(&tfile->sk);
>>>> -	atomic_inc(&tfile->count);
>>>> +	/* Check if we can use flow to select a queue */
>>>> +	rxq = skb_get_rxhash(skb);
>>>> +	if (rxq) {
>>>> +		u32 idx = ((u64)rxq * numqueues)>>   32;
>>> This completely confuses me. What's the logic here?
>>> How do we even know it's in range?
>>>
>> rxq is a u32, so the result should be less than numqueues.
> Aha. So the point is to use multiply+shift instead of %?
> Please add a comment.
>

Yes sure.
>>>> +		tfile = rcu_dereference(tun->tfiles[idx]);
>>>> +		goto out;
>>>> +	}
>>>>
>>>> +	tfile = rcu_dereference(tun->tfiles[0]);
>>>>   out:
>>>> -	netif_tx_unlock_bh(tun->dev);
>>>> -	return err;
>>>> +	return tfile;
>>>>   }
>>>>
>>>> -static void __tun_detach(struct tun_struct *tun)
>>>> +static int tun_detach(struct tun_file *tfile, bool clean)
>>>>   {
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> -	/* Detach from net device */
>>>> -	netif_tx_lock_bh(tun->dev);
>>>> -	netif_carrier_off(tun->dev);
>>>> -	tun->tfile = NULL;
>>>> -	netif_tx_unlock_bh(tun->dev);
>>>> -
>>>> -	/* Drop read queue */
>>>> -	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
>>>> -
>>>> -	/* Drop the extra count on the net device */
>>>> -	dev_put(tun->dev);
>>>> -}
>>>> +	struct tun_struct *tun;
>>>> +	struct net_device *dev = NULL;
>>>> +	bool destroy = false;
>>>>
>>>> -static void tun_detach(struct tun_struct *tun)
>>>> -{
>>>> -	rtnl_lock();
>>>> -	__tun_detach(tun);
>>>> -	rtnl_unlock();
>>>> -}
>>>> +	spin_lock(&tun_lock);
>>>>
>>>> -static struct tun_struct *__tun_get(struct tun_file *tfile)
>>>> -{
>>>> -	struct tun_struct *tun = NULL;
>>>> +	tun = rcu_dereference_protected(tfile->tun,
>>>> +					lockdep_is_held(&tun_lock));
>>>> +	if (tun) {
>>>> +		u16 index = tfile->queue_index;
>>>> +		BUG_ON(index>= tun->numqueues);
>>>> +		dev = tun->dev;
>>>> +
>>>> +		rcu_assign_pointer(tun->tfiles[index],
>>>> +				   tun->tfiles[tun->numqueues - 1]);
>>>> +		tun->tfiles[index]->queue_index = index;
>>>> +		rcu_assign_pointer(tfile->tun, NULL);
>>>> +		--tun->numqueues;
>>>> +		sock_put(&tfile->sk);
>>>>
>>>> -	if (atomic_inc_not_zero(&tfile->count))
>>>> -		tun = tfile->tun;
>>>> +		if (tun->numqueues == 0&&   !(tun->flags&   TUN_PERSIST))
>>>> +			destroy = true;
>>> Please don't use flags like that. Use dedicated labels and goto there on error.
>> ok.
>>>> +	}
>>>>
>>>> -	return tun;
>>>> +	spin_unlock(&tun_lock);
>>>> +
>>>> +	synchronize_rcu();
>>>> +	if (clean)
>>>> +		sock_put(&tfile->sk);
>>>> +
>>>> +	if (destroy) {
>>>> +		rtnl_lock();
>>>> +		if (dev->reg_state == NETREG_REGISTERED)
>>>> +			unregister_netdevice(dev);
>>>> +		rtnl_unlock();
>>>> +	}
>>>> +
>>>> +	return 0;
>>>>   }
>>>>
>>>> -static struct tun_struct *tun_get(struct file *file)
>>>> +static void tun_detach_all(struct net_device *dev)
>>>>   {
>>>> -	return __tun_get(file->private_data);
>>>> +	struct tun_struct *tun = netdev_priv(dev);
>>>> +	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
>>>> +	int i, j = 0;
>>>> +
>>>> +	spin_lock(&tun_lock);
>>>> +
>>>> +	for (i = 0; i<   MAX_TAP_QUEUES&&   tun->numqueues; i++) {
>>>> +		tfile = rcu_dereference_protected(tun->tfiles[i],
>>>> +						lockdep_is_held(&tun_lock));
>>>> +		BUG_ON(!tfile);
>>>> +		wake_up_all(&tfile->wq.wait);
>>>> +		tfile_list[j++] = tfile;
>>>> +		rcu_assign_pointer(tfile->tun, NULL);
>>>> +		--tun->numqueues;
>>>> +	}
>>>> +	BUG_ON(tun->numqueues != 0);
>>>> +	/* guarantee that any future tun_attach will fail */
>>>> +	tun->numqueues = MAX_TAP_QUEUES;
>>>> +	spin_unlock(&tun_lock);
>>>> +
>>>> +	synchronize_rcu();
>>>> +	for (--j; j>= 0; j--)
>>>> +		sock_put(&tfile_list[j]->sk);
>>>>   }
>>>>
>>>> -static void tun_put(struct tun_struct *tun)
>>>> +static int tun_attach(struct tun_struct *tun, struct file *file)
>>>>   {
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> +	struct tun_file *tfile = file->private_data;
>>>> +	int err;
>>>> +
>>>> +	ASSERT_RTNL();
>>>> +
>>>> +	spin_lock(&tun_lock);
>>>>
>>>> -	if (atomic_dec_and_test(&tfile->count))
>>>> -		tun_detach(tfile->tun);
>>>> +	err = -EINVAL;
>>>> +	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
>>>> +		goto out;
>>>> +
>>>> +	err = -EBUSY;
>>>> +	if (!(tun->flags&   TUN_TAP_MQ)&&   tun->numqueues == 1)
>>>> +		goto out;
>>>> +
>>>> +	if (tun->numqueues == MAX_TAP_QUEUES)
>>>> +		goto out;
>>>> +
>>>> +	err = 0;
>>>> +	tfile->queue_index = tun->numqueues;
>>>> +	rcu_assign_pointer(tfile->tun, tun);
>>>> +	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>>>> +	sock_hold(&tfile->sk);
>>>> +	tun->numqueues++;
>>>> +
>>>> +	if (tun->numqueues == 1)
>>>> +		netif_carrier_on(tun->dev);
>>>> +
>>>> +	/* device is allowed to go away first, so no need to hold extra
>>>> +	 * refcnt. */
>>>> +
>>>> +out:
>>>> +	spin_unlock(&tun_lock);
>>>> +	return err;
>>>>   }
>>>>
>>>>   /* TAP filtering */
>>>> @@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
>>>>   /* Net device detach from fd. */
>>>>   static void tun_net_uninit(struct net_device *dev)
>>>>   {
>>>> -	struct tun_struct *tun = netdev_priv(dev);
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> -
>>>> -	/* Inform the methods they need to stop using the dev.
>>>> -	 */
>>>> -	if (tfile) {
>>>> -		wake_up_all(&tfile->wq.wait);
>>>> -		if (atomic_dec_and_test(&tfile->count))
>>>> -			__tun_detach(tun);
>>>> -	}
>>>> +	tun_detach_all(dev);
>>>>   }
>>>>
>>>>   /* Net device open. */
>>>> @@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
>>>>   /* Net device start xmit */
>>>>   static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   {
>>>> -	struct tun_struct *tun = netdev_priv(dev);
>>>> -	struct tun_file *tfile = tun->tfile;
>>>> +	struct tun_file *tfile = NULL;
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
>>>> +	rcu_read_lock();
>>>> +	tfile = tun_get_queue(dev, skb);
>>>>
>>>>   	/* Drop packet if interface is not attached */
>>>>   	if (!tfile)
>>>> @@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>
>>>>   	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
>>>>   	>= dev->tx_queue_len) {
>>>> -		if (!(tun->flags&   TUN_ONE_QUEUE)) {
>>>> +		if (!(tfile->flags&   TUN_ONE_QUEUE)&&
>>> Which patch moved flags from tun to tfile?
>> Patch 1 cache the tun->flags in tfile, but it seems this may let the
>> flags out of sync. So we'd better to use the one in tun_struct.
>>>> +		    !(tfile->flags&   TUN_TAP_MQ)) {
>>>>   			/* Normal queueing mode. */
>>>>   			/* Packet scheduler handles dropping of further packets. */
>>>>   			netif_stop_queue(dev);
>>>> @@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   			 * error is more appropriate. */
>>>>   			dev->stats.tx_fifo_errors++;
>>>>   		} else {
>>>> -			/* Single queue mode.
>>>> +			/* Single queue mode or multi queue mode.
>>>>   			 * Driver handles dropping of all packets itself. */
>>> Please don't do this. Stop the queue on overrun as appropriate.
>>> ONE_QUEUE is a legacy hack.
>>>
>>> BTW we really should stop queue before we start dropping packets,
>>> but that can be a separate patch.
>> The problem here is the using of NETIF_F_LLTX. Kernel could only see
>> one queue even for a multiqueue tun/tap. If we use
>> netif_stop_queue(), all other queues would be stopped also.
> Another reason not to use LLTX?

Yes.
>>>>   			goto drop;
>>>>   		}
>>>> @@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>   		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
>>>>   	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
>>>>   				   POLLRDNORM | POLLRDBAND);
>>>> +	rcu_read_unlock();
>>>>   	return NETDEV_TX_OK;
>>>>
>>>>   drop:
>>>> +	rcu_read_unlock();
>>>>   	dev->stats.tx_dropped++;
>>>>   	kfree_skb(skb);
>>>>   	return NETDEV_TX_OK;
>>>> @@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
>>>>   static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>>>   {
>>>>   	struct tun_file *tfile = file->private_data;
>>>> -	struct tun_struct *tun = __tun_get(tfile);
>>>> +	struct tun_struct *tun = NULL;
>>>>   	struct sock *sk;
>>>>   	unsigned int mask = 0;
>>>>
>>>> -	if (!tun)
>>>> +	if (!tfile)
>>>>   		return POLLERR;
>>>>
>>>> -	sk = tfile->socket.sk;
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>> +		return POLLERR;
>>>> +	}
>>>> +	rcu_read_unlock();
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
>>>> +	sk =&tfile->sk;
>>>>
>>>>   	poll_wait(file,&tfile->wq.wait, wait);
>>>>
>>>> @@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>>>   	     sock_writeable(sk)))
>>>>   		mask |= POLLOUT | POLLWRNORM;
>>>>
>>>> -	if (tun->dev->reg_state != NETREG_REGISTERED)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
>>>>   		mask = POLLERR;
>>>> +	rcu_read_unlock();
>>>>
>>>> -	tun_put(tun);
>>>>   	return mask;
>>>>   }
>>>>
>>>> @@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>>>   		skb_shinfo(skb)->gso_segs = 0;
>>>>   	}
>>>>
>>>> -	tun = __tun_get(tfile);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return -EBADFD;
>>>> +	}
>>>>
>>>>   	switch (tfile->flags&   TUN_TYPE_MASK) {
>>>>   	case TUN_TUN_DEV:
>>>> @@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>>>   		skb->protocol = eth_type_trans(skb, tun->dev);
>>>>   		break;
>>>>   	}
>>>> -
>>>> -	netif_rx_ni(skb);
>>>>   	tun->dev->stats.rx_packets++;
>>>>   	tun->dev->stats.rx_bytes += len;
>>>> -	tun_put(tun);
>>>> +	rcu_read_unlock();
>>>> +
>>>> +	netif_rx_ni(skb);
>>>> +
>>>>   	return count;
>>>>
>>>>   err_free:
>>>>   	count = -EINVAL;
>>>>   	kfree_skb(skb);
>>>>   err:
>>>> -	tun = __tun_get(tfile);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return -EBADFD;
>>>> +	}
>>>>
>>>>   	if (drop)
>>>>   		tun->dev->stats.rx_dropped++;
>>>>   	if (error)
>>>>   		tun->dev->stats.rx_frame_errors++;
>>>> -	tun_put(tun);
>>>> +	rcu_read_unlock();
>>>>   	return count;
>>>>   }
>>>>
>>>> @@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
>>>>   	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
>>>>   	total += skb->len;
>>>>
>>>> -	tun = __tun_get(tfile);
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>>   	if (tun) {
>>>>   		tun->dev->stats.tx_packets++;
>>>>   		tun->dev->stats.tx_bytes += len;
>>>> -		tun_put(tun);
>>>>   	}
>>>> +	rcu_read_unlock();
>>>>
>>>>   	return total;
>>>>   }
>>>> @@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
>>>>   				break;
>>>>   			}
>>>>
>>>> -			tun = __tun_get(tfile);
>>>> +			rcu_read_lock();
>>>> +			tun = rcu_dereference(tfile->tun);
>>>>   			if (!tun) {
>>>> -				ret = -EIO;
>>>> +				ret = -EBADFD;
>>> BADFD is for when you get passed something like -1 fd.
>>> Here fd is OK, it's just in a bad state so you can not do IO.
>>>
>> Sure.
>>>> +				rcu_read_unlock();
>>>>   				break;
>>>>   			}
>>>>   			if (tun->dev->reg_state != NETREG_REGISTERED) {
>>>>   				ret = -EIO;
>>>> -				tun_put(tun);
>>>> +				rcu_read_unlock();
>>>>   				break;
>>>>   			}
>>>> -			tun_put(tun);
>>>> +			rcu_read_unlock();
>>>>
>>>>   			/* Nothing to read, let's sleep */
>>>>   			schedule();
>>>>   			continue;
>>>>   		}
>>>>
>>>> -		tun = __tun_get(tfile);
>>>> +		rcu_read_lock();
>>>> +		tun = rcu_dereference(tfile->tun);
>>>>   		if (tun) {
>>>>   			netif_wake_queue(tun->dev);
>>>> -			tun_put(tun);
>>>>   		}
>>>> +		rcu_read_unlock();
>>>>
>>>>   		ret = tun_put_user(tfile, skb, iv, len);
>>>>   		kfree_skb(skb);
>>>> @@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
>>>>   	if (tun->flags&   TUN_VNET_HDR)
>>>>   		flags |= IFF_VNET_HDR;
>>>>
>>>> +	if (tun->flags&   TUN_TAP_MQ)
>>>> +		flags |= IFF_MULTI_QUEUE;
>>>> +
>>>>   	return flags;
>>>>   }
>>>>
>>>> @@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   		err = tun_attach(tun, file);
>>>>   		if (err<   0)
>>>>   			return err;
>>>> -	}
>>>> -	else {
>>>> +	} else {
>>>>   		char *name;
>>>>   		unsigned long flags = 0;
>>>>
>>>> @@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
>>>>   			TUN_USER_FEATURES;
>>>>   		dev->features = dev->hw_features;
>>>> +		if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
>>>> +			dev->features |= NETIF_F_LLTX;
>>>>
>>>>   		err = register_netdevice(tun->dev);
>>>>   		if (err<   0)
>>>> @@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>
>>>>   		err = tun_attach(tun, file);
>>>>   		if (err<   0)
>>>> -			goto failed;
>>>> +			goto err_free_dev;
>>>>   	}
>>>>
>>>>   	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
>>>> @@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>   	else
>>>>   		tun->flags&= ~TUN_VNET_HDR;
>>>>
>>>> +	if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
>>>> +		tun->flags |= TUN_TAP_MQ;
>>>> +	else
>>>> +		tun->flags&= ~TUN_TAP_MQ;
>>>> +
>>>>   	/* Cache flags from tun device */
>>>>   	tfile->flags = tun->flags;
>>>>   	/* Make sure persistent devices do not get stuck in
>>>> @@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>
>>>>   err_free_dev:
>>>>   	free_netdev(dev);
>>>> -failed:
>>>>   	return err;
>>>>   }
>>>>
>>>> @@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   				(unsigned int __user*)argp);
>>>>   	}
>>>>
>>>> -	rtnl_lock();
>>>> -
>>>> -	tun = __tun_get(tfile);
>>>> -	if (cmd == TUNSETIFF&&   !tun) {
>>>> +	ret = 0;
>>>> +	if (cmd == TUNSETIFF) {
>>>> +		rtnl_lock();
>>>>   		ifr.ifr_name[IFNAMSIZ-1] = '\0';
>>>> -
>>>>   		ret = tun_set_iff(tfile->net, file,&ifr);
>>>> -
>>>> +		rtnl_unlock();
>>>>   		if (ret)
>>>> -			goto unlock;
>>>> -
>>>> +			return ret;
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>> -			ret = -EFAULT;
>>>> -		goto unlock;
>>>> +			return -EFAULT;
>>>> +		return ret;
>>>>   	}
>>>>
>>>> +	rtnl_lock();
>>>> +
>>>> +	rcu_read_lock();
>>>> +
>>>>   	ret = -EBADFD;
>>>> +	tun = rcu_dereference(tfile->tun);
>>>>   	if (!tun)
>>>>   		goto unlock;
>>>> +	else
>>>> +		ret = 0;
>>>>
>>>> -	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
>>>> -
>>>> -	ret = 0;
>>>>   	switch (cmd) {
>>>>   	case TUNGETIFF:
>>>>   		ret = tun_get_iff(current->nsproxy->net_ns, tun,&ifr);
>>>> +		rcu_read_unlock();
>>>>   		if (ret)
>>>> -			break;
>>>> +			goto out;
>>>>
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>>   			ret = -EFAULT;
>>>> -		break;
>>>> +		goto out;
>>>>
>>>>   	case TUNSETNOCSUM:
>>>>   		/* Disable/Enable checksum */
>>>> @@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   		/* Get hw address */
>>>>   		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
>>>>   		ifr.ifr_hwaddr.sa_family = tun->dev->type;
>>>> +		rcu_read_unlock();
>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>>   			ret = -EFAULT;
>>>> -		break;
>>>> +		goto out;
>>>>
>>>>   	case SIOCSIFHWADDR:
>>>>   		/* Set hw address */
>>>> @@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>   	}
>>>>
>>>>   unlock:
>>>> +	rcu_read_unlock();
>>>> +out:
>>>>   	rtnl_unlock();
>>>> -	if (tun)
>>>> -		tun_put(tun);
>>>>   	return ret;
>>>>   }
>>>>
>>>> @@ -1517,6 +1624,11 @@ out:
>>>>   	return ret;
>>>>   }
>>>>
>>>> +static void tun_sock_destruct(struct sock *sk)
>>>> +{
>>>> +	skb_queue_purge(&sk->sk_receive_queue);
>>>> +}
>>>> +
>>>>   static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   {
>>>>   	struct net *net = current->nsproxy->net_ns;
>>>> @@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   	sock_init_data(&tfile->socket,&tfile->sk);
>>>>
>>>>   	tfile->sk.sk_write_space = tun_sock_write_space;
>>>> +	tfile->sk.sk_destruct = tun_sock_destruct;
>>>>   	tfile->sk.sk_sndbuf = INT_MAX;
>>>>   	file->private_data = tfile;
>>>>
>>>> @@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>>>   static int tun_chr_close(struct inode *inode, struct file *file)
>>>>   {
>>>>   	struct tun_file *tfile = file->private_data;
>>>> -	struct tun_struct *tun;
>>>> -
>>>> -	tun = __tun_get(tfile);
>>>> -	if (tun) {
>>>> -		struct net_device *dev = tun->dev;
>>>> -
>>>> -		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
>>>> -
>>>> -		__tun_detach(tun);
>>>> -
>>>> -		/* If desirable, unregister the netdevice. */
>>>> -		if (!(tun->flags&   TUN_PERSIST)) {
>>>> -			rtnl_lock();
>>>> -			if (dev->reg_state == NETREG_REGISTERED)
>>>> -				unregister_netdevice(dev);
>>>> -			rtnl_unlock();
>>>> -		}
>>>>
>>>> -		/* drop the reference that netdevice holds */
>>>> -		sock_put(&tfile->sk);
>>>> -
>>>> -	}
>>>> -
>>>> -	/* drop the reference that file holds */
>>>> -	sock_put(&tfile->sk);
>>>> +	tun_detach(tfile, true);
>>>>
>>>>   	return 0;
>>>>   }
>>>> @@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
>>>>    * holding a reference to the file for as long as the socket is in use. */
>>>>   struct socket *tun_get_socket(struct file *file)
>>>>   {
>>>> -	struct tun_struct *tun;
>>>> +	struct tun_struct *tun = NULL;
>>>>   	struct tun_file *tfile = file->private_data;
>>>>   	if (file->f_op !=&tun_fops)
>>>>   		return ERR_PTR(-EINVAL);
>>>> -	tun = tun_get(file);
>>>> -	if (!tun)
>>>> +	rcu_read_lock();
>>>> +	tun = rcu_dereference(tfile->tun);
>>>> +	if (!tun) {
>>>> +		rcu_read_unlock();
>>>>   		return ERR_PTR(-EBADFD);
>>>> -	tun_put(tun);
>>>> +	}
>>>> +	rcu_read_unlock();
>>>>   	return&tfile->socket;
>>>>   }
>>>>   EXPORT_SYMBOL_GPL(tun_get_socket);


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-27  5:59         ` Jason Wang
@ 2012-06-27  8:26           ` Michael S. Tsirkin
  2012-06-28  3:15             ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2012-06-27  8:26 UTC (permalink / raw)
  To: Jason Wang
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle, Eric Dumazet

On Wed, Jun 27, 2012 at 01:59:37PM +0800, Jason Wang wrote:
> On 06/26/2012 07:54 PM, Michael S. Tsirkin wrote:
> >On Tue, Jun 26, 2012 at 01:52:57PM +0800, Jason Wang wrote:
> >>On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
> >>>On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
> >>>>This patch adds multiqueue support for tap device. This is done by abstracting
> >>>>each queue as a file/socket and allowing multiple sockets to be attached to the
> >>>>tuntap device (an array of tun_file were stored in the tun_struct). Userspace
> >>>>could write and read from those files to do the parallel packet
> >>>>sending/receiving.
> >>>>
> >>>>Unlike the previous single queue implementation, the socket and device were
> >>>>loosely coupled, each of them were allowed to go away first. In order to let the
> >>>>tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
> >>>>synchronize between data path and system call.
> >>>Don't use LLTX/RCU. It's not worth it.
> >>>Use something like netif_set_real_num_tx_queues.
> >>>
> >>For LLTX, maybe it's better to convert it to alloc_netdev_mq() to
> >>let the kernel see all queues and make the queue stopping and
> >>per-queue stats eaiser.
> >>RCU is used to handle the attaching/detaching when tun/tap is
> >>sending and receiving packets which looks reasonalbe for me.
> >Yes but do we have to allow this? How about we always ask
> >userspace to attach to all active queues?
> 
> Attaching/detaching is a method to active/deactive a queue, if all
> queues were kept attached, then we need other method or flag to mark
> the queue as activateddeactived and still need to synchronize with
> data path.

This is what I am trying to say: use an interface flag for
multiqueue. When it is set activate all queues attached.
When unset deactivate all queues except the default one.


> >>Not
> >>sure netif_set_real_num_tx_queues() can help in this situation.
> >Check it out.
> >
> >>>>The tx queue selecting is first based on the recorded rxq index of an skb, it
> >>>>there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
> >>>>
> >>>>Signed-off-by: Jason Wang<jasowang@redhat.com>
> >>>Interestingly macvtap switched to hashing first:
> >>>ef0002b577b52941fb147128f30bd1ecfdd3ff6d
> >>>(the commit log is corrupted but see what it
> >>>does in the patch).
> >>>Any idea why?
> >>>
> >>>>---
> >>>>  drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
> >>>>  1 files changed, 232 insertions(+), 139 deletions(-)
> >>>>
> >>>>diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>>>index 8233b0a..5c26757 100644
> >>>>--- a/drivers/net/tun.c
> >>>>+++ b/drivers/net/tun.c
> >>>>@@ -107,6 +107,8 @@ struct tap_filter {
> >>>>  	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
> >>>>  };
> >>>>
> >>>>+#define MAX_TAP_QUEUES (NR_CPUS<   16 ? NR_CPUS : 16)
> >>>Why the limit? I am guessing you copied this from macvtap?
> >>>This is problematic for a number of reasons:
> >>>	- will not play well with migration
> >>>	- will not work well for a large guest
> >>>
> >>>Yes, macvtap needs to be fixed too.
> >>>
> >>>I am guessing what it is trying to prevent is queueing
> >>>up a huge number of packets?
> >>>So just divide the default tx queue limit by the # of queues.
> >>Not sure,
> >>another reasons I can guess:
> >>- to prevent storing a large array of pointers in tun_struct or macvlan_dev.
> >OK so with the limit of e.g. 1024 we'd allocate at most
> >2 pages of memory. This doesn't look too bad. 1024 is probably a
> >high enough limit: modern hypervisors seem to support on the order
> >of 100-200 CPUs so this leaves us some breathing space
> >if we want to match a queue per guest CPU.
> >Of course we need to limit the packets per queue
> >in such a setup more aggressively. 1000 packets * 1000 queues
> >* 64K per packet is too much.
> >
> >>- it may not be suitable to allow the number of virtqueues greater
> >>than the number of physical queues in the card
> >Maybe for macvtap, here we have no idea which card we
> >are working with and how many queues it has.
> >
> >>>And by the way, for MQ applications maybe we can finally
> >>>ignore tx queue altogether and limit the total number
> >>>of bytes queued?
> >>>To avoid regressions we can make it large like 64M/# queues.
> >>>Could be a separate patch I think, and for a single queue
> >>>might need a compatible mode though I am not sure.
> >>Could you explain more about this?
> >>Did you mean to have a total
> >>sndbuf for all sockets that attached to tun/tap?
> >Consider that we currently limit the # of
> >packets queued at tun for xmit to userspace.
> >Some limit is needed but # of packets sounds
> >very silly - limiting the total memory
> >might be more reasonable.
> >
> >In case of multiqueue, we really care about
> >total # of packets or total memory, but a simple
> >approximation could be to divide the allocation
> >between active queues equally.
> 
> A possible method is to divce the TUN_READQ_SIZE by #queues, but
> make it at least to be equal to the vring size (256).

I would not enforce any limit actually.
Simply divide by # of queues, and
fail if userspace tries to attach > queue size packets.

With 1000 queues this is 64Mbyte worst case as is.
If someone wants to allow userspace to drink
256 times as much that is 16Giga byte per
single device, let the user tweak tx queue len.



> >
> >qdisc also queues some packets, that logic is
> >using # of packets anyway. So either make that
> >1000/# queues, or even set to 0 as Eric once
> >suggested.
> >
> >>>>+
> >>>>  struct tun_file {
> >>>>  	struct sock sk;
> >>>>  	struct socket socket;
> >>>>@@ -114,16 +116,18 @@ struct tun_file {
> >>>>  	int vnet_hdr_sz;
> >>>>  	struct tap_filter txflt;
> >>>>  	atomic_t count;
> >>>>-	struct tun_struct *tun;
> >>>>+	struct tun_struct __rcu *tun;
> >>>>  	struct net *net;
> >>>>  	struct fasync_struct *fasync;
> >>>>  	unsigned int flags;
> >>>>+	u16 queue_index;
> >>>>  };
> >>>>
> >>>>  struct tun_sock;
> >>>>
> >>>>  struct tun_struct {
> >>>>-	struct tun_file		*tfile;
> >>>>+	struct tun_file		*tfiles[MAX_TAP_QUEUES];
> >>>>+	unsigned int            numqueues;
> >>>>  	unsigned int 		flags;
> >>>>  	uid_t			owner;
> >>>>  	gid_t			group;
> >>>>@@ -138,80 +142,159 @@ struct tun_struct {
> >>>>  #endif
> >>>>  };
> >>>>
> >>>>-static int tun_attach(struct tun_struct *tun, struct file *file)
> >>>>+static DEFINE_SPINLOCK(tun_lock);
> >>>>+
> >>>>+/*
> >>>>+ * tun_get_queue(): calculate the queue index
> >>>>+ *     - if skbs comes from mq nics, we can just borrow
> >>>>+ *     - if not, calculate from the hash
> >>>>+ */
> >>>>+static struct tun_file *tun_get_queue(struct net_device *dev,
> >>>>+				      struct sk_buff *skb)
> >>>>  {
> >>>>-	struct tun_file *tfile = file->private_data;
> >>>>-	int err;
> >>>>+	struct tun_struct *tun = netdev_priv(dev);
> >>>>+	struct tun_file *tfile = NULL;
> >>>>+	int numqueues = tun->numqueues;
> >>>>+	__u32 rxq;
> >>>>
> >>>>-	ASSERT_RTNL();
> >>>>+	BUG_ON(!rcu_read_lock_held());
> >>>>
> >>>>-	netif_tx_lock_bh(tun->dev);
> >>>>+	if (!numqueues)
> >>>>+		goto out;
> >>>>
> >>>>-	err = -EINVAL;
> >>>>-	if (tfile->tun)
> >>>>+	if (numqueues == 1) {
> >>>>+		tfile = rcu_dereference(tun->tfiles[0]);
> >>>Instead of hacks like this, you can ask for an MQ
> >>>flag to be set in SETIFF. Then you won't need to
> >>>handle attach/detach at random times.
> >>Consier user switch between a sq guest to mq guest, qemu would
> >>attach or detach the fd which could not be expceted in kernel.
> >Can't userspace keep it attached always, just deactivate MQ?
> >
> >>>And most of the scary num_queues checks can go away.
> >>Even we has a MQ flag, userspace could still just attach one queue
> >>to the device.
> >I think we allow too much flexibility if we let
> >userspace detach a random queue.
> 
> The point is to let tun/tap has the same flexibility as macvtap.
> Macvtap allows add/delete queues at any time and it's very easy to
> add detach/attach to macvtap. So we can easily use almost the same
> ioctls to active/deactive a queue at any time for both tap and
> macvtap.

Yes but userspace does not do this in practice:
it decides how many queues and just activates them all.

> >Maybe only allow attaching/detaching with MQ off?
> >If userspace wants to attach/detach, clear MQ first?
> 
> Maybe I didn't understand the point here but I didn't advantages
> except more times of ioctl().

Way simpler to implement.

> >Alternatively, attach/detach all queues in one ioctl?
> 
> Yes, it can be same one.
> >
> >>>You can then also ask userspace about the max # of queues
> >>>to expect if you want to save some memory.
> >>>
> >>Yes, good suggestion.
> >>>>  		goto out;
> >>>>+	}
> >>>>
> >>>>-	err = -EBUSY;
> >>>>-	if (tun->tfile)
> >>>>+	if (likely(skb_rx_queue_recorded(skb))) {
> >>>>+		rxq = skb_get_rx_queue(skb);
> >>>>+
> >>>>+		while (unlikely(rxq>= numqueues))
> >>>>+			rxq -= numqueues;
> >>>>+
> >>>>+		tfile = rcu_dereference(tun->tfiles[rxq]);
> >>>>  		goto out;
> >>>>+	}
> >>>>
> >>>>-	err = 0;
> >>>>-	tfile->tun = tun;
> >>>>-	tun->tfile = tfile;
> >>>>-	netif_carrier_on(tun->dev);
> >>>>-	dev_hold(tun->dev);
> >>>>-	sock_hold(&tfile->sk);
> >>>>-	atomic_inc(&tfile->count);
> >>>>+	/* Check if we can use flow to select a queue */
> >>>>+	rxq = skb_get_rxhash(skb);
> >>>>+	if (rxq) {
> >>>>+		u32 idx = ((u64)rxq * numqueues)>>   32;
> >>>This completely confuses me. What's the logic here?
> >>>How do we even know it's in range?
> >>>
> >>rxq is a u32, so the result should be less than numqueues.
> >Aha. So the point is to use multiply+shift instead of %?
> >Please add a comment.
> >
> 
> Yes sure.

Not just about this trick, but generally explaining why do we use
rxhash for transmit.

> >>>>+		tfile = rcu_dereference(tun->tfiles[idx]);
> >>>>+		goto out;
> >>>>+	}
> >>>>
> >>>>+	tfile = rcu_dereference(tun->tfiles[0]);
> >>>>  out:
> >>>>-	netif_tx_unlock_bh(tun->dev);
> >>>>-	return err;
> >>>>+	return tfile;
> >>>>  }
> >>>>
> >>>>-static void __tun_detach(struct tun_struct *tun)
> >>>>+static int tun_detach(struct tun_file *tfile, bool clean)
> >>>>  {
> >>>>-	struct tun_file *tfile = tun->tfile;
> >>>>-	/* Detach from net device */
> >>>>-	netif_tx_lock_bh(tun->dev);
> >>>>-	netif_carrier_off(tun->dev);
> >>>>-	tun->tfile = NULL;
> >>>>-	netif_tx_unlock_bh(tun->dev);
> >>>>-
> >>>>-	/* Drop read queue */
> >>>>-	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
> >>>>-
> >>>>-	/* Drop the extra count on the net device */
> >>>>-	dev_put(tun->dev);
> >>>>-}
> >>>>+	struct tun_struct *tun;
> >>>>+	struct net_device *dev = NULL;
> >>>>+	bool destroy = false;
> >>>>
> >>>>-static void tun_detach(struct tun_struct *tun)
> >>>>-{
> >>>>-	rtnl_lock();
> >>>>-	__tun_detach(tun);
> >>>>-	rtnl_unlock();
> >>>>-}
> >>>>+	spin_lock(&tun_lock);
> >>>>
> >>>>-static struct tun_struct *__tun_get(struct tun_file *tfile)
> >>>>-{
> >>>>-	struct tun_struct *tun = NULL;
> >>>>+	tun = rcu_dereference_protected(tfile->tun,
> >>>>+					lockdep_is_held(&tun_lock));
> >>>>+	if (tun) {
> >>>>+		u16 index = tfile->queue_index;
> >>>>+		BUG_ON(index>= tun->numqueues);
> >>>>+		dev = tun->dev;
> >>>>+
> >>>>+		rcu_assign_pointer(tun->tfiles[index],
> >>>>+				   tun->tfiles[tun->numqueues - 1]);
> >>>>+		tun->tfiles[index]->queue_index = index;
> >>>>+		rcu_assign_pointer(tfile->tun, NULL);
> >>>>+		--tun->numqueues;
> >>>>+		sock_put(&tfile->sk);
> >>>>
> >>>>-	if (atomic_inc_not_zero(&tfile->count))
> >>>>-		tun = tfile->tun;
> >>>>+		if (tun->numqueues == 0&&   !(tun->flags&   TUN_PERSIST))
> >>>>+			destroy = true;
> >>>Please don't use flags like that. Use dedicated labels and goto there on error.
> >>ok.
> >>>>+	}
> >>>>
> >>>>-	return tun;
> >>>>+	spin_unlock(&tun_lock);
> >>>>+
> >>>>+	synchronize_rcu();
> >>>>+	if (clean)
> >>>>+		sock_put(&tfile->sk);
> >>>>+
> >>>>+	if (destroy) {
> >>>>+		rtnl_lock();
> >>>>+		if (dev->reg_state == NETREG_REGISTERED)
> >>>>+			unregister_netdevice(dev);
> >>>>+		rtnl_unlock();
> >>>>+	}
> >>>>+
> >>>>+	return 0;
> >>>>  }
> >>>>
> >>>>-static struct tun_struct *tun_get(struct file *file)
> >>>>+static void tun_detach_all(struct net_device *dev)
> >>>>  {
> >>>>-	return __tun_get(file->private_data);
> >>>>+	struct tun_struct *tun = netdev_priv(dev);
> >>>>+	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
> >>>>+	int i, j = 0;
> >>>>+
> >>>>+	spin_lock(&tun_lock);
> >>>>+
> >>>>+	for (i = 0; i<   MAX_TAP_QUEUES&&   tun->numqueues; i++) {
> >>>>+		tfile = rcu_dereference_protected(tun->tfiles[i],
> >>>>+						lockdep_is_held(&tun_lock));
> >>>>+		BUG_ON(!tfile);
> >>>>+		wake_up_all(&tfile->wq.wait);
> >>>>+		tfile_list[j++] = tfile;
> >>>>+		rcu_assign_pointer(tfile->tun, NULL);
> >>>>+		--tun->numqueues;
> >>>>+	}
> >>>>+	BUG_ON(tun->numqueues != 0);
> >>>>+	/* guarantee that any future tun_attach will fail */
> >>>>+	tun->numqueues = MAX_TAP_QUEUES;
> >>>>+	spin_unlock(&tun_lock);
> >>>>+
> >>>>+	synchronize_rcu();
> >>>>+	for (--j; j>= 0; j--)
> >>>>+		sock_put(&tfile_list[j]->sk);
> >>>>  }
> >>>>
> >>>>-static void tun_put(struct tun_struct *tun)
> >>>>+static int tun_attach(struct tun_struct *tun, struct file *file)
> >>>>  {
> >>>>-	struct tun_file *tfile = tun->tfile;
> >>>>+	struct tun_file *tfile = file->private_data;
> >>>>+	int err;
> >>>>+
> >>>>+	ASSERT_RTNL();
> >>>>+
> >>>>+	spin_lock(&tun_lock);
> >>>>
> >>>>-	if (atomic_dec_and_test(&tfile->count))
> >>>>-		tun_detach(tfile->tun);
> >>>>+	err = -EINVAL;
> >>>>+	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
> >>>>+		goto out;
> >>>>+
> >>>>+	err = -EBUSY;
> >>>>+	if (!(tun->flags&   TUN_TAP_MQ)&&   tun->numqueues == 1)
> >>>>+		goto out;
> >>>>+
> >>>>+	if (tun->numqueues == MAX_TAP_QUEUES)
> >>>>+		goto out;
> >>>>+
> >>>>+	err = 0;
> >>>>+	tfile->queue_index = tun->numqueues;
> >>>>+	rcu_assign_pointer(tfile->tun, tun);
> >>>>+	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
> >>>>+	sock_hold(&tfile->sk);
> >>>>+	tun->numqueues++;
> >>>>+
> >>>>+	if (tun->numqueues == 1)
> >>>>+		netif_carrier_on(tun->dev);
> >>>>+
> >>>>+	/* device is allowed to go away first, so no need to hold extra
> >>>>+	 * refcnt. */
> >>>>+
> >>>>+out:
> >>>>+	spin_unlock(&tun_lock);
> >>>>+	return err;
> >>>>  }
> >>>>
> >>>>  /* TAP filtering */
> >>>>@@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
> >>>>  /* Net device detach from fd. */
> >>>>  static void tun_net_uninit(struct net_device *dev)
> >>>>  {
> >>>>-	struct tun_struct *tun = netdev_priv(dev);
> >>>>-	struct tun_file *tfile = tun->tfile;
> >>>>-
> >>>>-	/* Inform the methods they need to stop using the dev.
> >>>>-	 */
> >>>>-	if (tfile) {
> >>>>-		wake_up_all(&tfile->wq.wait);
> >>>>-		if (atomic_dec_and_test(&tfile->count))
> >>>>-			__tun_detach(tun);
> >>>>-	}
> >>>>+	tun_detach_all(dev);
> >>>>  }
> >>>>
> >>>>  /* Net device open. */
> >>>>@@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
> >>>>  /* Net device start xmit */
> >>>>  static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>>  {
> >>>>-	struct tun_struct *tun = netdev_priv(dev);
> >>>>-	struct tun_file *tfile = tun->tfile;
> >>>>+	struct tun_file *tfile = NULL;
> >>>>
> >>>>-	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
> >>>>+	rcu_read_lock();
> >>>>+	tfile = tun_get_queue(dev, skb);
> >>>>
> >>>>  	/* Drop packet if interface is not attached */
> >>>>  	if (!tfile)
> >>>>@@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>>
> >>>>  	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
> >>>>  	>= dev->tx_queue_len) {
> >>>>-		if (!(tun->flags&   TUN_ONE_QUEUE)) {
> >>>>+		if (!(tfile->flags&   TUN_ONE_QUEUE)&&
> >>>Which patch moved flags from tun to tfile?
> >>Patch 1 cache the tun->flags in tfile, but it seems this may let the
> >>flags out of sync. So we'd better to use the one in tun_struct.
> >>>>+		    !(tfile->flags&   TUN_TAP_MQ)) {
> >>>>  			/* Normal queueing mode. */
> >>>>  			/* Packet scheduler handles dropping of further packets. */
> >>>>  			netif_stop_queue(dev);
> >>>>@@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>>  			 * error is more appropriate. */
> >>>>  			dev->stats.tx_fifo_errors++;
> >>>>  		} else {
> >>>>-			/* Single queue mode.
> >>>>+			/* Single queue mode or multi queue mode.
> >>>>  			 * Driver handles dropping of all packets itself. */
> >>>Please don't do this. Stop the queue on overrun as appropriate.
> >>>ONE_QUEUE is a legacy hack.
> >>>
> >>>BTW we really should stop queue before we start dropping packets,
> >>>but that can be a separate patch.
> >>The problem here is the using of NETIF_F_LLTX. Kernel could only see
> >>one queue even for a multiqueue tun/tap. If we use
> >>netif_stop_queue(), all other queues would be stopped also.
> >Another reason not to use LLTX?
> 
> Yes.
> >>>>  			goto drop;
> >>>>  		}
> >>>>@@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>>  		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
> >>>>  	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
> >>>>  				   POLLRDNORM | POLLRDBAND);
> >>>>+	rcu_read_unlock();
> >>>>  	return NETDEV_TX_OK;
> >>>>
> >>>>  drop:
> >>>>+	rcu_read_unlock();
> >>>>  	dev->stats.tx_dropped++;
> >>>>  	kfree_skb(skb);
> >>>>  	return NETDEV_TX_OK;
> >>>>@@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
> >>>>  static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
> >>>>  {
> >>>>  	struct tun_file *tfile = file->private_data;
> >>>>-	struct tun_struct *tun = __tun_get(tfile);
> >>>>+	struct tun_struct *tun = NULL;
> >>>>  	struct sock *sk;
> >>>>  	unsigned int mask = 0;
> >>>>
> >>>>-	if (!tun)
> >>>>+	if (!tfile)
> >>>>  		return POLLERR;
> >>>>
> >>>>-	sk = tfile->socket.sk;
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>+	if (!tun) {
> >>>>+		rcu_read_unlock();
> >>>>+		return POLLERR;
> >>>>+	}
> >>>>+	rcu_read_unlock();
> >>>>
> >>>>-	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
> >>>>+	sk =&tfile->sk;
> >>>>
> >>>>  	poll_wait(file,&tfile->wq.wait, wait);
> >>>>
> >>>>@@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
> >>>>  	     sock_writeable(sk)))
> >>>>  		mask |= POLLOUT | POLLWRNORM;
> >>>>
> >>>>-	if (tun->dev->reg_state != NETREG_REGISTERED)
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>+	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
> >>>>  		mask = POLLERR;
> >>>>+	rcu_read_unlock();
> >>>>
> >>>>-	tun_put(tun);
> >>>>  	return mask;
> >>>>  }
> >>>>
> >>>>@@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
> >>>>  		skb_shinfo(skb)->gso_segs = 0;
> >>>>  	}
> >>>>
> >>>>-	tun = __tun_get(tfile);
> >>>>-	if (!tun)
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>+	if (!tun) {
> >>>>+		rcu_read_unlock();
> >>>>  		return -EBADFD;
> >>>>+	}
> >>>>
> >>>>  	switch (tfile->flags&   TUN_TYPE_MASK) {
> >>>>  	case TUN_TUN_DEV:
> >>>>@@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
> >>>>  		skb->protocol = eth_type_trans(skb, tun->dev);
> >>>>  		break;
> >>>>  	}
> >>>>-
> >>>>-	netif_rx_ni(skb);
> >>>>  	tun->dev->stats.rx_packets++;
> >>>>  	tun->dev->stats.rx_bytes += len;
> >>>>-	tun_put(tun);
> >>>>+	rcu_read_unlock();
> >>>>+
> >>>>+	netif_rx_ni(skb);
> >>>>+
> >>>>  	return count;
> >>>>
> >>>>  err_free:
> >>>>  	count = -EINVAL;
> >>>>  	kfree_skb(skb);
> >>>>  err:
> >>>>-	tun = __tun_get(tfile);
> >>>>-	if (!tun)
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>+	if (!tun) {
> >>>>+		rcu_read_unlock();
> >>>>  		return -EBADFD;
> >>>>+	}
> >>>>
> >>>>  	if (drop)
> >>>>  		tun->dev->stats.rx_dropped++;
> >>>>  	if (error)
> >>>>  		tun->dev->stats.rx_frame_errors++;
> >>>>-	tun_put(tun);
> >>>>+	rcu_read_unlock();
> >>>>  	return count;
> >>>>  }
> >>>>
> >>>>@@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
> >>>>  	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
> >>>>  	total += skb->len;
> >>>>
> >>>>-	tun = __tun_get(tfile);
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>  	if (tun) {
> >>>>  		tun->dev->stats.tx_packets++;
> >>>>  		tun->dev->stats.tx_bytes += len;
> >>>>-		tun_put(tun);
> >>>>  	}
> >>>>+	rcu_read_unlock();
> >>>>
> >>>>  	return total;
> >>>>  }
> >>>>@@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
> >>>>  				break;
> >>>>  			}
> >>>>
> >>>>-			tun = __tun_get(tfile);
> >>>>+			rcu_read_lock();
> >>>>+			tun = rcu_dereference(tfile->tun);
> >>>>  			if (!tun) {
> >>>>-				ret = -EIO;
> >>>>+				ret = -EBADFD;
> >>>BADFD is for when you get passed something like -1 fd.
> >>>Here fd is OK, it's just in a bad state so you can not do IO.
> >>>
> >>Sure.
> >>>>+				rcu_read_unlock();
> >>>>  				break;
> >>>>  			}
> >>>>  			if (tun->dev->reg_state != NETREG_REGISTERED) {
> >>>>  				ret = -EIO;
> >>>>-				tun_put(tun);
> >>>>+				rcu_read_unlock();
> >>>>  				break;
> >>>>  			}
> >>>>-			tun_put(tun);
> >>>>+			rcu_read_unlock();
> >>>>
> >>>>  			/* Nothing to read, let's sleep */
> >>>>  			schedule();
> >>>>  			continue;
> >>>>  		}
> >>>>
> >>>>-		tun = __tun_get(tfile);
> >>>>+		rcu_read_lock();
> >>>>+		tun = rcu_dereference(tfile->tun);
> >>>>  		if (tun) {
> >>>>  			netif_wake_queue(tun->dev);
> >>>>-			tun_put(tun);
> >>>>  		}
> >>>>+		rcu_read_unlock();
> >>>>
> >>>>  		ret = tun_put_user(tfile, skb, iv, len);
> >>>>  		kfree_skb(skb);
> >>>>@@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
> >>>>  	if (tun->flags&   TUN_VNET_HDR)
> >>>>  		flags |= IFF_VNET_HDR;
> >>>>
> >>>>+	if (tun->flags&   TUN_TAP_MQ)
> >>>>+		flags |= IFF_MULTI_QUEUE;
> >>>>+
> >>>>  	return flags;
> >>>>  }
> >>>>
> >>>>@@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>>>  		err = tun_attach(tun, file);
> >>>>  		if (err<   0)
> >>>>  			return err;
> >>>>-	}
> >>>>-	else {
> >>>>+	} else {
> >>>>  		char *name;
> >>>>  		unsigned long flags = 0;
> >>>>
> >>>>@@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>>>  		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
> >>>>  			TUN_USER_FEATURES;
> >>>>  		dev->features = dev->hw_features;
> >>>>+		if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
> >>>>+			dev->features |= NETIF_F_LLTX;
> >>>>
> >>>>  		err = register_netdevice(tun->dev);
> >>>>  		if (err<   0)
> >>>>@@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>>>
> >>>>  		err = tun_attach(tun, file);
> >>>>  		if (err<   0)
> >>>>-			goto failed;
> >>>>+			goto err_free_dev;
> >>>>  	}
> >>>>
> >>>>  	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
> >>>>@@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>>>  	else
> >>>>  		tun->flags&= ~TUN_VNET_HDR;
> >>>>
> >>>>+	if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
> >>>>+		tun->flags |= TUN_TAP_MQ;
> >>>>+	else
> >>>>+		tun->flags&= ~TUN_TAP_MQ;
> >>>>+
> >>>>  	/* Cache flags from tun device */
> >>>>  	tfile->flags = tun->flags;
> >>>>  	/* Make sure persistent devices do not get stuck in
> >>>>@@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>>>
> >>>>  err_free_dev:
> >>>>  	free_netdev(dev);
> >>>>-failed:
> >>>>  	return err;
> >>>>  }
> >>>>
> >>>>@@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>>>  				(unsigned int __user*)argp);
> >>>>  	}
> >>>>
> >>>>-	rtnl_lock();
> >>>>-
> >>>>-	tun = __tun_get(tfile);
> >>>>-	if (cmd == TUNSETIFF&&   !tun) {
> >>>>+	ret = 0;
> >>>>+	if (cmd == TUNSETIFF) {
> >>>>+		rtnl_lock();
> >>>>  		ifr.ifr_name[IFNAMSIZ-1] = '\0';
> >>>>-
> >>>>  		ret = tun_set_iff(tfile->net, file,&ifr);
> >>>>-
> >>>>+		rtnl_unlock();
> >>>>  		if (ret)
> >>>>-			goto unlock;
> >>>>-
> >>>>+			return ret;
> >>>>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>>>-			ret = -EFAULT;
> >>>>-		goto unlock;
> >>>>+			return -EFAULT;
> >>>>+		return ret;
> >>>>  	}
> >>>>
> >>>>+	rtnl_lock();
> >>>>+
> >>>>+	rcu_read_lock();
> >>>>+
> >>>>  	ret = -EBADFD;
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>  	if (!tun)
> >>>>  		goto unlock;
> >>>>+	else
> >>>>+		ret = 0;
> >>>>
> >>>>-	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
> >>>>-
> >>>>-	ret = 0;
> >>>>  	switch (cmd) {
> >>>>  	case TUNGETIFF:
> >>>>  		ret = tun_get_iff(current->nsproxy->net_ns, tun,&ifr);
> >>>>+		rcu_read_unlock();
> >>>>  		if (ret)
> >>>>-			break;
> >>>>+			goto out;
> >>>>
> >>>>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>>>  			ret = -EFAULT;
> >>>>-		break;
> >>>>+		goto out;
> >>>>
> >>>>  	case TUNSETNOCSUM:
> >>>>  		/* Disable/Enable checksum */
> >>>>@@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>>>  		/* Get hw address */
> >>>>  		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
> >>>>  		ifr.ifr_hwaddr.sa_family = tun->dev->type;
> >>>>+		rcu_read_unlock();
> >>>>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>>>  			ret = -EFAULT;
> >>>>-		break;
> >>>>+		goto out;
> >>>>
> >>>>  	case SIOCSIFHWADDR:
> >>>>  		/* Set hw address */
> >>>>@@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>>>  	}
> >>>>
> >>>>  unlock:
> >>>>+	rcu_read_unlock();
> >>>>+out:
> >>>>  	rtnl_unlock();
> >>>>-	if (tun)
> >>>>-		tun_put(tun);
> >>>>  	return ret;
> >>>>  }
> >>>>
> >>>>@@ -1517,6 +1624,11 @@ out:
> >>>>  	return ret;
> >>>>  }
> >>>>
> >>>>+static void tun_sock_destruct(struct sock *sk)
> >>>>+{
> >>>>+	skb_queue_purge(&sk->sk_receive_queue);
> >>>>+}
> >>>>+
> >>>>  static int tun_chr_open(struct inode *inode, struct file * file)
> >>>>  {
> >>>>  	struct net *net = current->nsproxy->net_ns;
> >>>>@@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
> >>>>  	sock_init_data(&tfile->socket,&tfile->sk);
> >>>>
> >>>>  	tfile->sk.sk_write_space = tun_sock_write_space;
> >>>>+	tfile->sk.sk_destruct = tun_sock_destruct;
> >>>>  	tfile->sk.sk_sndbuf = INT_MAX;
> >>>>  	file->private_data = tfile;
> >>>>
> >>>>@@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
> >>>>  static int tun_chr_close(struct inode *inode, struct file *file)
> >>>>  {
> >>>>  	struct tun_file *tfile = file->private_data;
> >>>>-	struct tun_struct *tun;
> >>>>-
> >>>>-	tun = __tun_get(tfile);
> >>>>-	if (tun) {
> >>>>-		struct net_device *dev = tun->dev;
> >>>>-
> >>>>-		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
> >>>>-
> >>>>-		__tun_detach(tun);
> >>>>-
> >>>>-		/* If desirable, unregister the netdevice. */
> >>>>-		if (!(tun->flags&   TUN_PERSIST)) {
> >>>>-			rtnl_lock();
> >>>>-			if (dev->reg_state == NETREG_REGISTERED)
> >>>>-				unregister_netdevice(dev);
> >>>>-			rtnl_unlock();
> >>>>-		}
> >>>>
> >>>>-		/* drop the reference that netdevice holds */
> >>>>-		sock_put(&tfile->sk);
> >>>>-
> >>>>-	}
> >>>>-
> >>>>-	/* drop the reference that file holds */
> >>>>-	sock_put(&tfile->sk);
> >>>>+	tun_detach(tfile, true);
> >>>>
> >>>>  	return 0;
> >>>>  }
> >>>>@@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
> >>>>   * holding a reference to the file for as long as the socket is in use. */
> >>>>  struct socket *tun_get_socket(struct file *file)
> >>>>  {
> >>>>-	struct tun_struct *tun;
> >>>>+	struct tun_struct *tun = NULL;
> >>>>  	struct tun_file *tfile = file->private_data;
> >>>>  	if (file->f_op !=&tun_fops)
> >>>>  		return ERR_PTR(-EINVAL);
> >>>>-	tun = tun_get(file);
> >>>>-	if (!tun)
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>+	if (!tun) {
> >>>>+		rcu_read_unlock();
> >>>>  		return ERR_PTR(-EBADFD);
> >>>>-	tun_put(tun);
> >>>>+	}
> >>>>+	rcu_read_unlock();
> >>>>  	return&tfile->socket;
> >>>>  }
> >>>>  EXPORT_SYMBOL_GPL(tun_get_socket);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-27  5:16         ` Jason Wang
@ 2012-06-27  8:44           ` Michael S. Tsirkin
  2012-06-28  3:02             ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Michael S. Tsirkin @ 2012-06-27  8:44 UTC (permalink / raw)
  To: Jason Wang
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle

On Wed, Jun 27, 2012 at 01:16:30PM +0800, Jason Wang wrote:
> On 06/26/2012 06:42 PM, Michael S. Tsirkin wrote:
> >On Tue, Jun 26, 2012 at 11:42:17AM +0800, Jason Wang wrote:
> >>On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
> >>>On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
> >>>>This patch adds multiqueue support for tap device. This is done by abstracting
> >>>>each queue as a file/socket and allowing multiple sockets to be attached to the
> >>>>tuntap device (an array of tun_file were stored in the tun_struct). Userspace
> >>>>could write and read from those files to do the parallel packet
> >>>>sending/receiving.
> >>>>
> >>>>Unlike the previous single queue implementation, the socket and device were
> >>>>loosely coupled, each of them were allowed to go away first. In order to let the
> >>>>tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
> >>>>synchronize between data path and system call.
> >>>Don't use LLTX/RCU. It's not worth it.
> >>>Use something like netif_set_real_num_tx_queues.
> >>>
> >>>>The tx queue selecting is first based on the recorded rxq index of an skb, it
> >>>>there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
> >>>>
> >>>>Signed-off-by: Jason Wang<jasowang@redhat.com>
> >>>Interestingly macvtap switched to hashing first:
> >>>ef0002b577b52941fb147128f30bd1ecfdd3ff6d
> >>>(the commit log is corrupted but see what it
> >>>does in the patch).
> >>>Any idea why?
> >>Yes, so tap should be changed to behave same as macvtap. I remember
> >>the reason we do that is to make sure the packet of a single flow to
> >>be queued to a fixed socket/virtqueues. As 10g cards like ixgbe
> >>choose the rx queue for a flow based on the last tx queue where the
> >>packets of that flow comes. So if we are using recored rx queue in
> >>macvtap, the queue index of a flow would change as vhost thread
> >>moves amongs processors.
> >Hmm. OTOH if you override this, if TX is sent from VCPU0, RX might land
> >on VCPU1 in the guest, which is not good, right?
> 
> Yes, but better than making the rx moves between vcpus when we use
> recorded rx queue.

Why isn't this a problem with native TCP?
I think what happens is one of the following:
- moving between CPUs is more expensive with tun
  because it can queue so much data on xmit
- scheduler makes very bad decisions about VCPUs
  bouncing them around all the time

Could we isolate which it is? Does the problem
still happen if you pin VCPUs to host cpus?
If not it's the queue depth.

> Flow steering is needed to make sure the tx and
> rx on the same vcpu.

That involves IPI between processes, so it might be
very expensive for kvm.

> >>But during test tun/tap, one interesting thing I find is that even
> >>ixgbe has recorded the queue index during rx, it seems be lost when
> >>tap tries to transmit skbs to userspace.
> >dev_pick_tx does this I think but ndo_select_queue
> >should be able to get it without trouble.
> >
> >
> >>>>---
> >>>>  drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
> >>>>  1 files changed, 232 insertions(+), 139 deletions(-)
> >>>>
> >>>>diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>>>index 8233b0a..5c26757 100644
> >>>>--- a/drivers/net/tun.c
> >>>>+++ b/drivers/net/tun.c
> >>>>@@ -107,6 +107,8 @@ struct tap_filter {
> >>>>  	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
> >>>>  };
> >>>>
> >>>>+#define MAX_TAP_QUEUES (NR_CPUS<   16 ? NR_CPUS : 16)
> >>>Why the limit? I am guessing you copied this from macvtap?
> >>>This is problematic for a number of reasons:
> >>>	- will not play well with migration
> >>>	- will not work well for a large guest
> >>>
> >>>Yes, macvtap needs to be fixed too.
> >>>
> >>>I am guessing what it is trying to prevent is queueing
> >>>up a huge number of packets?
> >>>So just divide the default tx queue limit by the # of queues.
> >>>
> >>>And by the way, for MQ applications maybe we can finally
> >>>ignore tx queue altogether and limit the total number
> >>>of bytes queued?
> >>>To avoid regressions we can make it large like 64M/# queues.
> >>>Could be a separate patch I think, and for a single queue
> >>>might need a compatible mode though I am not sure.
> >>>
> >>>>+
> >>>>  struct tun_file {
> >>>>  	struct sock sk;
> >>>>  	struct socket socket;
> >>>>@@ -114,16 +116,18 @@ struct tun_file {
> >>>>  	int vnet_hdr_sz;
> >>>>  	struct tap_filter txflt;
> >>>>  	atomic_t count;
> >>>>-	struct tun_struct *tun;
> >>>>+	struct tun_struct __rcu *tun;
> >>>>  	struct net *net;
> >>>>  	struct fasync_struct *fasync;
> >>>>  	unsigned int flags;
> >>>>+	u16 queue_index;
> >>>>  };
> >>>>
> >>>>  struct tun_sock;
> >>>>
> >>>>  struct tun_struct {
> >>>>-	struct tun_file		*tfile;
> >>>>+	struct tun_file		*tfiles[MAX_TAP_QUEUES];
> >>>>+	unsigned int            numqueues;
> >>>>  	unsigned int 		flags;
> >>>>  	uid_t			owner;
> >>>>  	gid_t			group;
> >>>>@@ -138,80 +142,159 @@ struct tun_struct {
> >>>>  #endif
> >>>>  };
> >>>>
> >>>>-static int tun_attach(struct tun_struct *tun, struct file *file)
> >>>>+static DEFINE_SPINLOCK(tun_lock);
> >>>>+
> >>>>+/*
> >>>>+ * tun_get_queue(): calculate the queue index
> >>>>+ *     - if skbs comes from mq nics, we can just borrow
> >>>>+ *     - if not, calculate from the hash
> >>>>+ */
> >>>>+static struct tun_file *tun_get_queue(struct net_device *dev,
> >>>>+				      struct sk_buff *skb)
> >>>>  {
> >>>>-	struct tun_file *tfile = file->private_data;
> >>>>-	int err;
> >>>>+	struct tun_struct *tun = netdev_priv(dev);
> >>>>+	struct tun_file *tfile = NULL;
> >>>>+	int numqueues = tun->numqueues;
> >>>>+	__u32 rxq;
> >>>>
> >>>>-	ASSERT_RTNL();
> >>>>+	BUG_ON(!rcu_read_lock_held());
> >>>>
> >>>>-	netif_tx_lock_bh(tun->dev);
> >>>>+	if (!numqueues)
> >>>>+		goto out;
> >>>>
> >>>>-	err = -EINVAL;
> >>>>-	if (tfile->tun)
> >>>>+	if (numqueues == 1) {
> >>>>+		tfile = rcu_dereference(tun->tfiles[0]);
> >>>Instead of hacks like this, you can ask for an MQ
> >>>flag to be set in SETIFF. Then you won't need to
> >>>handle attach/detach at random times.
> >>>And most of the scary num_queues checks can go away.
> >>>You can then also ask userspace about the max # of queues
> >>>to expect if you want to save some memory.
> >>>
> >>>
> >>>>  		goto out;
> >>>>+	}
> >>>>
> >>>>-	err = -EBUSY;
> >>>>-	if (tun->tfile)
> >>>>+	if (likely(skb_rx_queue_recorded(skb))) {
> >>>>+		rxq = skb_get_rx_queue(skb);
> >>>>+
> >>>>+		while (unlikely(rxq>= numqueues))
> >>>>+			rxq -= numqueues;
> >>>>+
> >>>>+		tfile = rcu_dereference(tun->tfiles[rxq]);
> >>>>  		goto out;
> >>>>+	}
> >>>>
> >>>>-	err = 0;
> >>>>-	tfile->tun = tun;
> >>>>-	tun->tfile = tfile;
> >>>>-	netif_carrier_on(tun->dev);
> >>>>-	dev_hold(tun->dev);
> >>>>-	sock_hold(&tfile->sk);
> >>>>-	atomic_inc(&tfile->count);
> >>>>+	/* Check if we can use flow to select a queue */
> >>>>+	rxq = skb_get_rxhash(skb);
> >>>>+	if (rxq) {
> >>>>+		u32 idx = ((u64)rxq * numqueues)>>   32;
> >>>This completely confuses me. What's the logic here?
> >>>How do we even know it's in range?
> >>>
> >>>>+		tfile = rcu_dereference(tun->tfiles[idx]);
> >>>>+		goto out;
> >>>>+	}
> >>>>
> >>>>+	tfile = rcu_dereference(tun->tfiles[0]);
> >>>>  out:
> >>>>-	netif_tx_unlock_bh(tun->dev);
> >>>>-	return err;
> >>>>+	return tfile;
> >>>>  }
> >>>>
> >>>>-static void __tun_detach(struct tun_struct *tun)
> >>>>+static int tun_detach(struct tun_file *tfile, bool clean)
> >>>>  {
> >>>>-	struct tun_file *tfile = tun->tfile;
> >>>>-	/* Detach from net device */
> >>>>-	netif_tx_lock_bh(tun->dev);
> >>>>-	netif_carrier_off(tun->dev);
> >>>>-	tun->tfile = NULL;
> >>>>-	netif_tx_unlock_bh(tun->dev);
> >>>>-
> >>>>-	/* Drop read queue */
> >>>>-	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
> >>>>-
> >>>>-	/* Drop the extra count on the net device */
> >>>>-	dev_put(tun->dev);
> >>>>-}
> >>>>+	struct tun_struct *tun;
> >>>>+	struct net_device *dev = NULL;
> >>>>+	bool destroy = false;
> >>>>
> >>>>-static void tun_detach(struct tun_struct *tun)
> >>>>-{
> >>>>-	rtnl_lock();
> >>>>-	__tun_detach(tun);
> >>>>-	rtnl_unlock();
> >>>>-}
> >>>>+	spin_lock(&tun_lock);
> >>>>
> >>>>-static struct tun_struct *__tun_get(struct tun_file *tfile)
> >>>>-{
> >>>>-	struct tun_struct *tun = NULL;
> >>>>+	tun = rcu_dereference_protected(tfile->tun,
> >>>>+					lockdep_is_held(&tun_lock));
> >>>>+	if (tun) {
> >>>>+		u16 index = tfile->queue_index;
> >>>>+		BUG_ON(index>= tun->numqueues);
> >>>>+		dev = tun->dev;
> >>>>+
> >>>>+		rcu_assign_pointer(tun->tfiles[index],
> >>>>+				   tun->tfiles[tun->numqueues - 1]);
> >>>>+		tun->tfiles[index]->queue_index = index;
> >>>>+		rcu_assign_pointer(tfile->tun, NULL);
> >>>>+		--tun->numqueues;
> >>>>+		sock_put(&tfile->sk);
> >>>>
> >>>>-	if (atomic_inc_not_zero(&tfile->count))
> >>>>-		tun = tfile->tun;
> >>>>+		if (tun->numqueues == 0&&   !(tun->flags&   TUN_PERSIST))
> >>>>+			destroy = true;
> >>>Please don't use flags like that. Use dedicated labels and goto there on error.
> >>>
> >>>
> >>>>+	}
> >>>>
> >>>>-	return tun;
> >>>>+	spin_unlock(&tun_lock);
> >>>>+
> >>>>+	synchronize_rcu();
> >>>>+	if (clean)
> >>>>+		sock_put(&tfile->sk);
> >>>>+
> >>>>+	if (destroy) {
> >>>>+		rtnl_lock();
> >>>>+		if (dev->reg_state == NETREG_REGISTERED)
> >>>>+			unregister_netdevice(dev);
> >>>>+		rtnl_unlock();
> >>>>+	}
> >>>>+
> >>>>+	return 0;
> >>>>  }
> >>>>
> >>>>-static struct tun_struct *tun_get(struct file *file)
> >>>>+static void tun_detach_all(struct net_device *dev)
> >>>>  {
> >>>>-	return __tun_get(file->private_data);
> >>>>+	struct tun_struct *tun = netdev_priv(dev);
> >>>>+	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
> >>>>+	int i, j = 0;
> >>>>+
> >>>>+	spin_lock(&tun_lock);
> >>>>+
> >>>>+	for (i = 0; i<   MAX_TAP_QUEUES&&   tun->numqueues; i++) {
> >>>>+		tfile = rcu_dereference_protected(tun->tfiles[i],
> >>>>+						lockdep_is_held(&tun_lock));
> >>>>+		BUG_ON(!tfile);
> >>>>+		wake_up_all(&tfile->wq.wait);
> >>>>+		tfile_list[j++] = tfile;
> >>>>+		rcu_assign_pointer(tfile->tun, NULL);
> >>>>+		--tun->numqueues;
> >>>>+	}
> >>>>+	BUG_ON(tun->numqueues != 0);
> >>>>+	/* guarantee that any future tun_attach will fail */
> >>>>+	tun->numqueues = MAX_TAP_QUEUES;
> >>>>+	spin_unlock(&tun_lock);
> >>>>+
> >>>>+	synchronize_rcu();
> >>>>+	for (--j; j>= 0; j--)
> >>>>+		sock_put(&tfile_list[j]->sk);
> >>>>  }
> >>>>
> >>>>-static void tun_put(struct tun_struct *tun)
> >>>>+static int tun_attach(struct tun_struct *tun, struct file *file)
> >>>>  {
> >>>>-	struct tun_file *tfile = tun->tfile;
> >>>>+	struct tun_file *tfile = file->private_data;
> >>>>+	int err;
> >>>>+
> >>>>+	ASSERT_RTNL();
> >>>>+
> >>>>+	spin_lock(&tun_lock);
> >>>>
> >>>>-	if (atomic_dec_and_test(&tfile->count))
> >>>>-		tun_detach(tfile->tun);
> >>>>+	err = -EINVAL;
> >>>>+	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
> >>>>+		goto out;
> >>>>+
> >>>>+	err = -EBUSY;
> >>>>+	if (!(tun->flags&   TUN_TAP_MQ)&&   tun->numqueues == 1)
> >>>>+		goto out;
> >>>>+
> >>>>+	if (tun->numqueues == MAX_TAP_QUEUES)
> >>>>+		goto out;
> >>>>+
> >>>>+	err = 0;
> >>>>+	tfile->queue_index = tun->numqueues;
> >>>>+	rcu_assign_pointer(tfile->tun, tun);
> >>>>+	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
> >>>>+	sock_hold(&tfile->sk);
> >>>>+	tun->numqueues++;
> >>>>+
> >>>>+	if (tun->numqueues == 1)
> >>>>+		netif_carrier_on(tun->dev);
> >>>>+
> >>>>+	/* device is allowed to go away first, so no need to hold extra
> >>>>+	 * refcnt. */
> >>>>+
> >>>>+out:
> >>>>+	spin_unlock(&tun_lock);
> >>>>+	return err;
> >>>>  }
> >>>>
> >>>>  /* TAP filtering */
> >>>>@@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
> >>>>  /* Net device detach from fd. */
> >>>>  static void tun_net_uninit(struct net_device *dev)
> >>>>  {
> >>>>-	struct tun_struct *tun = netdev_priv(dev);
> >>>>-	struct tun_file *tfile = tun->tfile;
> >>>>-
> >>>>-	/* Inform the methods they need to stop using the dev.
> >>>>-	 */
> >>>>-	if (tfile) {
> >>>>-		wake_up_all(&tfile->wq.wait);
> >>>>-		if (atomic_dec_and_test(&tfile->count))
> >>>>-			__tun_detach(tun);
> >>>>-	}
> >>>>+	tun_detach_all(dev);
> >>>>  }
> >>>>
> >>>>  /* Net device open. */
> >>>>@@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
> >>>>  /* Net device start xmit */
> >>>>  static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>>  {
> >>>>-	struct tun_struct *tun = netdev_priv(dev);
> >>>>-	struct tun_file *tfile = tun->tfile;
> >>>>+	struct tun_file *tfile = NULL;
> >>>>
> >>>>-	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
> >>>>+	rcu_read_lock();
> >>>>+	tfile = tun_get_queue(dev, skb);
> >>>>
> >>>>  	/* Drop packet if interface is not attached */
> >>>>  	if (!tfile)
> >>>>@@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>>
> >>>>  	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
> >>>>  	>= dev->tx_queue_len) {
> >>>>-		if (!(tun->flags&   TUN_ONE_QUEUE)) {
> >>>>+		if (!(tfile->flags&   TUN_ONE_QUEUE)&&
> >>>Which patch moved flags from tun to tfile?
> >>>
> >>>>+		    !(tfile->flags&   TUN_TAP_MQ)) {
> >>>>  			/* Normal queueing mode. */
> >>>>  			/* Packet scheduler handles dropping of further packets. */
> >>>>  			netif_stop_queue(dev);
> >>>>@@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>>  			 * error is more appropriate. */
> >>>>  			dev->stats.tx_fifo_errors++;
> >>>>  		} else {
> >>>>-			/* Single queue mode.
> >>>>+			/* Single queue mode or multi queue mode.
> >>>>  			 * Driver handles dropping of all packets itself. */
> >>>Please don't do this. Stop the queue on overrun as appropriate.
> >>>ONE_QUEUE is a legacy hack.
> >>>
> >>>BTW we really should stop queue before we start dropping packets,
> >>>but that can be a separate patch.
> >>>
> >>>>  			goto drop;
> >>>>  		}
> >>>>@@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> >>>>  		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
> >>>>  	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
> >>>>  				   POLLRDNORM | POLLRDBAND);
> >>>>+	rcu_read_unlock();
> >>>>  	return NETDEV_TX_OK;
> >>>>
> >>>>  drop:
> >>>>+	rcu_read_unlock();
> >>>>  	dev->stats.tx_dropped++;
> >>>>  	kfree_skb(skb);
> >>>>  	return NETDEV_TX_OK;
> >>>>@@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
> >>>>  static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
> >>>>  {
> >>>>  	struct tun_file *tfile = file->private_data;
> >>>>-	struct tun_struct *tun = __tun_get(tfile);
> >>>>+	struct tun_struct *tun = NULL;
> >>>>  	struct sock *sk;
> >>>>  	unsigned int mask = 0;
> >>>>
> >>>>-	if (!tun)
> >>>>+	if (!tfile)
> >>>>  		return POLLERR;
> >>>>
> >>>>-	sk = tfile->socket.sk;
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>+	if (!tun) {
> >>>>+		rcu_read_unlock();
> >>>>+		return POLLERR;
> >>>>+	}
> >>>>+	rcu_read_unlock();
> >>>>
> >>>>-	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
> >>>>+	sk =&tfile->sk;
> >>>>
> >>>>  	poll_wait(file,&tfile->wq.wait, wait);
> >>>>
> >>>>@@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
> >>>>  	     sock_writeable(sk)))
> >>>>  		mask |= POLLOUT | POLLWRNORM;
> >>>>
> >>>>-	if (tun->dev->reg_state != NETREG_REGISTERED)
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>+	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
> >>>>  		mask = POLLERR;
> >>>>+	rcu_read_unlock();
> >>>>
> >>>>-	tun_put(tun);
> >>>>  	return mask;
> >>>>  }
> >>>>
> >>>>@@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
> >>>>  		skb_shinfo(skb)->gso_segs = 0;
> >>>>  	}
> >>>>
> >>>>-	tun = __tun_get(tfile);
> >>>>-	if (!tun)
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>+	if (!tun) {
> >>>>+		rcu_read_unlock();
> >>>>  		return -EBADFD;
> >>>>+	}
> >>>>
> >>>>  	switch (tfile->flags&   TUN_TYPE_MASK) {
> >>>>  	case TUN_TUN_DEV:
> >>>>@@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
> >>>>  		skb->protocol = eth_type_trans(skb, tun->dev);
> >>>>  		break;
> >>>>  	}
> >>>>-
> >>>>-	netif_rx_ni(skb);
> >>>>  	tun->dev->stats.rx_packets++;
> >>>>  	tun->dev->stats.rx_bytes += len;
> >>>>-	tun_put(tun);
> >>>>+	rcu_read_unlock();
> >>>>+
> >>>>+	netif_rx_ni(skb);
> >>>>+
> >>>>  	return count;
> >>>>
> >>>>  err_free:
> >>>>  	count = -EINVAL;
> >>>>  	kfree_skb(skb);
> >>>>  err:
> >>>>-	tun = __tun_get(tfile);
> >>>>-	if (!tun)
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>+	if (!tun) {
> >>>>+		rcu_read_unlock();
> >>>>  		return -EBADFD;
> >>>>+	}
> >>>>
> >>>>  	if (drop)
> >>>>  		tun->dev->stats.rx_dropped++;
> >>>>  	if (error)
> >>>>  		tun->dev->stats.rx_frame_errors++;
> >>>>-	tun_put(tun);
> >>>>+	rcu_read_unlock();
> >>>>  	return count;
> >>>>  }
> >>>>
> >>>>@@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
> >>>>  	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
> >>>>  	total += skb->len;
> >>>>
> >>>>-	tun = __tun_get(tfile);
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>  	if (tun) {
> >>>>  		tun->dev->stats.tx_packets++;
> >>>>  		tun->dev->stats.tx_bytes += len;
> >>>>-		tun_put(tun);
> >>>>  	}
> >>>>+	rcu_read_unlock();
> >>>>
> >>>>  	return total;
> >>>>  }
> >>>>@@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
> >>>>  				break;
> >>>>  			}
> >>>>
> >>>>-			tun = __tun_get(tfile);
> >>>>+			rcu_read_lock();
> >>>>+			tun = rcu_dereference(tfile->tun);
> >>>>  			if (!tun) {
> >>>>-				ret = -EIO;
> >>>>+				ret = -EBADFD;
> >>>BADFD is for when you get passed something like -1 fd.
> >>>Here fd is OK, it's just in a bad state so you can not do IO.
> >>>
> >>>
> >>>>+				rcu_read_unlock();
> >>>>  				break;
> >>>>  			}
> >>>>  			if (tun->dev->reg_state != NETREG_REGISTERED) {
> >>>>  				ret = -EIO;
> >>>>-				tun_put(tun);
> >>>>+				rcu_read_unlock();
> >>>>  				break;
> >>>>  			}
> >>>>-			tun_put(tun);
> >>>>+			rcu_read_unlock();
> >>>>
> >>>>  			/* Nothing to read, let's sleep */
> >>>>  			schedule();
> >>>>  			continue;
> >>>>  		}
> >>>>
> >>>>-		tun = __tun_get(tfile);
> >>>>+		rcu_read_lock();
> >>>>+		tun = rcu_dereference(tfile->tun);
> >>>>  		if (tun) {
> >>>>  			netif_wake_queue(tun->dev);
> >>>>-			tun_put(tun);
> >>>>  		}
> >>>>+		rcu_read_unlock();
> >>>>
> >>>>  		ret = tun_put_user(tfile, skb, iv, len);
> >>>>  		kfree_skb(skb);
> >>>>@@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
> >>>>  	if (tun->flags&   TUN_VNET_HDR)
> >>>>  		flags |= IFF_VNET_HDR;
> >>>>
> >>>>+	if (tun->flags&   TUN_TAP_MQ)
> >>>>+		flags |= IFF_MULTI_QUEUE;
> >>>>+
> >>>>  	return flags;
> >>>>  }
> >>>>
> >>>>@@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>>>  		err = tun_attach(tun, file);
> >>>>  		if (err<   0)
> >>>>  			return err;
> >>>>-	}
> >>>>-	else {
> >>>>+	} else {
> >>>>  		char *name;
> >>>>  		unsigned long flags = 0;
> >>>>
> >>>>@@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>>>  		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
> >>>>  			TUN_USER_FEATURES;
> >>>>  		dev->features = dev->hw_features;
> >>>>+		if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
> >>>>+			dev->features |= NETIF_F_LLTX;
> >>>>
> >>>>  		err = register_netdevice(tun->dev);
> >>>>  		if (err<   0)
> >>>>@@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>>>
> >>>>  		err = tun_attach(tun, file);
> >>>>  		if (err<   0)
> >>>>-			goto failed;
> >>>>+			goto err_free_dev;
> >>>>  	}
> >>>>
> >>>>  	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
> >>>>@@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>>>  	else
> >>>>  		tun->flags&= ~TUN_VNET_HDR;
> >>>>
> >>>>+	if (ifr->ifr_flags&   IFF_MULTI_QUEUE)
> >>>>+		tun->flags |= TUN_TAP_MQ;
> >>>>+	else
> >>>>+		tun->flags&= ~TUN_TAP_MQ;
> >>>>+
> >>>>  	/* Cache flags from tun device */
> >>>>  	tfile->flags = tun->flags;
> >>>>  	/* Make sure persistent devices do not get stuck in
> >>>>@@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >>>>
> >>>>  err_free_dev:
> >>>>  	free_netdev(dev);
> >>>>-failed:
> >>>>  	return err;
> >>>>  }
> >>>>
> >>>>@@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>>>  				(unsigned int __user*)argp);
> >>>>  	}
> >>>>
> >>>>-	rtnl_lock();
> >>>>-
> >>>>-	tun = __tun_get(tfile);
> >>>>-	if (cmd == TUNSETIFF&&   !tun) {
> >>>>+	ret = 0;
> >>>>+	if (cmd == TUNSETIFF) {
> >>>>+		rtnl_lock();
> >>>>  		ifr.ifr_name[IFNAMSIZ-1] = '\0';
> >>>>-
> >>>>  		ret = tun_set_iff(tfile->net, file,&ifr);
> >>>>-
> >>>>+		rtnl_unlock();
> >>>>  		if (ret)
> >>>>-			goto unlock;
> >>>>-
> >>>>+			return ret;
> >>>>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>>>-			ret = -EFAULT;
> >>>>-		goto unlock;
> >>>>+			return -EFAULT;
> >>>>+		return ret;
> >>>>  	}
> >>>>
> >>>>+	rtnl_lock();
> >>>>+
> >>>>+	rcu_read_lock();
> >>>>+
> >>>>  	ret = -EBADFD;
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>  	if (!tun)
> >>>>  		goto unlock;
> >>>>+	else
> >>>>+		ret = 0;
> >>>>
> >>>>-	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
> >>>>-
> >>>>-	ret = 0;
> >>>>  	switch (cmd) {
> >>>>  	case TUNGETIFF:
> >>>>  		ret = tun_get_iff(current->nsproxy->net_ns, tun,&ifr);
> >>>>+		rcu_read_unlock();
> >>>>  		if (ret)
> >>>>-			break;
> >>>>+			goto out;
> >>>>
> >>>>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>>>  			ret = -EFAULT;
> >>>>-		break;
> >>>>+		goto out;
> >>>>
> >>>>  	case TUNSETNOCSUM:
> >>>>  		/* Disable/Enable checksum */
> >>>>@@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>>>  		/* Get hw address */
> >>>>  		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
> >>>>  		ifr.ifr_hwaddr.sa_family = tun->dev->type;
> >>>>+		rcu_read_unlock();
> >>>>  		if (copy_to_user(argp,&ifr, ifreq_len))
> >>>>  			ret = -EFAULT;
> >>>>-		break;
> >>>>+		goto out;
> >>>>
> >>>>  	case SIOCSIFHWADDR:
> >>>>  		/* Set hw address */
> >>>>@@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> >>>>  	}
> >>>>
> >>>>  unlock:
> >>>>+	rcu_read_unlock();
> >>>>+out:
> >>>>  	rtnl_unlock();
> >>>>-	if (tun)
> >>>>-		tun_put(tun);
> >>>>  	return ret;
> >>>>  }
> >>>>
> >>>>@@ -1517,6 +1624,11 @@ out:
> >>>>  	return ret;
> >>>>  }
> >>>>
> >>>>+static void tun_sock_destruct(struct sock *sk)
> >>>>+{
> >>>>+	skb_queue_purge(&sk->sk_receive_queue);
> >>>>+}
> >>>>+
> >>>>  static int tun_chr_open(struct inode *inode, struct file * file)
> >>>>  {
> >>>>  	struct net *net = current->nsproxy->net_ns;
> >>>>@@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
> >>>>  	sock_init_data(&tfile->socket,&tfile->sk);
> >>>>
> >>>>  	tfile->sk.sk_write_space = tun_sock_write_space;
> >>>>+	tfile->sk.sk_destruct = tun_sock_destruct;
> >>>>  	tfile->sk.sk_sndbuf = INT_MAX;
> >>>>  	file->private_data = tfile;
> >>>>
> >>>>@@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
> >>>>  static int tun_chr_close(struct inode *inode, struct file *file)
> >>>>  {
> >>>>  	struct tun_file *tfile = file->private_data;
> >>>>-	struct tun_struct *tun;
> >>>>-
> >>>>-	tun = __tun_get(tfile);
> >>>>-	if (tun) {
> >>>>-		struct net_device *dev = tun->dev;
> >>>>-
> >>>>-		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
> >>>>-
> >>>>-		__tun_detach(tun);
> >>>>-
> >>>>-		/* If desirable, unregister the netdevice. */
> >>>>-		if (!(tun->flags&   TUN_PERSIST)) {
> >>>>-			rtnl_lock();
> >>>>-			if (dev->reg_state == NETREG_REGISTERED)
> >>>>-				unregister_netdevice(dev);
> >>>>-			rtnl_unlock();
> >>>>-		}
> >>>>
> >>>>-		/* drop the reference that netdevice holds */
> >>>>-		sock_put(&tfile->sk);
> >>>>-
> >>>>-	}
> >>>>-
> >>>>-	/* drop the reference that file holds */
> >>>>-	sock_put(&tfile->sk);
> >>>>+	tun_detach(tfile, true);
> >>>>
> >>>>  	return 0;
> >>>>  }
> >>>>@@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
> >>>>   * holding a reference to the file for as long as the socket is in use. */
> >>>>  struct socket *tun_get_socket(struct file *file)
> >>>>  {
> >>>>-	struct tun_struct *tun;
> >>>>+	struct tun_struct *tun = NULL;
> >>>>  	struct tun_file *tfile = file->private_data;
> >>>>  	if (file->f_op !=&tun_fops)
> >>>>  		return ERR_PTR(-EINVAL);
> >>>>-	tun = tun_get(file);
> >>>>-	if (!tun)
> >>>>+	rcu_read_lock();
> >>>>+	tun = rcu_dereference(tfile->tun);
> >>>>+	if (!tun) {
> >>>>+		rcu_read_unlock();
> >>>>  		return ERR_PTR(-EBADFD);
> >>>>-	tun_put(tun);
> >>>>+	}
> >>>>+	rcu_read_unlock();
> >>>>  	return&tfile->socket;
> >>>>  }
> >>>>  EXPORT_SYMBOL_GPL(tun_get_socket);
> >--
> >To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >the body of a message to majordomo@vger.kernel.org
> >More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-27  8:44           ` Michael S. Tsirkin
@ 2012-06-28  3:02             ` Jason Wang
  2012-06-28  4:52               ` Sridhar Samudrala
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Wang @ 2012-06-28  3:02 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle

On 06/27/2012 04:44 PM, Michael S. Tsirkin wrote:
> On Wed, Jun 27, 2012 at 01:16:30PM +0800, Jason Wang wrote:
>> On 06/26/2012 06:42 PM, Michael S. Tsirkin wrote:
>>> On Tue, Jun 26, 2012 at 11:42:17AM +0800, Jason Wang wrote:
>>>> On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
>>>>> On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
>>>>>> This patch adds multiqueue support for tap device. This is done by abstracting
>>>>>> each queue as a file/socket and allowing multiple sockets to be attached to the
>>>>>> tuntap device (an array of tun_file were stored in the tun_struct). Userspace
>>>>>> could write and read from those files to do the parallel packet
>>>>>> sending/receiving.
>>>>>>
>>>>>> Unlike the previous single queue implementation, the socket and device were
>>>>>> loosely coupled, each of them were allowed to go away first. In order to let the
>>>>>> tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
>>>>>> synchronize between data path and system call.
>>>>> Don't use LLTX/RCU. It's not worth it.
>>>>> Use something like netif_set_real_num_tx_queues.
>>>>>
>>>>>> The tx queue selecting is first based on the recorded rxq index of an skb, it
>>>>>> there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
>>>>>>
>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>> Interestingly macvtap switched to hashing first:
>>>>> ef0002b577b52941fb147128f30bd1ecfdd3ff6d
>>>>> (the commit log is corrupted but see what it
>>>>> does in the patch).
>>>>> Any idea why?
>>>> Yes, so tap should be changed to behave same as macvtap. I remember
>>>> the reason we do that is to make sure the packet of a single flow to
>>>> be queued to a fixed socket/virtqueues. As 10g cards like ixgbe
>>>> choose the rx queue for a flow based on the last tx queue where the
>>>> packets of that flow comes. So if we are using recored rx queue in
>>>> macvtap, the queue index of a flow would change as vhost thread
>>>> moves amongs processors.
>>> Hmm. OTOH if you override this, if TX is sent from VCPU0, RX might land
>>> on VCPU1 in the guest, which is not good, right?
>> Yes, but better than making the rx moves between vcpus when we use
>> recorded rx queue.
> Why isn't this a problem with native TCP?
> I think what happens is one of the following:
> - moving between CPUs is more expensive with tun
>    because it can queue so much data on xmit
> - scheduler makes very bad decisions about VCPUs
>    bouncing them around all the time

For usual native TCP/host process, as it reads and writes tcp sockets, 
so it make make sense to move rx to the porcessor where the process 
moves. But vhost does not do tcp stuffs and ixgbe would still move rx 
when vhost process moves, and we can't even make sure the vhost process 
that handling rx is running on processor that handle rx interrupt.

> Could we isolate which it is? Does the problem
> still happen if you pin VCPUs to host cpus?
> If not it's the queue depth.

It may not help as tun does not record the vcpu/queue that send the 
stream, so it can't transmit the packets back the same vcpu/queue.
>> Flow steering is needed to make sure the tx and
>> rx on the same vcpu.
> That involves IPI between processes, so it might be
> very expensive for kvm.
>
>>>> But during test tun/tap, one interesting thing I find is that even
>>>> ixgbe has recorded the queue index during rx, it seems be lost when
>>>> tap tries to transmit skbs to userspace.
>>> dev_pick_tx does this I think but ndo_select_queue
>>> should be able to get it without trouble.
>>>
>>>
>>>>>> ---
>>>>>>   drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
>>>>>>   1 files changed, 232 insertions(+), 139 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>> index 8233b0a..5c26757 100644
>>>>>> --- a/drivers/net/tun.c
>>>>>> +++ b/drivers/net/tun.c
>>>>>> @@ -107,6 +107,8 @@ struct tap_filter {
>>>>>>   	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
>>>>>>   };
>>>>>>
>>>>>> +#define MAX_TAP_QUEUES (NR_CPUS<    16 ? NR_CPUS : 16)
>>>>> Why the limit? I am guessing you copied this from macvtap?
>>>>> This is problematic for a number of reasons:
>>>>> 	- will not play well with migration
>>>>> 	- will not work well for a large guest
>>>>>
>>>>> Yes, macvtap needs to be fixed too.
>>>>>
>>>>> I am guessing what it is trying to prevent is queueing
>>>>> up a huge number of packets?
>>>>> So just divide the default tx queue limit by the # of queues.
>>>>>
>>>>> And by the way, for MQ applications maybe we can finally
>>>>> ignore tx queue altogether and limit the total number
>>>>> of bytes queued?
>>>>> To avoid regressions we can make it large like 64M/# queues.
>>>>> Could be a separate patch I think, and for a single queue
>>>>> might need a compatible mode though I am not sure.
>>>>>
>>>>>> +
>>>>>>   struct tun_file {
>>>>>>   	struct sock sk;
>>>>>>   	struct socket socket;
>>>>>> @@ -114,16 +116,18 @@ struct tun_file {
>>>>>>   	int vnet_hdr_sz;
>>>>>>   	struct tap_filter txflt;
>>>>>>   	atomic_t count;
>>>>>> -	struct tun_struct *tun;
>>>>>> +	struct tun_struct __rcu *tun;
>>>>>>   	struct net *net;
>>>>>>   	struct fasync_struct *fasync;
>>>>>>   	unsigned int flags;
>>>>>> +	u16 queue_index;
>>>>>>   };
>>>>>>
>>>>>>   struct tun_sock;
>>>>>>
>>>>>>   struct tun_struct {
>>>>>> -	struct tun_file		*tfile;
>>>>>> +	struct tun_file		*tfiles[MAX_TAP_QUEUES];
>>>>>> +	unsigned int            numqueues;
>>>>>>   	unsigned int 		flags;
>>>>>>   	uid_t			owner;
>>>>>>   	gid_t			group;
>>>>>> @@ -138,80 +142,159 @@ struct tun_struct {
>>>>>>   #endif
>>>>>>   };
>>>>>>
>>>>>> -static int tun_attach(struct tun_struct *tun, struct file *file)
>>>>>> +static DEFINE_SPINLOCK(tun_lock);
>>>>>> +
>>>>>> +/*
>>>>>> + * tun_get_queue(): calculate the queue index
>>>>>> + *     - if skbs comes from mq nics, we can just borrow
>>>>>> + *     - if not, calculate from the hash
>>>>>> + */
>>>>>> +static struct tun_file *tun_get_queue(struct net_device *dev,
>>>>>> +				      struct sk_buff *skb)
>>>>>>   {
>>>>>> -	struct tun_file *tfile = file->private_data;
>>>>>> -	int err;
>>>>>> +	struct tun_struct *tun = netdev_priv(dev);
>>>>>> +	struct tun_file *tfile = NULL;
>>>>>> +	int numqueues = tun->numqueues;
>>>>>> +	__u32 rxq;
>>>>>>
>>>>>> -	ASSERT_RTNL();
>>>>>> +	BUG_ON(!rcu_read_lock_held());
>>>>>>
>>>>>> -	netif_tx_lock_bh(tun->dev);
>>>>>> +	if (!numqueues)
>>>>>> +		goto out;
>>>>>>
>>>>>> -	err = -EINVAL;
>>>>>> -	if (tfile->tun)
>>>>>> +	if (numqueues == 1) {
>>>>>> +		tfile = rcu_dereference(tun->tfiles[0]);
>>>>> Instead of hacks like this, you can ask for an MQ
>>>>> flag to be set in SETIFF. Then you won't need to
>>>>> handle attach/detach at random times.
>>>>> And most of the scary num_queues checks can go away.
>>>>> You can then also ask userspace about the max # of queues
>>>>> to expect if you want to save some memory.
>>>>>
>>>>>
>>>>>>   		goto out;
>>>>>> +	}
>>>>>>
>>>>>> -	err = -EBUSY;
>>>>>> -	if (tun->tfile)
>>>>>> +	if (likely(skb_rx_queue_recorded(skb))) {
>>>>>> +		rxq = skb_get_rx_queue(skb);
>>>>>> +
>>>>>> +		while (unlikely(rxq>= numqueues))
>>>>>> +			rxq -= numqueues;
>>>>>> +
>>>>>> +		tfile = rcu_dereference(tun->tfiles[rxq]);
>>>>>>   		goto out;
>>>>>> +	}
>>>>>>
>>>>>> -	err = 0;
>>>>>> -	tfile->tun = tun;
>>>>>> -	tun->tfile = tfile;
>>>>>> -	netif_carrier_on(tun->dev);
>>>>>> -	dev_hold(tun->dev);
>>>>>> -	sock_hold(&tfile->sk);
>>>>>> -	atomic_inc(&tfile->count);
>>>>>> +	/* Check if we can use flow to select a queue */
>>>>>> +	rxq = skb_get_rxhash(skb);
>>>>>> +	if (rxq) {
>>>>>> +		u32 idx = ((u64)rxq * numqueues)>>    32;
>>>>> This completely confuses me. What's the logic here?
>>>>> How do we even know it's in range?
>>>>>
>>>>>> +		tfile = rcu_dereference(tun->tfiles[idx]);
>>>>>> +		goto out;
>>>>>> +	}
>>>>>>
>>>>>> +	tfile = rcu_dereference(tun->tfiles[0]);
>>>>>>   out:
>>>>>> -	netif_tx_unlock_bh(tun->dev);
>>>>>> -	return err;
>>>>>> +	return tfile;
>>>>>>   }
>>>>>>
>>>>>> -static void __tun_detach(struct tun_struct *tun)
>>>>>> +static int tun_detach(struct tun_file *tfile, bool clean)
>>>>>>   {
>>>>>> -	struct tun_file *tfile = tun->tfile;
>>>>>> -	/* Detach from net device */
>>>>>> -	netif_tx_lock_bh(tun->dev);
>>>>>> -	netif_carrier_off(tun->dev);
>>>>>> -	tun->tfile = NULL;
>>>>>> -	netif_tx_unlock_bh(tun->dev);
>>>>>> -
>>>>>> -	/* Drop read queue */
>>>>>> -	skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
>>>>>> -
>>>>>> -	/* Drop the extra count on the net device */
>>>>>> -	dev_put(tun->dev);
>>>>>> -}
>>>>>> +	struct tun_struct *tun;
>>>>>> +	struct net_device *dev = NULL;
>>>>>> +	bool destroy = false;
>>>>>>
>>>>>> -static void tun_detach(struct tun_struct *tun)
>>>>>> -{
>>>>>> -	rtnl_lock();
>>>>>> -	__tun_detach(tun);
>>>>>> -	rtnl_unlock();
>>>>>> -}
>>>>>> +	spin_lock(&tun_lock);
>>>>>>
>>>>>> -static struct tun_struct *__tun_get(struct tun_file *tfile)
>>>>>> -{
>>>>>> -	struct tun_struct *tun = NULL;
>>>>>> +	tun = rcu_dereference_protected(tfile->tun,
>>>>>> +					lockdep_is_held(&tun_lock));
>>>>>> +	if (tun) {
>>>>>> +		u16 index = tfile->queue_index;
>>>>>> +		BUG_ON(index>= tun->numqueues);
>>>>>> +		dev = tun->dev;
>>>>>> +
>>>>>> +		rcu_assign_pointer(tun->tfiles[index],
>>>>>> +				   tun->tfiles[tun->numqueues - 1]);
>>>>>> +		tun->tfiles[index]->queue_index = index;
>>>>>> +		rcu_assign_pointer(tfile->tun, NULL);
>>>>>> +		--tun->numqueues;
>>>>>> +		sock_put(&tfile->sk);
>>>>>>
>>>>>> -	if (atomic_inc_not_zero(&tfile->count))
>>>>>> -		tun = tfile->tun;
>>>>>> +		if (tun->numqueues == 0&&    !(tun->flags&    TUN_PERSIST))
>>>>>> +			destroy = true;
>>>>> Please don't use flags like that. Use dedicated labels and goto there on error.
>>>>>
>>>>>
>>>>>> +	}
>>>>>>
>>>>>> -	return tun;
>>>>>> +	spin_unlock(&tun_lock);
>>>>>> +
>>>>>> +	synchronize_rcu();
>>>>>> +	if (clean)
>>>>>> +		sock_put(&tfile->sk);
>>>>>> +
>>>>>> +	if (destroy) {
>>>>>> +		rtnl_lock();
>>>>>> +		if (dev->reg_state == NETREG_REGISTERED)
>>>>>> +			unregister_netdevice(dev);
>>>>>> +		rtnl_unlock();
>>>>>> +	}
>>>>>> +
>>>>>> +	return 0;
>>>>>>   }
>>>>>>
>>>>>> -static struct tun_struct *tun_get(struct file *file)
>>>>>> +static void tun_detach_all(struct net_device *dev)
>>>>>>   {
>>>>>> -	return __tun_get(file->private_data);
>>>>>> +	struct tun_struct *tun = netdev_priv(dev);
>>>>>> +	struct tun_file *tfile, *tfile_list[MAX_TAP_QUEUES];
>>>>>> +	int i, j = 0;
>>>>>> +
>>>>>> +	spin_lock(&tun_lock);
>>>>>> +
>>>>>> +	for (i = 0; i<    MAX_TAP_QUEUES&&    tun->numqueues; i++) {
>>>>>> +		tfile = rcu_dereference_protected(tun->tfiles[i],
>>>>>> +						lockdep_is_held(&tun_lock));
>>>>>> +		BUG_ON(!tfile);
>>>>>> +		wake_up_all(&tfile->wq.wait);
>>>>>> +		tfile_list[j++] = tfile;
>>>>>> +		rcu_assign_pointer(tfile->tun, NULL);
>>>>>> +		--tun->numqueues;
>>>>>> +	}
>>>>>> +	BUG_ON(tun->numqueues != 0);
>>>>>> +	/* guarantee that any future tun_attach will fail */
>>>>>> +	tun->numqueues = MAX_TAP_QUEUES;
>>>>>> +	spin_unlock(&tun_lock);
>>>>>> +
>>>>>> +	synchronize_rcu();
>>>>>> +	for (--j; j>= 0; j--)
>>>>>> +		sock_put(&tfile_list[j]->sk);
>>>>>>   }
>>>>>>
>>>>>> -static void tun_put(struct tun_struct *tun)
>>>>>> +static int tun_attach(struct tun_struct *tun, struct file *file)
>>>>>>   {
>>>>>> -	struct tun_file *tfile = tun->tfile;
>>>>>> +	struct tun_file *tfile = file->private_data;
>>>>>> +	int err;
>>>>>> +
>>>>>> +	ASSERT_RTNL();
>>>>>> +
>>>>>> +	spin_lock(&tun_lock);
>>>>>>
>>>>>> -	if (atomic_dec_and_test(&tfile->count))
>>>>>> -		tun_detach(tfile->tun);
>>>>>> +	err = -EINVAL;
>>>>>> +	if (rcu_dereference_protected(tfile->tun, lockdep_is_held(&tun_lock)))
>>>>>> +		goto out;
>>>>>> +
>>>>>> +	err = -EBUSY;
>>>>>> +	if (!(tun->flags&    TUN_TAP_MQ)&&    tun->numqueues == 1)
>>>>>> +		goto out;
>>>>>> +
>>>>>> +	if (tun->numqueues == MAX_TAP_QUEUES)
>>>>>> +		goto out;
>>>>>> +
>>>>>> +	err = 0;
>>>>>> +	tfile->queue_index = tun->numqueues;
>>>>>> +	rcu_assign_pointer(tfile->tun, tun);
>>>>>> +	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>>>>>> +	sock_hold(&tfile->sk);
>>>>>> +	tun->numqueues++;
>>>>>> +
>>>>>> +	if (tun->numqueues == 1)
>>>>>> +		netif_carrier_on(tun->dev);
>>>>>> +
>>>>>> +	/* device is allowed to go away first, so no need to hold extra
>>>>>> +	 * refcnt. */
>>>>>> +
>>>>>> +out:
>>>>>> +	spin_unlock(&tun_lock);
>>>>>> +	return err;
>>>>>>   }
>>>>>>
>>>>>>   /* TAP filtering */
>>>>>> @@ -331,16 +414,7 @@ static const struct ethtool_ops tun_ethtool_ops;
>>>>>>   /* Net device detach from fd. */
>>>>>>   static void tun_net_uninit(struct net_device *dev)
>>>>>>   {
>>>>>> -	struct tun_struct *tun = netdev_priv(dev);
>>>>>> -	struct tun_file *tfile = tun->tfile;
>>>>>> -
>>>>>> -	/* Inform the methods they need to stop using the dev.
>>>>>> -	 */
>>>>>> -	if (tfile) {
>>>>>> -		wake_up_all(&tfile->wq.wait);
>>>>>> -		if (atomic_dec_and_test(&tfile->count))
>>>>>> -			__tun_detach(tun);
>>>>>> -	}
>>>>>> +	tun_detach_all(dev);
>>>>>>   }
>>>>>>
>>>>>>   /* Net device open. */
>>>>>> @@ -360,10 +434,10 @@ static int tun_net_close(struct net_device *dev)
>>>>>>   /* Net device start xmit */
>>>>>>   static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>>>   {
>>>>>> -	struct tun_struct *tun = netdev_priv(dev);
>>>>>> -	struct tun_file *tfile = tun->tfile;
>>>>>> +	struct tun_file *tfile = NULL;
>>>>>>
>>>>>> -	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
>>>>>> +	rcu_read_lock();
>>>>>> +	tfile = tun_get_queue(dev, skb);
>>>>>>
>>>>>>   	/* Drop packet if interface is not attached */
>>>>>>   	if (!tfile)
>>>>>> @@ -381,7 +455,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>>>
>>>>>>   	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
>>>>>>   	>= dev->tx_queue_len) {
>>>>>> -		if (!(tun->flags&    TUN_ONE_QUEUE)) {
>>>>>> +		if (!(tfile->flags&    TUN_ONE_QUEUE)&&
>>>>> Which patch moved flags from tun to tfile?
>>>>>
>>>>>> +		    !(tfile->flags&    TUN_TAP_MQ)) {
>>>>>>   			/* Normal queueing mode. */
>>>>>>   			/* Packet scheduler handles dropping of further packets. */
>>>>>>   			netif_stop_queue(dev);
>>>>>> @@ -390,7 +465,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>>>   			 * error is more appropriate. */
>>>>>>   			dev->stats.tx_fifo_errors++;
>>>>>>   		} else {
>>>>>> -			/* Single queue mode.
>>>>>> +			/* Single queue mode or multi queue mode.
>>>>>>   			 * Driver handles dropping of all packets itself. */
>>>>> Please don't do this. Stop the queue on overrun as appropriate.
>>>>> ONE_QUEUE is a legacy hack.
>>>>>
>>>>> BTW we really should stop queue before we start dropping packets,
>>>>> but that can be a separate patch.
>>>>>
>>>>>>   			goto drop;
>>>>>>   		}
>>>>>> @@ -408,9 +483,11 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>>>>>   		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
>>>>>>   	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
>>>>>>   				   POLLRDNORM | POLLRDBAND);
>>>>>> +	rcu_read_unlock();
>>>>>>   	return NETDEV_TX_OK;
>>>>>>
>>>>>>   drop:
>>>>>> +	rcu_read_unlock();
>>>>>>   	dev->stats.tx_dropped++;
>>>>>>   	kfree_skb(skb);
>>>>>>   	return NETDEV_TX_OK;
>>>>>> @@ -527,16 +604,22 @@ static void tun_net_init(struct net_device *dev)
>>>>>>   static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>>>>>   {
>>>>>>   	struct tun_file *tfile = file->private_data;
>>>>>> -	struct tun_struct *tun = __tun_get(tfile);
>>>>>> +	struct tun_struct *tun = NULL;
>>>>>>   	struct sock *sk;
>>>>>>   	unsigned int mask = 0;
>>>>>>
>>>>>> -	if (!tun)
>>>>>> +	if (!tfile)
>>>>>>   		return POLLERR;
>>>>>>
>>>>>> -	sk = tfile->socket.sk;
>>>>>> +	rcu_read_lock();
>>>>>> +	tun = rcu_dereference(tfile->tun);
>>>>>> +	if (!tun) {
>>>>>> +		rcu_read_unlock();
>>>>>> +		return POLLERR;
>>>>>> +	}
>>>>>> +	rcu_read_unlock();
>>>>>>
>>>>>> -	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
>>>>>> +	sk =&tfile->sk;
>>>>>>
>>>>>>   	poll_wait(file,&tfile->wq.wait, wait);
>>>>>>
>>>>>> @@ -548,10 +631,12 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
>>>>>>   	     sock_writeable(sk)))
>>>>>>   		mask |= POLLOUT | POLLWRNORM;
>>>>>>
>>>>>> -	if (tun->dev->reg_state != NETREG_REGISTERED)
>>>>>> +	rcu_read_lock();
>>>>>> +	tun = rcu_dereference(tfile->tun);
>>>>>> +	if (!tun || tun->dev->reg_state != NETREG_REGISTERED)
>>>>>>   		mask = POLLERR;
>>>>>> +	rcu_read_unlock();
>>>>>>
>>>>>> -	tun_put(tun);
>>>>>>   	return mask;
>>>>>>   }
>>>>>>
>>>>>> @@ -708,9 +793,12 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>>>>>   		skb_shinfo(skb)->gso_segs = 0;
>>>>>>   	}
>>>>>>
>>>>>> -	tun = __tun_get(tfile);
>>>>>> -	if (!tun)
>>>>>> +	rcu_read_lock();
>>>>>> +	tun = rcu_dereference(tfile->tun);
>>>>>> +	if (!tun) {
>>>>>> +		rcu_read_unlock();
>>>>>>   		return -EBADFD;
>>>>>> +	}
>>>>>>
>>>>>>   	switch (tfile->flags&    TUN_TYPE_MASK) {
>>>>>>   	case TUN_TUN_DEV:
>>>>>> @@ -720,26 +808,30 @@ static ssize_t tun_get_user(struct tun_file *tfile,
>>>>>>   		skb->protocol = eth_type_trans(skb, tun->dev);
>>>>>>   		break;
>>>>>>   	}
>>>>>> -
>>>>>> -	netif_rx_ni(skb);
>>>>>>   	tun->dev->stats.rx_packets++;
>>>>>>   	tun->dev->stats.rx_bytes += len;
>>>>>> -	tun_put(tun);
>>>>>> +	rcu_read_unlock();
>>>>>> +
>>>>>> +	netif_rx_ni(skb);
>>>>>> +
>>>>>>   	return count;
>>>>>>
>>>>>>   err_free:
>>>>>>   	count = -EINVAL;
>>>>>>   	kfree_skb(skb);
>>>>>>   err:
>>>>>> -	tun = __tun_get(tfile);
>>>>>> -	if (!tun)
>>>>>> +	rcu_read_lock();
>>>>>> +	tun = rcu_dereference(tfile->tun);
>>>>>> +	if (!tun) {
>>>>>> +		rcu_read_unlock();
>>>>>>   		return -EBADFD;
>>>>>> +	}
>>>>>>
>>>>>>   	if (drop)
>>>>>>   		tun->dev->stats.rx_dropped++;
>>>>>>   	if (error)
>>>>>>   		tun->dev->stats.rx_frame_errors++;
>>>>>> -	tun_put(tun);
>>>>>> +	rcu_read_unlock();
>>>>>>   	return count;
>>>>>>   }
>>>>>>
>>>>>> @@ -833,12 +925,13 @@ static ssize_t tun_put_user(struct tun_file *tfile,
>>>>>>   	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
>>>>>>   	total += skb->len;
>>>>>>
>>>>>> -	tun = __tun_get(tfile);
>>>>>> +	rcu_read_lock();
>>>>>> +	tun = rcu_dereference(tfile->tun);
>>>>>>   	if (tun) {
>>>>>>   		tun->dev->stats.tx_packets++;
>>>>>>   		tun->dev->stats.tx_bytes += len;
>>>>>> -		tun_put(tun);
>>>>>>   	}
>>>>>> +	rcu_read_unlock();
>>>>>>
>>>>>>   	return total;
>>>>>>   }
>>>>>> @@ -869,28 +962,31 @@ static ssize_t tun_do_read(struct tun_file *tfile,
>>>>>>   				break;
>>>>>>   			}
>>>>>>
>>>>>> -			tun = __tun_get(tfile);
>>>>>> +			rcu_read_lock();
>>>>>> +			tun = rcu_dereference(tfile->tun);
>>>>>>   			if (!tun) {
>>>>>> -				ret = -EIO;
>>>>>> +				ret = -EBADFD;
>>>>> BADFD is for when you get passed something like -1 fd.
>>>>> Here fd is OK, it's just in a bad state so you can not do IO.
>>>>>
>>>>>
>>>>>> +				rcu_read_unlock();
>>>>>>   				break;
>>>>>>   			}
>>>>>>   			if (tun->dev->reg_state != NETREG_REGISTERED) {
>>>>>>   				ret = -EIO;
>>>>>> -				tun_put(tun);
>>>>>> +				rcu_read_unlock();
>>>>>>   				break;
>>>>>>   			}
>>>>>> -			tun_put(tun);
>>>>>> +			rcu_read_unlock();
>>>>>>
>>>>>>   			/* Nothing to read, let's sleep */
>>>>>>   			schedule();
>>>>>>   			continue;
>>>>>>   		}
>>>>>>
>>>>>> -		tun = __tun_get(tfile);
>>>>>> +		rcu_read_lock();
>>>>>> +		tun = rcu_dereference(tfile->tun);
>>>>>>   		if (tun) {
>>>>>>   			netif_wake_queue(tun->dev);
>>>>>> -			tun_put(tun);
>>>>>>   		}
>>>>>> +		rcu_read_unlock();
>>>>>>
>>>>>>   		ret = tun_put_user(tfile, skb, iv, len);
>>>>>>   		kfree_skb(skb);
>>>>>> @@ -1038,6 +1134,9 @@ static int tun_flags(struct tun_struct *tun)
>>>>>>   	if (tun->flags&    TUN_VNET_HDR)
>>>>>>   		flags |= IFF_VNET_HDR;
>>>>>>
>>>>>> +	if (tun->flags&    TUN_TAP_MQ)
>>>>>> +		flags |= IFF_MULTI_QUEUE;
>>>>>> +
>>>>>>   	return flags;
>>>>>>   }
>>>>>>
>>>>>> @@ -1097,8 +1196,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>>>   		err = tun_attach(tun, file);
>>>>>>   		if (err<    0)
>>>>>>   			return err;
>>>>>> -	}
>>>>>> -	else {
>>>>>> +	} else {
>>>>>>   		char *name;
>>>>>>   		unsigned long flags = 0;
>>>>>>
>>>>>> @@ -1142,6 +1240,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>>>   		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
>>>>>>   			TUN_USER_FEATURES;
>>>>>>   		dev->features = dev->hw_features;
>>>>>> +		if (ifr->ifr_flags&    IFF_MULTI_QUEUE)
>>>>>> +			dev->features |= NETIF_F_LLTX;
>>>>>>
>>>>>>   		err = register_netdevice(tun->dev);
>>>>>>   		if (err<    0)
>>>>>> @@ -1154,7 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>>>
>>>>>>   		err = tun_attach(tun, file);
>>>>>>   		if (err<    0)
>>>>>> -			goto failed;
>>>>>> +			goto err_free_dev;
>>>>>>   	}
>>>>>>
>>>>>>   	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
>>>>>> @@ -1174,6 +1274,11 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>>>   	else
>>>>>>   		tun->flags&= ~TUN_VNET_HDR;
>>>>>>
>>>>>> +	if (ifr->ifr_flags&    IFF_MULTI_QUEUE)
>>>>>> +		tun->flags |= TUN_TAP_MQ;
>>>>>> +	else
>>>>>> +		tun->flags&= ~TUN_TAP_MQ;
>>>>>> +
>>>>>>   	/* Cache flags from tun device */
>>>>>>   	tfile->flags = tun->flags;
>>>>>>   	/* Make sure persistent devices do not get stuck in
>>>>>> @@ -1187,7 +1292,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>>>
>>>>>>   err_free_dev:
>>>>>>   	free_netdev(dev);
>>>>>> -failed:
>>>>>>   	return err;
>>>>>>   }
>>>>>>
>>>>>> @@ -1264,38 +1368,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>>>   				(unsigned int __user*)argp);
>>>>>>   	}
>>>>>>
>>>>>> -	rtnl_lock();
>>>>>> -
>>>>>> -	tun = __tun_get(tfile);
>>>>>> -	if (cmd == TUNSETIFF&&    !tun) {
>>>>>> +	ret = 0;
>>>>>> +	if (cmd == TUNSETIFF) {
>>>>>> +		rtnl_lock();
>>>>>>   		ifr.ifr_name[IFNAMSIZ-1] = '\0';
>>>>>> -
>>>>>>   		ret = tun_set_iff(tfile->net, file,&ifr);
>>>>>> -
>>>>>> +		rtnl_unlock();
>>>>>>   		if (ret)
>>>>>> -			goto unlock;
>>>>>> -
>>>>>> +			return ret;
>>>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>>>> -			ret = -EFAULT;
>>>>>> -		goto unlock;
>>>>>> +			return -EFAULT;
>>>>>> +		return ret;
>>>>>>   	}
>>>>>>
>>>>>> +	rtnl_lock();
>>>>>> +
>>>>>> +	rcu_read_lock();
>>>>>> +
>>>>>>   	ret = -EBADFD;
>>>>>> +	tun = rcu_dereference(tfile->tun);
>>>>>>   	if (!tun)
>>>>>>   		goto unlock;
>>>>>> +	else
>>>>>> +		ret = 0;
>>>>>>
>>>>>> -	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %d\n", cmd);
>>>>>> -
>>>>>> -	ret = 0;
>>>>>>   	switch (cmd) {
>>>>>>   	case TUNGETIFF:
>>>>>>   		ret = tun_get_iff(current->nsproxy->net_ns, tun,&ifr);
>>>>>> +		rcu_read_unlock();
>>>>>>   		if (ret)
>>>>>> -			break;
>>>>>> +			goto out;
>>>>>>
>>>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>>>>   			ret = -EFAULT;
>>>>>> -		break;
>>>>>> +		goto out;
>>>>>>
>>>>>>   	case TUNSETNOCSUM:
>>>>>>   		/* Disable/Enable checksum */
>>>>>> @@ -1357,9 +1463,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>>>   		/* Get hw address */
>>>>>>   		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
>>>>>>   		ifr.ifr_hwaddr.sa_family = tun->dev->type;
>>>>>> +		rcu_read_unlock();
>>>>>>   		if (copy_to_user(argp,&ifr, ifreq_len))
>>>>>>   			ret = -EFAULT;
>>>>>> -		break;
>>>>>> +		goto out;
>>>>>>
>>>>>>   	case SIOCSIFHWADDR:
>>>>>>   		/* Set hw address */
>>>>>> @@ -1375,9 +1482,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>>>>>>   	}
>>>>>>
>>>>>>   unlock:
>>>>>> +	rcu_read_unlock();
>>>>>> +out:
>>>>>>   	rtnl_unlock();
>>>>>> -	if (tun)
>>>>>> -		tun_put(tun);
>>>>>>   	return ret;
>>>>>>   }
>>>>>>
>>>>>> @@ -1517,6 +1624,11 @@ out:
>>>>>>   	return ret;
>>>>>>   }
>>>>>>
>>>>>> +static void tun_sock_destruct(struct sock *sk)
>>>>>> +{
>>>>>> +	skb_queue_purge(&sk->sk_receive_queue);
>>>>>> +}
>>>>>> +
>>>>>>   static int tun_chr_open(struct inode *inode, struct file * file)
>>>>>>   {
>>>>>>   	struct net *net = current->nsproxy->net_ns;
>>>>>> @@ -1540,6 +1652,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>>>>>   	sock_init_data(&tfile->socket,&tfile->sk);
>>>>>>
>>>>>>   	tfile->sk.sk_write_space = tun_sock_write_space;
>>>>>> +	tfile->sk.sk_destruct = tun_sock_destruct;
>>>>>>   	tfile->sk.sk_sndbuf = INT_MAX;
>>>>>>   	file->private_data = tfile;
>>>>>>
>>>>>> @@ -1549,31 +1662,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
>>>>>>   static int tun_chr_close(struct inode *inode, struct file *file)
>>>>>>   {
>>>>>>   	struct tun_file *tfile = file->private_data;
>>>>>> -	struct tun_struct *tun;
>>>>>> -
>>>>>> -	tun = __tun_get(tfile);
>>>>>> -	if (tun) {
>>>>>> -		struct net_device *dev = tun->dev;
>>>>>> -
>>>>>> -		tun_debug(KERN_INFO, tun, "tun_chr_close\n");
>>>>>> -
>>>>>> -		__tun_detach(tun);
>>>>>> -
>>>>>> -		/* If desirable, unregister the netdevice. */
>>>>>> -		if (!(tun->flags&    TUN_PERSIST)) {
>>>>>> -			rtnl_lock();
>>>>>> -			if (dev->reg_state == NETREG_REGISTERED)
>>>>>> -				unregister_netdevice(dev);
>>>>>> -			rtnl_unlock();
>>>>>> -		}
>>>>>>
>>>>>> -		/* drop the reference that netdevice holds */
>>>>>> -		sock_put(&tfile->sk);
>>>>>> -
>>>>>> -	}
>>>>>> -
>>>>>> -	/* drop the reference that file holds */
>>>>>> -	sock_put(&tfile->sk);
>>>>>> +	tun_detach(tfile, true);
>>>>>>
>>>>>>   	return 0;
>>>>>>   }
>>>>>> @@ -1700,14 +1790,17 @@ static void tun_cleanup(void)
>>>>>>    * holding a reference to the file for as long as the socket is in use. */
>>>>>>   struct socket *tun_get_socket(struct file *file)
>>>>>>   {
>>>>>> -	struct tun_struct *tun;
>>>>>> +	struct tun_struct *tun = NULL;
>>>>>>   	struct tun_file *tfile = file->private_data;
>>>>>>   	if (file->f_op !=&tun_fops)
>>>>>>   		return ERR_PTR(-EINVAL);
>>>>>> -	tun = tun_get(file);
>>>>>> -	if (!tun)
>>>>>> +	rcu_read_lock();
>>>>>> +	tun = rcu_dereference(tfile->tun);
>>>>>> +	if (!tun) {
>>>>>> +		rcu_read_unlock();
>>>>>>   		return ERR_PTR(-EBADFD);
>>>>>> -	tun_put(tun);
>>>>>> +	}
>>>>>> +	rcu_read_unlock();
>>>>>>   	return&tfile->socket;
>>>>>>   }
>>>>>>   EXPORT_SYMBOL_GPL(tun_get_socket);
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at  http://www.tux.org/lkml/
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-27  8:26           ` Michael S. Tsirkin
@ 2012-06-28  3:15             ` Jason Wang
  0 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2012-06-28  3:15 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: habanero, netdev, linux-kernel, krkumar2, tahm, akong, davem,
	shemminger, mashirle, Eric Dumazet

On 06/27/2012 04:26 PM, Michael S. Tsirkin wrote:
> On Wed, Jun 27, 2012 at 01:59:37PM +0800, Jason Wang wrote:
>> On 06/26/2012 07:54 PM, Michael S. Tsirkin wrote:
>>> On Tue, Jun 26, 2012 at 01:52:57PM +0800, Jason Wang wrote:
>>>> On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
>>>>> On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
>>>>>> This patch adds multiqueue support for tap device. This is done by abstracting
>>>>>> each queue as a file/socket and allowing multiple sockets to be attached to the
>>>>>> tuntap device (an array of tun_file were stored in the tun_struct). Userspace
>>>>>> could write and read from those files to do the parallel packet
>>>>>> sending/receiving.
>>>>>>
>>>>>> Unlike the previous single queue implementation, the socket and device were
>>>>>> loosely coupled, each of them were allowed to go away first. In order to let the
>>>>>> tx path lockless, netif_tx_loch_bh() is replaced by RCU/NETIF_F_LLTX to
>>>>>> synchronize between data path and system call.
>>>>> Don't use LLTX/RCU. It's not worth it.
>>>>> Use something like netif_set_real_num_tx_queues.
>>>>>
>>>> For LLTX, maybe it's better to convert it to alloc_netdev_mq() to
>>>> let the kernel see all queues and make the queue stopping and
>>>> per-queue stats eaiser.
>>>> RCU is used to handle the attaching/detaching when tun/tap is
>>>> sending and receiving packets which looks reasonalbe for me.
>>> Yes but do we have to allow this? How about we always ask
>>> userspace to attach to all active queues?
>> Attaching/detaching is a method to active/deactive a queue, if all
>> queues were kept attached, then we need other method or flag to mark
>> the queue as activateddeactived and still need to synchronize with
>> data path.
> This is what I am trying to say: use an interface flag for
> multiqueue. When it is set activate all queues attached.
> When unset deactivate all queues except the default one.
>
>
>>>> Not
>>>> sure netif_set_real_num_tx_queues() can help in this situation.
>>> Check it out.
>>>
>>>>>> The tx queue selecting is first based on the recorded rxq index of an skb, it
>>>>>> there's no such one, then choosing based on rx hashing (skb_get_rxhash()).
>>>>>>
>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>> Interestingly macvtap switched to hashing first:
>>>>> ef0002b577b52941fb147128f30bd1ecfdd3ff6d
>>>>> (the commit log is corrupted but see what it
>>>>> does in the patch).
>>>>> Any idea why?
>>>>>
>>>>>> ---
>>>>>>   drivers/net/tun.c |  371 +++++++++++++++++++++++++++++++++--------------------
>>>>>>   1 files changed, 232 insertions(+), 139 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>> index 8233b0a..5c26757 100644
>>>>>> --- a/drivers/net/tun.c
>>>>>> +++ b/drivers/net/tun.c
>>>>>> @@ -107,6 +107,8 @@ struct tap_filter {
>>>>>>   	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
>>>>>>   };
>>>>>>
>>>>>> +#define MAX_TAP_QUEUES (NR_CPUS<    16 ? NR_CPUS : 16)
>>>>> Why the limit? I am guessing you copied this from macvtap?
>>>>> This is problematic for a number of reasons:
>>>>> 	- will not play well with migration
>>>>> 	- will not work well for a large guest
>>>>>
>>>>> Yes, macvtap needs to be fixed too.
>>>>>
>>>>> I am guessing what it is trying to prevent is queueing
>>>>> up a huge number of packets?
>>>>> So just divide the default tx queue limit by the # of queues.
>>>> Not sure,
>>>> another reasons I can guess:
>>>> - to prevent storing a large array of pointers in tun_struct or macvlan_dev.
>>> OK so with the limit of e.g. 1024 we'd allocate at most
>>> 2 pages of memory. This doesn't look too bad. 1024 is probably a
>>> high enough limit: modern hypervisors seem to support on the order
>>> of 100-200 CPUs so this leaves us some breathing space
>>> if we want to match a queue per guest CPU.
>>> Of course we need to limit the packets per queue
>>> in such a setup more aggressively. 1000 packets * 1000 queues
>>> * 64K per packet is too much.
>>>
>>>> - it may not be suitable to allow the number of virtqueues greater
>>>> than the number of physical queues in the card
>>> Maybe for macvtap, here we have no idea which card we
>>> are working with and how many queues it has.
>>>
>>>>> And by the way, for MQ applications maybe we can finally
>>>>> ignore tx queue altogether and limit the total number
>>>>> of bytes queued?
>>>>> To avoid regressions we can make it large like 64M/# queues.
>>>>> Could be a separate patch I think, and for a single queue
>>>>> might need a compatible mode though I am not sure.
>>>> Could you explain more about this?
>>>> Did you mean to have a total
>>>> sndbuf for all sockets that attached to tun/tap?
>>> Consider that we currently limit the # of
>>> packets queued at tun for xmit to userspace.
>>> Some limit is needed but # of packets sounds
>>> very silly - limiting the total memory
>>> might be more reasonable.
>>>
>>> In case of multiqueue, we really care about
>>> total # of packets or total memory, but a simple
>>> approximation could be to divide the allocation
>>> between active queues equally.
>> A possible method is to divce the TUN_READQ_SIZE by #queues, but
>> make it at least to be equal to the vring size (256).
> I would not enforce any limit actually.
> Simply divide by # of queues, and
> fail if userspace tries to attach>  queue size packets.
>
> With 1000 queues this is 64Mbyte worst case as is.
> If someone wants to allow userspace to drink
> 256 times as much that is 16Giga byte per
> single device, let the user tweak tx queue len.
>
>
>
>>> qdisc also queues some packets, that logic is
>>> using # of packets anyway. So either make that
>>> 1000/# queues, or even set to 0 as Eric once
>>> suggested.
>>>
>>>>>> +
>>>>>>   struct tun_file {
>>>>>>   	struct sock sk;
>>>>>>   	struct socket socket;
>>>>>> @@ -114,16 +116,18 @@ struct tun_file {
>>>>>>   	int vnet_hdr_sz;
>>>>>>   	struct tap_filter txflt;
>>>>>>   	atomic_t count;
>>>>>> -	struct tun_struct *tun;
>>>>>> +	struct tun_struct __rcu *tun;
>>>>>>   	struct net *net;
>>>>>>   	struct fasync_struct *fasync;
>>>>>>   	unsigned int flags;
>>>>>> +	u16 queue_index;
>>>>>>   };
>>>>>>
>>>>>>   struct tun_sock;
>>>>>>
>>>>>>   struct tun_struct {
>>>>>> -	struct tun_file		*tfile;
>>>>>> +	struct tun_file		*tfiles[MAX_TAP_QUEUES];
>>>>>> +	unsigned int            numqueues;
>>>>>>   	unsigned int 		flags;
>>>>>>   	uid_t			owner;
>>>>>>   	gid_t			group;
>>>>>> @@ -138,80 +142,159 @@ struct tun_struct {
>>>>>>   #endif
>>>>>>   };
>>>>>>
>>>>>> -static int tun_attach(struct tun_struct *tun, struct file *file)
>>>>>> +static DEFINE_SPINLOCK(tun_lock);
>>>>>> +
>>>>>> +/*
>>>>>> + * tun_get_queue(): calculate the queue index
>>>>>> + *     - if skbs comes from mq nics, we can just borrow
>>>>>> + *     - if not, calculate from the hash
>>>>>> + */
>>>>>> +static struct tun_file *tun_get_queue(struct net_device *dev,
>>>>>> +				      struct sk_buff *skb)
>>>>>>   {
>>>>>> -	struct tun_file *tfile = file->private_data;
>>>>>> -	int err;
>>>>>> +	struct tun_struct *tun = netdev_priv(dev);
>>>>>> +	struct tun_file *tfile = NULL;
>>>>>> +	int numqueues = tun->numqueues;
>>>>>> +	__u32 rxq;
>>>>>>
>>>>>> -	ASSERT_RTNL();
>>>>>> +	BUG_ON(!rcu_read_lock_held());
>>>>>>
>>>>>> -	netif_tx_lock_bh(tun->dev);
>>>>>> +	if (!numqueues)
>>>>>> +		goto out;
>>>>>>
>>>>>> -	err = -EINVAL;
>>>>>> -	if (tfile->tun)
>>>>>> +	if (numqueues == 1) {
>>>>>> +		tfile = rcu_dereference(tun->tfiles[0]);
>>>>> Instead of hacks like this, you can ask for an MQ
>>>>> flag to be set in SETIFF. Then you won't need to
>>>>> handle attach/detach at random times.
>>>> Consier user switch between a sq guest to mq guest, qemu would
>>>> attach or detach the fd which could not be expceted in kernel.
>>> Can't userspace keep it attached always, just deactivate MQ?
>>>
>>>>> And most of the scary num_queues checks can go away.
>>>> Even we has a MQ flag, userspace could still just attach one queue
>>>> to the device.
>>> I think we allow too much flexibility if we let
>>> userspace detach a random queue.
>> The point is to let tun/tap has the same flexibility as macvtap.
>> Macvtap allows add/delete queues at any time and it's very easy to
>> add detach/attach to macvtap. So we can easily use almost the same
>> ioctls to active/deactive a queue at any time for both tap and
>> macvtap.
> Yes but userspace does not do this in practice:
> it decides how many queues and just activates them all.

The problem here I think is:

- We export files descriptors to userspace, so any of the files could  
be closed at anytime which could not be expected.
- Easy to let tap and macvtap has the same ioctls.
>
>
[...]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-28  3:02             ` Jason Wang
@ 2012-06-28  4:52               ` Sridhar Samudrala
  2012-06-28  5:31                 ` Jason Wang
  0 siblings, 1 reply; 28+ messages in thread
From: Sridhar Samudrala @ 2012-06-28  4:52 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, habanero, netdev, linux-kernel, krkumar2,
	tahm, akong, davem, shemminger, mashirle

On 6/27/2012 8:02 PM, Jason Wang wrote:
> On 06/27/2012 04:44 PM, Michael S. Tsirkin wrote:
>> On Wed, Jun 27, 2012 at 01:16:30PM +0800, Jason Wang wrote:
>>> On 06/26/2012 06:42 PM, Michael S. Tsirkin wrote:
>>>> On Tue, Jun 26, 2012 at 11:42:17AM +0800, Jason Wang wrote:
>>>>> On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
>>>>>> On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
>>>>>>> This patch adds multiqueue support for tap device. This is done 
>>>>>>> by abstracting
>>>>>>> each queue as a file/socket and allowing multiple sockets to be 
>>>>>>> attached to the
>>>>>>> tuntap device (an array of tun_file were stored in the 
>>>>>>> tun_struct). Userspace
>>>>>>> could write and read from those files to do the parallel packet
>>>>>>> sending/receiving.
>>>>>>>
>>>>>>> Unlike the previous single queue implementation, the socket and 
>>>>>>> device were
>>>>>>> loosely coupled, each of them were allowed to go away first. In 
>>>>>>> order to let the
>>>>>>> tx path lockless, netif_tx_loch_bh() is replaced by 
>>>>>>> RCU/NETIF_F_LLTX to
>>>>>>> synchronize between data path and system call.
>>>>>> Don't use LLTX/RCU. It's not worth it.
>>>>>> Use something like netif_set_real_num_tx_queues.
>>>>>>
>>>>>>> The tx queue selecting is first based on the recorded rxq index 
>>>>>>> of an skb, it
>>>>>>> there's no such one, then choosing based on rx hashing 
>>>>>>> (skb_get_rxhash()).
>>>>>>>
>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>>> Interestingly macvtap switched to hashing first:
>>>>>> ef0002b577b52941fb147128f30bd1ecfdd3ff6d
>>>>>> (the commit log is corrupted but see what it
>>>>>> does in the patch).
>>>>>> Any idea why?
>>>>> Yes, so tap should be changed to behave same as macvtap. I remember
>>>>> the reason we do that is to make sure the packet of a single flow to
>>>>> be queued to a fixed socket/virtqueues. As 10g cards like ixgbe
>>>>> choose the rx queue for a flow based on the last tx queue where the
>>>>> packets of that flow comes. So if we are using recored rx queue in
>>>>> macvtap, the queue index of a flow would change as vhost thread
>>>>> moves amongs processors.
>>>> Hmm. OTOH if you override this, if TX is sent from VCPU0, RX might 
>>>> land
>>>> on VCPU1 in the guest, which is not good, right?
>>> Yes, but better than making the rx moves between vcpus when we use
>>> recorded rx queue.
>> Why isn't this a problem with native TCP?
>> I think what happens is one of the following:
>> - moving between CPUs is more expensive with tun
>>    because it can queue so much data on xmit
>> - scheduler makes very bad decisions about VCPUs
>>    bouncing them around all the time
>
> For usual native TCP/host process, as it reads and writes tcp sockets, 
> so it make make sense to move rx to the porcessor where the process 
> moves. But vhost does not do tcp stuffs and ixgbe would still move rx 
> when vhost process moves, and we can't even make sure the vhost 
> process that handling rx is running on processor that handle rx 
> interrupt.

We also saw this behavior with the default ixgbe configuration. If vhost 
is pinned to a CPU all
packets for that VM are received on a single RX queue.
So even if the VM is doing multiple TCP_RR sessions, packets for all the 
flows are received
on a single RX queue. Without pinning, vhost moves around and so does 
the packets across
the RX queues.

I think
         ethtool -K ethX ntuple on
will disable this behavior and it should be possible to program the flow 
director using ethtool -U.
This way we can split the packets across the host NIC RX queues based on 
the flows, but it is not
clear if this would help with the current model of single vhost per device.
With per-cpu vhost,  each RX queue can be handled by the matching vhost, 
but if we have only
1 queue in the VMs virtio-net device, that could become the bottleneck.
Multi-queue virtio-net should help here, but we need the same number of 
queues in VM's virtio-net
device as the host's NIC so that each vhost can handle the corresponding 
virtio queue.
But if the VM has only 2 vcpus, i think it is not efficient to have 8 
virtio-net queues.(to match a host
with 8 physical cpus and 8 RX queues in the NIC).

Thanks
Sridhar

>
>> Could we isolate which it is? Does the problem
>> still happen if you pin VCPUs to host cpus?
>> If not it's the queue depth.
>
> It may not help as tun does not record the vcpu/queue that send the 
> stream, so it can't transmit the packets back the same vcpu/queue.
>>> Flow steering is needed to make sure the tx and
>>> rx on the same vcpu.
>> That involves IPI between processes, so it might be
>> very expensive for kvm.
>>
>>>>> But during test tun/tap, one interesting thing I find is that even
>>>>> ixgbe has recorded the queue index during rx, it seems be lost when
>>>>> tap tries to transmit skbs to userspace.
>>>> dev_pick_tx does this I think but ndo_select_queue
>>>> should be able to get it without trouble.
>>>>
>>>>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support
  2012-06-28  4:52               ` Sridhar Samudrala
@ 2012-06-28  5:31                 ` Jason Wang
  0 siblings, 0 replies; 28+ messages in thread
From: Jason Wang @ 2012-06-28  5:31 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Michael S. Tsirkin, habanero, netdev, linux-kernel, krkumar2,
	tahm, akong, davem, shemminger, mashirle

On 06/28/2012 12:52 PM, Sridhar Samudrala wrote:
> On 6/27/2012 8:02 PM, Jason Wang wrote:
>> On 06/27/2012 04:44 PM, Michael S. Tsirkin wrote:
>>> On Wed, Jun 27, 2012 at 01:16:30PM +0800, Jason Wang wrote:
>>>> On 06/26/2012 06:42 PM, Michael S. Tsirkin wrote:
>>>>> On Tue, Jun 26, 2012 at 11:42:17AM +0800, Jason Wang wrote:
>>>>>> On 06/25/2012 04:25 PM, Michael S. Tsirkin wrote:
>>>>>>> On Mon, Jun 25, 2012 at 02:10:18PM +0800, Jason Wang wrote:
>>>>>>>> This patch adds multiqueue support for tap device. This is done 
>>>>>>>> by abstracting
>>>>>>>> each queue as a file/socket and allowing multiple sockets to be 
>>>>>>>> attached to the
>>>>>>>> tuntap device (an array of tun_file were stored in the 
>>>>>>>> tun_struct). Userspace
>>>>>>>> could write and read from those files to do the parallel packet
>>>>>>>> sending/receiving.
>>>>>>>>
>>>>>>>> Unlike the previous single queue implementation, the socket and 
>>>>>>>> device were
>>>>>>>> loosely coupled, each of them were allowed to go away first. In 
>>>>>>>> order to let the
>>>>>>>> tx path lockless, netif_tx_loch_bh() is replaced by 
>>>>>>>> RCU/NETIF_F_LLTX to
>>>>>>>> synchronize between data path and system call.
>>>>>>> Don't use LLTX/RCU. It's not worth it.
>>>>>>> Use something like netif_set_real_num_tx_queues.
>>>>>>>
>>>>>>>> The tx queue selecting is first based on the recorded rxq index 
>>>>>>>> of an skb, it
>>>>>>>> there's no such one, then choosing based on rx hashing 
>>>>>>>> (skb_get_rxhash()).
>>>>>>>>
>>>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>>>> Interestingly macvtap switched to hashing first:
>>>>>>> ef0002b577b52941fb147128f30bd1ecfdd3ff6d
>>>>>>> (the commit log is corrupted but see what it
>>>>>>> does in the patch).
>>>>>>> Any idea why?
>>>>>> Yes, so tap should be changed to behave same as macvtap. I remember
>>>>>> the reason we do that is to make sure the packet of a single flow to
>>>>>> be queued to a fixed socket/virtqueues. As 10g cards like ixgbe
>>>>>> choose the rx queue for a flow based on the last tx queue where the
>>>>>> packets of that flow comes. So if we are using recored rx queue in
>>>>>> macvtap, the queue index of a flow would change as vhost thread
>>>>>> moves amongs processors.
>>>>> Hmm. OTOH if you override this, if TX is sent from VCPU0, RX might 
>>>>> land
>>>>> on VCPU1 in the guest, which is not good, right?
>>>> Yes, but better than making the rx moves between vcpus when we use
>>>> recorded rx queue.
>>> Why isn't this a problem with native TCP?
>>> I think what happens is one of the following:
>>> - moving between CPUs is more expensive with tun
>>>    because it can queue so much data on xmit
>>> - scheduler makes very bad decisions about VCPUs
>>>    bouncing them around all the time
>>
>> For usual native TCP/host process, as it reads and writes tcp 
>> sockets, so it make make sense to move rx to the porcessor where the 
>> process moves. But vhost does not do tcp stuffs and ixgbe would still 
>> move rx when vhost process moves, and we can't even make sure the 
>> vhost process that handling rx is running on processor that handle rx 
>> interrupt.
>
> We also saw this behavior with the default ixgbe configuration. If 
> vhost is pinned to a CPU all
> packets for that VM are received on a single RX queue.
> So even if the VM is doing multiple TCP_RR sessions, packets for all 
> the flows are received
> on a single RX queue. Without pinning, vhost moves around and so does 
> the packets across
> the RX queues.
>
> I think
>         ethtool -K ethX ntuple on
> will disable this behavior and it should be possible to program the 
> flow director using ethtool -U.
> This way we can split the packets across the host NIC RX queues based 
> on the flows, but it is not
> clear if this would help with the current model of single vhost per 
> device.
> With per-cpu vhost,  each RX queue can be handled by the matching 
> vhost, but if we have only
> 1 queue in the VMs virtio-net device, that could become the bottleneck.

Yes, I've been thinking about this. And instead of using ethtool -U 
(maybe possible for macvtap but hard for tuntap), we can 'teach' the 
ixgbe of the rxq it would used for a flow because ixgbe_select_queue() 
would first select the txq based on the recorded rxq. So if we want the 
flow using a dedicated rxq say N, we can record N to the rxq in tuntap 
before we passing the skb to bridge.

> Multi-queue virtio-net should help here, but we need the same number 
> of queues in VM's virtio-net
> device as the host's NIC so that each vhost can handle the 
> corresponding virtio queue.
> But if the VM has only 2 vcpus, i think it is not efficient to have 8 
> virtio-net queues.(to match a host
> with 8 physical cpus and 8 RX queues in the NIC).

Ideally, if we can 2 queues in guest, it's better to only use 2 queues 
in host to avoid extra contention.
>
> Thanks
> Sridhar
>
>>
>>> Could we isolate which it is? Does the problem
>>> still happen if you pin VCPUs to host cpus?
>>> If not it's the queue depth.
>>
>> It may not help as tun does not record the vcpu/queue that send the 
>> stream, so it can't transmit the packets back the same vcpu/queue.
>>>> Flow steering is needed to make sure the tx and
>>>> rx on the same vcpu.
>>> That involves IPI between processes, so it might be
>>> very expensive for kvm.
>>>
>>>>>> But during test tun/tap, one interesting thing I find is that even
>>>>>> ixgbe has recorded the queue index during rx, it seems be lost when
>>>>>> tap tries to transmit skbs to userspace.
>>>>> dev_pick_tx does this I think but ndo_select_queue
>>>>> should be able to get it without trouble.
>>>>>
>>>>>
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2012-06-28  5:29 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20120625060830.6765.27584.stgit@amd-6168-8-1.englab.nay.redhat.com>
     [not found] ` <20120625061018.6765.76633.stgit@amd-6168-8-1.englab.nay.redhat.com>
2012-06-25  8:25   ` [net-next RFC V3 PATCH 4/6] tuntap: multiqueue support Michael S. Tsirkin
2012-06-25  8:41     ` Michael S. Tsirkin
2012-06-26  3:42     ` Jason Wang
2012-06-26 10:42       ` Michael S. Tsirkin
2012-06-27  5:16         ` Jason Wang
2012-06-27  8:44           ` Michael S. Tsirkin
2012-06-28  3:02             ` Jason Wang
2012-06-28  4:52               ` Sridhar Samudrala
2012-06-28  5:31                 ` Jason Wang
2012-06-26  5:52     ` Jason Wang
2012-06-26 11:54       ` Michael S. Tsirkin
2012-06-27  5:59         ` Jason Wang
2012-06-27  8:26           ` Michael S. Tsirkin
2012-06-28  3:15             ` Jason Wang
     [not found] ` <20120625060945.6765.98618.stgit@amd-6168-8-1.englab.nay.redhat.com>
2012-06-25  8:27   ` [net-next RFC V3 PATCH 1/6] tuntap: move socket to tun_file Michael S. Tsirkin
2012-06-26  5:55     ` Jason Wang
2012-06-25 11:59 ` [net-next RFC V3 0/6] Multiqueue support in tun/tap Jason Wang
2012-06-25 11:59 ` [PATCH 1/6] tuntap: move socket to tun_file Jason Wang
2012-06-25 11:59 ` [PATCH 2/6] tuntap: categorize ioctl Jason Wang
2012-06-25 11:59 ` [PATCH 3/6] tuntap: introduce multiqueue flags Jason Wang
2012-06-25 11:59 ` [PATCH 4/6] tuntap: multiqueue support Jason Wang
2012-06-25 11:59 ` [PATCH 5/6] tuntap: per queue 64 bit stats Jason Wang
2012-06-25 12:52   ` Eric Dumazet
2012-06-26  6:00     ` Jason Wang
2012-06-26  6:10       ` Eric Dumazet
2012-06-26  6:28         ` Jason Wang
2012-06-26 19:46       ` [PATCH 5/6] tuntap: per queue 64 bit stats\ Michael S. Tsirkin
2012-06-25 11:59 ` [PATCH 6/6] tuntap: add ioctls to attach or detach a file form tuntap device Jason Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.