All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] tun: export underlying socket
@ 2009-09-10 12:59 Michael S. Tsirkin
  2009-09-10 13:19 ` Eric Dumazet
                   ` (3 more replies)
  0 siblings, 4 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2009-09-10 12:59 UTC (permalink / raw)
  To: David Miller, m.s.tsirkin; +Cc: mst, netdev, herbert

Tun device looks similar to a packet socket
in that both pass complete frames from/to userspace.

This patch fills in enough fields in the socket underlying tun driver
to support sendmsg/recvmsg operations, and exports access to this socket
to modules.

This way, code using raw sockets to inject packets
into a physical device, can support injecting
packets into host network stack almost without modification.

First user of this interface will be vhost virtualization
accelerator.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

This patch is on top of net-next master.
An alternative approach would be to add an ioctl to tun, to export the
underlying socket to userspace: a uniform way to work with a network
device and the host stack might be useful there, as well.
Kernel users could then do sockfd_lookup to get the socket.
I decided against it for now as it requires more code.
Please comment.

 drivers/net/tun.c      |   78 +++++++++++++++++++++++++++++++++++++++++++----
 include/linux/if_tun.h |   14 ++++++++
 2 files changed, 85 insertions(+), 7 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 589a44a..76f5faa 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -151,6 +151,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
 	err = 0;
 	tfile->tun = tun;
 	tun->tfile = tfile;
+	tun->socket.file = file;
 	dev_hold(tun->dev);
 	sock_hold(tun->socket.sk);
 	atomic_inc(&tfile->count);
@@ -165,6 +166,7 @@ static void __tun_detach(struct tun_struct *tun)
 	/* Detach from net device */
 	netif_tx_lock_bh(tun->dev);
 	tun->tfile = NULL;
+	tun->socket.file = NULL;
 	netif_tx_unlock_bh(tun->dev);
 
 	/* Drop read queue */
@@ -750,7 +752,7 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
 	len = min_t(int, skb->len, len);
 
 	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
-	total += len;
+	total += skb->len;
 
 	tun->dev->stats.tx_packets++;
 	tun->dev->stats.tx_bytes += len;
@@ -758,12 +760,10 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
 	return total;
 }
 
-static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
-			    unsigned long count, loff_t pos)
+static ssize_t tun_do_read(struct tun_struct *tun,
+			   struct kiocb *iocb, const struct iovec *iv,
+			   unsigned long count, int noblock)
 {
-	struct file *file = iocb->ki_filp;
-	struct tun_file *tfile = file->private_data;
-	struct tun_struct *tun = __tun_get(tfile);
 	DECLARE_WAITQUEUE(wait, current);
 	struct sk_buff *skb;
 	ssize_t len, ret = 0;
@@ -785,7 +785,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
 
 		/* Read frames from the queue */
 		if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
-			if (file->f_flags & O_NONBLOCK) {
+			if (noblock) {
 				ret = -EAGAIN;
 				break;
 			}
@@ -813,6 +813,21 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
 	remove_wait_queue(&tun->socket.wait, &wait);
 
 out:
+	return ret;
+}
+
+static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
+			    unsigned long count, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct tun_file *tfile = file->private_data;
+	struct tun_struct *tun = __tun_get(tfile);
+	ssize_t ret;
+
+	if (!tun)
+		return -EBADFD;
+	ret = tun_do_read(tun, iocb, iv, count, file->f_flags & O_NONBLOCK);
+	ret = min_t(ssize_t, ret, count);
 	tun_put(tun);
 	return ret;
 }
@@ -865,6 +880,37 @@ static void tun_sock_destruct(struct sock *sk)
 	free_netdev(container_of(sk, struct tun_sock, sk)->tun->dev);
 }
 
+static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *m, size_t total_len)
+{
+	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
+	return tun_get_user(tun, m->msg_iov, total_len,
+			    m->msg_flags & MSG_DONTWAIT);
+}
+
+static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *m, size_t total_len,
+		       int flags)
+{
+	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
+	int ret;
+	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
+		return -EINVAL;
+	ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
+			  flags & MSG_DONTWAIT);
+	if (ret > total_len) {
+		m->msg_flags |= MSG_TRUNC;
+		ret = flags & MSG_TRUNC ? ret : total_len;
+	}
+	return ret;
+}
+
+/* Ops structure to mimic raw sockets with tun */
+static const struct proto_ops tun_socket_ops = {
+	.sendmsg = tun_sendmsg,
+	.recvmsg = tun_recvmsg,
+};
+
 static struct proto tun_proto = {
 	.name		= "tun",
 	.owner		= THIS_MODULE,
@@ -982,6 +1028,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 			goto err_free_dev;
 
 		init_waitqueue_head(&tun->socket.wait);
+		tun->socket.ops = &tun_socket_ops;
 		sock_init_data(&tun->socket, sk);
 		sk->sk_write_space = tun_sock_write_space;
 		sk->sk_sndbuf = INT_MAX;
@@ -1483,6 +1530,23 @@ static void tun_cleanup(void)
 	rtnl_link_unregister(&tun_link_ops);
 }
 
+/* Get an underlying socket object from tun file.  Returns error unless file is
+ * attached to a device.  The returned object works like a packet socket, it
+ * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
+ * holding a reference to the file for as long as the socket is in use. */
+struct socket *tun_get_socket(struct file *file)
+{
+	struct tun_struct *tun;
+	if (file->f_op != &tun_fops)
+		return ERR_PTR(-EINVAL);
+	tun = tun_get(file);
+	if (!tun)
+		return ERR_PTR(-EBADFD);
+	tun_put(tun);
+	return &tun->socket;
+}
+EXPORT_SYMBOL_GPL(tun_get_socket);
+
 module_init(tun_init);
 module_exit(tun_cleanup);
 MODULE_DESCRIPTION(DRV_DESCRIPTION);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 3f5fd52..404abe0 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -86,4 +86,18 @@ struct tun_filter {
 	__u8   addr[0][ETH_ALEN];
 };
 
+#ifdef __KERNEL__
+#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
+struct socket *tun_get_socket(struct file *);
+#else
+#include <linux/err.h>
+#include <linux/errno.h>
+struct file;
+struct socket;
+static inline struct socket *tun_get_socket(struct file *f)
+{
+	return ERR_PTR(-EINVAL);
+}
+#endif /* CONFIG_TUN */
+#endif /* __KERNEL__ */
 #endif /* __IF_TUN_H */
-- 
1.6.2.5

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-10 12:59 [PATCH RFC] tun: export underlying socket Michael S. Tsirkin
@ 2009-09-10 13:19 ` Eric Dumazet
  2009-09-10 13:27   ` Michael S. Tsirkin
  2009-09-11  4:17 ` Paul Moore
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 24+ messages in thread
From: Eric Dumazet @ 2009-09-10 13:19 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, m.s.tsirkin, netdev, herbert

Michael S. Tsirkin a écrit :
> Tun device looks similar to a packet socket
> in that both pass complete frames from/to userspace.
> 
> This patch fills in enough fields in the socket underlying tun driver
> to support sendmsg/recvmsg operations, and exports access to this socket
> to modules.
> 
> This way, code using raw sockets to inject packets
> into a physical device, can support injecting
> packets into host network stack almost without modification.
> 
> First user of this interface will be vhost virtualization
> accelerator.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> 
> This patch is on top of net-next master.
> An alternative approach would be to add an ioctl to tun, to export the
> underlying socket to userspace: a uniform way to work with a network
> device and the host stack might be useful there, as well.
> Kernel users could then do sockfd_lookup to get the socket.
> I decided against it for now as it requires more code.
> Please comment.
> 
>  drivers/net/tun.c      |   78 +++++++++++++++++++++++++++++++++++++++++++----
>  include/linux/if_tun.h |   14 ++++++++
>  2 files changed, 85 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 589a44a..76f5faa 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -151,6 +151,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
>  	err = 0;
>  	tfile->tun = tun;
>  	tun->tfile = tfile;
> +	tun->socket.file = file;
>  	dev_hold(tun->dev);
>  	sock_hold(tun->socket.sk);
>  	atomic_inc(&tfile->count);
> @@ -165,6 +166,7 @@ static void __tun_detach(struct tun_struct *tun)
>  	/* Detach from net device */
>  	netif_tx_lock_bh(tun->dev);
>  	tun->tfile = NULL;
> +	tun->socket.file = NULL;
>  	netif_tx_unlock_bh(tun->dev);
>  
>  	/* Drop read queue */
> @@ -750,7 +752,7 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
>  	len = min_t(int, skb->len, len);
>  
>  	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
> -	total += len;
> +	total += skb->len;

Why are you changing this ? This is very strange that read() can return
a bigger length than what was asked by user...

>  
>  	tun->dev->stats.tx_packets++;
>  	tun->dev->stats.tx_bytes += len;
> @@ -758,12 +760,10 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
>  	return total;
>  }
>  
> -static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
> -			    unsigned long count, loff_t pos)
> +static ssize_t tun_do_read(struct tun_struct *tun,
> +			   struct kiocb *iocb, const struct iovec *iv,
> +			   unsigned long count, int noblock)
>  {
> -	struct file *file = iocb->ki_filp;
> -	struct tun_file *tfile = file->private_data;
> -	struct tun_struct *tun = __tun_get(tfile);
>  	DECLARE_WAITQUEUE(wait, current);
>  	struct sk_buff *skb;
>  	ssize_t len, ret = 0;
> @@ -785,7 +785,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
>  
>  		/* Read frames from the queue */
>  		if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
> -			if (file->f_flags & O_NONBLOCK) {
> +			if (noblock) {
>  				ret = -EAGAIN;
>  				break;
>  			}
> @@ -813,6 +813,21 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
>  	remove_wait_queue(&tun->socket.wait, &wait);
>  
>  out:
> +	return ret;
> +}
> +
> +static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
> +			    unsigned long count, loff_t pos)
> +{
> +	struct file *file = iocb->ki_filp;
> +	struct tun_file *tfile = file->private_data;
> +	struct tun_struct *tun = __tun_get(tfile);
> +	ssize_t ret;
> +
> +	if (!tun)
> +		return -EBADFD;
> +	ret = tun_do_read(tun, iocb, iv, count, file->f_flags & O_NONBLOCK);
> +	ret = min_t(ssize_t, ret, count);
>  	tun_put(tun);
>  	return ret;
>  }
> @@ -865,6 +880,37 @@ static void tun_sock_destruct(struct sock *sk)
>  	free_netdev(container_of(sk, struct tun_sock, sk)->tun->dev);
>  }
>  
> +static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
> +		       struct msghdr *m, size_t total_len)
> +{
> +	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
> +	return tun_get_user(tun, m->msg_iov, total_len,
> +			    m->msg_flags & MSG_DONTWAIT);
> +}
> +
> +static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
> +		       struct msghdr *m, size_t total_len,
> +		       int flags)
> +{
> +	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
> +	int ret;
> +	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
> +		return -EINVAL;
> +	ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
> +			  flags & MSG_DONTWAIT);
> +	if (ret > total_len) {
> +		m->msg_flags |= MSG_TRUNC;
> +		ret = flags & MSG_TRUNC ? ret : total_len;
> +	}
> +	return ret;
> +}
> +
> +/* Ops structure to mimic raw sockets with tun */
> +static const struct proto_ops tun_socket_ops = {
> +	.sendmsg = tun_sendmsg,
> +	.recvmsg = tun_recvmsg,
> +};
> +
>  static struct proto tun_proto = {
>  	.name		= "tun",
>  	.owner		= THIS_MODULE,
> @@ -982,6 +1028,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>  			goto err_free_dev;
>  
>  		init_waitqueue_head(&tun->socket.wait);
> +		tun->socket.ops = &tun_socket_ops;
>  		sock_init_data(&tun->socket, sk);
>  		sk->sk_write_space = tun_sock_write_space;
>  		sk->sk_sndbuf = INT_MAX;
> @@ -1483,6 +1530,23 @@ static void tun_cleanup(void)
>  	rtnl_link_unregister(&tun_link_ops);
>  }
>  
> +/* Get an underlying socket object from tun file.  Returns error unless file is
> + * attached to a device.  The returned object works like a packet socket, it
> + * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
> + * holding a reference to the file for as long as the socket is in use. */
> +struct socket *tun_get_socket(struct file *file)
> +{
> +	struct tun_struct *tun;
> +	if (file->f_op != &tun_fops)
> +		return ERR_PTR(-EINVAL);
> +	tun = tun_get(file);
> +	if (!tun)
> +		return ERR_PTR(-EBADFD);
> +	tun_put(tun);
> +	return &tun->socket;
> +}
> +EXPORT_SYMBOL_GPL(tun_get_socket);
> +
>  module_init(tun_init);
>  module_exit(tun_cleanup);
>  MODULE_DESCRIPTION(DRV_DESCRIPTION);
> diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
> index 3f5fd52..404abe0 100644
> --- a/include/linux/if_tun.h
> +++ b/include/linux/if_tun.h
> @@ -86,4 +86,18 @@ struct tun_filter {
>  	__u8   addr[0][ETH_ALEN];
>  };
>  
> +#ifdef __KERNEL__
> +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
> +struct socket *tun_get_socket(struct file *);
> +#else
> +#include <linux/err.h>
> +#include <linux/errno.h>
> +struct file;
> +struct socket;
> +static inline struct socket *tun_get_socket(struct file *f)
> +{
> +	return ERR_PTR(-EINVAL);
> +}
> +#endif /* CONFIG_TUN */
> +#endif /* __KERNEL__ */
>  #endif /* __IF_TUN_H */


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-10 13:19 ` Eric Dumazet
@ 2009-09-10 13:27   ` Michael S. Tsirkin
  0 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2009-09-10 13:27 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, herbert

On Thu, Sep 10, 2009 at 03:19:21PM +0200, Eric Dumazet wrote:
> Michael S. Tsirkin a écrit :
> > Tun device looks similar to a packet socket
> > in that both pass complete frames from/to userspace.
> > 
> > This patch fills in enough fields in the socket underlying tun driver
> > to support sendmsg/recvmsg operations, and exports access to this socket
> > to modules.
> > 
> > This way, code using raw sockets to inject packets
> > into a physical device, can support injecting
> > packets into host network stack almost without modification.
> > 
> > First user of this interface will be vhost virtualization
> > accelerator.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> > 
> > This patch is on top of net-next master.
> > An alternative approach would be to add an ioctl to tun, to export the
> > underlying socket to userspace: a uniform way to work with a network
> > device and the host stack might be useful there, as well.
> > Kernel users could then do sockfd_lookup to get the socket.
> > I decided against it for now as it requires more code.
> > Please comment.
> > 
> >  drivers/net/tun.c      |   78 +++++++++++++++++++++++++++++++++++++++++++----
> >  include/linux/if_tun.h |   14 ++++++++
> >  2 files changed, 85 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > index 589a44a..76f5faa 100644
> > --- a/drivers/net/tun.c
> > +++ b/drivers/net/tun.c
> > @@ -151,6 +151,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
> >  	err = 0;
> >  	tfile->tun = tun;
> >  	tun->tfile = tfile;
> > +	tun->socket.file = file;
> >  	dev_hold(tun->dev);
> >  	sock_hold(tun->socket.sk);
> >  	atomic_inc(&tfile->count);
> > @@ -165,6 +166,7 @@ static void __tun_detach(struct tun_struct *tun)
> >  	/* Detach from net device */
> >  	netif_tx_lock_bh(tun->dev);
> >  	tun->tfile = NULL;
> > +	tun->socket.file = NULL;
> >  	netif_tx_unlock_bh(tun->dev);
> >  
> >  	/* Drop read queue */
> > @@ -750,7 +752,7 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
> >  	len = min_t(int, skb->len, len);
> >  
> >  	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
> > -	total += len;
> > +	total += skb->len;
> 
> Why are you changing this ?

Because this function is now used in both read() and recvmsg(), and
recvmsg with MSG_TRUNC reports the full packet length.

> This is very strange that read() can return
> a bigger length than what was asked by user...

Of course. Note how tun_chr_aio_read below does
	ret = min_t(ssize_t, ret, count);
so there's no change for read() at all. OK?

> >  
> >  	tun->dev->stats.tx_packets++;
> >  	tun->dev->stats.tx_bytes += len;
> > @@ -758,12 +760,10 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
> >  	return total;
> >  }
> >  
> > -static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
> > -			    unsigned long count, loff_t pos)
> > +static ssize_t tun_do_read(struct tun_struct *tun,
> > +			   struct kiocb *iocb, const struct iovec *iv,
> > +			   unsigned long count, int noblock)
> >  {
> > -	struct file *file = iocb->ki_filp;
> > -	struct tun_file *tfile = file->private_data;
> > -	struct tun_struct *tun = __tun_get(tfile);
> >  	DECLARE_WAITQUEUE(wait, current);
> >  	struct sk_buff *skb;
> >  	ssize_t len, ret = 0;
> > @@ -785,7 +785,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
> >  
> >  		/* Read frames from the queue */
> >  		if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
> > -			if (file->f_flags & O_NONBLOCK) {
> > +			if (noblock) {
> >  				ret = -EAGAIN;
> >  				break;
> >  			}
> > @@ -813,6 +813,21 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
> >  	remove_wait_queue(&tun->socket.wait, &wait);
> >  
> >  out:
> > +	return ret;
> > +}
> > +
> > +static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
> > +			    unsigned long count, loff_t pos)
> > +{
> > +	struct file *file = iocb->ki_filp;
> > +	struct tun_file *tfile = file->private_data;
> > +	struct tun_struct *tun = __tun_get(tfile);
> > +	ssize_t ret;
> > +
> > +	if (!tun)
> > +		return -EBADFD;
> > +	ret = tun_do_read(tun, iocb, iv, count, file->f_flags & O_NONBLOCK);
> > +	ret = min_t(ssize_t, ret, count);
> >  	tun_put(tun);
> >  	return ret;
> >  }
> > @@ -865,6 +880,37 @@ static void tun_sock_destruct(struct sock *sk)
> >  	free_netdev(container_of(sk, struct tun_sock, sk)->tun->dev);
> >  }
> >  
> > +static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
> > +		       struct msghdr *m, size_t total_len)
> > +{
> > +	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
> > +	return tun_get_user(tun, m->msg_iov, total_len,
> > +			    m->msg_flags & MSG_DONTWAIT);
> > +}
> > +
> > +static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
> > +		       struct msghdr *m, size_t total_len,
> > +		       int flags)
> > +{
> > +	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
> > +	int ret;
> > +	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
> > +		return -EINVAL;
> > +	ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
> > +			  flags & MSG_DONTWAIT);
> > +	if (ret > total_len) {
> > +		m->msg_flags |= MSG_TRUNC;
> > +		ret = flags & MSG_TRUNC ? ret : total_len;
> > +	}
> > +	return ret;
> > +}
> > +
> > +/* Ops structure to mimic raw sockets with tun */
> > +static const struct proto_ops tun_socket_ops = {
> > +	.sendmsg = tun_sendmsg,
> > +	.recvmsg = tun_recvmsg,
> > +};
> > +
> >  static struct proto tun_proto = {
> >  	.name		= "tun",
> >  	.owner		= THIS_MODULE,
> > @@ -982,6 +1028,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> >  			goto err_free_dev;
> >  
> >  		init_waitqueue_head(&tun->socket.wait);
> > +		tun->socket.ops = &tun_socket_ops;
> >  		sock_init_data(&tun->socket, sk);
> >  		sk->sk_write_space = tun_sock_write_space;
> >  		sk->sk_sndbuf = INT_MAX;
> > @@ -1483,6 +1530,23 @@ static void tun_cleanup(void)
> >  	rtnl_link_unregister(&tun_link_ops);
> >  }
> >  
> > +/* Get an underlying socket object from tun file.  Returns error unless file is
> > + * attached to a device.  The returned object works like a packet socket, it
> > + * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
> > + * holding a reference to the file for as long as the socket is in use. */
> > +struct socket *tun_get_socket(struct file *file)
> > +{
> > +	struct tun_struct *tun;
> > +	if (file->f_op != &tun_fops)
> > +		return ERR_PTR(-EINVAL);
> > +	tun = tun_get(file);
> > +	if (!tun)
> > +		return ERR_PTR(-EBADFD);
> > +	tun_put(tun);
> > +	return &tun->socket;
> > +}
> > +EXPORT_SYMBOL_GPL(tun_get_socket);
> > +
> >  module_init(tun_init);
> >  module_exit(tun_cleanup);
> >  MODULE_DESCRIPTION(DRV_DESCRIPTION);
> > diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
> > index 3f5fd52..404abe0 100644
> > --- a/include/linux/if_tun.h
> > +++ b/include/linux/if_tun.h
> > @@ -86,4 +86,18 @@ struct tun_filter {
> >  	__u8   addr[0][ETH_ALEN];
> >  };
> >  
> > +#ifdef __KERNEL__
> > +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
> > +struct socket *tun_get_socket(struct file *);
> > +#else
> > +#include <linux/err.h>
> > +#include <linux/errno.h>
> > +struct file;
> > +struct socket;
> > +static inline struct socket *tun_get_socket(struct file *f)
> > +{
> > +	return ERR_PTR(-EINVAL);
> > +}
> > +#endif /* CONFIG_TUN */
> > +#endif /* __KERNEL__ */
> >  #endif /* __IF_TUN_H */

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-10 12:59 [PATCH RFC] tun: export underlying socket Michael S. Tsirkin
  2009-09-10 13:19 ` Eric Dumazet
@ 2009-09-11  4:17 ` Paul Moore
  2009-09-11  4:59   ` Michael S. Tsirkin
  2009-09-14  8:07 ` Or Gerlitz
  2009-11-02 17:20 ` [PATCHv2] " Michael S. Tsirkin
  3 siblings, 1 reply; 24+ messages in thread
From: Paul Moore @ 2009-09-11  4:17 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, m.s.tsirkin, netdev, herbert

On Thursday 10 September 2009 08:59:29 am Michael S. Tsirkin wrote:
> Tun device looks similar to a packet socket
> in that both pass complete frames from/to userspace.
> 
> This patch fills in enough fields in the socket underlying tun driver
> to support sendmsg/recvmsg operations, and exports access to this socket
> to modules.
> 
> This way, code using raw sockets to inject packets
> into a physical device, can support injecting
> packets into host network stack almost without modification.
> 
> First user of this interface will be vhost virtualization
> accelerator.

No comments on the code at this point - I'm just trying to understand the 
intended user right now which I'm assuming is the vhost-net bits you sent 
previously? 

> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> 
> This patch is on top of net-next master.
> An alternative approach would be to add an ioctl to tun, to export the
> underlying socket to userspace: a uniform way to work with a network
> device and the host stack might be useful there, as well.
> Kernel users could then do sockfd_lookup to get the socket.
> I decided against it for now as it requires more code.
> Please comment.
> 
>  drivers/net/tun.c      |   78
>  +++++++++++++++++++++++++++++++++++++++++++---- include/linux/if_tun.h |  
>  14 ++++++++
>  2 files changed, 85 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 589a44a..76f5faa 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -151,6 +151,7 @@ static int tun_attach(struct tun_struct *tun, struct
>  file *file) err = 0;
>  	tfile->tun = tun;
>  	tun->tfile = tfile;
> +	tun->socket.file = file;
>  	dev_hold(tun->dev);
>  	sock_hold(tun->socket.sk);
>  	atomic_inc(&tfile->count);
> @@ -165,6 +166,7 @@ static void __tun_detach(struct tun_struct *tun)
>  	/* Detach from net device */
>  	netif_tx_lock_bh(tun->dev);
>  	tun->tfile = NULL;
> +	tun->socket.file = NULL;
>  	netif_tx_unlock_bh(tun->dev);
> 
>  	/* Drop read queue */
> @@ -750,7 +752,7 @@ static __inline__ ssize_t tun_put_user(struct
>  tun_struct *tun, len = min_t(int, skb->len, len);
> 
>  	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
> -	total += len;
> +	total += skb->len;
> 
>  	tun->dev->stats.tx_packets++;
>  	tun->dev->stats.tx_bytes += len;
> @@ -758,12 +760,10 @@ static __inline__ ssize_t tun_put_user(struct
>  tun_struct *tun, return total;
>  }
> 
> -static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec
>  *iv, -			    unsigned long count, loff_t pos)
> +static ssize_t tun_do_read(struct tun_struct *tun,
> +			   struct kiocb *iocb, const struct iovec *iv,
> +			   unsigned long count, int noblock)
>  {
> -	struct file *file = iocb->ki_filp;
> -	struct tun_file *tfile = file->private_data;
> -	struct tun_struct *tun = __tun_get(tfile);
>  	DECLARE_WAITQUEUE(wait, current);
>  	struct sk_buff *skb;
>  	ssize_t len, ret = 0;
> @@ -785,7 +785,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb,
>  const struct iovec *iv,
> 
>  		/* Read frames from the queue */
>  		if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
> -			if (file->f_flags & O_NONBLOCK) {
> +			if (noblock) {
>  				ret = -EAGAIN;
>  				break;
>  			}
> @@ -813,6 +813,21 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb,
>  const struct iovec *iv, remove_wait_queue(&tun->socket.wait, &wait);
> 
>  out:
> +	return ret;
> +}
> +
> +static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec
>  *iv, +			    unsigned long count, loff_t pos)
> +{
> +	struct file *file = iocb->ki_filp;
> +	struct tun_file *tfile = file->private_data;
> +	struct tun_struct *tun = __tun_get(tfile);
> +	ssize_t ret;
> +
> +	if (!tun)
> +		return -EBADFD;
> +	ret = tun_do_read(tun, iocb, iv, count, file->f_flags & O_NONBLOCK);
> +	ret = min_t(ssize_t, ret, count);
>  	tun_put(tun);
>  	return ret;
>  }
> @@ -865,6 +880,37 @@ static void tun_sock_destruct(struct sock *sk)
>  	free_netdev(container_of(sk, struct tun_sock, sk)->tun->dev);
>  }
> 
> +static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
> +		       struct msghdr *m, size_t total_len)
> +{
> +	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
> +	return tun_get_user(tun, m->msg_iov, total_len,
> +			    m->msg_flags & MSG_DONTWAIT);
> +}
> +
> +static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
> +		       struct msghdr *m, size_t total_len,
> +		       int flags)
> +{
> +	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
> +	int ret;
> +	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
> +		return -EINVAL;
> +	ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
> +			  flags & MSG_DONTWAIT);
> +	if (ret > total_len) {
> +		m->msg_flags |= MSG_TRUNC;
> +		ret = flags & MSG_TRUNC ? ret : total_len;
> +	}
> +	return ret;
> +}
> +
> +/* Ops structure to mimic raw sockets with tun */
> +static const struct proto_ops tun_socket_ops = {
> +	.sendmsg = tun_sendmsg,
> +	.recvmsg = tun_recvmsg,
> +};
> +
>  static struct proto tun_proto = {
>  	.name		= "tun",
>  	.owner		= THIS_MODULE,
> @@ -982,6 +1028,7 @@ static int tun_set_iff(struct net *net, struct file
>  *file, struct ifreq *ifr) goto err_free_dev;
> 
>  		init_waitqueue_head(&tun->socket.wait);
> +		tun->socket.ops = &tun_socket_ops;
>  		sock_init_data(&tun->socket, sk);
>  		sk->sk_write_space = tun_sock_write_space;
>  		sk->sk_sndbuf = INT_MAX;
> @@ -1483,6 +1530,23 @@ static void tun_cleanup(void)
>  	rtnl_link_unregister(&tun_link_ops);
>  }
> 
> +/* Get an underlying socket object from tun file.  Returns error unless
>  file is + * attached to a device.  The returned object works like a packet
>  socket, it + * can be used for sock_sendmsg/sock_recvmsg.  The caller is
>  responsible for + * holding a reference to the file for as long as the
>  socket is in use. */ +struct socket *tun_get_socket(struct file *file)
> +{
> +	struct tun_struct *tun;
> +	if (file->f_op != &tun_fops)
> +		return ERR_PTR(-EINVAL);
> +	tun = tun_get(file);
> +	if (!tun)
> +		return ERR_PTR(-EBADFD);
> +	tun_put(tun);
> +	return &tun->socket;
> +}
> +EXPORT_SYMBOL_GPL(tun_get_socket);
> +
>  module_init(tun_init);
>  module_exit(tun_cleanup);
>  MODULE_DESCRIPTION(DRV_DESCRIPTION);
> diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
> index 3f5fd52..404abe0 100644
> --- a/include/linux/if_tun.h
> +++ b/include/linux/if_tun.h
> @@ -86,4 +86,18 @@ struct tun_filter {
>  	__u8   addr[0][ETH_ALEN];
>  };
> 
> +#ifdef __KERNEL__
> +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
> +struct socket *tun_get_socket(struct file *);
> +#else
> +#include <linux/err.h>
> +#include <linux/errno.h>
> +struct file;
> +struct socket;
> +static inline struct socket *tun_get_socket(struct file *f)
> +{
> +	return ERR_PTR(-EINVAL);
> +}
> +#endif /* CONFIG_TUN */
> +#endif /* __KERNEL__ */
>  #endif /* __IF_TUN_H */
> 

-- 
paul moore
linux @ hp

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-11  4:17 ` Paul Moore
@ 2009-09-11  4:59   ` Michael S. Tsirkin
  2009-09-11  5:36     ` Michael S. Tsirkin
  0 siblings, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2009-09-11  4:59 UTC (permalink / raw)
  To: Paul Moore; +Cc: David Miller, netdev, herbert

On Fri, Sep 11, 2009 at 12:17:27AM -0400, Paul Moore wrote:
> On Thursday 10 September 2009 08:59:29 am Michael S. Tsirkin wrote:
> > Tun device looks similar to a packet socket
> > in that both pass complete frames from/to userspace.
> > 
> > This patch fills in enough fields in the socket underlying tun driver
> > to support sendmsg/recvmsg operations, and exports access to this socket
> > to modules.
> > 
> > This way, code using raw sockets to inject packets
> > into a physical device, can support injecting
> > packets into host network stack almost without modification.
> > 
> > First user of this interface will be vhost virtualization
> > accelerator.
> 
> No comments on the code at this point - I'm just trying to understand the 
> intended user right now which I'm assuming is the vhost-net bits you sent 
> previously? 

Yes - these now use raw socket, I'll add
something like 

	sock = tun_get_socket(file)
	if (!IS_ERR(sock)) {
		return sock;
	}


> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> > 
> > This patch is on top of net-next master.
> > An alternative approach would be to add an ioctl to tun, to export the
> > underlying socket to userspace: a uniform way to work with a network
> > device and the host stack might be useful there, as well.
> > Kernel users could then do sockfd_lookup to get the socket.
> > I decided against it for now as it requires more code.
> > Please comment.
> > 
> >  drivers/net/tun.c      |   78
> >  +++++++++++++++++++++++++++++++++++++++++++---- include/linux/if_tun.h |  
> >  14 ++++++++
> >  2 files changed, 85 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > index 589a44a..76f5faa 100644
> > --- a/drivers/net/tun.c
> > +++ b/drivers/net/tun.c
> > @@ -151,6 +151,7 @@ static int tun_attach(struct tun_struct *tun, struct
> >  file *file) err = 0;
> >  	tfile->tun = tun;
> >  	tun->tfile = tfile;
> > +	tun->socket.file = file;
> >  	dev_hold(tun->dev);
> >  	sock_hold(tun->socket.sk);
> >  	atomic_inc(&tfile->count);
> > @@ -165,6 +166,7 @@ static void __tun_detach(struct tun_struct *tun)
> >  	/* Detach from net device */
> >  	netif_tx_lock_bh(tun->dev);
> >  	tun->tfile = NULL;
> > +	tun->socket.file = NULL;
> >  	netif_tx_unlock_bh(tun->dev);
> > 
> >  	/* Drop read queue */
> > @@ -750,7 +752,7 @@ static __inline__ ssize_t tun_put_user(struct
> >  tun_struct *tun, len = min_t(int, skb->len, len);
> > 
> >  	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
> > -	total += len;
> > +	total += skb->len;
> > 
> >  	tun->dev->stats.tx_packets++;
> >  	tun->dev->stats.tx_bytes += len;
> > @@ -758,12 +760,10 @@ static __inline__ ssize_t tun_put_user(struct
> >  tun_struct *tun, return total;
> >  }
> > 
> > -static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec
> >  *iv, -			    unsigned long count, loff_t pos)
> > +static ssize_t tun_do_read(struct tun_struct *tun,
> > +			   struct kiocb *iocb, const struct iovec *iv,
> > +			   unsigned long count, int noblock)
> >  {
> > -	struct file *file = iocb->ki_filp;
> > -	struct tun_file *tfile = file->private_data;
> > -	struct tun_struct *tun = __tun_get(tfile);
> >  	DECLARE_WAITQUEUE(wait, current);
> >  	struct sk_buff *skb;
> >  	ssize_t len, ret = 0;
> > @@ -785,7 +785,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb,
> >  const struct iovec *iv,
> > 
> >  		/* Read frames from the queue */
> >  		if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
> > -			if (file->f_flags & O_NONBLOCK) {
> > +			if (noblock) {
> >  				ret = -EAGAIN;
> >  				break;
> >  			}
> > @@ -813,6 +813,21 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb,
> >  const struct iovec *iv, remove_wait_queue(&tun->socket.wait, &wait);
> > 
> >  out:
> > +	return ret;
> > +}
> > +
> > +static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec
> >  *iv, +			    unsigned long count, loff_t pos)
> > +{
> > +	struct file *file = iocb->ki_filp;
> > +	struct tun_file *tfile = file->private_data;
> > +	struct tun_struct *tun = __tun_get(tfile);
> > +	ssize_t ret;
> > +
> > +	if (!tun)
> > +		return -EBADFD;
> > +	ret = tun_do_read(tun, iocb, iv, count, file->f_flags & O_NONBLOCK);
> > +	ret = min_t(ssize_t, ret, count);
> >  	tun_put(tun);
> >  	return ret;
> >  }
> > @@ -865,6 +880,37 @@ static void tun_sock_destruct(struct sock *sk)
> >  	free_netdev(container_of(sk, struct tun_sock, sk)->tun->dev);
> >  }
> > 
> > +static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
> > +		       struct msghdr *m, size_t total_len)
> > +{
> > +	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
> > +	return tun_get_user(tun, m->msg_iov, total_len,
> > +			    m->msg_flags & MSG_DONTWAIT);
> > +}
> > +
> > +static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
> > +		       struct msghdr *m, size_t total_len,
> > +		       int flags)
> > +{
> > +	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
> > +	int ret;
> > +	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
> > +		return -EINVAL;
> > +	ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
> > +			  flags & MSG_DONTWAIT);
> > +	if (ret > total_len) {
> > +		m->msg_flags |= MSG_TRUNC;
> > +		ret = flags & MSG_TRUNC ? ret : total_len;
> > +	}
> > +	return ret;
> > +}
> > +
> > +/* Ops structure to mimic raw sockets with tun */
> > +static const struct proto_ops tun_socket_ops = {
> > +	.sendmsg = tun_sendmsg,
> > +	.recvmsg = tun_recvmsg,
> > +};
> > +
> >  static struct proto tun_proto = {
> >  	.name		= "tun",
> >  	.owner		= THIS_MODULE,
> > @@ -982,6 +1028,7 @@ static int tun_set_iff(struct net *net, struct file
> >  *file, struct ifreq *ifr) goto err_free_dev;
> > 
> >  		init_waitqueue_head(&tun->socket.wait);
> > +		tun->socket.ops = &tun_socket_ops;
> >  		sock_init_data(&tun->socket, sk);
> >  		sk->sk_write_space = tun_sock_write_space;
> >  		sk->sk_sndbuf = INT_MAX;
> > @@ -1483,6 +1530,23 @@ static void tun_cleanup(void)
> >  	rtnl_link_unregister(&tun_link_ops);
> >  }
> > 
> > +/* Get an underlying socket object from tun file.  Returns error unless
> >  file is + * attached to a device.  The returned object works like a packet
> >  socket, it + * can be used for sock_sendmsg/sock_recvmsg.  The caller is
> >  responsible for + * holding a reference to the file for as long as the
> >  socket is in use. */ +struct socket *tun_get_socket(struct file *file)
> > +{
> > +	struct tun_struct *tun;
> > +	if (file->f_op != &tun_fops)
> > +		return ERR_PTR(-EINVAL);
> > +	tun = tun_get(file);
> > +	if (!tun)
> > +		return ERR_PTR(-EBADFD);
> > +	tun_put(tun);
> > +	return &tun->socket;
> > +}
> > +EXPORT_SYMBOL_GPL(tun_get_socket);
> > +
> >  module_init(tun_init);
> >  module_exit(tun_cleanup);
> >  MODULE_DESCRIPTION(DRV_DESCRIPTION);
> > diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
> > index 3f5fd52..404abe0 100644
> > --- a/include/linux/if_tun.h
> > +++ b/include/linux/if_tun.h
> > @@ -86,4 +86,18 @@ struct tun_filter {
> >  	__u8   addr[0][ETH_ALEN];
> >  };
> > 
> > +#ifdef __KERNEL__
> > +#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
> > +struct socket *tun_get_socket(struct file *);
> > +#else
> > +#include <linux/err.h>
> > +#include <linux/errno.h>
> > +struct file;
> > +struct socket;
> > +static inline struct socket *tun_get_socket(struct file *f)
> > +{
> > +	return ERR_PTR(-EINVAL);
> > +}
> > +#endif /* CONFIG_TUN */
> > +#endif /* __KERNEL__ */
> >  #endif /* __IF_TUN_H */
> > 
> 
> -- 
> paul moore
> linux @ hp

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-11  4:59   ` Michael S. Tsirkin
@ 2009-09-11  5:36     ` Michael S. Tsirkin
  2009-09-11  6:10       ` Eric Dumazet
  2009-09-14  8:01       ` Or Gerlitz
  0 siblings, 2 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2009-09-11  5:36 UTC (permalink / raw)
  To: Paul Moore; +Cc: David Miller, netdev, herbert

On Fri, Sep 11, 2009 at 07:59:43AM +0300, Michael S. Tsirkin wrote:
> On Fri, Sep 11, 2009 at 12:17:27AM -0400, Paul Moore wrote:
> > On Thursday 10 September 2009 08:59:29 am Michael S. Tsirkin wrote:
> > > Tun device looks similar to a packet socket
> > > in that both pass complete frames from/to userspace.
> > > 
> > > This patch fills in enough fields in the socket underlying tun driver
> > > to support sendmsg/recvmsg operations, and exports access to this socket
> > > to modules.
> > > 
> > > This way, code using raw sockets to inject packets
> > > into a physical device, can support injecting
> > > packets into host network stack almost without modification.
> > > 
> > > First user of this interface will be vhost virtualization
> > > accelerator.
> > 
> > No comments on the code at this point - I'm just trying to understand the 
> > intended user right now which I'm assuming is the vhost-net bits you sent 
> > previously? 
> 
> Yes - these now use raw socket,

More specifically, vhost would then be patched with:

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index aeffb3a..b54f9d6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -331,15 +331,26 @@ err:
 	return ERR_PTR(r);
 }
 
+static struct socket *get_tun_socket(int fd)
+{
+	struct file *file = fget(fd);
+	if (!file)
+		return ERR_PTR(-EBADF);
+	return tun_get_socket(file);
+}
+
 static struct socket *get_socket(int fd)
 {
 	struct socket *sock;
 	sock = get_raw_socket(fd);
 	if (!IS_ERR(sock))
 		return sock;
+	sock = get_tun_socket(fd);
+	if (!IS_ERR(sock))
+		return sock;
 	return ERR_PTR(-ENOTSOCK);
 }
 
 static long vhost_net_set_socket(struct vhost_net *n, int fd)
 {
 	struct socket *sock, *oldsock = NULL;

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-11  5:36     ` Michael S. Tsirkin
@ 2009-09-11  6:10       ` Eric Dumazet
  2009-09-11  9:44         ` Michael S. Tsirkin
  2009-09-14  8:01       ` Or Gerlitz
  1 sibling, 1 reply; 24+ messages in thread
From: Eric Dumazet @ 2009-09-11  6:10 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Paul Moore, David Miller, netdev, herbert

Michael S. Tsirkin a écrit :
> On Fri, Sep 11, 2009 at 07:59:43AM +0300, Michael S. Tsirkin wrote:
>> On Fri, Sep 11, 2009 at 12:17:27AM -0400, Paul Moore wrote:
>>> On Thursday 10 September 2009 08:59:29 am Michael S. Tsirkin wrote:
>>>> Tun device looks similar to a packet socket
>>>> in that both pass complete frames from/to userspace.
>>>>
>>>> This patch fills in enough fields in the socket underlying tun driver
>>>> to support sendmsg/recvmsg operations, and exports access to this socket
>>>> to modules.
>>>>
>>>> This way, code using raw sockets to inject packets
>>>> into a physical device, can support injecting
>>>> packets into host network stack almost without modification.
>>>>
>>>> First user of this interface will be vhost virtualization
>>>> accelerator.
>>> No comments on the code at this point - I'm just trying to understand the 
>>> intended user right now which I'm assuming is the vhost-net bits you sent 
>>> previously? 
>> Yes - these now use raw socket,
> 
> More specifically, vhost would then be patched with:
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index aeffb3a..b54f9d6 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -331,15 +331,26 @@ err:
>  	return ERR_PTR(r);
>  }
>  
> +static struct socket *get_tun_socket(int fd)
> +{
> +	struct file *file = fget(fd);
> +	if (!file)
> +		return ERR_PTR(-EBADF);
> +	return tun_get_socket(file);

This would leak a reference on file, if it happens not being a tun file 

> +}
> +
>  static struct socket *get_socket(int fd)
>  {
>  	struct socket *sock;
>  	sock = get_raw_socket(fd);
>  	if (!IS_ERR(sock))
>  		return sock;
> +	sock = get_tun_socket(fd);
> +	if (!IS_ERR(sock))
> +		return sock;
>  	return ERR_PTR(-ENOTSOCK);
>  }
>  
>  static long vhost_net_set_socket(struct vhost_net *n, int fd)
>  {
>  	struct socket *sock, *oldsock = NULL;


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-11  6:10       ` Eric Dumazet
@ 2009-09-11  9:44         ` Michael S. Tsirkin
  0 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2009-09-11  9:44 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Paul Moore, David Miller, netdev, herbert

On Fri, Sep 11, 2009 at 08:10:27AM +0200, Eric Dumazet wrote:
> Michael S. Tsirkin a écrit :
> > On Fri, Sep 11, 2009 at 07:59:43AM +0300, Michael S. Tsirkin wrote:
> >> On Fri, Sep 11, 2009 at 12:17:27AM -0400, Paul Moore wrote:
> >>> On Thursday 10 September 2009 08:59:29 am Michael S. Tsirkin wrote:
> >>>> Tun device looks similar to a packet socket
> >>>> in that both pass complete frames from/to userspace.
> >>>>
> >>>> This patch fills in enough fields in the socket underlying tun driver
> >>>> to support sendmsg/recvmsg operations, and exports access to this socket
> >>>> to modules.
> >>>>
> >>>> This way, code using raw sockets to inject packets
> >>>> into a physical device, can support injecting
> >>>> packets into host network stack almost without modification.
> >>>>
> >>>> First user of this interface will be vhost virtualization
> >>>> accelerator.
> >>> No comments on the code at this point - I'm just trying to understand the 
> >>> intended user right now which I'm assuming is the vhost-net bits you sent 
> >>> previously? 
> >> Yes - these now use raw socket,
> > 
> > More specifically, vhost would then be patched with:
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index aeffb3a..b54f9d6 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -331,15 +331,26 @@ err:
> >  	return ERR_PTR(r);
> >  }
> >  
> > +static struct socket *get_tun_socket(int fd)
> > +{
> > +	struct file *file = fget(fd);
> > +	if (!file)
> > +		return ERR_PTR(-EBADF);
> > +	return tun_get_socket(file);
> 
> This would leak a reference on file, if it happens not being a tun file 

Good catch, thanks! So it should be:

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index aeffb3a..e70f954 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -331,16 +331,30 @@ err:
 	return ERR_PTR(r);
 }
 
+static struct socket *get_tun_socket(int fd)
+{
+	struct file *file = fget(fd);
+	struct socket *sock;
+	if (!file)
+		return ERR_PTR(-EBADF);
+	sock = tun_get_socket(file);
+	if (IS_ERR(sock))
+		fput(file);
+	return sock;
+}
+
 static struct socket *get_socket(int fd)
 {
 	struct socket *sock;
 	sock = get_raw_socket(fd);
 	if (!IS_ERR(sock))
 		return sock;
+	sock = get_tun_socket(fd);
+	if (!IS_ERR(sock))
+		return sock;
 	return ERR_PTR(-ENOTSOCK);
 }
 
-
 static long vhost_net_set_socket(struct vhost_net *n, int fd)
 {
 	struct socket *sock, *oldsock = NULL;

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-11  5:36     ` Michael S. Tsirkin
  2009-09-11  6:10       ` Eric Dumazet
@ 2009-09-14  8:01       ` Or Gerlitz
  1 sibling, 0 replies; 24+ messages in thread
From: Or Gerlitz @ 2009-09-14  8:01 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Paul Moore, David Miller, netdev, herbert, Or Gerlitz

On 9/11/09, Michael S. Tsirkin <mst@redhat.com> wrote:
>> On Fri, Sep 11, 2009 at 12:17:27AM -0400, Paul Moore wrote:
>>> No comments on the code at this point - I'm just trying to understand the
>>> intended user right now which I'm assuming is the vhost-net bits you sent previously

> More specifically, vhost would then be patched with:
>
>  diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>  index aeffb3a..b54f9d6 100644
>  --- a/drivers/vhost/net.c
>  +++ b/drivers/vhost/net.c
>  @@ -331,15 +331,26 @@ err:
>         return ERR_PTR(r);
>   }
>
>  +static struct socket *get_tun_socket(int fd)
>  +{
>  +       struct file *file = fget(fd);
>  +       if (!file)
>  +               return ERR_PTR(-EBADF);
>  +       return tun_get_socket(file);
>  +}
>  +
>   static struct socket *get_socket(int fd)

Michael,

your latest posting "[PATCHv5 3/3] vhost_net: a kernel-level virtio
server" doesn't have a function named get_socket, so I don't see how
one can really get what you are up with this snip from vhost/net.c

Or.

>   {
>         struct socket *sock;
>         sock = get_raw_socket(fd);
>
>         if (!IS_ERR(sock))
>                 return sock;
>
> +       sock = get_tun_socket(fd);
>  +       if (!IS_ERR(sock))
>  +               return sock;
>         return ERR_PTR(-ENOTSOCK);
>   }
>
>   static long vhost_net_set_socket(struct vhost_net *n, int fd)
>   {
>         struct socket *sock, *oldsock = NULL;
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-10 12:59 [PATCH RFC] tun: export underlying socket Michael S. Tsirkin
  2009-09-10 13:19 ` Eric Dumazet
  2009-09-11  4:17 ` Paul Moore
@ 2009-09-14  8:07 ` Or Gerlitz
  2009-09-14  8:09   ` Michael S. Tsirkin
  2009-11-02 17:20 ` [PATCHv2] " Michael S. Tsirkin
  3 siblings, 1 reply; 24+ messages in thread
From: Or Gerlitz @ 2009-09-14  8:07 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, m.s.tsirkin, netdev, herbert, Or Gerlitz

On 9/10/09, Michael S. Tsirkin <mst@redhat.com> wrote:
>  This way, code using raw sockets to inject packets into a physical device,
>  can support injecting packets into host network stack almost without modification.
>  First user of this interface will be vhost virtualization accelerator.

vhost injects packet into physical device, what is the use case of
vhost injecting packet into the host network stack?

Or.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-14  8:07 ` Or Gerlitz
@ 2009-09-14  8:09   ` Michael S. Tsirkin
  2009-09-14  8:17     ` Or Gerlitz
  0 siblings, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2009-09-14  8:09 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: David Miller, netdev, herbert, Or Gerlitz

On Mon, Sep 14, 2009 at 11:07:25AM +0300, Or Gerlitz wrote:
> On 9/10/09, Michael S. Tsirkin <mst@redhat.com> wrote:
> >  This way, code using raw sockets to inject packets into a physical device,
> >  can support injecting packets into host network stack almost without modification.
> >  First user of this interface will be vhost virtualization accelerator.
> 
> vhost injects packet into physical device, what is the use case of
> vhost injecting packet into the host network stack?

The case where the user requires bridging, typically.

> Or.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-14  8:09   ` Michael S. Tsirkin
@ 2009-09-14  8:17     ` Or Gerlitz
  2009-09-14  9:11       ` Michael S. Tsirkin
  0 siblings, 1 reply; 24+ messages in thread
From: Or Gerlitz @ 2009-09-14  8:17 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, herbert

Michael S. Tsirkin wrote:
> On Mon, Sep 14, 2009 at 11:07:25AM +0300, Or Gerlitz wrote:
>   
>> vhost injects packets into physical device, what is the use case of vhost injecting packets into the host network stack?
>>     
> The case where the user requires bridging, typically.
>   
So you want to support a scheme where someone wants to attach vhost to a 
bridge? why not just telling these users to just set their vhost on top 
of veth couple, such that one veth device is added to a bridge as 
interface and a vhost instance is tied to the other veth device?


Or.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-14  8:17     ` Or Gerlitz
@ 2009-09-14  9:11       ` Michael S. Tsirkin
  2009-09-14  9:43         ` Or Gerlitz
  0 siblings, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2009-09-14  9:11 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: David Miller, netdev, herbert

On Mon, Sep 14, 2009 at 11:17:14AM +0300, Or Gerlitz wrote:
> Michael S. Tsirkin wrote:
>> On Mon, Sep 14, 2009 at 11:07:25AM +0300, Or Gerlitz wrote:
>>   
>>> vhost injects packets into physical device, what is the use case of vhost injecting packets into the host network stack?
>>>     
>> The case where the user requires bridging, typically.
>>   
> So you want to support a scheme where someone wants to attach vhost to a  
> bridge? why not just telling these users to just set their vhost on top  
> of veth couple, such that one veth device is added to a bridge as  
> interface and a vhost instance is tied to the other veth device?
>
>
> Or.

That's already possible. However virtualization users are familiar with
configuring the tun device, and tun has grown virtualization-specific
extensions, so I don't see a reason not to accomodate these uses.

-- 
MST

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-14  9:11       ` Michael S. Tsirkin
@ 2009-09-14  9:43         ` Or Gerlitz
  2009-09-14 10:10           ` Michael S. Tsirkin
  0 siblings, 1 reply; 24+ messages in thread
From: Or Gerlitz @ 2009-09-14  9:43 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, herbert

Michael S. Tsirkin wrote:
> That's already possible. However virtualization users are familiar with configuring the tun device, and tun has grown virtualization-specific extensions, so I don't see a reason not to accomodate these uses
Today packets are written/read from/to Qemu to/from tun device, how 
would the use case with vhost will look like?

Is this the user setting an uplink NIC + bridge + per VM tun device but 
the packets will go from/to virtio-net in the guest kernel to/from vhost 
in the host kernel and then from/to vhost to/from tun? so eventually no 
packets will be seen by the qemu process? I don't see what these scheme 
buys people, I got very much confused.

Or.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-14  9:43         ` Or Gerlitz
@ 2009-09-14 10:10           ` Michael S. Tsirkin
  2009-09-14 14:06             ` Or Gerlitz
  0 siblings, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2009-09-14 10:10 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: David Miller, netdev, herbert

On Mon, Sep 14, 2009 at 12:43:02PM +0300, Or Gerlitz wrote:
> Michael S. Tsirkin wrote:
>> That's already possible. However virtualization users are familiar
>> with configuring the tun device, and tun has grown
>> virtualization-specific extensions, so I don't see a reason not to
>> accomodate these uses
> Today packets are written/read from/to Qemu to/from tun device, how  
> would the use case with vhost will look like?

- Configure bridge and tun using existing scripts
- pass tun fd to vhost via an ioctl
- vhost calls tun_get_socket
- from this point, guest networking just goes faster

> Is this the user setting an uplink NIC + bridge + per VM tun device but  
> the packets will go from/to virtio-net in the guest kernel to/from vhost  
> in the host kernel and then from/to vhost to/from tun? so eventually no  
> packets will be seen by the qemu process? I don't see what these scheme  
> buys people, I got very much confused.
>
> Or.

A lot of people have asked for tun support in vhost, because qemu
currently uses tun.  With this scheme existing code and scripts can be
used to configure both tun and bridge.  You also can utilize
virtualization-specific features in tun.

-- 
MST

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-14 10:10           ` Michael S. Tsirkin
@ 2009-09-14 14:06             ` Or Gerlitz
  2009-09-14 15:03               ` Herbert Xu
  2009-09-14 15:40               ` Michael S. Tsirkin
  0 siblings, 2 replies; 24+ messages in thread
From: Or Gerlitz @ 2009-09-14 14:06 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, netdev, herbert

Michael S. Tsirkin wrote:
>> how  would the use case with vhost will look like?
> - Configure bridge and tun using existing scripts
> - pass tun fd to vhost via an ioctl
> - vhost calls tun_get_socket
> - from this point, guest networking just goes faster

let me see I am with you:

1. vhost gets from user space through ioctl packet socket fd OR tun fd - 
but never both

2. for packet socket fd
VM.TX is translated by vhost to sendmsg which goes through the NIC
NIC RX  makes the fd poll to signal and then recvmsg is called on the 
fd, then vhost places the packet in a virtq

3. for tun fd
VM.TX is translated by vhost to sendmsg which is translated by tun to 
netif_rx which is then handled by the bridge
NIC RX  goes to the bridge which xmits the packet a tun interface, now 
what makes tun provide this packet to vhost and how it is done?


> A lot of people have asked for tun support in vhost, because qemu currently uses tun.  With this scheme existing code and scripts can be used to configure both tun and bridge.  You also can utilize virtualization-specific features in tun.
Tun has code to support some virtualization-specific features, however, 
it has also some inherent problems, I think, for example, you don't know 
over which NIC eventually a packet will be sent and as such, the feature 
advertising to the guest (virtio-net) NIC is problematic, for example, 
TSO. With vhost, since you are directly attached to a NIC and assuming 
its a PF or VF NIC and not something like macvlan/veth you can actually 
know what features are supported by this NIC.

Or.




^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-14 14:06             ` Or Gerlitz
@ 2009-09-14 15:03               ` Herbert Xu
  2009-09-15 13:02                 ` Or Gerlitz
  2009-09-14 15:40               ` Michael S. Tsirkin
  1 sibling, 1 reply; 24+ messages in thread
From: Herbert Xu @ 2009-09-14 15:03 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Michael S. Tsirkin, David Miller, netdev

On Mon, Sep 14, 2009 at 05:06:52PM +0300, Or Gerlitz wrote:
>
>> A lot of people have asked for tun support in vhost, because qemu currently uses tun.  With this scheme existing code and scripts can be used to configure both tun and bridge.  You also can utilize virtualization-specific features in tun.
> Tun has code to support some virtualization-specific features, however,  
> it has also some inherent problems, I think, for example, you don't know  
> over which NIC eventually a packet will be sent and as such, the feature  
> advertising to the guest (virtio-net) NIC is problematic, for example,  
> TSO. With vhost, since you are directly attached to a NIC and assuming  
> its a PF or VF NIC and not something like macvlan/veth you can actually  
> know what features are supported by this NIC.

TSO is not a problem because we provide a software fallback when
the hardware does not support it.  So guests should always enable
TSO if they support it and not worry about the physical NIC.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-14 14:06             ` Or Gerlitz
  2009-09-14 15:03               ` Herbert Xu
@ 2009-09-14 15:40               ` Michael S. Tsirkin
  2009-09-15 13:11                 ` Or Gerlitz
  1 sibling, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2009-09-14 15:40 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: David Miller, netdev, herbert

On Mon, Sep 14, 2009 at 05:06:52PM +0300, Or Gerlitz wrote:
> Michael S. Tsirkin wrote:
>>> how  would the use case with vhost will look like?
>> - Configure bridge and tun using existing scripts
>> - pass tun fd to vhost via an ioctl
>> - vhost calls tun_get_socket
>> - from this point, guest networking just goes faster
>
> let me see I am with you:
>
> 1. vhost gets from user space through ioctl packet socket fd OR tun fd -  
> but never both

Right

> 2. for packet socket fd
> VM.TX is translated by vhost to sendmsg which goes through the NIC
> NIC RX  makes the fd poll to signal and then recvmsg is called on the  
> fd, then vhost places the packet in a virtq
>
> 3. for tun fd
> VM.TX is translated by vhost to sendmsg which is translated by tun to  
> netif_rx which is then handled by the bridge
> NIC RX  goes to the bridge which xmits the packet a tun interface, now  
> what makes tun provide this packet to vhost and how it is done?

Same as above. vhost polls tun and calls recvmsg on the socket.

>
>> A lot of people have asked for tun support in vhost, because qemu
>> currently uses tun.  With this scheme existing code and scripts can
>> be used to configure both tun and bridge.  You also can utilize
>> virtualization-specific features in tun.

( broken too-long lines up. please do not merge them. )

> Tun has code to support some virtualization-specific features, however,  
> it has also some inherent problems, I think, for example, you don't know  
> over which NIC eventually a packet will be sent and as such, the feature  
> advertising to the guest (virtio-net) NIC is problematic,
> for example,  
> TSO. With vhost, since you are directly attached to a NIC and assuming  
> its a PF or VF NIC and not something like macvlan/veth you can actually  
> know what features are supported by this NIC.
>
> Or.

Herbert addressed the TSO example.

Generally, feature negotiation does become more complicated in bridged
configurations, but some users require bridging. So with vhost, feature
negotiation is mostly done in userspace (e.g. vhost does not expose a
TSO cpability, devices do this already); vhost itself only cares about
virtio features such as mergeable buffers.
Policy decisions, including whether to use packet socket or
tun+bridge, are up to the user.

-- 
MST

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-14 15:03               ` Herbert Xu
@ 2009-09-15 13:02                 ` Or Gerlitz
  2009-09-15 13:31                   ` Herbert Xu
  0 siblings, 1 reply; 24+ messages in thread
From: Or Gerlitz @ 2009-09-15 13:02 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Michael S. Tsirkin, David Miller, netdev

Herbert Xu wrote:
> TSO is not a problem because we provide a software fallback when
> the hardware does not support it.  So guests should always enable
> TSO if they support it and not worry about the physical NIC.

Yes, I am aware that GSO can take action and so software segmentation is / can be done when the HW isn't capable of doing so (and I assume the same can go for checksum). I assume that by "we provide" you mean the core networking code, correct? in what level? I assume its not the TCP one, since this is not applicable a tun/bridge scheme. Simper example would be the vm nic MTU... with vhost attached directly to a nic with packet socket you can learn/control it all, where when tun/bridge is used, how can you tell?

Or.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-14 15:40               ` Michael S. Tsirkin
@ 2009-09-15 13:11                 ` Or Gerlitz
  2009-09-15 13:18                   ` Michael S. Tsirkin
  0 siblings, 1 reply; 24+ messages in thread
From: Or Gerlitz @ 2009-09-15 13:11 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Or Gerlitz, David Miller, netdev, herbert

On 9/14/09, Michael S. Tsirkin <mst@redhat.com> wrote:
>> 3. for tun fd
>> VM.TX is translated by vhost to sendmsg which is translated by tun to
>> netif_rx which is then handled by the bridge
>> NIC RX  goes to the bridge which xmits the packet a tun interface, now
>> what makes tun provide this packet to vhost and how it is done?

> Same as above. vhost polls tun and calls recvmsg on the socket

no, correct if I'm wrong, but recvmsg calls tun_do_read and the later
calls tun_put_user to the packet goes to user space and not into the virtq

Or.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-15 13:11                 ` Or Gerlitz
@ 2009-09-15 13:18                   ` Michael S. Tsirkin
  0 siblings, 0 replies; 24+ messages in thread
From: Michael S. Tsirkin @ 2009-09-15 13:18 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Or Gerlitz, David Miller, netdev, herbert

On Tue, Sep 15, 2009 at 04:11:22PM +0300, Or Gerlitz wrote:
> On 9/14/09, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> 3. for tun fd
> >> VM.TX is translated by vhost to sendmsg which is translated by tun to
> >> netif_rx which is then handled by the bridge
> >> NIC RX  goes to the bridge which xmits the packet a tun interface, now
> >> what makes tun provide this packet to vhost and how it is done?
> 
> > Same as above. vhost polls tun and calls recvmsg on the socket
> 
> no, correct if I'm wrong, but recvmsg calls tun_do_read and the later
> calls tun_put_user to the packet goes to user space and not into the virtq
> 
> Or.

It ends up in the virtq because that maps userspace addresses.
This is the same as with packet sockets, really.

-- 
MST

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH RFC] tun: export underlying socket
  2009-09-15 13:02                 ` Or Gerlitz
@ 2009-09-15 13:31                   ` Herbert Xu
  0 siblings, 0 replies; 24+ messages in thread
From: Herbert Xu @ 2009-09-15 13:31 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Michael S. Tsirkin, David Miller, netdev

On Tue, Sep 15, 2009 at 04:02:35PM +0300, Or Gerlitz wrote:
>
> Yes, I am aware that GSO can take action and so software segmentation is / can be done when the HW isn't capable of doing so (and I assume the same can go for checksum). I assume that by "we provide" you mean the core networking code, correct? in what level? I assume its not the TCP one, since this is not applicable a tun/bridge scheme. Simper example would be the vm nic MTU... with vhost attached directly to a nic with packet socket you can learn/control it all, where when tun/bridge is used, how can you tell?

You handle it the same way as how you've always done with MTUs,
ensure that the MTU is set to the maximum required by all the
guests.  If that means the guests cannot use jumboframes then
that's what you're stuck with.

This is the whole reason why TSO exists in the first place.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCHv2] tun: export underlying socket
  2009-09-10 12:59 [PATCH RFC] tun: export underlying socket Michael S. Tsirkin
                   ` (2 preceding siblings ...)
  2009-09-14  8:07 ` Or Gerlitz
@ 2009-11-02 17:20 ` Michael S. Tsirkin
  2009-11-03 15:10   ` Herbert Xu
  3 siblings, 1 reply; 24+ messages in thread
From: Michael S. Tsirkin @ 2009-11-02 17:20 UTC (permalink / raw)
  To: David Miller, Eric Dumazet; +Cc: Paul Moore, netdev, herbert

Tun device looks similar to a packet socket
in that both pass complete frames from/to userspace.

This patch fills in enough fields in the socket underlying tun driver
to support sendmsg/recvmsg operations, and message flags
MSG_TRUNC and MSG_DONTWAIT, and exports access to this socket
to modules.  Regular read/write behaviour is unchanged.

This way, code using raw sockets to inject packets
into a physical device, can support injecting
packets into host network stack almost without modification.

First user of this interface will be vhost virtualization
accelerator.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

Changes from v1:
addressed comments from review

This patch is on top of net-next master.

vhost which is the first user of this interface is intended to be merged
through Rusty's virtio tree, so I think it's easiest to merge this patch
trough that tree, as well, as part of the vhost patchset.
Makes sense? Please comment.

Usage example in vhost is here:
http://git.kernel.org/?p=linux/kernel/git/mst/vhost.git;a=commitdiff;h=030c5e25539e8fb14ca9130621710b7a006f3c1c

An alternative approach would be to make sendmsg work on tun fd from
userspace as well as from kernel: a uniform way to work with a network
device and the host stack might be useful there, as well.  Kernel users
could then do sockfd_lookup to get the socket.  I decided against it for
now as it requires more code and touches net core.
Please comment.

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 4fdfa2a..18f8876 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -144,6 +144,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
 	err = 0;
 	tfile->tun = tun;
 	tun->tfile = tfile;
+	tun->socket.file = file;
 	dev_hold(tun->dev);
 	sock_hold(tun->socket.sk);
 	atomic_inc(&tfile->count);
@@ -158,6 +159,7 @@ static void __tun_detach(struct tun_struct *tun)
 	/* Detach from net device */
 	netif_tx_lock_bh(tun->dev);
 	tun->tfile = NULL;
+	tun->socket.file = NULL;
 	netif_tx_unlock_bh(tun->dev);
 
 	/* Drop read queue */
@@ -387,7 +389,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Notify and wake up reader process */
 	if (tun->flags & TUN_FASYNC)
 		kill_fasync(&tun->fasync, SIGIO, POLL_IN);
-	wake_up_interruptible(&tun->socket.wait);
+	wake_up_interruptible_poll(&tun->socket.wait, POLLIN |
+				   POLLRDNORM | POLLRDBAND);
 	return NETDEV_TX_OK;
 
 drop:
@@ -743,7 +746,7 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
 	len = min_t(int, skb->len, len);
 
 	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
-	total += len;
+	total += skb->len;
 
 	tun->dev->stats.tx_packets++;
 	tun->dev->stats.tx_bytes += len;
@@ -751,34 +754,23 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
 	return total;
 }
 
-static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
-			    unsigned long count, loff_t pos)
+static ssize_t tun_do_read(struct tun_struct *tun,
+			   struct kiocb *iocb, const struct iovec *iv,
+			   ssize_t len, int noblock)
 {
-	struct file *file = iocb->ki_filp;
-	struct tun_file *tfile = file->private_data;
-	struct tun_struct *tun = __tun_get(tfile);
 	DECLARE_WAITQUEUE(wait, current);
 	struct sk_buff *skb;
-	ssize_t len, ret = 0;
-
-	if (!tun)
-		return -EBADFD;
+	ssize_t ret = 0;
 
 	DBG(KERN_INFO "%s: tun_chr_read\n", tun->dev->name);
 
-	len = iov_length(iv, count);
-	if (len < 0) {
-		ret = -EINVAL;
-		goto out;
-	}
-
 	add_wait_queue(&tun->socket.wait, &wait);
 	while (len) {
 		current->state = TASK_INTERRUPTIBLE;
 
 		/* Read frames from the queue */
 		if (!(skb=skb_dequeue(&tun->socket.sk->sk_receive_queue))) {
-			if (file->f_flags & O_NONBLOCK) {
+			if (noblock) {
 				ret = -EAGAIN;
 				break;
 			}
@@ -805,6 +797,27 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
 	current->state = TASK_RUNNING;
 	remove_wait_queue(&tun->socket.wait, &wait);
 
+	return ret;
+}
+
+static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
+			    unsigned long count, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct tun_file *tfile = file->private_data;
+	struct tun_struct *tun = __tun_get(tfile);
+	ssize_t len, ret;
+
+	if (!tun)
+		return -EBADFD;
+	len = iov_length(iv, count);
+	if (len < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = tun_do_read(tun, iocb, iv, len, file->f_flags & O_NONBLOCK);
+	ret = min_t(ssize_t, ret, len);
 out:
 	tun_put(tun);
 	return ret;
@@ -847,7 +860,8 @@ static void tun_sock_write_space(struct sock *sk)
 		return;
 
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible_sync(sk->sk_sleep);
+		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
+						POLLWRNORM | POLLWRBAND);
 
 	tun = container_of(sk, struct tun_sock, sk)->tun;
 	kill_fasync(&tun->fasync, SIGIO, POLL_OUT);
@@ -858,6 +872,37 @@ static void tun_sock_destruct(struct sock *sk)
 	free_netdev(container_of(sk, struct tun_sock, sk)->tun->dev);
 }
 
+static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *m, size_t total_len)
+{
+	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
+	return tun_get_user(tun, m->msg_iov, total_len,
+			    m->msg_flags & MSG_DONTWAIT);
+}
+
+static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
+		       struct msghdr *m, size_t total_len,
+		       int flags)
+{
+	struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
+	int ret;
+	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
+		return -EINVAL;
+	ret = tun_do_read(tun, iocb, m->msg_iov, total_len,
+			  flags & MSG_DONTWAIT);
+	if (ret > total_len) {
+		m->msg_flags |= MSG_TRUNC;
+		ret = flags & MSG_TRUNC ? ret : total_len;
+	}
+	return ret;
+}
+
+/* Ops structure to mimic raw sockets with tun */
+static const struct proto_ops tun_socket_ops = {
+	.sendmsg = tun_sendmsg,
+	.recvmsg = tun_recvmsg,
+};
+
 static struct proto tun_proto = {
 	.name		= "tun",
 	.owner		= THIS_MODULE,
@@ -986,6 +1031,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 			goto err_free_dev;
 
 		init_waitqueue_head(&tun->socket.wait);
+		tun->socket.ops = &tun_socket_ops;
 		sock_init_data(&tun->socket, sk);
 		sk->sk_write_space = tun_sock_write_space;
 		sk->sk_sndbuf = INT_MAX;
@@ -1489,6 +1535,23 @@ static void tun_cleanup(void)
 	rtnl_link_unregister(&tun_link_ops);
 }
 
+/* Get an underlying socket object from tun file.  Returns error unless file is
+ * attached to a device.  The returned object works like a packet socket, it
+ * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
+ * holding a reference to the file for as long as the socket is in use. */
+struct socket *tun_get_socket(struct file *file)
+{
+	struct tun_struct *tun;
+	if (file->f_op != &tun_fops)
+		return ERR_PTR(-EINVAL);
+	tun = tun_get(file);
+	if (!tun)
+		return ERR_PTR(-EBADFD);
+	tun_put(tun);
+	return &tun->socket;
+}
+EXPORT_SYMBOL_GPL(tun_get_socket);
+
 module_init(tun_init);
 module_exit(tun_cleanup);
 MODULE_DESCRIPTION(DRV_DESCRIPTION);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 3f5fd52..404abe0 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -86,4 +86,18 @@ struct tun_filter {
 	__u8   addr[0][ETH_ALEN];
 };
 
+#ifdef __KERNEL__
+#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
+struct socket *tun_get_socket(struct file *);
+#else
+#include <linux/err.h>
+#include <linux/errno.h>
+struct file;
+struct socket;
+static inline struct socket *tun_get_socket(struct file *f)
+{
+	return ERR_PTR(-EINVAL);
+}
+#endif /* CONFIG_TUN */
+#endif /* __KERNEL__ */
 #endif /* __IF_TUN_H */

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCHv2] tun: export underlying socket
  2009-11-02 17:20 ` [PATCHv2] " Michael S. Tsirkin
@ 2009-11-03 15:10   ` Herbert Xu
  0 siblings, 0 replies; 24+ messages in thread
From: Herbert Xu @ 2009-11-03 15:10 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, Eric Dumazet, Paul Moore, netdev

On Mon, Nov 02, 2009 at 07:20:22PM +0200, Michael S. Tsirkin wrote:
> Tun device looks similar to a packet socket
> in that both pass complete frames from/to userspace.
> 
> This patch fills in enough fields in the socket underlying tun driver
> to support sendmsg/recvmsg operations, and message flags
> MSG_TRUNC and MSG_DONTWAIT, and exports access to this socket
> to modules.  Regular read/write behaviour is unchanged.
> 
> This way, code using raw sockets to inject packets
> into a physical device, can support injecting
> packets into host network stack almost without modification.
> 
> First user of this interface will be vhost virtualization
> accelerator.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2009-11-03 15:10 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-10 12:59 [PATCH RFC] tun: export underlying socket Michael S. Tsirkin
2009-09-10 13:19 ` Eric Dumazet
2009-09-10 13:27   ` Michael S. Tsirkin
2009-09-11  4:17 ` Paul Moore
2009-09-11  4:59   ` Michael S. Tsirkin
2009-09-11  5:36     ` Michael S. Tsirkin
2009-09-11  6:10       ` Eric Dumazet
2009-09-11  9:44         ` Michael S. Tsirkin
2009-09-14  8:01       ` Or Gerlitz
2009-09-14  8:07 ` Or Gerlitz
2009-09-14  8:09   ` Michael S. Tsirkin
2009-09-14  8:17     ` Or Gerlitz
2009-09-14  9:11       ` Michael S. Tsirkin
2009-09-14  9:43         ` Or Gerlitz
2009-09-14 10:10           ` Michael S. Tsirkin
2009-09-14 14:06             ` Or Gerlitz
2009-09-14 15:03               ` Herbert Xu
2009-09-15 13:02                 ` Or Gerlitz
2009-09-15 13:31                   ` Herbert Xu
2009-09-14 15:40               ` Michael S. Tsirkin
2009-09-15 13:11                 ` Or Gerlitz
2009-09-15 13:18                   ` Michael S. Tsirkin
2009-11-02 17:20 ` [PATCHv2] " Michael S. Tsirkin
2009-11-03 15:10   ` Herbert Xu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.