linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/5] High-speed tun receive and xmit
@ 2008-04-18  4:33 Rusty Russell
  2008-04-18  4:35 ` [PATCH 1/5] virtio: put last_used and last_avail index into ring itself Rusty Russell
  0 siblings, 1 reply; 27+ messages in thread
From: Rusty Russell @ 2008-04-18  4:33 UTC (permalink / raw)
  To: netdev; +Cc: Max Krasnyansky, virtualization, linux-kernel

kvm (and lguest!) want to get more speed out of the tun device.  We already 
have an ABI for guest<->host comms, called virtio_ring; extending tun to 
understand this (with its async nature and batching) make for an efficient 
network.

But moreover: the same things that make virtio a good guest<->host transport 
make it appealing as a userspace<->kernel transport.  So rather than do 
something just for tun, we create a /dev/vring.  We could do a system call, 
but /dev/vring is sufficient.  It is a nice, simple format for normal tun 
usage too.

We do need a small (backwards-compatible) extension to the ring format if 
we're going to do this however, since we didn't previously expose the 
consumer's view of the producer (and vice verse) in the ring because the 
other end doesn't need to see it.  However, it means that a middleman (as the 
kvm or lguest launcher process now becomes) cannot save and restore all the 
ring state.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/5] virtio: put last_used and last_avail index into ring itself.
  2008-04-18  4:33 [PATCH 0/5] High-speed tun receive and xmit Rusty Russell
@ 2008-04-18  4:35 ` Rusty Russell
  2008-04-18  4:39   ` [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface Rusty Russell
  0 siblings, 1 reply; 27+ messages in thread
From: Rusty Russell @ 2008-04-18  4:35 UTC (permalink / raw)
  To: netdev; +Cc: Max Krasnyansky, virtualization, linux-kernel

Generally, the other end of the virtio ring doesn't need to see where
you're up to in consuming the ring.  However, in order for an external
entity to understand it, it must be exposed.  For example, if you want
to save and restore a virtio_ring, but you're not the consumer because
the kernel is using it directly.

Fortunately, we have room to expand: the ring is always a whole number
of pages and there's hundreds of bytes of padding after the avail ring
and the used ring, whatever the number of descriptors (which must be a
power of 2).

We add a feature bit so the guest can tell the host that it's writing
out the current value there, if it wants to use that.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c  |   20 ++++++++++++--------
 include/linux/virtio_config.h |    3 +++
 include/linux/virtio_ring.h   |   12 +++++++++++-
 3 files changed, 26 insertions(+), 9 deletions(-)

diff -r 1349b6a821c7 drivers/virtio/virtio_ring.c
--- a/drivers/virtio/virtio_ring.c	Thu Apr 17 05:58:09 2008 +1000
+++ b/drivers/virtio/virtio_ring.c	Thu Apr 17 06:15:26 2008 +1000
@@ -18,6 +18,7 @@
  */
 #include <linux/virtio.h>
 #include <linux/virtio_ring.h>
+#include <linux/virtio_config.h>
 #include <linux/device.h>
 
 #ifdef DEBUG
@@ -51,9 +52,6 @@ struct vring_virtqueue
 	unsigned int free_head;
 	/* Number we've added since last sync. */
 	unsigned int num_added;
-
-	/* Last used index we've seen. */
-	u16 last_used_idx;
 
 	/* How to notify other side. FIXME: commonalize hcalls! */
 	void (*notify)(struct virtqueue *vq);
@@ -173,12 +171,13 @@ static void detach_buf(struct vring_virt
 
 static inline bool more_used(const struct vring_virtqueue *vq)
 {
-	return vq->last_used_idx != vq->vring.used->idx;
+	return vring_last_used(&vq->vring) != vq->vring.used->idx;
 }
 
 static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
+	struct vring_used_elem *u;
 	void *ret;
 	unsigned int i;
 
@@ -195,8 +194,11 @@ static void *vring_get_buf(struct virtqu
 		return NULL;
 	}
 
-	i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id;
-	*len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len;
+	u = &vq->vring.used->ring[vring_last_used(&vq->vring) % vq->vring.num];
+	i = u->id;
+	*len = u->len;
+	/* Make sure we don't reload i after doing checks. */
+	rmb();
 
 	if (unlikely(i >= vq->vring.num)) {
 		BAD_RING(vq, "id %u out of range\n", i);
@@ -210,7 +212,7 @@ static void *vring_get_buf(struct virtqu
 	/* detach_buf clears data, so grab it now. */
 	ret = vq->data[i];
 	detach_buf(vq, i);
-	vq->last_used_idx++;
+	vring_last_used(&vq->vring)++;
 	END_USE(vq);
 	return ret;
 }
@@ -302,7 +304,6 @@ struct virtqueue *vring_new_virtqueue(un
 	vq->vq.vq_ops = &vring_vq_ops;
 	vq->notify = notify;
 	vq->broken = false;
-	vq->last_used_idx = 0;
 	vq->num_added = 0;
 #ifdef DEBUG
 	vq->in_use = false;
@@ -318,6 +319,9 @@ struct virtqueue *vring_new_virtqueue(un
 	for (i = 0; i < num-1; i++)
 		vq->vring.desc[i].next = i+1;
 
+	/* We now publish our last_used index in the rings. */
+	vdev->config->feature(vdev, VIRTIO_RING_F_PUBLISH_INDICES);
+
 	return &vq->vq;
 }
 EXPORT_SYMBOL_GPL(vring_new_virtqueue);
diff -r 1349b6a821c7 include/linux/virtio_config.h
--- a/include/linux/virtio_config.h	Thu Apr 17 05:58:09 2008 +1000
+++ b/include/linux/virtio_config.h	Thu Apr 17 06:15:26 2008 +1000
@@ -14,6 +14,9 @@
 #define VIRTIO_CONFIG_S_DRIVER_OK	4
 /* We've given up on this device. */
 #define VIRTIO_CONFIG_S_FAILED		0x80
+
+/* Transport virtio feature bits (currently bits 24 to 32 are reserved
+ * for this), the rest are per-device feature bits. */
 
 #ifdef __KERNEL__
 struct virtio_device;
diff -r 1349b6a821c7 include/linux/virtio_ring.h
--- a/include/linux/virtio_ring.h	Thu Apr 17 05:58:09 2008 +1000
+++ b/include/linux/virtio_ring.h	Thu Apr 17 06:15:26 2008 +1000
@@ -23,6 +23,9 @@
  * when you consume a buffer.  It's unreliable, so it's simply an
  * optimization.  */
 #define VRING_AVAIL_F_NO_INTERRUPT	1
+
+/* We publish our last-seen used index at the end of the avail ring. */
+#define VIRTIO_RING_F_PUBLISH_INDICES	24
 
 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
 struct vring_desc
@@ -82,6 +85,7 @@ struct vring {
  *	__u16 avail_flags;
  *	__u16 avail_idx;
  *	__u16 available[num];
+ *	__u16 last_used_idx;
  *
  *	// Padding to the next page boundary.
  *	char pad[];
@@ -90,6 +94,7 @@ struct vring {
  *	__u16 used_flags;
  *	__u16 used_idx;
  *	struct vring_used_elem used[num];
+ *	__u16 last_avail_idx;
  * };
  */
 static inline void vring_init(struct vring *vr, unsigned int num, void *p,
@@ -106,8 +111,13 @@ static inline unsigned vring_size(unsign
 {
 	return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num)
 		 + pagesize - 1) & ~(pagesize - 1))
-		+ sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num;
+		+ sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num + 2;
 }
+
+/* We publish the last-seen used index at the end of the available ring, and
+ * vice-versa.  These are at the end for backwards compatibility. */
+#define vring_last_used(vr) ((vr)->avail->ring[(vr)->num])
+#define vring_last_avail(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num])
 
 #ifdef __KERNEL__
 #include <linux/irqreturn.h>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-18  4:35 ` [PATCH 1/5] virtio: put last_used and last_avail index into ring itself Rusty Russell
@ 2008-04-18  4:39   ` Rusty Russell
  2008-04-18  4:41     ` [PATCH 3/5] /dev/vring limit and base ioctls Rusty Russell
                       ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Rusty Russell @ 2008-04-18  4:39 UTC (permalink / raw)
  To: netdev; +Cc: Max Krasnyansky, virtualization, linux-kernel

virtio introduced a ring structure ABI for guest-host communications
(currently used by lguest and kvm).  Using this same ABI, we can
create a nice fd version.

This is useful for efficiently passing packets to and from the tun,
for example.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/char/Kconfig  |    9 +
 drivers/char/Makefile |    2 
 drivers/char/vring.c  |  400 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/vring.h |   58 +++++++
 4 files changed, 469 insertions(+)

diff -r b2d9869d338f drivers/char/Kconfig
--- a/drivers/char/Kconfig	Fri Apr 18 10:33:58 2008 +1000
+++ b/drivers/char/Kconfig	Fri Apr 18 13:35:16 2008 +1000
@@ -1049,5 +1049,14 @@ config DEVPORT
 
 source "drivers/s390/char/Kconfig"
 
+config VRING
+       tristate "/dev/vring support (EXPERIMENTAL)"
+       depends on EXPERIMENTAL
+       help
+         vring is a ringbuffer implementation for efficient I/O.  It is
+	 currently used by virtualization hosts (lguest, kvm) for efficient
+	 networking using the tun driver.
+
+	 If unsure, say N, but there's a part of you that wants to say M.
 endmenu
 
diff -r b2d9869d338f drivers/char/Makefile
--- a/drivers/char/Makefile	Fri Apr 18 10:33:58 2008 +1000
+++ b/drivers/char/Makefile	Fri Apr 18 13:35:16 2008 +1000
@@ -112,6 +112,8 @@ obj-$(CONFIG_JS_RTC)		+= js-rtc.o
 obj-$(CONFIG_JS_RTC)		+= js-rtc.o
 js-rtc-y = rtc.o
 
+obj-$(CONFIG_VRING)		+= vring.o
+
 # Files generated that shall be removed upon make clean
 clean-files := consolemap_deftbl.c defkeymap.c
 
diff -r b2d9869d338f drivers/char/vring.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drivers/char/vring.c	Fri Apr 18 13:35:16 2008 +1000
@@ -0,0 +1,400 @@
+/* Ring-buffer device implementation.
+ *
+ *  Copyright 2008 Rusty Russell IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/virtio_ring.h>
+#include <linux/vring.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+
+struct vring_info {
+	struct mutex lock;
+
+	struct vring ring;
+	u16 mask;
+	u16 last_used;
+
+	const struct vring_ops *ops;
+	void *ops_data;
+
+	/* Waitqueue for poll() */
+	wait_queue_head_t poll_wait;
+};
+
+static unsigned int vring_poll(struct file *filp,
+			       struct poll_table_struct *poll)
+{
+	struct vring_info *vr = filp->private_data;
+	unsigned int mask;
+	u16 used = 0;
+
+	/* Poll can't error, so let's not go silly here. */
+	get_user(used, &vr->ring.used->idx);
+
+	/* More buffers have been used?  It's 'readable'. */
+	if (used != vr->last_used)
+		mask = POLLIN | POLLRDNORM;
+	else {
+		mask = 0;
+		/* If we need to pull, it's also readable. */
+		mutex_lock(&vr->lock);
+		if (vr->ops && vr->ops->needs_pull) {
+			if (vr->ops->needs_pull(vr->ops_data))
+				mask = POLLIN | POLLRDNORM;
+		}
+		mutex_unlock(&vr->lock);
+	}
+
+	poll_wait(filp, &vr->poll_wait, poll);
+
+	return mask;
+}
+
+/* Read may not be necessary for all use cases, in fact. */
+static ssize_t vring_read(struct file *filp, char __user *buf,
+			  size_t size, loff_t *off)
+{
+	struct vring_info *vr = filp->private_data;
+	int err;
+
+	/* Some uses of vrings require updating in user context.  This
+	 * is best done close to the caller, ie. here. */
+	mutex_lock(&vr->lock);
+	if (vr->ops && vr->ops->pull)
+		err = vr->ops->pull(vr->ops_data);
+	else
+		err = 0;
+	mutex_unlock(&vr->lock);
+
+	/* Update our last_used value to clear the poll. */
+	if (!err)
+		err = get_user(vr->last_used, &vr->ring.used->idx);
+
+	return err;
+}
+
+/* Write kicks the other end to say we have buffers. */
+static ssize_t vring_write(struct file *filp, const char __user *buf,
+			   size_t size, loff_t *off)
+{
+	struct vring_info *vr = filp->private_data;
+	int err;
+
+	mutex_lock(&vr->lock);
+	if (vr->ops && vr->ops->push)
+		err = vr->ops->push(vr->ops_data);
+	else
+		err = 0;
+	mutex_unlock(&vr->lock);
+
+	return err;
+}
+
+/* We assume anyone attached holds a reference, so this won't mess them up */
+static int vring_release(struct inode *inode, struct file *filp)
+{
+	struct vring_info *vr = filp->private_data;
+
+	kfree(vr);
+	return 0;
+}
+
+static int vring_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	unsigned long size, num_descs;
+	struct vring_info *vr = filp->private_data;
+	int err;
+
+	/* We overload mmap's offset to hold the ring number. */
+	num_descs = vma->vm_pgoff;
+
+	/* Must be a power of two, and limit indices to a u16. */
+	if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536)
+		return -EINVAL;
+
+	/* mmap size must be what we expect for such a ring. */
+	size = vma->vm_end - vma->vm_start;
+	if (size != ALIGN(vring_size(num_descs, PAGE_SIZE), PAGE_SIZE))
+		return -EINVAL;
+
+	/* We only let them map this in one place. */
+	mutex_lock(&vr->lock);
+	if (vr->ring.num != 0) {
+		err = -EBUSY;
+		goto unlock;
+	}
+
+	vring_init(&vr->ring, num_descs, (void *)vma->vm_start, PAGE_SIZE);
+
+	vr->mask = num_descs - 1;
+	err = 0;
+
+unlock:
+	mutex_unlock(&vr->lock);
+	return err;
+}
+
+static int vring_open(struct inode *in, struct file *filp)
+{
+	struct vring_info *vr;
+
+	filp->private_data = vr = kzalloc(sizeof(*vr), GFP_KERNEL);
+	if (!vr)
+		return -ENOMEM;
+
+	init_waitqueue_head(&vr->poll_wait);
+	mutex_init(&vr->lock);
+	return 0;
+}
+
+static const struct file_operations vring_fops = {
+	.open		= vring_open,
+	.release	= vring_release,
+	.mmap		= vring_mmap,
+	.read		= vring_read,
+	.write		= vring_write,
+	.poll		= vring_poll,
+};
+
+/**
+ * vring_get_buffer - get a buffer from the vring
+ * @vr: the vring
+ * @in_iov: the iovec array for input buffers
+ * @num_in: the size of the in_iov array, updated by this function.
+ * @in_len: the total length of in_iov after this function.
+ * @out_iov: the iovec array for output buffers
+ * @num_out: the size of the ut_iov array, updated by this function.
+ * @out_len: the total length of out_iov after this function.
+ *
+ * A vring buffer is an array of input and output parts.  This gets the next
+ * available buffer, and returns a non-zero id which is handed back to
+ * vring_used_buffer() once you're finished with the buffer.  A zero return
+ * means no available buffers, negative for error.
+ */
+int vring_get_buffer(struct vring_info *vr,
+		     struct iovec *in_iov,
+		     unsigned int *num_in, unsigned long *in_len,
+		     struct iovec *out_iov,
+		     unsigned int *num_out, unsigned long *out_len)
+{
+	unsigned int i, in = 0, out = 0;
+	unsigned long dummy;
+	u16 avail, last_avail, head;
+	struct vring_desc d;
+
+	if (unlikely(get_user(avail, &vr->ring.avail->idx)))
+		return -EFAULT;
+	if (unlikely(get_user(last_avail, &vring_last_avail(&vr->ring))))
+		return -EFAULT;
+
+	if (last_avail == avail)
+		return 0;
+
+	if (!in_len)
+		in_len = &dummy;
+	if (!out_len)
+		out_len = &dummy;
+
+	*in_len = *out_len = 0;
+
+	if (unlikely(get_user(head, &vr->ring.avail->ring[last_avail
+							  & vr->mask])))
+		return -EFAULT;
+
+	i = head;
+	do {
+		if (unlikely(i >= vr->ring.num)) {
+			pr_debug("vring: bad index: %u\n", i);
+			return -EINVAL;
+		}
+
+		if (copy_from_user(&d, &vr->ring.desc[i], sizeof(d)) != 0)
+			return -EFAULT;
+
+		if (d.flags & VRING_DESC_F_WRITE) {
+			/* Check for length and iovec overflows */
+			if (!num_in) {
+				pr_debug("vring: writable desc %u in ring %p\n",
+					 i, vr->ring.desc);
+				return -EINVAL;
+			}
+			if (in == *num_in || *in_len + d.len < *in_len)
+				return -E2BIG;
+			in_iov[in].iov_len = d.len;
+			*in_len += d.len;
+			in_iov[in].iov_base = (void __user *)(long)d.addr;
+			in++;
+		} else {
+			if (!num_out) {
+				pr_debug("vring: readable desc %u in ring %p\n",
+					 i, vr->ring.desc);
+				return -EINVAL;
+			}
+			if (out == *num_out || *out_len + d.len < *out_len)
+				return -E2BIG;
+			out_iov[out].iov_len = d.len;
+			*out_len += d.len;
+			out_iov[out].iov_base = (void __user *)(long)d.addr;
+			out++;
+		}
+
+		i = d.next;
+	} while (d.flags & VRING_DESC_F_NEXT);
+
+	if (num_in)
+		*num_in = in;
+	if (num_out)
+		*num_out = out;
+
+	last_avail++;
+	put_user(last_avail, &vring_last_avail(&vr->ring));
+
+	/* 0 is a valid head, so add one. */
+	return head + 1;
+}
+EXPORT_SYMBOL_GPL(vring_get_buffer);
+
+/**
+ * vring_used_buffer - return a used buffer to the vring
+ * @vr: the vring
+ * @id: the id returned from vring_get_buffer
+ * @len: the total bytes *written* to the buffer
+ */
+void vring_used_buffer(struct vring_info *vr, int id, u32 len)
+{
+	struct vring_used_elem used;
+	u16 used_idx;
+
+	BUG_ON(id <= 0 || id > vr->ring.num);
+
+	used.id = id - 1;
+	used.len = len;
+	if (get_user(used_idx, &vr->ring.used->idx) != 0)
+		return;
+
+	if (copy_to_user(&vr->ring.used->ring[used_idx & vr->mask], &used,
+			 sizeof(used)))
+		return;
+
+	wmb();
+	used_idx++;
+	put_user(used_idx, &vr->ring.used->idx);
+}
+EXPORT_SYMBOL_GPL(vring_used_buffer);
+
+void vring_wake(struct vring_info *vr)
+{
+	wake_up(&vr->poll_wait);
+}
+EXPORT_SYMBOL_GPL(vring_wake);
+
+/**
+ * vring_get - check out a vring file descriptor
+ * @filp: the file structure to attach to (eg. from fget()).
+ *
+ * Userspace opens /dev/vring and mmaps it, then hands that fd to the
+ * kernel subsystem it wants to communicate with.  That subsystem uses
+ * this routine and vring_set_ops() to attach to it.
+ *
+ * This simply checks that it really is a vring fd (otherwise it
+ * returns NULL), the other routine checks that it's not already
+ * attached.
+ */
+struct vring_info *vring_get(struct file *filp)
+{
+	/* Must be one of ours. */
+	if (filp->f_op != &vring_fops)
+		return NULL;
+
+	return filp->private_data;
+}
+EXPORT_SYMBOL_GPL(vring_get);
+
+/**
+ * vring_set_ops - attach operations to a vring file descriptor.
+ * @vr: the vring_info returned from vring_get.
+ * @ops: the operations to attach.
+ * @ops_data: the argument to the ops callbacks.
+ *
+ * This is called after vring_get(): the reason for the two-part
+ * process is that the ops can be called before vring_set_ops returns
+ * (we don't do locking), so you really need to set things up before
+ * this call.
+ *
+ * This simply checks that the ring is not already attached to something,
+ * then sets the ops.
+ */
+int vring_set_ops(struct vring_info *vr,
+		  const struct vring_ops *ops, void *ops_data)
+{
+	int err;
+
+	mutex_lock(&vr->lock);
+	if (vr->ops) {
+		err = -EBUSY;
+		goto unlock;
+	}
+
+	/* We don't lock, so make sure we get this in the right order. */
+	vr->ops_data = ops_data;
+	wmb();
+	vr->ops = ops;
+
+	err = 0;
+unlock:
+	mutex_unlock(&vr->lock);
+	local_irq_enable();
+	return err;
+}
+EXPORT_SYMBOL_GPL(vring_set_ops);
+
+/**
+ * vring_unset_ops - remove operations to a vring file descriptor.
+ * @vr: the vring_info previously successfully vring_set_ops'd
+ */
+void vring_unset_ops(struct vring_info *vr)
+{
+	BUG_ON(!vr->ops);
+	mutex_lock(&vr->lock);
+	vr->ops = NULL;
+	mutex_unlock(&vr->lock);
+}
+EXPORT_SYMBOL_GPL(vring_unset_ops);
+
+static struct miscdevice vring_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = KBUILD_MODNAME,
+	.fops = &vring_fops,
+};
+
+static int __init init(void)
+{
+	return misc_register(&vring_dev);
+}
+
+static void __exit fini(void)
+{
+	misc_deregister(&vring_dev);
+}
+
+module_init(init);
+module_exit(fini);
diff -r b2d9869d338f include/linux/vring.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/linux/vring.h	Fri Apr 18 13:35:16 2008 +1000
@@ -0,0 +1,58 @@
+/* Ring-buffer file descriptor implementation.
+ *
+ *  Copyright 2008 Rusty Russell IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef _LINUX_VRING_H
+#define _LINUX_VRING_H
+
+/**
+ * vring_ops - operations for a vring fd.
+ * @needs_pull: more data is pending, need to call pull.
+ * @pull: callback when read() is called to report used buffers.
+ * @push: callback when write() is called to notify of added buffers.
+ *
+ * Any of these callbacks can be NULL, if you don't need them.
+ */
+struct vring_ops {
+	bool (*needs_pull)(void *ops_data);
+
+	/* Returns 0 or negative errno. */
+	int (*pull)(void *ops_data);
+
+	/* Returns 0 or negative errno. */
+	int (*push)(void *ops_data);
+};
+
+struct file;
+
+struct vring_info *vring_get(struct file *filp);
+int vring_set_ops(struct vring_info *,
+		  const struct vring_ops *ops, void *ops_data);
+void vring_unset_ops(struct vring_info *vr);
+struct iovec;
+
+/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer() */
+int vring_get_buffer(struct vring_info *vr,
+		     struct iovec *in_iov,
+		     unsigned int *num_in, unsigned long *in_len,
+		     struct iovec *out_iov,
+		     unsigned int *num_out, unsigned long *out_len);
+
+void vring_used_buffer(struct vring_info *vr, int id, u32 len);
+
+void vring_wake(struct vring_info *vr);
+#endif /* _LINUX_VRING_H */

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 3/5] /dev/vring limit and base ioctls
  2008-04-18  4:39   ` [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface Rusty Russell
@ 2008-04-18  4:41     ` Rusty Russell
  2008-04-18  4:42       ` [PATCH 4/5] tun: vringfd receive support Rusty Russell
  2008-04-18 11:18     ` [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface Andrew Morton
  2008-04-19 10:22     ` Evgeniy Polyakov
  2 siblings, 1 reply; 27+ messages in thread
From: Rusty Russell @ 2008-04-18  4:41 UTC (permalink / raw)
  To: netdev; +Cc: Max Krasnyansky, virtualization, linux-kernel

It turns out the lguest (and possibly kvm) want the addresses in the
ring buffer to only cover a certain part of memory, and be offset.

It makes sense that this be an ioctl.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/ioctl-number.txt |    3 +--
 drivers/char/vring.c           |   37 +++++++++++++++++++++++++++++++++++--
 include/linux/vring.h          |    9 +++++++++
 3 files changed, 45 insertions(+), 4 deletions(-)

diff -r caeeb48d478a Documentation/ioctl-number.txt
--- a/Documentation/ioctl-number.txt	Fri Apr 18 13:35:17 2008 +1000
+++ b/Documentation/ioctl-number.txt	Fri Apr 18 13:36:21 2008 +1000
@@ -181,8 +181,7 @@ 0xA3	90-9F	linux/dtlk.h
 0xA3	90-9F	linux/dtlk.h
 0xAB	00-1F	linux/nbd.h
 0xAC	00-1F	linux/raw.h
-0xAD	00	Netfilter device	in development:
-					<mailto:rusty@rustcorp.com.au>	
+0xAD	00-01	linux/vring.h
 0xB0	all	RATIO devices		in development:
 					<mailto:vgo@ratio.de>
 0xB1	00-1F	PPPoX			<mailto:mostrows@styx.uwaterloo.ca>
diff -r caeeb48d478a drivers/char/vring.c
--- a/drivers/char/vring.c	Fri Apr 18 13:35:17 2008 +1000
+++ b/drivers/char/vring.c	Fri Apr 18 13:36:21 2008 +1000
@@ -32,6 +32,8 @@ struct vring_info {
 	struct vring ring;
 	u16 mask;
 	u16 last_used;
+
+	unsigned long base, limit;
 
 	const struct vring_ops *ops;
 	void *ops_data;
@@ -163,6 +165,26 @@ static int vring_open(struct inode *in, 
 
 	init_waitqueue_head(&vr->poll_wait);
 	mutex_init(&vr->lock);
+	vr->limit = -1UL;
+	vr->base = 0;
+	return 0;
+}
+
+static int vring_ioctl(struct inode *in, struct file *filp,
+		       unsigned int cmd, unsigned long arg)
+{
+	struct vring_info *vr = filp->private_data;
+
+	switch (cmd) {
+	case VRINGSETBASE:
+		vr->base = arg;
+		break;
+	case VRINGSETLIMIT:
+		vr->limit = arg;
+		break;
+	default:
+		return -ENOTTY;
+	}
 	return 0;
 }
 
@@ -173,6 +195,7 @@ static const struct file_operations vrin
 	.read		= vring_read,
 	.write		= vring_write,
 	.poll		= vring_poll,
+	.ioctl		= vring_ioctl,
 };
 
 /**
@@ -220,6 +243,8 @@ int vring_get_buffer(struct vring_info *
 
 	i = head;
 	do {
+		void __user *base;
+
 		if (unlikely(i >= vr->ring.num)) {
 			pr_debug("vring: bad index: %u\n", i);
 			return -EINVAL;
@@ -227,6 +252,14 @@ int vring_get_buffer(struct vring_info *
 
 		if (copy_from_user(&d, &vr->ring.desc[i], sizeof(d)) != 0)
 			return -EFAULT;
+
+		if (d.addr + d.len > vr->limit || (d.addr + d.len < d.addr)) {
+			pr_debug("vring: bad addr/len: %u@%p\n",
+				 d.len, (void *)(unsigned long)d.addr);
+			return -EINVAL;
+		}
+
+		base = (void __user *)(unsigned long)d.addr + vr->base;
 
 		if (d.flags & VRING_DESC_F_WRITE) {
 			/* Check for length and iovec overflows */
@@ -239,7 +272,7 @@ int vring_get_buffer(struct vring_info *
 				return -E2BIG;
 			in_iov[in].iov_len = d.len;
 			*in_len += d.len;
-			in_iov[in].iov_base = (void __user *)(long)d.addr;
+			in_iov[in].iov_base = base;
 			in++;
 		} else {
 			if (!num_out) {
@@ -251,7 +284,7 @@ int vring_get_buffer(struct vring_info *
 				return -E2BIG;
 			out_iov[out].iov_len = d.len;
 			*out_len += d.len;
-			out_iov[out].iov_base = (void __user *)(long)d.addr;
+			out_iov[out].iov_base = base;
 			out++;
 		}
 
diff -r caeeb48d478a include/linux/vring.h
--- a/include/linux/vring.h	Fri Apr 18 13:35:17 2008 +1000
+++ b/include/linux/vring.h	Fri Apr 18 13:36:21 2008 +1000
@@ -18,6 +18,13 @@
  */
 #ifndef _LINUX_VRING_H
 #define _LINUX_VRING_H
+#include <linux/types.h>
+
+/* Ioctl defines. */
+#define VRINGSETBASE	_IO(0xAD, 0)
+#define VRINGSETLIMIT	_IO(0xAD, 1)
+
+#ifdef __KERNEL__
 
 /**
  * vring_ops - operations for a vring fd.
@@ -55,4 +62,6 @@ void vring_used_buffer(struct vring_info
 void vring_used_buffer(struct vring_info *vr, int id, u32 len);
 
 void vring_wake(struct vring_info *vr);
+#endif /* __KERNEL__ */
+
 #endif /* _LINUX_VRING_H */

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 4/5] tun: vringfd receive support.
  2008-04-18  4:41     ` [PATCH 3/5] /dev/vring limit and base ioctls Rusty Russell
@ 2008-04-18  4:42       ` Rusty Russell
  2008-04-18  4:43         ` [PATCH 5/5] tun: vringfd xmit support Rusty Russell
  0 siblings, 1 reply; 27+ messages in thread
From: Rusty Russell @ 2008-04-18  4:42 UTC (permalink / raw)
  To: netdev; +Cc: Max Krasnyansky, virtualization, linux-kernel

This patch modifies tun to allow a vringfd to specify the receive
buffer.  Because we can't copy to userspace in bh context, we queue
like normal then use the "pull" hook to actually do the copy.

We use struct virtio_net_hdr prepended to packets in the ring to allow
userspace to receive GSO packets in future (at the moment, the tun
driver doesn't tell the stack it can handle them, so these cases are
never taken).  This will need to be something that userspace tells us
it can handle.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/Kconfig    |    2 
 drivers/net/tun.c      |  159 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/if_tun.h |    1 
 3 files changed, 162 insertions(+)

diff -r 9bafcef88e1b drivers/net/Kconfig
--- a/drivers/net/Kconfig	Fri Apr 18 05:54:45 2008 +1000
+++ b/drivers/net/Kconfig	Fri Apr 18 05:58:40 2008 +1000
@@ -120,6 +120,8 @@ config TUN
 config TUN
 	tristate "Universal TUN/TAP device driver support"
 	select CRC32
+# If no VRING at all, that's fine, but if it's a module, we must be, too.
+	depends on !VRING || VRING
 	---help---
 	  TUN/TAP provides packet reception and transmission for user space
 	  programs.  It can be viewed as a simple Point-to-Point or Ethernet
diff -r 9bafcef88e1b drivers/net/tun.c
--- a/drivers/net/tun.c	Fri Apr 18 05:54:45 2008 +1000
+++ b/drivers/net/tun.c	Fri Apr 18 05:58:40 2008 +1000
@@ -62,6 +62,9 @@
 #include <linux/if_ether.h>
 #include <linux/if_tun.h>
 #include <linux/crc32.h>
+#include <linux/vring.h>
+#include <linux/virtio_net.h>
+#include <linux/file.h>
 #include <net/net_namespace.h>
 
 #include <asm/system.h>
@@ -98,6 +101,9 @@ struct tun_struct {
 	u8 dev_addr[ETH_ALEN];
 	u32 chr_filter[2];
 	u32 net_filter[2];
+
+	struct vring_info	*inring;
+	struct file		*infile;
 
 #ifdef TUN_DEBUG
 	int debug;
@@ -158,6 +164,10 @@ static int tun_net_xmit(struct sk_buff *
 	/* Notify and wake up reader process */
 	if (tun->flags & TUN_FASYNC)
 		kill_fasync(&tun->fasync, SIGIO, POLL_IN);
+
+	if (tun->inring)
+		vring_wake(tun->inring);
+
 	wake_up_interruptible(&tun->read_wait);
 	return 0;
 
@@ -249,6 +259,149 @@ static void tun_net_init(struct net_devi
 		break;
 	}
 }
+
+#if defined(CONFIG_VRING) || defined(CONFIG_VRING_MODULE)
+/* Returns whether there are queued buffers */
+static bool pending_recv_skbs(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+
+	return !skb_queue_empty(&tun->readq);
+}
+
+/* Returns 0, or negative errno. */
+static int pull_recv_skbs(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+	int err = 0, num_copied = 0;
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&tun->readq)) != NULL) {
+		struct iovec iov[1+MAX_SKB_FRAGS];
+		struct virtio_net_hdr gso = { 0 }; /* no info leak */
+		unsigned int iovnum = ARRAY_SIZE(iov);
+		unsigned long len;
+		int id;
+
+		id = vring_get_buffer(tun->inring, iov, &iovnum, &len,
+				      NULL, NULL, NULL);
+		if (id <= 0) {
+			err = id;
+			break;
+		}
+
+		/* FIXME: we could stash this descriptor and go looking for a
+		 * better-sized one.  That would allow them to mix different
+		 * buffer sizes for efficiency. */
+		if (unlikely(len < sizeof(gso) + skb->len)) {
+			tun->dev->stats.tx_aborted_errors++;
+			err = -ENOBUFS; /* PS. You suck! */
+			break;
+		}
+
+		if (skb_is_gso(skb)) {
+			struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+			/* This is a hint as to how much should be linear. */
+			gso.hdr_len = skb_headlen(skb);
+			gso.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else if (sinfo->gso_type & SKB_GSO_UDP)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+		} else
+			gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			gso.csum_start = skb->csum_start - skb_headroom(skb);
+			gso.csum_offset = skb->csum_offset;
+		} /* else everything is zero */
+
+		err = memcpy_toiovec(iov, (void *)&gso, sizeof(gso));
+		if (unlikely(err)) {
+			tun->dev->stats.tx_fifo_errors++;
+			break;
+		}
+
+		err = skb_copy_datagram_iovec(skb, 0, iov, skb->len);
+		if (unlikely(err)) {
+			tun->dev->stats.tx_fifo_errors++;
+			break;
+		}
+
+		vring_used_buffer(tun->inring, id, sizeof(gso) + skb->len);
+		num_copied++;
+	}
+
+	/* We took an skb, but ring isn't ready for it.  Put it back */
+	if (skb)
+		skb_queue_head(&tun->readq, skb);
+
+	if (num_copied)
+		netif_wake_queue(tun->dev);
+
+	return err;
+}
+
+static struct vring_ops recvops = {
+	.needs_pull = pending_recv_skbs,
+	.pull = pull_recv_skbs,
+};
+
+static int set_recv_vring(struct tun_struct *tun, int fd)
+{
+	int err;
+
+	if (tun->inring)
+		return -EBUSY;
+
+	tun->infile = fget(fd);
+	if (!tun->infile)
+		return -EBADF;
+
+	tun->inring = vring_get(tun->infile);
+	if (!tun->inring) {
+		err = -EBADF;
+		goto put;
+	}
+
+	err = vring_set_ops(tun->inring, &recvops, tun);
+	if (err) {
+		tun->inring = NULL;
+		goto put;
+	}
+	return 0;
+
+put:
+	fput(tun->infile);
+	tun->infile = NULL;
+	return err;
+}
+
+static void unset_vrings(struct tun_struct *tun)
+{
+	if (tun->inring) {
+		vring_unset_ops(tun->inring);
+		fput(tun->infile);
+	}
+}
+#else /* ... !CONFIG_VRING */
+static int set_recv_vring(struct tun_struct *tun, int fd)
+{
+	return -ENOTTY;
+}
+
+static void unset_vrings(struct tun_struct *tun)
+{
+}
+#endif
 
 /* Character device part */
 
@@ -465,6 +618,7 @@ static void tun_setup(struct net_device 
 
 	tun->owner = -1;
 	tun->group = -1;
+	tun->inring = NULL;
 
 	dev->open = tun_net_open;
 	dev->hard_start_xmit = tun_net_xmit;
@@ -674,6 +828,9 @@ static int tun_chr_ioctl(struct inode *i
 		break;
 #endif
 
+	case TUNSETRECVVRING:
+		return set_recv_vring(tun, arg);
+
 	case SIOCGIFFLAGS:
 		ifr.ifr_flags = tun->if_flags;
 		if (copy_to_user( argp, &ifr, sizeof ifr))
@@ -784,6 +941,8 @@ static int tun_chr_close(struct inode *i
 	DBG(KERN_INFO "%s: tun_chr_close\n", tun->dev->name);
 
 	tun_chr_fasync(-1, file, 0);
+
+	unset_vrings(tun);
 
 	rtnl_lock();
 
diff -r 9bafcef88e1b include/linux/if_tun.h
--- a/include/linux/if_tun.h	Fri Apr 18 05:54:45 2008 +1000
+++ b/include/linux/if_tun.h	Fri Apr 18 05:58:40 2008 +1000
@@ -42,6 +42,7 @@
 #define TUNSETOWNER   _IOW('T', 204, int)
 #define TUNSETLINK    _IOW('T', 205, int)
 #define TUNSETGROUP   _IOW('T', 206, int)
+#define TUNSETRECVVRING _IOW('T', 207, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 5/5] tun: vringfd xmit support.
  2008-04-18  4:42       ` [PATCH 4/5] tun: vringfd receive support Rusty Russell
@ 2008-04-18  4:43         ` Rusty Russell
  2008-04-18 11:31           ` Andrew Morton
  2008-04-18 11:46           ` pradeep singh rautela
  0 siblings, 2 replies; 27+ messages in thread
From: Rusty Russell @ 2008-04-18  4:43 UTC (permalink / raw)
  To: netdev; +Cc: Max Krasnyansky, virtualization, linux-kernel

This patch modifies tun to allow a vringfd to specify the send
buffer.  The user does a write to push out packets from the buffer.

Again we use the 'struct virtio_net_hdr' to allow userspace to send
GSO packets.  In this case, it can hint how much to copy, and the
other pages will be made into skb fragments.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/net/tun.c      |  410 +++++++++++++++++++++++++++++++++++++++++--------
 include/linux/if_tun.h |    1 
 2 files changed, 351 insertions(+), 60 deletions(-)

diff -r f797ec115d1b drivers/net/tun.c
--- a/drivers/net/tun.c	Fri Apr 18 05:58:40 2008 +1000
+++ b/drivers/net/tun.c	Fri Apr 18 06:07:21 2008 +1000
@@ -65,6 +65,8 @@
 #include <linux/vring.h>
 #include <linux/virtio_net.h>
 #include <linux/file.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
 #include <net/net_namespace.h>
 
 #include <asm/system.h>
@@ -102,8 +104,8 @@ struct tun_struct {
 	u32 chr_filter[2];
 	u32 net_filter[2];
 
-	struct vring_info	*inring;
-	struct file		*infile;
+	struct vring_info	*inring, *outring;
+	struct file		*infile, *outfile;
 
 #ifdef TUN_DEBUG
 	int debug;
@@ -258,6 +261,169 @@ static void tun_net_init(struct net_devi
 		dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
 		break;
 	}
+}
+
+/* We don't consolidate consecutive iovecs, so huge iovecs can break here.
+ * Users will learn not to do that. */
+static int get_user_skb_frags(const struct iovec *iv, size_t len,
+			      struct skb_frag_struct *f)
+{
+	unsigned int i, j, num_pg = 0;
+	int err;
+	struct page *pages[MAX_SKB_FRAGS];
+
+	down_read(&current->mm->mmap_sem);
+	while (len) {
+		int n, npages;
+		unsigned long base, len;
+		base = (unsigned long)iv->iov_base;
+		len = (unsigned long)iv->iov_len;
+
+		if (len == 0) {
+			iv++;
+			continue;
+		}
+
+		/* How many pages will this take? */
+		npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
+		if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
+			err = -ENOSPC;
+			goto fail;
+		}
+		n = get_user_pages(current, current->mm, base, npages,
+				   0, 0, pages, NULL);
+		if (unlikely(n < 0)) {
+			err = n;
+			goto fail;
+		}
+
+		/* Transfer pages to the frag array */
+		for (j = 0; j < n; j++) {
+			f[num_pg].page = pages[j];
+			if (j == 0) {
+				f[num_pg].page_offset = offset_in_page(base);
+				f[num_pg].size = min(len, PAGE_SIZE -
+						     f[num_pg].page_offset);
+			} else {
+				f[num_pg].page_offset = 0;
+				f[num_pg].size = min(len, PAGE_SIZE);
+			}
+			len -= f[num_pg].size;
+			base += f[num_pg].size;
+			num_pg++;
+		}
+
+		if (unlikely(n != npages)) {
+			err = -EFAULT;
+			goto fail;
+		}
+	}
+	up_read(&current->mm->mmap_sem);
+	return num_pg;
+
+fail:
+	for (i = 0; i < num_pg; i++)
+		put_page(f[i].page);
+	up_read(&current->mm->mmap_sem);
+	return err;
+}
+
+/* We actually store this at the head of the skb. */
+struct skb_tun_hdr {
+	struct list_head list;
+	struct tun_struct *tun;
+	unsigned int id;
+	unsigned int len;
+};
+
+/* Get packet from user space buffer.  copylen is a hint as to how
+ * much to copy (rest is pinned).  */
+static struct sk_buff *get_user_skb(struct tun_struct *tun, struct iovec *iv,
+				    size_t copylen, size_t len)
+{
+	struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
+	struct sk_buff *skb;
+	size_t align = 0, extra = 0;
+	int err;
+
+	if (!(tun->flags & TUN_NO_PI)) {
+		if (len < sizeof(pi)) {
+			err = -EINVAL;
+			goto fail;
+		}
+		len -= sizeof(pi);
+
+		if (memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) {
+			err = -EFAULT;
+			goto fail;
+		}
+		if (copylen > len)
+			copylen = len;
+	}
+
+	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
+		align = NET_IP_ALIGN;
+		if (unlikely(copylen < ETH_HLEN)) {
+			if (len < ETH_HLEN) {
+				err = -EINVAL;
+				goto fail;
+			}
+			copylen = ETH_HLEN;
+		}
+	}
+
+	/* Allocate extra header if we need  */
+	if (copylen != len)
+		extra = sizeof(struct skb_tun_hdr);
+
+	skb = alloc_skb(extra + copylen + align, GFP_KERNEL);
+	if (!skb) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	if (extra + align)
+		skb_reserve(skb, extra + align);
+
+	if (memcpy_fromiovec(skb_put(skb, copylen), iv, copylen)) {
+		err = -EFAULT;
+		goto free_skb;
+	}
+
+	switch (tun->flags & TUN_TYPE_MASK) {
+	case TUN_TUN_DEV:
+		skb_reset_mac_header(skb);
+		skb->protocol = pi.proto;
+		skb->dev = tun->dev;
+		break;
+	case TUN_TAP_DEV:
+		skb->protocol = eth_type_trans(skb, tun->dev);
+		break;
+	};
+
+	if (tun->flags & TUN_NOCHECKSUM)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	/* Anything left gets put into frags. */
+	if (extra) {
+		struct skb_shared_info *sinfo = skb_shinfo(skb);
+		int err = get_user_skb_frags(iv, len - copylen, sinfo->frags);
+		if (err < 0)
+			goto free_skb;
+		sinfo->nr_frags = err;
+	}
+	tun->dev->last_rx = jiffies;
+
+	tun->dev->stats.rx_packets++;
+	tun->dev->stats.rx_bytes += len;
+
+	return skb;
+
+free_skb:
+	kfree_skb(skb);
+fail:
+	tun->dev->stats.rx_dropped++;
+	return ERR_PTR(err);
 }
 
 #if defined(CONFIG_VRING) || defined(CONFIG_VRING_MODULE)
@@ -355,6 +521,132 @@ static struct vring_ops recvops = {
 	.pull = pull_recv_skbs,
 };
 
+static DEFINE_SPINLOCK(finished_lock);
+static LIST_HEAD(shinfo_finished_list);
+static struct task_struct *shinfo_finisher;
+
+static void used_buffer(struct skb_tun_hdr *tunh)
+{
+	/* Woot, something happened. */
+	vring_wake(tunh->tun->outring);
+
+	/* Release device.  Keeping this reference blocks file close. */
+	dev_put(tunh->tun->dev);
+
+	/* tunh == skb->head. */
+	kfree(tunh);
+}
+
+static int do_shinfo_finisher(void *unused)
+{
+	LIST_HEAD(list);
+	struct skb_tun_hdr *i;
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		spin_lock_irq(&finished_lock);
+		list_splice_init(&list, &shinfo_finished_list);
+		spin_unlock_irq(&finished_lock);
+
+		if (list_empty(&list)) {
+			schedule();
+			continue;
+		}
+
+		list_for_each_entry(i, &list, list) {
+			vring_used_buffer(i->tun->outring, i->id, i->len);
+			used_buffer(i);
+		}
+	}
+	return 0;
+}
+
+/* We are done with this skb data: put it in the used pile. */
+static void shinfo_finished(struct skb_shared_info *sinfo)
+{
+	struct skb_tun_hdr *tunh = (void *)skb_shinfo_to_head(sinfo);
+	unsigned long flags;
+
+	spin_lock_irqsave(&finished_lock, flags);
+	list_add(&tunh->list, &shinfo_finished_list);
+	spin_unlock_irqrestore(&finished_lock, flags);
+
+	wake_up_process(shinfo_finisher);
+}
+
+static int xmit_packets(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+	struct iovec iov[1+MAX_SKB_FRAGS];
+	unsigned int iovnum = ARRAY_SIZE(iov);
+	int id, err, wake = 0;
+	unsigned long len;
+
+	while ((id = vring_get_buffer(tun->outring, NULL, NULL, NULL,
+				      iov, &iovnum, &len)) > 0) {
+		struct virtio_net_hdr h;
+		struct sk_buff *skb;
+		struct skb_shared_info *shinfo;
+
+		if (unlikely(len < sizeof(h)))
+			return -EINVAL;
+
+		err = memcpy_fromiovec((void *)&h, iov, sizeof(h));
+		if (unlikely(err))
+			return -EFAULT;
+
+		len -= sizeof(h);
+		if (h.hdr_len > len)
+			return -EINVAL;
+
+		/* Without GSO, we copy entire packet. */
+		if (h.gso_type == VIRTIO_NET_HDR_GSO_NONE)
+			h.hdr_len = len;
+
+		skb = get_user_skb(tun, iov, h.hdr_len, len);
+		if (IS_ERR(skb))
+			return PTR_ERR(skb);
+
+		if ((h.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+		    !skb_partial_csum_set(skb, h.csum_start, h.csum_offset)) {
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+
+		/* If it has fragments, set up destructor for later. */
+		shinfo = skb_shinfo(skb);
+		if (skb_shinfo(skb)->nr_frags) {
+			struct skb_tun_hdr *tunh = (void *)skb->head;
+			shinfo->destructor = shinfo_finished;
+			tunh->id = id;
+			tunh->len = sizeof(h) + skb->len;
+		} else {
+			vring_used_buffer(tun->outring, id, sizeof(h)+skb->len);
+			wake = 1;
+		}
+		netif_rx_ni(skb);
+	}
+
+	if (wake)
+		vring_wake(tun->outring);
+
+	/* 0 or error. */
+	return id;
+}
+
+static struct vring_ops xmitops = {
+	.push = xmit_packets,
+};
+
+static int init_vring(void)
+{
+	shinfo_finisher = kthread_run(do_shinfo_finisher, NULL, "tun");
+	if (IS_ERR(shinfo_finisher))
+		return PTR_ERR(shinfo_finisher);
+	return 0;
+}
+
 static int set_recv_vring(struct tun_struct *tun, int fd)
 {
 	int err;
@@ -391,9 +685,47 @@ static void unset_vrings(struct tun_stru
 		vring_unset_ops(tun->inring);
 		fput(tun->infile);
 	}
+	if (tun->outring) {
+		vring_unset_ops(tun->outring);
+		fput(tun->outfile);
+	}
+}
+
+static int set_xmit_vring(struct tun_struct *tun, int fd)
+{
+	int err;
+
+	if (tun->outring)
+		return -EBUSY;
+
+	tun->outfile = fget(fd);
+	if (!tun->outfile)
+		return -EBADF;
+
+	tun->outring = vring_get(tun->outfile);
+	if (!tun->outring) {
+		err = -EBADF;
+		goto put;
+	}
+
+	err = vring_set_ops(tun->outring, &xmitops, tun);
+	if (err) {
+		tun->outring = NULL;
+		goto put;
+	}
+	return 0;
+
+put:
+	fput(tun->outfile);
+	tun->outfile = NULL;
+	return err;
 }
 #else /* ... !CONFIG_VRING */
 static int set_recv_vring(struct tun_struct *tun, int fd)
+{
+	return -ENOTTY;
+}
+static int set_xmit_vring(struct tun_struct *tun, int fd)
 {
 	return -ENOTTY;
 }
@@ -424,74 +756,26 @@ static unsigned int tun_chr_poll(struct 
 	return mask;
 }
 
-/* Get packet from user space buffer */
-static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
-{
-	struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
-	struct sk_buff *skb;
-	size_t len = count, align = 0;
-
-	if (!(tun->flags & TUN_NO_PI)) {
-		if ((len -= sizeof(pi)) > count)
-			return -EINVAL;
-
-		if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
-			return -EFAULT;
-	}
-
-	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
-		align = NET_IP_ALIGN;
-		if (unlikely(len < ETH_HLEN))
-			return -EINVAL;
-	}
-
-	if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
-		tun->dev->stats.rx_dropped++;
-		return -ENOMEM;
-	}
-
-	if (align)
-		skb_reserve(skb, align);
-	if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
-		tun->dev->stats.rx_dropped++;
-		kfree_skb(skb);
-		return -EFAULT;
-	}
-
-	switch (tun->flags & TUN_TYPE_MASK) {
-	case TUN_TUN_DEV:
-		skb_reset_mac_header(skb);
-		skb->protocol = pi.proto;
-		skb->dev = tun->dev;
-		break;
-	case TUN_TAP_DEV:
-		skb->protocol = eth_type_trans(skb, tun->dev);
-		break;
-	};
-
-	if (tun->flags & TUN_NOCHECKSUM)
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-	netif_rx_ni(skb);
-	tun->dev->last_rx = jiffies;
-
-	tun->dev->stats.rx_packets++;
-	tun->dev->stats.rx_bytes += len;
-
-	return count;
-}
-
 static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 			      unsigned long count, loff_t pos)
 {
 	struct tun_struct *tun = iocb->ki_filp->private_data;
+	size_t len;
+	struct sk_buff *skb;
 
 	if (!tun)
 		return -EBADFD;
 
 	DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
 
-	return tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
+	len = iov_length(iv, count);
+
+	skb = get_user_skb(tun, (struct iovec *)iv, len, len);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	netif_rx_ni(skb);
+	return len;
 }
 
 /* Put packet to the user space buffer */
@@ -831,6 +1115,9 @@ static int tun_chr_ioctl(struct inode *i
 	case TUNSETRECVVRING:
 		return set_recv_vring(tun, arg);
 
+	case TUNSETXMITVRING:
+		return set_xmit_vring(tun, arg);
+
 	case SIOCGIFFLAGS:
 		ifr.ifr_flags = tun->if_flags;
 		if (copy_to_user( argp, &ifr, sizeof ifr))
@@ -1078,6 +1365,12 @@ static int __init tun_init(void)
 	ret = misc_register(&tun_miscdev);
 	if (ret)
 		printk(KERN_ERR "tun: Can't register misc device %d\n", TUN_MINOR);
+	else {
+		ret = init_vring();
+		if (ret)
+			misc_deregister(&tun_miscdev);
+	}
+
 	return ret;
 }
 
diff -r f797ec115d1b include/linux/if_tun.h
--- a/include/linux/if_tun.h	Fri Apr 18 05:58:40 2008 +1000
+++ b/include/linux/if_tun.h	Fri Apr 18 06:07:21 2008 +1000
@@ -43,6 +43,7 @@
 #define TUNSETLINK    _IOW('T', 205, int)
 #define TUNSETGROUP   _IOW('T', 206, int)
 #define TUNSETRECVVRING _IOW('T', 207, int)
+#define TUNSETXMITVRING _IOW('T', 208, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-18  4:39   ` [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface Rusty Russell
  2008-04-18  4:41     ` [PATCH 3/5] /dev/vring limit and base ioctls Rusty Russell
@ 2008-04-18 11:18     ` Andrew Morton
  2008-04-18 14:32       ` Rusty Russell
  2008-04-19 10:22     ` Evgeniy Polyakov
  2 siblings, 1 reply; 27+ messages in thread
From: Andrew Morton @ 2008-04-18 11:18 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

On Fri, 18 Apr 2008 14:39:48 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:

> virtio introduced a ring structure ABI for guest-host communications
> (currently used by lguest and kvm).  Using this same ABI, we can
> create a nice fd version.
> 
> This is useful for efficiently passing packets to and from the tun,
> for example.
> 
> ...
>
> +static int vring_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> +	unsigned long size, num_descs;
> +	struct vring_info *vr = filp->private_data;
> +	int err;
> +
> +	/* We overload mmap's offset to hold the ring number. */
> +	num_descs = vma->vm_pgoff;
> +
> +	/* Must be a power of two, and limit indices to a u16. */
> +	if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536)

We have an is_power_of_2().

> +		return -EINVAL;
> +
> +	/* mmap size must be what we expect for such a ring. */
> +	size = vma->vm_end - vma->vm_start;
> +	if (size != ALIGN(vring_size(num_descs, PAGE_SIZE), PAGE_SIZE))
> +		return -EINVAL;
> +
> +	/* We only let them map this in one place. */
> +	mutex_lock(&vr->lock);
> +	if (vr->ring.num != 0) {
> +		err = -EBUSY;
> +		goto unlock;
> +	}
> +
> +	vring_init(&vr->ring, num_descs, (void *)vma->vm_start, PAGE_SIZE);
> +
> +	vr->mask = num_descs - 1;
> +	err = 0;
> +
> +unlock:
> +	mutex_unlock(&vr->lock);
> +	return err;
> +}
>
> ...
>
> +/**
> + * vring_get - check out a vring file descriptor
> + * @filp: the file structure to attach to (eg. from fget()).
> + *
> + * Userspace opens /dev/vring and mmaps it, then hands that fd to the
> + * kernel subsystem it wants to communicate with.  That subsystem uses
> + * this routine and vring_set_ops() to attach to it.
> + *
> + * This simply checks that it really is a vring fd (otherwise it
> + * returns NULL), the other routine checks that it's not already
> + * attached.
> + */

hm, I don't understand the big picture here yet.

Isn't this kinda-sorta like what a relayfs file does?  The oprofile
buffers?  etc?  Nothing in common at all, no hope?

> +struct vring_info *vring_get(struct file *filp)
> +{
> +	/* Must be one of ours. */
> +	if (filp->f_op != &vring_fops)
> +		return NULL;
> +
> +	return filp->private_data;
> +}
> +EXPORT_SYMBOL_GPL(vring_get);
> +
> +/**
> + * vring_set_ops - attach operations to a vring file descriptor.
> + * @vr: the vring_info returned from vring_get.
> + * @ops: the operations to attach.
> + * @ops_data: the argument to the ops callbacks.
> + *
> + * This is called after vring_get(): the reason for the two-part
> + * process is that the ops can be called before vring_set_ops returns
> + * (we don't do locking), so you really need to set things up before
> + * this call.
> + *
> + * This simply checks that the ring is not already attached to something,
> + * then sets the ops.
> + */
> +int vring_set_ops(struct vring_info *vr,
> +		  const struct vring_ops *ops, void *ops_data)
> +{
> +	int err;
> +
> +	mutex_lock(&vr->lock);
> +	if (vr->ops) {
> +		err = -EBUSY;
> +		goto unlock;
> +	}
> +
> +	/* We don't lock, so make sure we get this in the right order. */
> +	vr->ops_data = ops_data;
> +	wmb();
> +	vr->ops = ops;
> +
> +	err = 0;
> +unlock:
> +	mutex_unlock(&vr->lock);
> +	local_irq_enable();

what's this doing here?

> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(vring_set_ops);
> +
> +/**
> + * vring_unset_ops - remove operations to a vring file descriptor.
> + * @vr: the vring_info previously successfully vring_set_ops'd
> + */
> +void vring_unset_ops(struct vring_info *vr)
> +{
> +	BUG_ON(!vr->ops);
> +	mutex_lock(&vr->lock);
> +	vr->ops = NULL;
> +	mutex_unlock(&vr->lock);
> +}
> +EXPORT_SYMBOL_GPL(vring_unset_ops);

Isn't this just vring_set_ops(vr, NULL, NULL)?

> +static struct miscdevice vring_dev = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = KBUILD_MODNAME,
> +	.fops = &vring_fops,
> +};
> +
> +static int __init init(void)
> +{
> +	return misc_register(&vring_dev);
> +}
> +
> +static void __exit fini(void)
> +{
> +	misc_deregister(&vring_dev);
> +}
> +
> +module_init(init);
> +module_exit(fini);
> diff -r b2d9869d338f include/linux/vring.h
> --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
> +++ b/include/linux/vring.h	Fri Apr 18 13:35:16 2008 +1000
> @@ -0,0 +1,58 @@
> +/* Ring-buffer file descriptor implementation.
> + *
> + *  Copyright 2008 Rusty Russell IBM Corporation
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License, or
> + *  (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *  GNU General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License
> + *  along with this program; if not, write to the Free Software
> + *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> + */

ponders #include <copyright.h>

> +#ifndef _LINUX_VRING_H
> +#define _LINUX_VRING_H
> +
> +/**
> + * vring_ops - operations for a vring fd.
> + * @needs_pull: more data is pending, need to call pull.
> + * @pull: callback when read() is called to report used buffers.
> + * @push: callback when write() is called to notify of added buffers.
> + *
> + * Any of these callbacks can be NULL, if you don't need them.
> + */
> +struct vring_ops {
> +	bool (*needs_pull)(void *ops_data);
> +
> +	/* Returns 0 or negative errno. */
> +	int (*pull)(void *ops_data);
> +
> +	/* Returns 0 or negative errno. */
> +	int (*push)(void *ops_data);
> +};
> +
> +struct file;
> +
> +struct vring_info *vring_get(struct file *filp);
> +int vring_set_ops(struct vring_info *,
> +		  const struct vring_ops *ops, void *ops_data);

the first arg to vring_set_ops() lost its name.

> +void vring_unset_ops(struct vring_info *vr);
> +struct iovec;
> +
> +/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer() */
> +int vring_get_buffer(struct vring_info *vr,
> +		     struct iovec *in_iov,
> +		     unsigned int *num_in, unsigned long *in_len,
> +		     struct iovec *out_iov,
> +		     unsigned int *num_out, unsigned long *out_len);
> +
> +void vring_used_buffer(struct vring_info *vr, int id, u32 len);
> +
> +void vring_wake(struct vring_info *vr);
> +#endif /* _LINUX_VRING_H */



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] tun: vringfd xmit support.
  2008-04-18  4:43         ` [PATCH 5/5] tun: vringfd xmit support Rusty Russell
@ 2008-04-18 11:31           ` Andrew Morton
  2008-04-18 15:15             ` Rusty Russell
  2008-04-18 11:46           ` pradeep singh rautela
  1 sibling, 1 reply; 27+ messages in thread
From: Andrew Morton @ 2008-04-18 11:31 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

On Fri, 18 Apr 2008 14:43:24 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:

> This patch modifies tun to allow a vringfd to specify the send
> buffer.  The user does a write to push out packets from the buffer.
> 
> Again we use the 'struct virtio_net_hdr' to allow userspace to send
> GSO packets.  In this case, it can hint how much to copy, and the
> other pages will be made into skb fragments.
> 
> 
> ...
>
> +/* We don't consolidate consecutive iovecs, so huge iovecs can break here.
> + * Users will learn not to do that. */
> +static int get_user_skb_frags(const struct iovec *iv, size_t len,
> +			      struct skb_frag_struct *f)
> +{
> +	unsigned int i, j, num_pg = 0;
> +	int err;
> +	struct page *pages[MAX_SKB_FRAGS];
> +
> +	down_read(&current->mm->mmap_sem);
> +	while (len) {
> +		int n, npages;
> +		unsigned long base, len;
> +		base = (unsigned long)iv->iov_base;
> +		len = (unsigned long)iv->iov_len;
> +
> +		if (len == 0) {
> +			iv++;
> +			continue;
> +		}
> +
> +		/* How many pages will this take? */
> +		npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;

Brain hurts.  I hope you got that right.

> +		if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
> +			err = -ENOSPC;
> +			goto fail;
> +		}
> +		n = get_user_pages(current, current->mm, base, npages,
> +				   0, 0, pages, NULL);

What is the maximum numbet of pages which an unpriviliged user can
concurrently pin with this code?

> +		if (unlikely(n < 0)) {
> +			err = n;
> +			goto fail;
> +		}
> +
> +		/* Transfer pages to the frag array */
> +		for (j = 0; j < n; j++) {
> +			f[num_pg].page = pages[j];
> +			if (j == 0) {
> +				f[num_pg].page_offset = offset_in_page(base);
> +				f[num_pg].size = min(len, PAGE_SIZE -
> +						     f[num_pg].page_offset);
> +			} else {
> +				f[num_pg].page_offset = 0;
> +				f[num_pg].size = min(len, PAGE_SIZE);
> +			}
> +			len -= f[num_pg].size;
> +			base += f[num_pg].size;
> +			num_pg++;
> +		}

This loop is a fancy way of doing

		num_pg = n;

> +		if (unlikely(n != npages)) {
> +			err = -EFAULT;
> +			goto fail;
> +		}

why not do this immediately after running get_user_pages()?

> +	}
> +	up_read(&current->mm->mmap_sem);
> +	return num_pg;
> +
> +fail:
> +	for (i = 0; i < num_pg; i++)
> +		put_page(f[i].page);

release_pages() could be a tad more efficient, but it's only error-path.

> +	up_read(&current->mm->mmap_sem);
> +	return err;
> +}
> +
>  


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] tun: vringfd xmit support.
  2008-04-18  4:43         ` [PATCH 5/5] tun: vringfd xmit support Rusty Russell
  2008-04-18 11:31           ` Andrew Morton
@ 2008-04-18 11:46           ` pradeep singh rautela
  2008-04-18 14:25             ` Ray Lee
  1 sibling, 1 reply; 27+ messages in thread
From: pradeep singh rautela @ 2008-04-18 11:46 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, linux-kernel, Max Krasnyansky, virtualization

On Fri, Apr 18, 2008 at 10:13 AM, Rusty Russell <rusty@rustcorp.com.au> wrote:
> This patch modifies tun to allow a vringfd to specify the send
>  buffer.  The user does a write to push out packets from the buffer.
>
>  Again we use the 'struct virtio_net_hdr' to allow userspace to send
>  GSO packets.  In this case, it can hint how much to copy, and the
>  other pages will be made into skb fragments.
>
>  Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
>  ---
>   drivers/net/tun.c      |  410 +++++++++++++++++++++++++++++++++++++++++--------
>   include/linux/if_tun.h |    1
>   2 files changed, 351 insertions(+), 60 deletions(-)
>
>  diff -r f797ec115d1b drivers/net/tun.c
>  --- a/drivers/net/tun.c Fri Apr 18 05:58:40 2008 +1000
>  +++ b/drivers/net/tun.c Fri Apr 18 06:07:21 2008 +1000
>  @@ -65,6 +65,8 @@
>   #include <linux/vring.h>
>   #include <linux/virtio_net.h>
>   #include <linux/file.h>
>  +#include <linux/spinlock.h>
>  +#include <linux/kthread.h>
>   #include <net/net_namespace.h>
>
>   #include <asm/system.h>
>  @@ -102,8 +104,8 @@ struct tun_struct {
>         u32 chr_filter[2];
>         u32 net_filter[2];
>
>  -       struct vring_info       *inring;
>  -       struct file             *infile;
>  +       struct vring_info       *inring, *outring;
>  +       struct file             *infile, *outfile;
>
>   #ifdef TUN_DEBUG
>         int debug;
>  @@ -258,6 +261,169 @@ static void tun_net_init(struct net_devi
>                 dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
>                 break;
>         }
>  +}
>  +
>  +/* We don't consolidate consecutive iovecs, so huge iovecs can break here.
>  + * Users will learn not to do that. */
>  +static int get_user_skb_frags(const struct iovec *iv, size_t len,
>  +                             struct skb_frag_struct *f)
>  +{
>  +       unsigned int i, j, num_pg = 0;
>  +       int err;
>  +       struct page *pages[MAX_SKB_FRAGS];
>  +
>  +       down_read(&current->mm->mmap_sem);
>  +       while (len) {
>  +               int n, npages;
>  +               unsigned long base, len;
>  +               base = (unsigned long)iv->iov_base;
>  +               len = (unsigned long)iv->iov_len;
>  +
>  +               if (len == 0) {
>  +                       iv++;
>  +                       continue;
>  +               }
>  +
>  +               /* How many pages will this take? */
>  +               npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;

Hi Rusty,
A trivial suggestion, how about
          npages = 1+(len -1)/PAGE_SIZE ?

Thanks,
         --Pradeep
>  +               if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
>  +                       err = -ENOSPC;
>  +                       goto fail;
>  +               }
>  +               n = get_user_pages(current, current->mm, base, npages,
>  +                                  0, 0, pages, NULL);
>  +               if (unlikely(n < 0)) {
>  +                       err = n;
>  +                       goto fail;
>  +               }
>  +
>  +               /* Transfer pages to the frag array */
>  +               for (j = 0; j < n; j++) {
>  +                       f[num_pg].page = pages[j];
>  +                       if (j == 0) {
>  +                               f[num_pg].page_offset = offset_in_page(base);
>  +                               f[num_pg].size = min(len, PAGE_SIZE -
>  +                                                    f[num_pg].page_offset);
>  +                       } else {
>  +                               f[num_pg].page_offset = 0;
>  +                               f[num_pg].size = min(len, PAGE_SIZE);
>  +                       }
>  +                       len -= f[num_pg].size;
>  +                       base += f[num_pg].size;
>  +                       num_pg++;
>  +               }
>  +
>  +               if (unlikely(n != npages)) {
>  +                       err = -EFAULT;
>  +                       goto fail;
>  +               }
>  +       }
>  +       up_read(&current->mm->mmap_sem);
>  +       return num_pg;
>  +
>  +fail:
>  +       for (i = 0; i < num_pg; i++)
>  +               put_page(f[i].page);
>  +       up_read(&current->mm->mmap_sem);
>  +       return err;
>  +}
>  +
>  +/* We actually store this at the head of the skb. */
>  +struct skb_tun_hdr {
>  +       struct list_head list;
>  +       struct tun_struct *tun;
>  +       unsigned int id;
>  +       unsigned int len;
>  +};
>  +
>  +/* Get packet from user space buffer.  copylen is a hint as to how
>  + * much to copy (rest is pinned).  */
>  +static struct sk_buff *get_user_skb(struct tun_struct *tun, struct iovec *iv,
>  +                                   size_t copylen, size_t len)
>  +{
>  +       struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
>  +       struct sk_buff *skb;
>  +       size_t align = 0, extra = 0;
>  +       int err;
>  +
>  +       if (!(tun->flags & TUN_NO_PI)) {
>  +               if (len < sizeof(pi)) {
>  +                       err = -EINVAL;
>  +                       goto fail;
>  +               }
>  +               len -= sizeof(pi);
>  +
>  +               if (memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) {
>  +                       err = -EFAULT;
>  +                       goto fail;
>  +               }
>  +               if (copylen > len)
>  +                       copylen = len;
>  +       }
>  +
>  +       if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
>  +               align = NET_IP_ALIGN;
>  +               if (unlikely(copylen < ETH_HLEN)) {
>  +                       if (len < ETH_HLEN) {
>  +                               err = -EINVAL;
>  +                               goto fail;
>  +                       }
>  +                       copylen = ETH_HLEN;
>  +               }
>  +       }
>  +
>  +       /* Allocate extra header if we need  */
>  +       if (copylen != len)
>  +               extra = sizeof(struct skb_tun_hdr);
>  +
>  +       skb = alloc_skb(extra + copylen + align, GFP_KERNEL);
>  +       if (!skb) {
>  +               err = -ENOMEM;
>  +               goto fail;
>  +       }
>  +
>  +       if (extra + align)
>  +               skb_reserve(skb, extra + align);
>  +
>  +       if (memcpy_fromiovec(skb_put(skb, copylen), iv, copylen)) {
>  +               err = -EFAULT;
>  +               goto free_skb;
>  +       }
>  +
>  +       switch (tun->flags & TUN_TYPE_MASK) {
>  +       case TUN_TUN_DEV:
>  +               skb_reset_mac_header(skb);
>  +               skb->protocol = pi.proto;
>  +               skb->dev = tun->dev;
>  +               break;
>  +       case TUN_TAP_DEV:
>  +               skb->protocol = eth_type_trans(skb, tun->dev);
>  +               break;
>  +       };
>  +
>  +       if (tun->flags & TUN_NOCHECKSUM)
>  +               skb->ip_summed = CHECKSUM_UNNECESSARY;
>  +
>  +       /* Anything left gets put into frags. */
>  +       if (extra) {
>  +               struct skb_shared_info *sinfo = skb_shinfo(skb);
>  +               int err = get_user_skb_frags(iv, len - copylen, sinfo->frags);
>  +               if (err < 0)
>  +                       goto free_skb;
>  +               sinfo->nr_frags = err;
>  +       }
>  +       tun->dev->last_rx = jiffies;
>  +
>  +       tun->dev->stats.rx_packets++;
>  +       tun->dev->stats.rx_bytes += len;
>  +
>  +       return skb;
>  +
>  +free_skb:
>  +       kfree_skb(skb);
>  +fail:
>  +       tun->dev->stats.rx_dropped++;
>  +       return ERR_PTR(err);
>   }
>
>   #if defined(CONFIG_VRING) || defined(CONFIG_VRING_MODULE)
>  @@ -355,6 +521,132 @@ static struct vring_ops recvops = {
>         .pull = pull_recv_skbs,
>   };
>
>  +static DEFINE_SPINLOCK(finished_lock);
>  +static LIST_HEAD(shinfo_finished_list);
>  +static struct task_struct *shinfo_finisher;
>  +
>  +static void used_buffer(struct skb_tun_hdr *tunh)
>  +{
>  +       /* Woot, something happened. */
>  +       vring_wake(tunh->tun->outring);
>  +
>  +       /* Release device.  Keeping this reference blocks file close. */
>  +       dev_put(tunh->tun->dev);
>  +
>  +       /* tunh == skb->head. */
>  +       kfree(tunh);
>  +}
>  +
>  +static int do_shinfo_finisher(void *unused)
>  +{
>  +       LIST_HEAD(list);
>  +       struct skb_tun_hdr *i;
>  +
>  +       while (!kthread_should_stop()) {
>  +               set_current_state(TASK_INTERRUPTIBLE);
>  +
>  +               spin_lock_irq(&finished_lock);
>  +               list_splice_init(&list, &shinfo_finished_list);
>  +               spin_unlock_irq(&finished_lock);
>  +
>  +               if (list_empty(&list)) {
>  +                       schedule();
>  +                       continue;
>  +               }
>  +
>  +               list_for_each_entry(i, &list, list) {
>  +                       vring_used_buffer(i->tun->outring, i->id, i->len);
>  +                       used_buffer(i);
>  +               }
>  +       }
>  +       return 0;
>  +}
>  +
>  +/* We are done with this skb data: put it in the used pile. */
>  +static void shinfo_finished(struct skb_shared_info *sinfo)
>  +{
>  +       struct skb_tun_hdr *tunh = (void *)skb_shinfo_to_head(sinfo);
>  +       unsigned long flags;
>  +
>  +       spin_lock_irqsave(&finished_lock, flags);
>  +       list_add(&tunh->list, &shinfo_finished_list);
>  +       spin_unlock_irqrestore(&finished_lock, flags);
>  +
>  +       wake_up_process(shinfo_finisher);
>  +}
>  +
>  +static int xmit_packets(void *_tun)
>  +{
>  +       struct tun_struct *tun = _tun;
>  +       struct iovec iov[1+MAX_SKB_FRAGS];
>  +       unsigned int iovnum = ARRAY_SIZE(iov);
>  +       int id, err, wake = 0;
>  +       unsigned long len;
>  +
>  +       while ((id = vring_get_buffer(tun->outring, NULL, NULL, NULL,
>  +                                     iov, &iovnum, &len)) > 0) {
>  +               struct virtio_net_hdr h;
>  +               struct sk_buff *skb;
>  +               struct skb_shared_info *shinfo;
>  +
>  +               if (unlikely(len < sizeof(h)))
>  +                       return -EINVAL;
>  +
>  +               err = memcpy_fromiovec((void *)&h, iov, sizeof(h));
>  +               if (unlikely(err))
>  +                       return -EFAULT;
>  +
>  +               len -= sizeof(h);
>  +               if (h.hdr_len > len)
>  +                       return -EINVAL;
>  +
>  +               /* Without GSO, we copy entire packet. */
>  +               if (h.gso_type == VIRTIO_NET_HDR_GSO_NONE)
>  +                       h.hdr_len = len;
>  +
>  +               skb = get_user_skb(tun, iov, h.hdr_len, len);
>  +               if (IS_ERR(skb))
>  +                       return PTR_ERR(skb);
>  +
>  +               if ((h.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
>  +                   !skb_partial_csum_set(skb, h.csum_start, h.csum_offset)) {
>  +                       kfree_skb(skb);
>  +                       return -EINVAL;
>  +               }
>  +
>  +               /* If it has fragments, set up destructor for later. */
>  +               shinfo = skb_shinfo(skb);
>  +               if (skb_shinfo(skb)->nr_frags) {
>  +                       struct skb_tun_hdr *tunh = (void *)skb->head;
>  +                       shinfo->destructor = shinfo_finished;
>  +                       tunh->id = id;
>  +                       tunh->len = sizeof(h) + skb->len;
>  +               } else {
>  +                       vring_used_buffer(tun->outring, id, sizeof(h)+skb->len);
>  +                       wake = 1;
>  +               }
>  +               netif_rx_ni(skb);
>  +       }
>  +
>  +       if (wake)
>  +               vring_wake(tun->outring);
>  +
>  +       /* 0 or error. */
>  +       return id;
>  +}
>  +
>  +static struct vring_ops xmitops = {
>  +       .push = xmit_packets,
>  +};
>  +
>  +static int init_vring(void)
>  +{
>  +       shinfo_finisher = kthread_run(do_shinfo_finisher, NULL, "tun");
>  +       if (IS_ERR(shinfo_finisher))
>  +               return PTR_ERR(shinfo_finisher);
>  +       return 0;
>  +}
>  +
>   static int set_recv_vring(struct tun_struct *tun, int fd)
>   {
>         int err;
>  @@ -391,9 +685,47 @@ static void unset_vrings(struct tun_stru
>                 vring_unset_ops(tun->inring);
>                 fput(tun->infile);
>         }
>  +       if (tun->outring) {
>  +               vring_unset_ops(tun->outring);
>  +               fput(tun->outfile);
>  +       }
>  +}
>  +
>  +static int set_xmit_vring(struct tun_struct *tun, int fd)
>  +{
>  +       int err;
>  +
>  +       if (tun->outring)
>  +               return -EBUSY;
>  +
>  +       tun->outfile = fget(fd);
>  +       if (!tun->outfile)
>  +               return -EBADF;
>  +
>  +       tun->outring = vring_get(tun->outfile);
>  +       if (!tun->outring) {
>  +               err = -EBADF;
>  +               goto put;
>  +       }
>  +
>  +       err = vring_set_ops(tun->outring, &xmitops, tun);
>  +       if (err) {
>  +               tun->outring = NULL;
>  +               goto put;
>  +       }
>  +       return 0;
>  +
>  +put:
>  +       fput(tun->outfile);
>  +       tun->outfile = NULL;
>  +       return err;
>   }
>   #else /* ... !CONFIG_VRING */
>   static int set_recv_vring(struct tun_struct *tun, int fd)
>  +{
>  +       return -ENOTTY;
>  +}
>  +static int set_xmit_vring(struct tun_struct *tun, int fd)
>   {
>         return -ENOTTY;
>   }
>  @@ -424,74 +756,26 @@ static unsigned int tun_chr_poll(struct
>         return mask;
>   }
>
>  -/* Get packet from user space buffer */
>  -static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
>  -{
>  -       struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
>  -       struct sk_buff *skb;
>  -       size_t len = count, align = 0;
>  -
>  -       if (!(tun->flags & TUN_NO_PI)) {
>  -               if ((len -= sizeof(pi)) > count)
>  -                       return -EINVAL;
>  -
>  -               if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
>  -                       return -EFAULT;
>  -       }
>  -
>  -       if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
>  -               align = NET_IP_ALIGN;
>  -               if (unlikely(len < ETH_HLEN))
>  -                       return -EINVAL;
>  -       }
>  -
>  -       if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
>  -               tun->dev->stats.rx_dropped++;
>  -               return -ENOMEM;
>  -       }
>  -
>  -       if (align)
>  -               skb_reserve(skb, align);
>  -       if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
>  -               tun->dev->stats.rx_dropped++;
>  -               kfree_skb(skb);
>  -               return -EFAULT;
>  -       }
>  -
>  -       switch (tun->flags & TUN_TYPE_MASK) {
>  -       case TUN_TUN_DEV:
>  -               skb_reset_mac_header(skb);
>  -               skb->protocol = pi.proto;
>  -               skb->dev = tun->dev;
>  -               break;
>  -       case TUN_TAP_DEV:
>  -               skb->protocol = eth_type_trans(skb, tun->dev);
>  -               break;
>  -       };
>  -
>  -       if (tun->flags & TUN_NOCHECKSUM)
>  -               skb->ip_summed = CHECKSUM_UNNECESSARY;
>  -
>  -       netif_rx_ni(skb);
>  -       tun->dev->last_rx = jiffies;
>  -
>  -       tun->dev->stats.rx_packets++;
>  -       tun->dev->stats.rx_bytes += len;
>  -
>  -       return count;
>  -}
>  -
>   static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
>                               unsigned long count, loff_t pos)
>   {
>         struct tun_struct *tun = iocb->ki_filp->private_data;
>  +       size_t len;
>  +       struct sk_buff *skb;
>
>         if (!tun)
>                 return -EBADFD;
>
>         DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
>
>  -       return tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
>  +       len = iov_length(iv, count);
>  +
>  +       skb = get_user_skb(tun, (struct iovec *)iv, len, len);
>  +       if (IS_ERR(skb))
>  +               return PTR_ERR(skb);
>  +
>  +       netif_rx_ni(skb);
>  +       return len;
>   }
>
>   /* Put packet to the user space buffer */
>  @@ -831,6 +1115,9 @@ static int tun_chr_ioctl(struct inode *i
>         case TUNSETRECVVRING:
>                 return set_recv_vring(tun, arg);
>
>  +       case TUNSETXMITVRING:
>  +               return set_xmit_vring(tun, arg);
>  +
>         case SIOCGIFFLAGS:
>                 ifr.ifr_flags = tun->if_flags;
>                 if (copy_to_user( argp, &ifr, sizeof ifr))
>  @@ -1078,6 +1365,12 @@ static int __init tun_init(void)
>         ret = misc_register(&tun_miscdev);
>         if (ret)
>                 printk(KERN_ERR "tun: Can't register misc device %d\n", TUN_MINOR);
>  +       else {
>  +               ret = init_vring();
>  +               if (ret)
>  +                       misc_deregister(&tun_miscdev);
>  +       }
>  +
>         return ret;
>   }
>
>  diff -r f797ec115d1b include/linux/if_tun.h
>  --- a/include/linux/if_tun.h    Fri Apr 18 05:58:40 2008 +1000
>  +++ b/include/linux/if_tun.h    Fri Apr 18 06:07:21 2008 +1000
>  @@ -43,6 +43,7 @@
>   #define TUNSETLINK    _IOW('T', 205, int)
>   #define TUNSETGROUP   _IOW('T', 206, int)
>   #define TUNSETRECVVRING _IOW('T', 207, int)
>  +#define TUNSETXMITVRING _IOW('T', 208, int)
>
>   /* TUNSETIFF ifr flags */
>   #define IFF_TUN                0x0001
>  _______________________________________________
>  Virtualization mailing list
>  Virtualization@lists.linux-foundation.org
>  https://lists.linux-foundation.org/mailman/listinfo/virtualization
>



-- 
Pradeep Singh Rautela
http://eagain.wordpress.com
http://emptydomain.googlepages.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] tun: vringfd xmit support.
  2008-04-18 11:46           ` pradeep singh rautela
@ 2008-04-18 14:25             ` Ray Lee
  2008-04-18 18:01               ` pradeep singh rautela
  0 siblings, 1 reply; 27+ messages in thread
From: Ray Lee @ 2008-04-18 14:25 UTC (permalink / raw)
  To: pradeep singh rautela
  Cc: Rusty Russell, netdev, linux-kernel, Max Krasnyansky, virtualization

On Fri, Apr 18, 2008 at 4:46 AM, pradeep singh rautela
<rautelap@gmail.com> wrote:
>
> On Fri, Apr 18, 2008 at 10:13 AM, Rusty Russell <rusty@rustcorp.com.au> wrote:
>  >  +               /* How many pages will this take? */
>  >  +               npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
>
>  Hi Rusty,
>  A trivial suggestion, how about
>           npages = 1+(len -1)/PAGE_SIZE ?

That's not the same. In particular, his version accounts for the
fractional page at the beginning, while yours doesn't.

While it's tempting to use algebra to simplify things, it's not safe
to do so when the expression involves division over the integers. The
only care-free integer math in a computer is subtraction and ++.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-18 11:18     ` [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface Andrew Morton
@ 2008-04-18 14:32       ` Rusty Russell
  2008-04-18 18:59         ` Andrew Morton
  0 siblings, 1 reply; 27+ messages in thread
From: Rusty Russell @ 2008-04-18 14:32 UTC (permalink / raw)
  To: Andrew Morton; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

On Friday 18 April 2008 21:18:46 Andrew Morton wrote:
> > +	/* Must be a power of two, and limit indices to a u16. */
> > +	if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536)
>
> We have an is_power_of_2().

Thanks, fixed.

> > + * vring_get - check out a vring file descriptor
> > + * @filp: the file structure to attach to (eg. from fget()).
> > + *
> > + * Userspace opens /dev/vring and mmaps it, then hands that fd to the
> > + * kernel subsystem it wants to communicate with.  That subsystem uses
> > + * this routine and vring_set_ops() to attach to it.
> > + *
> > + * This simply checks that it really is a vring fd (otherwise it
> > + * returns NULL), the other routine checks that it's not already
> > + * attached.
> > + */
>
> hm, I don't understand the big picture here yet.
>
> Isn't this kinda-sorta like what a relayfs file does?  The oprofile
> buffers?  etc?  Nothing in common at all, no hope?

An excellent question, but I thought the modern kernel etiquette was to only 
comment on whitespace and formatting, and call it "review"? :)

Yes, kinda-sorta in that it's a ring buffer.  No, in that it's bidir and 
consumption can be out-of-order (kind of important for I/O buffers).

But the reason I'm not proposing it as a syscall is that I'm not convinced 
it's the One True Solution which everyone should be using.  Time will tell: 
it's clearly not tied to tun and it's been generically useful for virtual 
I/O, but history has not been kind to new userspace interfaces.

> > +	mutex_unlock(&vr->lock);
> > +	local_irq_enable();
>
> what's this doing here?

Snot from previous version.  Removed.

> > +void vring_unset_ops(struct vring_info *vr)
> > +{
> > +	BUG_ON(!vr->ops);
> > +	mutex_lock(&vr->lock);
> > +	vr->ops = NULL;
> > +	mutex_unlock(&vr->lock);
> > +}
> > +EXPORT_SYMBOL_GPL(vring_unset_ops);
>
> Isn't this just vring_set_ops(vr, NULL, NULL)?

Yes, except I like the clarity and the BUG_ON.

> ponders #include <copyright.h>

"#include <copyright-gpl2-plus>" for me, just to add more inclement weather to 
that teacup...

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] tun: vringfd xmit support.
  2008-04-18 11:31           ` Andrew Morton
@ 2008-04-18 15:15             ` Rusty Russell
  2008-04-18 16:24               ` Ray Lee
                                 ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Rusty Russell @ 2008-04-18 15:15 UTC (permalink / raw)
  To: Andrew Morton; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

On Friday 18 April 2008 21:31:20 Andrew Morton wrote:
> On Fri, 18 Apr 2008 14:43:24 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:
> > +		/* How many pages will this take? */
> > +		npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
>
> Brain hurts.  I hope you got that right.

I tested it when I wrote it, but just wrote a tester again:

base		len	npages
0               1       1
0xfff           1       1
0x1000          1       1
0               4096    1
0x1             4096    2
0xfff           4096    2
0x1000          4096    1
0xfffff000      4096    1
0xfffff000      4097    4293918722

> > +		if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
> > +			err = -ENOSPC;
> > +			goto fail;
> > +		}
> > +		n = get_user_pages(current, current->mm, base, npages,
> > +				   0, 0, pages, NULL);
>
> What is the maximum numbet of pages which an unpriviliged user can
> concurrently pin with this code?

Since only root can open the tun device, it's currently OK.  The old code
kmalloced and copied: is there some mm-fu reason why pinning userspace memory
is worse?

But I actually think it's OK even for non-root, since these become skbs, which
means they either go into an outgoing device queue or a socket queue which is
accounted for exactly for this reason. 

> > +		if (unlikely(n < 0)) {
> > +			err = n;
> > +			goto fail;
> > +		}
> > +
> > +		/* Transfer pages to the frag array */
> > +		for (j = 0; j < n; j++) {
> > +			f[num_pg].page = pages[j];
> > +			if (j == 0) {
> > +				f[num_pg].page_offset = offset_in_page(base);
> > +				f[num_pg].size = min(len, PAGE_SIZE -
> > +						     f[num_pg].page_offset);
> > +			} else {
> > +				f[num_pg].page_offset = 0;
> > +				f[num_pg].size = min(len, PAGE_SIZE);
> > +			}
> > +			len -= f[num_pg].size;
> > +			base += f[num_pg].size;
> > +			num_pg++;
> > +		}
>
> This loop is a fancy way of doing
>
> 		num_pg = n;

Damn, you had me reworking this until I realized why.  It's not: we're
inside a loop, doing one iovec array element at a time.

> > +		if (unlikely(n != npages)) {
> > +			err = -EFAULT;
> > +			goto fail;
> > +		}
>
> why not do this immediately after running get_user_pages()?

To simplify the failure path.  Hmm, I would use release_pages here...

> > +fail:
> > +	for (i = 0; i < num_pg; i++)
> > +		put_page(f[i].page);
>
> release_pages() could be a tad more efficient, but it's only error-path.

... but I didn't know that existed.  Had to include pagemap.h, and it's not
exported.  It seems to be a useful interface; see patch.

Cheers,
Rusty.

Subject: Export release_pages; nice undo for get_user_pages.

Andrew Morton suggests tun/tap use release_pages, but it's not
exported.  It's not clear to me why this is in swap.c, but it exists
even without CONFIG_SWAP, so that's OK.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r abd2ad431e5c mm/swap.c
--- a/mm/swap.c	Sat Apr 19 00:34:54 2008 +1000
+++ b/mm/swap.c	Sat Apr 19 01:11:40 2008 +1000
@@ -346,6 +346,7 @@ void release_pages(struct page **pages, 
 
 	pagevec_free(&pages_to_free);
 }
+EXPORT_SYMBOL(release_pages);
 
 /*
  * The pages which we're about to release may be in the deferred lru-addition

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] tun: vringfd xmit support.
  2008-04-18 15:15             ` Rusty Russell
@ 2008-04-18 16:24               ` Ray Lee
  2008-04-18 19:06               ` Andrew Morton
  2008-04-19  1:54               ` Andrew Morton
  2 siblings, 0 replies; 27+ messages in thread
From: Ray Lee @ 2008-04-18 16:24 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, netdev, Max Krasnyansky, virtualization, linux-kernel

On Fri, Apr 18, 2008 at 8:15 AM, Rusty Russell <rusty@rustcorp.com.au> wrote:
> On Friday 18 April 2008 21:31:20 Andrew Morton wrote:
>  > On Fri, 18 Apr 2008 14:43:24 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:
>
> > > +           /* How many pages will this take? */
>  > > +           npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
>  >
>  > Brain hurts.  I hope you got that right.
>
>  I tested it when I wrote it, but just wrote a tester again:
>
>  base            len     npages
>  0               1       1
>  0xfff           1       1
>  0x1000          1       1
>  0               4096    1
>  0x1             4096    2
>  0xfff           4096    2
>  0x1000          4096    1
>  0xfffff000      4096    1
>  0xfffff000      4097    4293918722

Picky, picky :-).

If base were always page aligned, we'd want

    npages = (len + PAGE_SIZE - 1) / PAGE_SIZE;

which is simply rounding len up to the next largest page size. So when
base is not page aligned, increase len by the left-over space at the
beginning, and then do the same calculation as above. (ie, pretend
base is page aligned, and instead count the excess at the beginning as
part of len.)

    npages = ( (base & PAGE_MASK) + len + PAGE_SIZE - 1) / PAGE_SIZE;

As long as len < PAGE_MASK - PAGE_SIZE, we're safe from overflows.

(The above gives a different answer when len=0, but you guard for that.)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] tun: vringfd xmit support.
  2008-04-18 14:25             ` Ray Lee
@ 2008-04-18 18:01               ` pradeep singh rautela
  0 siblings, 0 replies; 27+ messages in thread
From: pradeep singh rautela @ 2008-04-18 18:01 UTC (permalink / raw)
  To: Ray Lee
  Cc: Rusty Russell, netdev, linux-kernel, Max Krasnyansky, virtualization

On Fri, Apr 18, 2008 at 7:55 PM, Ray Lee <ray-lk@madrabbit.org> wrote:
> On Fri, Apr 18, 2008 at 4:46 AM, pradeep singh rautela
>  <rautelap@gmail.com> wrote:
>  >
>  > On Fri, Apr 18, 2008 at 10:13 AM, Rusty Russell <rusty@rustcorp.com.au> wrote:
>
> >  >  +               /* How many pages will this take? */
>  >  >  +               npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
>  >
>  >  Hi Rusty,
>  >  A trivial suggestion, how about
>  >           npages = 1+(len -1)/PAGE_SIZE ?
>
>  That's not the same. In particular, his version accounts for the
>  fractional page at the beginning, while yours doesn't.

Oh thanks for correcting me Ray. :)
>
>  While it's tempting to use algebra to simplify things, it's not safe
>  to do so when the expression involves division over the integers. The
>  only care-free integer math in a computer is subtraction and ++.

I stand corrected.

Sorry for noise.

Thanks,
-- 
Pradeep Singh Rautela
http://eagain.wordpress.com
http://emptydomain.googlepages.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-18 14:32       ` Rusty Russell
@ 2008-04-18 18:59         ` Andrew Morton
  2008-04-18 19:38           ` Michael Kerrisk
  2008-04-19 15:02           ` Jonathan Corbet
  0 siblings, 2 replies; 27+ messages in thread
From: Andrew Morton @ 2008-04-18 18:59 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

On Sat, 19 Apr 2008 00:32:39 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:

> > Isn't this kinda-sorta like what a relayfs file does?  The oprofile
> > buffers?  etc?  Nothing in common at all, no hope?
> 
> An excellent question, but I thought the modern kernel etiquette was to only 
> comment on whitespace and formatting, and call it "review"? :)
> 
> Yes, kinda-sorta in that it's a ring buffer.  No, in that it's bidir and 
> consumption can be out-of-order (kind of important for I/O buffers).
> 
> But the reason I'm not proposing it as a syscall is that I'm not convinced 
> it's the One True Solution which everyone should be using.  Time will tell: 
> it's clearly not tied to tun and it's been generically useful for virtual 
> I/O, but history has not been kind to new userspace interfaces.

This is may be our third high-bandwidth user/kernel interface to transport
bulk data ("hbukittbd") which was implemented because its predecessors
weren't quite right.  In a year or two's time someone else will need a
hbukittbd and will find that the existing three aren't quite right and will
give us another one.  One day we need to stop doing this ;)

It could be that this person will look at Rusty's hbukittbd and find that
it _could_ be tweaked to do what he wants, but it's already shipping and
it's part of the kernel API and hence can't be made to do what he wants.

So I think it would be good to plonk the proposed interface on the table
and have a poke at it.  Is it compat-safe?  Is it extensible in a
backward-compatible fashion?  Are there future-safe changes we should make
to it?  Can Michael Kerrisk understand, review and document it?  etc.

You know what I'm saying ;)  What is the proposed interface?

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] tun: vringfd xmit support.
  2008-04-18 15:15             ` Rusty Russell
  2008-04-18 16:24               ` Ray Lee
@ 2008-04-18 19:06               ` Andrew Morton
  2008-04-19 14:41                 ` Rusty Russell
  2008-04-19  1:54               ` Andrew Morton
  2 siblings, 1 reply; 27+ messages in thread
From: Andrew Morton @ 2008-04-18 19:06 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

On Sat, 19 Apr 2008 01:15:15 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:

> >
> > What is the maximum numbet of pages which an unpriviliged user can
> > concurrently pin with this code?
> 
> Since only root can open the tun device, it's currently OK.  The old code
> kmalloced and copied: is there some mm-fu reason why pinning userspace memory
> is worse?

We generally try to avoid it - it allows users to dos the box.  Although I
suspect that direct-io presently permits users to transiently pin an amount
of memory which is proportional to the number of disks upon which they can
open files.

> Subject: Export release_pages; nice undo for get_user_pages.
> 
> Andrew Morton suggests tun/tap use release_pages, but it's not
> exported.  It's not clear to me why this is in swap.c, but it exists
> even without CONFIG_SWAP, so that's OK.
> 
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> 
> diff -r abd2ad431e5c mm/swap.c
> --- a/mm/swap.c	Sat Apr 19 00:34:54 2008 +1000
> +++ b/mm/swap.c	Sat Apr 19 01:11:40 2008 +1000
> @@ -346,6 +346,7 @@ void release_pages(struct page **pages, 
>  
>  	pagevec_free(&pages_to_free);
>  }
> +EXPORT_SYMBOL(release_pages);
>  
>  /*
>   * The pages which we're about to release may be in the deferred lru-addition

acked-by: me.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-18 18:59         ` Andrew Morton
@ 2008-04-18 19:38           ` Michael Kerrisk
  2008-04-19 16:41             ` Rusty Russell
  2008-04-19 15:02           ` Jonathan Corbet
  1 sibling, 1 reply; 27+ messages in thread
From: Michael Kerrisk @ 2008-04-18 19:38 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rusty Russell, netdev, Max Krasnyansky, virtualization,
	linux-kernel, Michael Kerrisk

On 4/18/08, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Sat, 19 Apr 2008 00:32:39 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:
>
>  > > Isn't this kinda-sorta like what a relayfs file does?  The oprofile
>  > > buffers?  etc?  Nothing in common at all, no hope?
>  >
>  > An excellent question, but I thought the modern kernel etiquette was to only
>  > comment on whitespace and formatting, and call it "review"? :)
>  >
>  > Yes, kinda-sorta in that it's a ring buffer.  No, in that it's bidir and
>  > consumption can be out-of-order (kind of important for I/O buffers).
>  >
>  > But the reason I'm not proposing it as a syscall is that I'm not convinced
>  > it's the One True Solution which everyone should be using.  Time will tell:
>  > it's clearly not tied to tun and it's been generically useful for virtual
>  > I/O, but history has not been kind to new userspace interfaces.
>
>
> This is may be our third high-bandwidth user/kernel interface to transport
>  bulk data ("hbukittbd") which was implemented because its predecessors
>  weren't quite right.  In a year or two's time someone else will need a
>  hbukittbd and will find that the existing three aren't quite right and will
>  give us another one.  One day we need to stop doing this ;)
>
>  It could be that this person will look at Rusty's hbukittbd and find that
>  it _could_ be tweaked to do what he wants, but it's already shipping and
>  it's part of the kernel API and hence can't be made to do what he wants.
>
>  So I think it would be good to plonk the proposed interface on the table
>  and have a poke at it.  Is it compat-safe?  Is it extensible in a
>  backward-compatible fashion?  Are there future-safe changes we should make
>  to it?  Can Michael Kerrisk understand, review and document it?  etc.

Well, it helps if he's CCed....

I'm happy to work *with someone* on the documentation (pointless to do
it on my own -- how do I know what Rusty's *intended* behavior for the
interface is), and review, and testing.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] tun: vringfd xmit support.
  2008-04-18 15:15             ` Rusty Russell
  2008-04-18 16:24               ` Ray Lee
  2008-04-18 19:06               ` Andrew Morton
@ 2008-04-19  1:54               ` Andrew Morton
  2 siblings, 0 replies; 27+ messages in thread
From: Andrew Morton @ 2008-04-19  1:54 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

On Sat, 19 Apr 2008 01:15:15 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:

> It's not clear to me why this is in swap.c, but it exists
> even without CONFIG_SWAP, so that's OK.

We should have done mv mm/swap.c mm/mmlib.c years ago.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-18  4:39   ` [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface Rusty Russell
  2008-04-18  4:41     ` [PATCH 3/5] /dev/vring limit and base ioctls Rusty Russell
  2008-04-18 11:18     ` [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface Andrew Morton
@ 2008-04-19 10:22     ` Evgeniy Polyakov
  2008-04-19 16:05       ` Rusty Russell
  2 siblings, 1 reply; 27+ messages in thread
From: Evgeniy Polyakov @ 2008-04-19 10:22 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

Hi.

On Fri, Apr 18, 2008 at 02:39:48PM +1000, Rusty Russell (rusty@rustcorp.com.au) wrote:

> +int vring_get_buffer(struct vring_info *vr,
> +		     struct iovec *in_iov,
> +		     unsigned int *num_in, unsigned long *in_len,
> +		     struct iovec *out_iov,
> +		     unsigned int *num_out, unsigned long *out_len)
> +{
> +	unsigned int i, in = 0, out = 0;
> +	unsigned long dummy;
> +	u16 avail, last_avail, head;
> +	struct vring_desc d;

Should this whole function and vring_used_buffer() be protected with
vr->lock mutex?

> +	if (unlikely(get_user(avail, &vr->ring.avail->idx)))
> +		return -EFAULT;
> +	if (unlikely(get_user(last_avail, &vring_last_avail(&vr->ring))))
> +		return -EFAULT;
> +
> +	if (last_avail == avail)
> +		return 0;
> +
> +	if (!in_len)
> +		in_len = &dummy;
> +	if (!out_len)
> +		out_len = &dummy;
> +
> +	*in_len = *out_len = 0;
> +
> +	if (unlikely(get_user(head, &vr->ring.avail->ring[last_avail
> +							  & vr->mask])))
> +		return -EFAULT;
> +
> +	i = head;
> +	do {
> +		if (unlikely(i >= vr->ring.num)) {
> +			pr_debug("vring: bad index: %u\n", i);
> +			return -EINVAL;
> +		}
> +
> +		if (copy_from_user(&d, &vr->ring.desc[i], sizeof(d)) != 0)
> +			return -EFAULT;
> +
> +		if (d.flags & VRING_DESC_F_WRITE) {
> +			/* Check for length and iovec overflows */
> +			if (!num_in) {
> +				pr_debug("vring: writable desc %u in ring %p\n",
> +					 i, vr->ring.desc);
> +				return -EINVAL;
> +			}
> +			if (in == *num_in || *in_len + d.len < *in_len)
> +				return -E2BIG;
> +			in_iov[in].iov_len = d.len;
> +			*in_len += d.len;
> +			in_iov[in].iov_base = (void __user *)(long)d.addr;
> +			in++;
> +		} else {
> +			if (!num_out) {
> +				pr_debug("vring: readable desc %u in ring %p\n",
> +					 i, vr->ring.desc);
> +				return -EINVAL;
> +			}
> +			if (out == *num_out || *out_len + d.len < *out_len)
> +				return -E2BIG;
> +			out_iov[out].iov_len = d.len;
> +			*out_len += d.len;
> +			out_iov[out].iov_base = (void __user *)(long)d.addr;
> +			out++;
> +		}
> +
> +		i = d.next;
> +	} while (d.flags & VRING_DESC_F_NEXT);
> +
> +	if (num_in)
> +		*num_in = in;
> +	if (num_out)
> +		*num_out = out;
> +
> +	last_avail++;
> +	put_user(last_avail, &vring_last_avail(&vr->ring));
> +
> +	/* 0 is a valid head, so add one. */
> +	return head + 1;
> +}
> +EXPORT_SYMBOL_GPL(vring_get_buffer);
> +
> +/**
> + * vring_used_buffer - return a used buffer to the vring
> + * @vr: the vring
> + * @id: the id returned from vring_get_buffer
> + * @len: the total bytes *written* to the buffer
> + */
> +void vring_used_buffer(struct vring_info *vr, int id, u32 len)
> +{
> +	struct vring_used_elem used;
> +	u16 used_idx;
> +
> +	BUG_ON(id <= 0 || id > vr->ring.num);
> +
> +	used.id = id - 1;
> +	used.len = len;
> +	if (get_user(used_idx, &vr->ring.used->idx) != 0)
> +		return;
> +
> +	if (copy_to_user(&vr->ring.used->ring[used_idx & vr->mask], &used,
> +			 sizeof(used)))
> +		return;
> +
> +	wmb();
> +	used_idx++;
> +	put_user(used_idx, &vr->ring.used->idx);
> +}
> +EXPORT_SYMBOL_GPL(vring_used_buffer);

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] tun: vringfd xmit support.
  2008-04-18 19:06               ` Andrew Morton
@ 2008-04-19 14:41                 ` Rusty Russell
  2008-04-19 17:51                   ` Andrew Morton
  0 siblings, 1 reply; 27+ messages in thread
From: Rusty Russell @ 2008-04-19 14:41 UTC (permalink / raw)
  To: Andrew Morton; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

On Saturday 19 April 2008 05:06:34 Andrew Morton wrote:
> On Sat, 19 Apr 2008 01:15:15 +1000 Rusty Russell <rusty@rustcorp.com.au> 
wrote:
> > > What is the maximum numbet of pages which an unpriviliged user can
> > > concurrently pin with this code?
> >
> > Since only root can open the tun device, it's currently OK.  The old code
> > kmalloced and copied: is there some mm-fu reason why pinning userspace
> > memory is worse?
>
> We generally try to avoid it - it allows users to dos the box.

My question is: is pinning a page worse than allocating a (kernel) page in 
some way?

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-18 18:59         ` Andrew Morton
  2008-04-18 19:38           ` Michael Kerrisk
@ 2008-04-19 15:02           ` Jonathan Corbet
  1 sibling, 0 replies; 27+ messages in thread
From: Jonathan Corbet @ 2008-04-19 15:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: netdev, Max Krasnyansky, virtualization, Rusty Russell, linux-kernel

> So I think it would be good to plonk the proposed interface on the table
> and have a poke at it.  Is it compat-safe?  Is it extensible in a
> backward-compatible fashion?  Are there future-safe changes we should make
> to it?  Can Michael Kerrisk understand, review and document it?  etc.
> 
> You know what I'm saying ;)  What is the proposed interface?

So, I'm not Michael, but I *did* make an attempt to document this
interface - user and kernel sides - so that it could be more easily
understood:

	http://lwn.net/Articles/276856/

That was the previous posting, but a quick look suggests it hasn't
changed *that* much in this round.

jon

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-19 10:22     ` Evgeniy Polyakov
@ 2008-04-19 16:05       ` Rusty Russell
  2008-04-19 16:33         ` Evgeniy Polyakov
  0 siblings, 1 reply; 27+ messages in thread
From: Rusty Russell @ 2008-04-19 16:05 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

On Saturday 19 April 2008 20:22:15 Evgeniy Polyakov wrote:
> Hi.
>
> On Fri, Apr 18, 2008 at 02:39:48PM +1000, Rusty Russell 
(rusty@rustcorp.com.au) wrote:
> > +int vring_get_buffer(struct vring_info *vr,
> > +		     struct iovec *in_iov,
> > +		     unsigned int *num_in, unsigned long *in_len,
> > +		     struct iovec *out_iov,
> > +		     unsigned int *num_out, unsigned long *out_len)
> > +{
> > +	unsigned int i, in = 0, out = 0;
> > +	unsigned long dummy;
> > +	u16 avail, last_avail, head;
> > +	struct vring_desc d;
>
> Should this whole function and vring_used_buffer() be protected with
> vr->lock mutex?

No; it's up to the caller to make sure that they are serialized.  In the case 
of tun that happens naturally.

There are two reasons not to grab the lock.  It turns out that if we tried to 
lock here, we'd deadlock, since the callbacks are called under the lock.  
Secondly, it's possible to implement an atomic vring_used_buffer variant, 
which could fail: this would avoid using the thread most of the time.

Hope that helps,
Rusty.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-19 16:05       ` Rusty Russell
@ 2008-04-19 16:33         ` Evgeniy Polyakov
  2008-04-19 16:45           ` Rusty Russell
  0 siblings, 1 reply; 27+ messages in thread
From: Evgeniy Polyakov @ 2008-04-19 16:33 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

On Sun, Apr 20, 2008 at 02:05:31AM +1000, Rusty Russell (rusty@rustcorp.com.au) wrote:
> > Should this whole function and vring_used_buffer() be protected with
> > vr->lock mutex?
> 
> No; it's up to the caller to make sure that they are serialized.  In the case 
> of tun that happens naturally.
> 
> There are two reasons not to grab the lock.  It turns out that if we tried to 
> lock here, we'd deadlock, since the callbacks are called under the lock.  
> Secondly, it's possible to implement an atomic vring_used_buffer variant, 
> which could fail: this would avoid using the thread most of the time.

Yep, I decided that too. But it limits its usage to tun only or any
other system where only single thread picks up results, so no generic
userspace ring buffers?

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-18 19:38           ` Michael Kerrisk
@ 2008-04-19 16:41             ` Rusty Russell
  2008-04-20  0:16               ` David Miller
  0 siblings, 1 reply; 27+ messages in thread
From: Rusty Russell @ 2008-04-19 16:41 UTC (permalink / raw)
  To: Michael Kerrisk
  Cc: Andrew Morton, netdev, Max Krasnyansky, virtualization, linux-kernel

On Saturday 19 April 2008 05:38:50 Michael Kerrisk wrote:
> On 4/18/08, Andrew Morton <akpm@linux-foundation.org> wrote:
> > This is may be our third high-bandwidth user/kernel interface to
> > transport bulk data ("hbukittbd") which was implemented because its
> > predecessors weren't quite right.  In a year or two's time someone else
> > will need a hbukittbd and will find that the existing three aren't quite
> > right and will give us another one.  One day we need to stop doing this
> > ;)

If only there were some kind of, I don't know... summit... for kernel 
people... 

> >  It could be that this person will look at Rusty's hbukittbd and find
> > that it _could_ be tweaked to do what he wants, but it's already shipping
> > and it's part of the kernel API and hence can't be made to do what he
> > wants.

Indeed.  I marked it experimental because of these questions (ie. it's not yet 
kernel ABI).  Getting everyone's attention is hard tho, so I figured we put 
it in as a device and moving to a syscall if and when we feel it's ready.

> >  So I think it would be good to plonk the proposed interface on the table
> >  and have a poke at it.  Is it compat-safe?  Is it extensible in a
> >  backward-compatible fashion?  Are there future-safe changes we should
> > make to it?  Can Michael Kerrisk understand, review and document it? 
> > etc.
>
> Well, it helps if he's CCed....

It is compat safe, and we've already extended it once, so I'm reasonably happy 
so far.  If it were a syscall I'd add a flags arg, for the device it'd be an 
ioctl.  Starting with the virtio ABI seemed a reasonable first step, because 
*we* can use this today even if noone else does.

> I'm happy to work *with someone* on the documentation (pointless to do
> it on my own -- how do I know what Rusty's *intended* behavior for the
> interface is), and review, and testing.

Document coming up...
Rusty.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-19 16:33         ` Evgeniy Polyakov
@ 2008-04-19 16:45           ` Rusty Russell
  0 siblings, 0 replies; 27+ messages in thread
From: Rusty Russell @ 2008-04-19 16:45 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: netdev, Max Krasnyansky, virtualization, linux-kernel

On Sunday 20 April 2008 02:33:22 Evgeniy Polyakov wrote:
> On Sun, Apr 20, 2008 at 02:05:31AM +1000, Rusty Russell 
(rusty@rustcorp.com.au) wrote:
> > There are two reasons not to grab the lock.  It turns out that if we
> > tried to lock here, we'd deadlock, since the callbacks are called under
> > the lock. Secondly, it's possible to implement an atomic
> > vring_used_buffer variant, which could fail: this would avoid using the
> > thread most of the time.
>
> Yep, I decided that too. But it limits its usage to tun only or any
> other system where only single thread picks up results, so no generic
> userspace ring buffers?

I don't think so, it just externalizes the locking.  The mutex protects the 
attaching and detaching of the ops structure, some other lock or code 
protects simultenous kernel ring accesses.

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/5] tun: vringfd xmit support.
  2008-04-19 14:41                 ` Rusty Russell
@ 2008-04-19 17:51                   ` Andrew Morton
  0 siblings, 0 replies; 27+ messages in thread
From: Andrew Morton @ 2008-04-19 17:51 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, maxk, virtualization, linux-kernel

> On Sun, 20 Apr 2008 00:41:43 +1000 Rusty Russell <rusty@rustcorp.com.au> wrote:
> On Saturday 19 April 2008 05:06:34 Andrew Morton wrote:
> > On Sat, 19 Apr 2008 01:15:15 +1000 Rusty Russell <rusty@rustcorp.com.au> 
> wrote:
> > > > What is the maximum numbet of pages which an unpriviliged user can
> > > > concurrently pin with this code?
> > >
> > > Since only root can open the tun device, it's currently OK.  The old code
> > > kmalloced and copied: is there some mm-fu reason why pinning userspace
> > > memory is worse?
> >
> > We generally try to avoid it - it allows users to dos the box.
> 
> My question is: is pinning a page worse than allocating a (kernel) page in 
> some way?
> 

I guess pinning is not as bad as straight-out allocating.

Pinning is limited to the size of the program's VM.  Pinning
it at least pining something which is accounted and is exposed
to admin tools.

But they're both pretty similar in effect and risk.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.
  2008-04-19 16:41             ` Rusty Russell
@ 2008-04-20  0:16               ` David Miller
  0 siblings, 0 replies; 27+ messages in thread
From: David Miller @ 2008-04-20  0:16 UTC (permalink / raw)
  To: rusty; +Cc: mtk.manpages, akpm, netdev, maxk, virtualization, linux-kernel

From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sun, 20 Apr 2008 02:41:14 +1000

> If only there were some kind of, I don't know... summit... for kernel 
> people... 

I'm starting to disbelieve the myth that because we can discuss
technical issues on mailing lists, we should talk primarily about
process issues during the kernel summit.

There is a distinct advantage to discussing and hashing things out in
person.  You can't say "screw you, your idea sucks" when you're face
to face with the other person, whereas online it's way too easy.

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2008-04-20  0:16 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-04-18  4:33 [PATCH 0/5] High-speed tun receive and xmit Rusty Russell
2008-04-18  4:35 ` [PATCH 1/5] virtio: put last_used and last_avail index into ring itself Rusty Russell
2008-04-18  4:39   ` [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface Rusty Russell
2008-04-18  4:41     ` [PATCH 3/5] /dev/vring limit and base ioctls Rusty Russell
2008-04-18  4:42       ` [PATCH 4/5] tun: vringfd receive support Rusty Russell
2008-04-18  4:43         ` [PATCH 5/5] tun: vringfd xmit support Rusty Russell
2008-04-18 11:31           ` Andrew Morton
2008-04-18 15:15             ` Rusty Russell
2008-04-18 16:24               ` Ray Lee
2008-04-18 19:06               ` Andrew Morton
2008-04-19 14:41                 ` Rusty Russell
2008-04-19 17:51                   ` Andrew Morton
2008-04-19  1:54               ` Andrew Morton
2008-04-18 11:46           ` pradeep singh rautela
2008-04-18 14:25             ` Ray Lee
2008-04-18 18:01               ` pradeep singh rautela
2008-04-18 11:18     ` [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface Andrew Morton
2008-04-18 14:32       ` Rusty Russell
2008-04-18 18:59         ` Andrew Morton
2008-04-18 19:38           ` Michael Kerrisk
2008-04-19 16:41             ` Rusty Russell
2008-04-20  0:16               ` David Miller
2008-04-19 15:02           ` Jonathan Corbet
2008-04-19 10:22     ` Evgeniy Polyakov
2008-04-19 16:05       ` Rusty Russell
2008-04-19 16:33         ` Evgeniy Polyakov
2008-04-19 16:45           ` Rusty Russell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).