All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] Provide a zero-copy method on KVM virtio-net.
@ 2010-02-10 11:48 Xin Xiaohui
  2010-02-10 11:48 ` [PATCH 1/3] A device for zero-copy based " Xin Xiaohui
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Xin Xiaohui @ 2010-02-10 11:48 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mingo, mst, jdike

The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
	To support GRO
	Performance tuning

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/3] A device for zero-copy based on KVM virtio-net.
  2010-02-10 11:48 [PATCH 0/3] Provide a zero-copy method on KVM virtio-net Xin Xiaohui
@ 2010-02-10 11:48 ` Xin Xiaohui
  2010-02-10 11:48   ` [PATCH 2/3] Provides multiple submits and asynchronous notifications Xin Xiaohui
  2010-02-10 15:17   ` [PATCH 1/3] A device for zero-copy based on KVM virtio-net Eric Dumazet
  2010-02-10 13:40 ` [PATCH 0/3] Provide a zero-copy method " Arnd Bergmann
  2010-02-11  8:54   ` Xin, Xiaohui
  2 siblings, 2 replies; 11+ messages in thread
From: Xin Xiaohui @ 2010-02-10 11:48 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mingo, mst, jdike; +Cc: Xin Xiaohui, Zhao Yu

Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81@gmail.com>
Sigend-off-by: Jeff Dike <jdike@c2.user-mode-linux.org>
---
 drivers/vhost/Kconfig      |    5 +
 drivers/vhost/Makefile     |    2 +
 drivers/vhost/mpassthru.c  | 1178 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/miscdevice.h |    1 +
 include/linux/mpassthru.h  |   17 +
 5 files changed, 1203 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c
 create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..ee32a3b 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,8 @@ config VHOST_NET
 	  To compile this driver as a module, choose M here: the module will
 	  be called vhost_net.
 
+config VHOST_PASSTHRU
+	tristate "Zerocopy network driver (EXPERIMENTAL)"
+	depends on VHOST_NET
+	---help---
+	  zerocopy network I/O support
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..3f79c79 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 0000000..d8d153f
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1178 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME        "mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/miscdevice.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/crc32.h>
+#include <linux/nsproxy.h>
+#include <linux/uaccess.h>
+#include <linux/virtio_net.h>
+#include <linux/mpassthru.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+
+#include "vhost.h"
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+	u16     offset;
+	u16     size;
+};
+
+struct page_ctor {
+	struct list_head        readq;
+	int 			w_len;
+	int 			r_len;
+	spinlock_t      	read_lock;
+	atomic_t        	refcnt;
+	struct kmem_cache   	*cache;
+	struct net_device   	*dev;
+	struct netdev_page_ctor ctor;
+	void 			*sendctrl;
+	void 			*recvctrl;
+};
+
+struct page_info {
+	struct list_head    	list;
+	int         		header;
+	/* indicate the actual length of bytes
+	 * send/recv in the user space buffers
+	 */
+	int         		total;
+	int         		offset;
+	struct page     	*pages[MAX_SKB_FRAGS+1];
+	struct skb_frag_struct 	frag[MAX_SKB_FRAGS+1];
+	struct sk_buff      	*skb;
+	struct page_ctor   	*ctor;
+
+	/* The pointer relayed to skb, to indicate
+	 * it's a user space allocated skb or kernel
+	 */
+	struct skb_user_page    user;
+	struct skb_shared_info	ushinfo;
+
+#define INFO_READ      		0
+#define INFO_WRITE     		1
+	unsigned        	flags;
+	unsigned        	pnum;
+
+	/* It's meaningful for receive, means
+	 * the max length allowed
+	 */
+	size_t          	len;
+
+	/* The fields after that is for backend
+	 * driver, now for vhost-net.
+	 */
+	struct vhost_notifier	notifier;
+	unsigned int    	desc_pos;
+	unsigned int 		log;
+	struct iovec 		hdr[VHOST_NET_MAX_SG];
+	struct iovec 		iov[VHOST_NET_MAX_SG];
+	void 			*ctl;
+};
+
+struct mp_struct {
+	struct mp_file   	*mfile;
+	struct net_device       *dev;
+	struct page_ctor	*ctor;
+	struct socket           socket;
+
+#ifdef MPASSTHRU_DEBUG
+	int debug;
+#endif
+};
+
+struct mp_file {
+	atomic_t count;
+	struct mp_struct *mp;
+	struct net *net;
+};
+
+struct mp_sock {
+	struct sock            	sk;
+	struct mp_struct       	*mp;
+};
+
+/* The main function to allocate user space buffers */
+static struct skb_user_page *page_ctor(struct netdev_page_ctor *page_ctor,
+		struct sk_buff *skb, int npages)
+{
+	int i;
+	unsigned long flags;
+	struct page_ctor *ctor;
+	struct page_info *info = NULL;
+
+	ctor = container_of(page_ctor, struct page_ctor, ctor);
+
+	spin_lock_irqsave(&ctor->read_lock, flags);
+	if (!list_empty(&ctor->readq)) {
+		info = list_first_entry(&ctor->readq, struct page_info, list);
+		list_del(&info->list);
+	}
+	spin_unlock_irqrestore(&ctor->read_lock, flags);
+	if (!info)
+		return NULL;
+
+	for (i = 0; i < info->pnum; i++) {
+		get_page(info->pages[i]);
+		info->frag[i].page = info->pages[i];
+		info->frag[i].page_offset = i ? 0 : info->offset;
+		info->frag[i].size = page_ctor->npages > 1 ? PAGE_SIZE :
+			page_ctor->data_len;
+	}
+	info->skb = skb;
+	info->user.frags = info->frag;
+	info->user.ushinfo = &info->ushinfo;
+	return &info->user;
+}
+
+static struct vhost_notifier *create_vhost_notifier(struct vhost_virtqueue *vq,
+			struct page_info *info, int size);
+
+static void mp_vhost_notifier_dtor(struct vhost_notifier *vnotify)
+{
+	struct page_info *info = (struct page_info *)(vnotify->ctrl);
+	int i;
+
+	for (i = 0; i < info->pnum; i++) {
+		if (i <= skb_shinfo(info->skb)->nr_frags &&
+				info->flags == INFO_WRITE)
+			info->pages[i] = NULL;
+		if (info->pages[i])
+			put_page(info->pages[i]);
+	}
+
+	if (info->flags == INFO_READ) {
+		skb_shinfo(info->skb)->destructor_arg = &info->user;
+		info->skb->destructor = NULL;
+		kfree(info->skb);
+	}
+
+	kmem_cache_free(info->ctor->cache, info);
+
+	return;
+}
+
+/* A helper to clean the skb before the kfree_skb() */
+
+static void page_dtor_prepare(struct page_info *info)
+{
+	if (info->flags == INFO_READ)
+		if (info->skb)
+			info->skb->head = NULL;
+}
+
+/* The callback to destruct the user space buffers or skb */
+static void page_dtor(struct skb_user_page *user)
+{
+	struct page_info *info;
+	struct page_ctor *ctor;
+	struct sock *sk;
+	struct sk_buff *skb;
+	struct vhost_notifier *vnotify;
+	struct vhost_virtqueue *vq = NULL;
+	unsigned long flags;
+	int i;
+
+	if (!user)
+		return;
+	info = container_of(user, struct page_info, user);
+	if (!info)
+		return;
+	ctor = info->ctor;
+	skb = info->skb;
+
+	page_dtor_prepare(info);
+
+	/* If the info->total is 0, make it to be reused */
+	if (!info->total) {
+		spin_lock_irqsave(&ctor->read_lock, flags);
+		list_add(&info->list, &ctor->readq);
+		spin_unlock_irqrestore(&ctor->read_lock, flags);
+		return;
+	}
+
+	/* Receive buffers, should be destructed */
+	if (info->flags == INFO_READ) {
+		for (i = 0; info->pages[i]; i++)
+			put_page(info->pages[i]);
+		info->skb = NULL;
+		return;
+	}
+
+	/* For transmit, we should wait for the DMA finish by hardware.
+	 * Queue the notifier to wake up the backend driver
+	 */
+	vq = (struct vhost_virtqueue *)info->ctl;
+	vnotify = create_vhost_notifier(vq, info, info->total);
+
+	spin_lock_irqsave(&vq->notify_lock, flags);
+	list_add_tail(&vnotify->list, &vq->notifier);
+	spin_unlock_irqrestore(&vq->notify_lock, flags);
+
+	sk = ctor->ctor.sock->sk;
+	sk->sk_write_space(sk);
+
+	return;
+}
+
+static int page_ctor_attach(struct mp_struct *mp)
+{
+	int rc;
+	struct page_ctor *ctor;
+	struct net_device *dev = mp->dev;
+
+	rcu_read_lock();
+	if (rcu_dereference(mp->ctor)) {
+		rcu_read_unlock();
+		return -EBUSY;
+	}
+	rcu_read_unlock();
+
+	ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
+	if (!ctor)
+		return -ENOMEM;
+	rc = netdev_page_ctor_prep(dev, &ctor->ctor);
+	if (rc)
+		goto fail;
+
+	ctor->cache = kmem_cache_create("skb_page_info",
+			sizeof(struct page_info), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+
+	if (!ctor->cache)
+		goto cache_fail;
+
+	INIT_LIST_HEAD(&ctor->readq);
+	spin_lock_init(&ctor->read_lock);
+
+	ctor->w_len = 0;
+	ctor->r_len = 0;
+
+	dev_hold(dev);
+	ctor->dev = dev;
+	ctor->ctor.ctor = page_ctor;
+	ctor->ctor.sock = &mp->socket;
+	atomic_set(&ctor->refcnt, 1);
+
+	rc = netdev_page_ctor_attach(dev, &ctor->ctor);
+	if (rc)
+		goto fail;
+
+	/* locked by mp_mutex */
+	rcu_assign_pointer(mp->ctor, ctor);
+
+	/* XXX:Need we do set_offload here ? */
+
+	return 0;
+
+fail:
+	kmem_cache_destroy(ctor->cache);
+cache_fail:
+	kfree(ctor);
+	dev_put(dev);
+
+	return rc;
+}
+
+
+static inline void get_page_ctor(struct page_ctor *ctor)
+{
+       atomic_inc(&ctor->refcnt);
+}
+
+static inline void put_page_ctor(struct page_ctor *ctor)
+{
+	if (atomic_dec_and_test(&ctor->refcnt))
+		kfree(ctor);
+}
+
+struct page_info *info_dequeue(struct page_ctor *ctor)
+{
+	unsigned long flags;
+	struct page_info *info = NULL;
+	spin_lock_irqsave(&ctor->read_lock, flags);
+	if (!list_empty(&ctor->readq)) {
+		info = list_first_entry(&ctor->readq,
+				struct page_info, list);
+		list_del(&info->list);
+	}
+	spin_unlock_irqrestore(&ctor->read_lock, flags);
+	return info;
+}
+
+static int page_ctor_detach(struct mp_struct *mp)
+{
+	struct page_ctor *ctor;
+	struct page_info *info;
+	int i;
+
+	rcu_read_lock();
+	ctor = rcu_dereference(mp->ctor);
+	rcu_read_unlock();
+
+	if (!ctor)
+		return -ENODEV;
+
+	while ((info = info_dequeue(ctor))) {
+		for (i = 0; i < info->pnum; i++)
+			if (info->pages[i])
+				put_page(info->pages[i]);
+		kmem_cache_free(ctor->cache, info);
+	}
+	kmem_cache_destroy(ctor->cache);
+	netdev_page_ctor_detach(ctor->dev);
+	dev_put(ctor->dev);
+
+	/* locked by mp_mutex */
+	rcu_assign_pointer(mp->ctor, NULL);
+	synchronize_rcu();
+
+	put_page_ctor(ctor);
+
+	return 0;
+}
+
+/* For small user space buffers transmit, we don't need to call
+ * get_user_pages().
+ */
+static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
+		int total)
+{
+	struct page_info *info = kmem_cache_alloc(ctor->cache, GFP_KERNEL);
+
+	if (!info)
+		return NULL;
+	memset(info, 0, sizeof(struct page_info));
+	memset(info->pages, 0, sizeof(info->pages));
+
+	info->header = 0;
+	info->total = total;
+	info->skb = NULL;
+	info->user.dtor = page_dtor;
+	info->ctor = ctor;
+	info->flags = INFO_WRITE;
+	info->pnum = 0;
+	return info;
+}
+
+/* The main function to transform the guest user space address
+ * to host kernel address via get_user_pages(). Thus the hardware
+ * can do DMA directly to the user space address.
+ */
+static struct page_info *alloc_page_info(struct page_ctor *ctor,
+			struct iovec *iov, int count, struct frag *frags,
+			int npages, int total)
+{
+	int rc;
+	int i, j, n = 0;
+	int len;
+	unsigned long base;
+	struct page_info *info = kmem_cache_alloc(ctor->cache, GFP_KERNEL);
+
+	if (!info)
+		return NULL;
+	memset(info, 0, sizeof(struct page_info));
+	memset(info->pages, 0, sizeof(info->pages));
+
+	down_read(&current->mm->mmap_sem);
+	for (i = j = 0; i < count; i++) {
+		base = (unsigned long)iov[i].iov_base;
+		len = iov[i].iov_len;
+
+		if (!len)
+			continue;
+		n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+
+		rc = get_user_pages(current, current->mm, base, n,
+				npages ? 1 : 0, 0, &info->pages[j], NULL);
+		if (rc != n) {
+			up_read(&current->mm->mmap_sem);
+			goto failed;
+		}
+
+		while (n--) {
+			frags[j].offset = base & ~PAGE_MASK;
+			frags[j].size = min_t(int, len,
+					PAGE_SIZE - frags[j].offset);
+			len -= frags[j].size;
+			base += frags[j].size;
+			j++;
+		}
+	}
+	up_read(&current->mm->mmap_sem);
+
+#ifdef CONFIG_HIGHMEM
+	if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
+		for (i = 0; i < j; i++) {
+			if (PageHighMem(info->pages[i]))
+				goto failed;
+		}
+	}
+#endif
+
+	info->header = 0;
+	info->total = total;
+	info->skb = NULL;
+	info->user.dtor = page_dtor;
+	info->ctor = ctor;
+	info->pnum = j;
+
+	if (!npages)
+		info->flags = INFO_WRITE;
+	if (info->flags == INFO_READ) {
+		info->user.start = (u8 *)(((unsigned long)
+				(pfn_to_kaddr(page_to_pfn(info->pages[0]))) +
+				frags[0].offset) - NET_IP_ALIGN - NET_SKB_PAD);
+		info->user.size = iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD;
+	}
+	return info;
+
+failed:
+	for (i = 0; i < j; i++)
+		put_page(info->pages[i]);
+
+	kmem_cache_free(ctor->cache, info);
+
+	return NULL;
+}
+
+struct page_ctor *mp_rcu_get_ctor(struct page_ctor *ctor)
+{
+	struct page_ctor *_ctor = NULL;
+
+	rcu_read_lock();
+	_ctor = rcu_dereference(ctor);
+	rcu_read_unlock();
+
+	if (!_ctor) {
+		DBG(KERN_INFO "Device %s cannot do mediate passthru.\n",
+				ctor->dev->name);
+		return NULL;
+	}
+	if (_ctor)
+		get_page_ctor(_ctor);
+	return _ctor;
+}
+
+static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *m, size_t total_len)
+{
+	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+	struct page_ctor *ctor;
+	struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(m->msg_control);
+	struct iovec *iov = m->msg_iov;
+	struct page_info *info = NULL;
+	struct frag frags[MAX_SKB_FRAGS];
+	struct sk_buff *skb;
+	int count = m->msg_iovlen;
+	int total = 0, header, n, i, len, rc;
+	unsigned long base;
+
+	ctor = mp_rcu_get_ctor(mp->ctor);
+	if (!ctor)
+		return -ENODEV;
+
+	ctor->sendctrl = vq;
+
+	total = iov_length(iov, count);
+
+	if (total < ETH_HLEN) {
+		put_page_ctor(ctor);
+		return -EINVAL;
+	}
+
+	if (total <= COPY_THRESHOLD)
+		goto copy;
+
+	n = 0;
+	for (i = 0; i < count; i++) {
+		base = (unsigned long)iov[i].iov_base;
+		len = iov[i].iov_len;
+		if (!len)
+			continue;
+		n += ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		if (n > MAX_SKB_FRAGS) {
+			put_page_ctor(ctor);
+			return -EINVAL;
+		}
+	}
+
+copy:
+	header = total > COPY_THRESHOLD ? COPY_HDR_LEN : total;
+
+	skb = alloc_skb(header + NET_IP_ALIGN, GFP_ATOMIC);
+	if (!skb)
+		goto drop;
+
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	skb_set_network_header(skb, ETH_HLEN);
+
+	memcpy_fromiovec(skb->data, iov, header);
+	skb_put(skb, header);
+	skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);
+
+	if (header == total) {
+		rc = total;
+		info = alloc_small_page_info(ctor, total);
+	} else {
+		info = alloc_page_info(ctor, iov, count, frags, 0, total);
+		if (info)
+			for (i = 0; info->pages[i]; i++) {
+				skb_add_rx_frag(skb, i, info->pages[i],
+						frags[i].offset, frags[i].size);
+				info->pages[i] = NULL;
+			}
+	}
+	if (info != NULL) {
+		info->desc_pos = vq->head;
+		info->ctl = vq;
+		info->total = total;
+		info->skb = skb;
+		skb_shinfo(skb)->destructor_arg = &info->user;
+		skb->dev = mp->dev;
+		dev_queue_xmit(skb);
+		mp->dev->stats.tx_packets++;
+		mp->dev->stats.tx_bytes += total;
+		put_page_ctor(ctor);
+		return 0;
+	}
+drop:
+	kfree(skb);
+	if (info) {
+		for (i = 0; info->pages[i]; i++)
+			put_page(info->pages[i]);
+		kmem_cache_free(info->ctor->cache, info);
+	}
+	mp->dev->stats.tx_dropped++;
+	put_page_ctor(ctor);
+	return -ENOMEM;
+}
+
+
+static struct vhost_notifier *create_vhost_notifier(struct vhost_virtqueue *vq,
+			struct page_info *info, int size)
+{
+	struct vhost_notifier *vnotify = NULL;
+
+	vnotify = &info->notifier;
+	memset(vnotify, 0, sizeof(struct vhost_notifier));
+	vnotify->vq = vq;
+	vnotify->head = info->desc_pos;
+	vnotify->size = size;
+	vnotify->log = info->log;
+	vnotify->ctrl = (void *)info;
+	vnotify->dtor = mp_vhost_notifier_dtor;
+	return vnotify;
+}
+
+static void mp_recvmsg_notify(struct vhost_virtqueue *vq)
+{
+	struct socket *sock = vq->private_data;
+	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+	struct page_ctor *ctor = NULL;
+	struct sk_buff *skb = NULL;
+	struct page_info *info = NULL;
+	struct ethhdr *eth;
+	struct vhost_notifier *vnotify = NULL;
+	int len, i;
+	unsigned long flags;
+
+	struct virtio_net_hdr hdr = {
+		.flags = 0,
+		.gso_type = VIRTIO_NET_HDR_GSO_NONE
+	};
+
+	ctor = mp_rcu_get_ctor(mp->ctor);
+	if (!ctor)
+		return;
+
+	while ((skb = skb_dequeue(&sock->sk->sk_receive_queue)) != NULL) {
+		if (skb_shinfo(skb)->destructor_arg) {
+			info = container_of(skb_shinfo(skb)->destructor_arg,
+					struct page_info, user);
+			info->skb = skb;
+			if (skb->len > info->len) {
+				mp->dev->stats.rx_dropped++;
+				DBG(KERN_INFO "Discarded truncated rx packet: "
+					" len %d > %zd\n", skb->len, info->len);
+				info->total = skb->len;
+				goto clean;
+			} else {
+				int i;
+				struct skb_shared_info *gshinfo =
+				(struct skb_shared_info *)(&info->ushinfo);
+				struct skb_shared_info *hshinfo =
+						skb_shinfo(skb);
+
+				if (gshinfo->nr_frags < hshinfo->nr_frags)
+					goto clean;
+				eth = eth_hdr(skb);
+				skb_push(skb, ETH_HLEN);
+
+				hdr.hdr_len = skb_headlen(skb);
+				info->total = skb->len;
+
+				for (i = 0; i < gshinfo->nr_frags; i++)
+					gshinfo->frags[i].size = 0;
+				for (i = 0; i < hshinfo->nr_frags; i++)
+					gshinfo->frags[i].size =
+						hshinfo->frags[i].size;
+				memcpy(skb_shinfo(skb), &info->ushinfo,
+						sizeof(struct skb_shared_info));
+			}
+		} else {
+			/* The skb composed with kernel buffers
+			 * in case user space buffers are not sufficent.
+			 * The case should be rare.
+			 */
+			unsigned long flags;
+			int i;
+			struct skb_shared_info *gshinfo = NULL;
+
+			info = NULL;
+
+			spin_lock_irqsave(&ctor->read_lock, flags);
+			if (!list_empty(&ctor->readq)) {
+				info = list_first_entry(&ctor->readq,
+						struct page_info, list);
+				list_del(&info->list);
+			}
+			spin_unlock_irqrestore(&ctor->read_lock, flags);
+			if (!info) {
+				DBG(KERN_INFO "No user buffer avaliable %p\n",
+									skb);
+				skb_queue_head(&sock->sk->sk_receive_queue,
+									skb);
+				break;
+			}
+			info->skb = skb;
+			/* compute the guest skb frags info */
+			gshinfo = (struct skb_shared_info *)(info->user.start +
+					SKB_DATA_ALIGN(info->user.size));
+
+			if (gshinfo->nr_frags < skb_shinfo(skb)->nr_frags)
+				goto clean;
+
+			eth = eth_hdr(skb);
+			skb_push(skb, ETH_HLEN);
+			info->total = skb->len;
+
+			for (i = 0; i < gshinfo->nr_frags; i++)
+				gshinfo->frags[i].size = 0;
+			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+				gshinfo->frags[i].size =
+					skb_shinfo(skb)->frags[i].size;
+			hdr.hdr_len = min_t(int, skb->len,
+						info->iov[1].iov_len);
+			skb_copy_datagram_iovec(skb, 0, info->iov, skb->len);
+		}
+
+		len = memcpy_toiovec(info->hdr, (unsigned char *)&hdr,
+								 sizeof hdr);
+		if (len) {
+			DBG(KERN_INFO
+				"Unable to write vnet_hdr at addr %p: %d\n",
+				info->hdr->iov_base, len);
+			goto clean;
+		}
+		vnotify = create_vhost_notifier(vq, info,
+				skb->len + sizeof(hdr));
+
+		spin_lock_irqsave(&vq->notify_lock, flags);
+		list_add_tail(&vnotify->list, &vq->notifier);
+		spin_unlock_irqrestore(&vq->notify_lock, flags);
+		continue;
+
+clean:
+		kfree_skb(skb);
+		for (i = 0; info->pages[i]; i++)
+			put_page(info->pages[i]);
+		kmem_cache_free(ctor->cache, info);
+	}
+	put_page_ctor(ctor);
+	return;
+}
+
+static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *m, size_t total_len,
+		int flags)
+{
+	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+	struct page_ctor *ctor;
+	struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(m->msg_control);
+	struct iovec *iov = m->msg_iov;
+	int count = m->msg_iovlen;
+	int npages, payload;
+	struct page_info *info;
+	struct frag frags[MAX_SKB_FRAGS];
+	unsigned long base;
+	int i, len;
+	unsigned long flag;
+
+	if (!(flags & MSG_DONTWAIT))
+		return -EINVAL;
+
+	ctor = mp_rcu_get_ctor(mp->ctor);
+	if (!ctor)
+		return -EINVAL;
+
+	ctor->recvctrl = vq;
+
+	/* Error detections in case invalid user space buffer */
+	if (count > 2 && iov[1].iov_len < ctor->ctor.hdr_len &&
+			mp->dev->features & NETIF_F_SG) {
+		put_page_ctor(ctor);
+		return -EINVAL;
+	}
+
+	npages = ctor->ctor.npages;
+	payload = ctor->ctor.data_len;
+
+	/* If KVM guest virtio-net FE driver use SG feature */
+	if (count > 2) {
+		for (i = 2; i < count; i++) {
+			base = (unsigned long)iov[i].iov_base & ~PAGE_MASK;
+			len = iov[i].iov_len;
+			if (npages == 1)
+				len = min_t(int, len, PAGE_SIZE - base);
+			else if (base)
+				break;
+			payload -= len;
+			if (payload <= 0)
+				goto proceed;
+			if (npages == 1 || (len & ~PAGE_MASK))
+				break;
+		}
+	}
+
+	if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK)
+				- NET_SKB_PAD - NET_IP_ALIGN) >= 0)
+		goto proceed;
+
+	put_page_ctor(ctor);
+	return -EINVAL;
+
+proceed:
+	/* skip the virtnet head */
+	iov++;
+	count--;
+
+	/* Translate address to kernel */
+	info = alloc_page_info(ctor, iov, count, frags, npages, 0);
+	if (!info) {
+		put_page_ctor(ctor);
+		return -ENOMEM;
+	}
+
+	info->len = total_len;
+	info->hdr[0].iov_base = vq->hdr[0].iov_base;
+	info->hdr[0].iov_len = vq->hdr[0].iov_len;
+	info->offset = frags[0].offset;
+	info->desc_pos = vq->head;
+	info->log = vq->_log;
+	info->ctl = NULL;
+
+	iov--;
+	count++;
+
+	memcpy(info->iov, vq->iov, sizeof(struct iovec) * count);
+
+	spin_lock_irqsave(&ctor->read_lock, flag);
+	list_add_tail(&info->list, &ctor->readq);
+	spin_unlock_irqrestore(&ctor->read_lock, flag);
+
+	if (!vq->receiver)
+		vq->receiver = mp_recvmsg_notify;
+
+	put_page_ctor(ctor);
+	return 0;
+}
+
+static void mp_put(struct mp_file *mfile);
+
+static int mp_release(struct socket *sock)
+{
+	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+	struct mp_file *mfile = mp->mfile;
+
+	mp_put(mfile);
+	sock_put(mp->socket.sk);
+	put_net(mfile->net);
+
+	return 0;
+}
+
+/* Ops structure to mimic raw sockets with mp device */
+static const struct proto_ops mp_socket_ops = {
+	.sendmsg = mp_sendmsg,
+	.recvmsg = mp_recvmsg,
+	.release = mp_release,
+};
+
+static struct proto mp_proto = {
+	.name           = "mp",
+	.owner          = THIS_MODULE,
+	.obj_size       = sizeof(struct mp_sock),
+};
+
+static int mp_chr_open(struct inode *inode, struct file * file)
+{
+	struct mp_file *mfile;
+	cycle_kernel_lock();
+	DBG1(KERN_INFO "mp: mp_chr_open\n");
+
+	mfile = kzalloc(sizeof(*mfile), GFP_KERNEL);
+	if (!mfile)
+		return -ENOMEM;
+	atomic_set(&mfile->count, 0);
+	mfile->mp = NULL;
+	mfile->net = get_net(current->nsproxy->net_ns);
+	file->private_data = mfile;
+	return 0;
+}
+
+static void __mp_detach(struct mp_struct *mp)
+{
+	int up = 0;
+
+	mp->mfile = NULL;
+
+	/* stop the driver to clean all the user space buffers */
+	if (mp->dev->flags & IFF_UP) {
+		up = 1;
+		mp->dev->netdev_ops->ndo_stop(mp->dev);
+	}
+	page_ctor_detach(mp);
+	if (up)
+		mp->dev->netdev_ops->ndo_open(mp->dev);
+	/* Drop the extra count on the net device */
+	dev_put(mp->dev);
+}
+
+static DEFINE_MUTEX(mp_mutex);
+
+static void mp_detach(struct mp_struct *mp)
+{
+	mutex_lock(&mp_mutex);
+	__mp_detach(mp);
+	mutex_unlock(&mp_mutex);
+}
+
+static struct mp_struct *mp_get(struct mp_file *mfile)
+{
+	struct mp_struct *mp = NULL;
+	if (atomic_inc_not_zero(&mfile->count))
+		mp = mfile->mp;
+
+	return mp;
+}
+
+static void mp_put(struct mp_file *mfile)
+{
+	if (atomic_dec_and_test(&mfile->count))
+		mp_detach(mfile->mp);
+}
+
+static int mp_attach(struct mp_struct *mp, struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+	int err;
+
+	netif_tx_lock_bh(mp->dev);
+
+	err = -EINVAL;
+
+	if (mfile->mp)
+		goto out;
+
+	err = -EBUSY;
+	if (mp->mfile)
+		goto out;
+
+	err = 0;
+	mfile->mp = mp;
+	mp->mfile = mfile;
+	mp->socket.file = file;
+	dev_hold(mp->dev);
+	sock_hold(mp->socket.sk);
+	atomic_inc(&mfile->count);
+
+out:
+	netif_tx_unlock_bh(mp->dev);
+	return err;
+}
+
+static void mp_sock_destruct(struct sock *sk)
+{
+	struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+	kfree(mp);
+}
+
+static int do_unbind(struct mp_file *mfile)
+{
+	struct mp_struct *mp = mp_get(mfile);
+
+	if (!mp)
+		return -EINVAL;
+
+	mp_detach(mp);
+	sock_put(mp->socket.sk);
+	mp_put(mfile);
+	return 0;
+}
+
+static void mp_sock_data_ready(struct sock *sk, int len)
+{
+	read_lock(&sk->sk_callback_lock);
+	if (sk_has_sleeper(sk))
+		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static void mp_sock_write_space(struct sock *sk)
+{
+	read_lock(&sk->sk_callback_lock);
+	if (sk_has_sleeper(sk))
+		wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static long mp_chr_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp;
+	struct net_device *dev;
+	void __user* argp = (void __user *)arg;
+	struct ifreq ifr;
+	struct sock *sk;
+	int ret;
+
+	ret = -EINVAL;
+
+	switch (cmd) {
+	case MPASSTHRU_BINDDEV:
+		ret = -EFAULT;
+		if (copy_from_user(&ifr, argp, sizeof ifr))
+			break;
+
+		ifr.ifr_name[IFNAMSIZ-1] = '\0';
+
+		ret = -EBUSY;
+
+		if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL)
+			break;
+
+		ret = -ENODEV;
+		dev = dev_get_by_name(mfile->net, ifr.ifr_name);
+		if (!dev)
+			break;
+
+		mutex_lock(&mp_mutex);
+
+		ret = -EBUSY;
+		mp = mfile->mp;
+		if (mp)
+			goto out;
+
+		mp = kzalloc(sizeof(*mp), GFP_KERNEL);
+		if (!mp) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		mp->dev = dev;
+		ret = -ENOMEM;
+
+		sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto);
+		if (!sk)
+			goto err_free_mp;
+
+		init_waitqueue_head(&mp->socket.wait);
+		mp->socket.ops = &mp_socket_ops;
+		sock_init_data(&mp->socket, sk);
+		sk->sk_sndbuf = INT_MAX;
+		container_of(sk, struct mp_sock, sk)->mp = mp;
+
+		sk->sk_destruct = mp_sock_destruct;
+		sk->sk_data_ready = mp_sock_data_ready;
+		sk->sk_write_space = mp_sock_write_space;
+
+		ret = mp_attach(mp, file);
+		if (ret < 0)
+			goto err_free_sk;
+		ifr.ifr_flags |= IFF_MPASSTHRU_EXCL;
+		ret = page_ctor_attach(mp);
+out:
+		mutex_unlock(&mp_mutex);
+		break;
+err_free_sk:
+		sk_free(sk);
+err_free_mp:
+		kfree(mp);
+		goto out;
+
+	case MPASSTHRU_UNBINDDEV:
+		ret = do_unbind(mfile);
+		break;
+
+	default:
+		break;
+	}
+	return ret;
+}
+
+static int count;
+static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp = mp_get(mfile);
+	struct sock *sk;
+	unsigned int mask = 0;
+
+	if (!mp)
+		return POLLERR;
+
+	sk = mp->socket.sk;
+
+	poll_wait(file, &mp->socket.wait, wait);
+
+	if (!skb_queue_empty(&sk->sk_receive_queue) || !count)
+		mask |= POLLIN | POLLRDNORM;
+
+	if (sock_writeable(sk) ||
+		(!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+			 sock_writeable(sk)))
+		mask |= POLLOUT | POLLWRNORM;
+
+	if (mp->dev->reg_state != NETREG_REGISTERED)
+		mask = POLLERR;
+
+	mp_put(mfile);
+	return mask;
+}
+
+static int mp_chr_close(struct inode *inode, struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+
+	/*
+	 * Ignore return value since an error only means there was nothing to
+	 * do
+	 */
+	do_unbind(mfile);
+
+	put_net(mfile->net);
+	kfree(mfile);
+
+	return 0;
+}
+
+static const struct file_operations mp_fops = {
+	.owner  = THIS_MODULE,
+	.llseek = no_llseek,
+	.poll   = mp_chr_poll,
+	.unlocked_ioctl = mp_chr_ioctl,
+	.open   = mp_chr_open,
+	.release = mp_chr_close,
+};
+
+static struct miscdevice mp_miscdev = {
+	.minor = MPASSTHRU_MINOR,
+	.name = "mp",
+	.nodename = "net/mp",
+	.fops = &mp_fops,
+};
+
+static int mp_init(void)
+{
+	int ret = 0;
+	ret = misc_register(&mp_miscdev);
+	if (ret)
+		printk(KERN_ERR "mp: Can't register misc device\n");
+	return ret;
+}
+
+void mp_cleanup(void)
+{
+	misc_deregister(&mp_miscdev);
+}
+
+/* Get an underlying socket object from mp file.  Returns error unless file is
+ * attached to a device.  The returned object works like a packet socket, it
+ * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
+ * holding a reference to the file for as long as the socket is in use. */
+struct socket *mp_get_socket(struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp;
+
+	if (file->f_op != &mp_fops)
+		return ERR_PTR(-EINVAL);
+	mp = mp_get(mfile);
+	if (!mp)
+		return ERR_PTR(-EBADFD);
+	mp_put(mfile);
+	return &mp->socket;
+}
+EXPORT_SYMBOL_GPL(mp_get_socket);
+
+module_init(mp_init);
+module_exit(mp_cleanup);
+MODULE_AUTHOR(DRV_COPYRIGHT);
+MODULE_DESCRIPTION(DRV_DESCRIPTION);
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 8b5f7cc..8f5211e 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -31,6 +31,7 @@
 #define FUSE_MINOR		229
 #define KVM_MINOR		232
 #define VHOST_NET_MINOR		233
+#define MPASSTHRU_MINOR		234
 #define MISC_DYNAMIC_MINOR	255
 
 struct device;
diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
new file mode 100644
index 0000000..7b71365
--- /dev/null
+++ b/include/linux/mpassthru.h
@@ -0,0 +1,17 @@
+#ifndef __MPASSTHRU_H
+#define __MPASSTHRU_H
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/* ioctl defines */
+#define MPASSTHRU_BINDDEV      _IOW('M', 213, int)
+#define MPASSTHRU_UNBINDDEV    _IOW('M', 214, int)
+
+/* MPASSTHRU ifc flags */
+#define IFF_MPASSTHRU		0x0001
+#define IFF_MPASSTHRU_EXCL	0x0002
+
+struct socket *mp_get_socket(struct file *);
+
+#endif /* __MPASSTHRU_H */
-- 
1.5.4.4


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/3] Provides multiple submits and asynchronous notifications.
  2010-02-10 11:48 ` [PATCH 1/3] A device for zero-copy based " Xin Xiaohui
@ 2010-02-10 11:48   ` Xin Xiaohui
  2010-02-10 11:49     ` [PATCH 3/3] Let host NIC driver to DMA to guest user space Xin Xiaohui
  2010-02-10 15:17   ` [PATCH 1/3] A device for zero-copy based on KVM virtio-net Eric Dumazet
  1 sibling, 1 reply; 11+ messages in thread
From: Xin Xiaohui @ 2010-02-10 11:48 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mingo, mst, jdike; +Cc: Xin Xiaohui

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
---
 drivers/vhost/net.c   |  145 +++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/vhost/vhost.h |   23 ++++++++
 2 files changed, 164 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..8a85227 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -22,6 +22,7 @@
 #include <linux/if_packet.h>
 #include <linux/if_arp.h>
 #include <linux/if_tun.h>
+#include <linux/mpassthru.h>
 
 #include <net/sock.h>
 
@@ -91,6 +92,10 @@ static void tx_poll_start(struct vhost_net *net, struct socket *sock)
 	net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+static void handle_async_rx_events_notify(struct vhost_net *net,
+					struct vhost_virtqueue *vq);
+static void handle_async_tx_events_notify(struct vhost_net *net,
+					struct vhost_virtqueue *vq);
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -124,6 +129,8 @@ static void handle_tx(struct vhost_net *net)
 		tx_poll_stop(net);
 	hdr_size = vq->hdr_size;
 
+	handle_async_tx_events_notify(net, vq);
+
 	for (;;) {
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
@@ -151,6 +158,12 @@ static void handle_tx(struct vhost_net *net)
 		/* Skip header. TODO: support TSO. */
 		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
 		msg.msg_iovlen = out;
+
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+			vq->head = head;
+			msg.msg_control = (void *)vq;
+		}
+
 		len = iov_length(vq->iov, out);
 		/* Sanity check */
 		if (!len) {
@@ -166,6 +179,10 @@ static void handle_tx(struct vhost_net *net)
 			tx_poll_start(net, sock);
 			break;
 		}
+
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+			continue;
+
 		if (err != len)
 			pr_err("Truncated TX packet: "
 			       " len %d != %zd\n", err, len);
@@ -177,6 +194,8 @@ static void handle_tx(struct vhost_net *net)
 		}
 	}
 
+	handle_async_tx_events_notify(net, vq);
+
 	mutex_unlock(&vq->mutex);
 	unuse_mm(net->dev.mm);
 }
@@ -206,7 +225,8 @@ static void handle_rx(struct vhost_net *net)
 	int err;
 	size_t hdr_size;
 	struct socket *sock = rcu_dereference(vq->private_data);
-	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+	if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+			vq->link_state == VHOST_VQ_LINK_SYNC))
 		return;
 
 	use_mm(net->dev.mm);
@@ -217,6 +237,8 @@ static void handle_rx(struct vhost_net *net)
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 
+	handle_async_rx_events_notify(net, vq);
+
 	for (;;) {
 		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 					 ARRAY_SIZE(vq->iov),
@@ -245,6 +267,11 @@ static void handle_rx(struct vhost_net *net)
 		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
 		msg.msg_iovlen = in;
 		len = iov_length(vq->iov, in);
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+			vq->head = head;
+			vq->_log = log;
+			msg.msg_control = (void *)vq;
+		}
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for RX: "
@@ -259,6 +286,10 @@ static void handle_rx(struct vhost_net *net)
 			vhost_discard_vq_desc(vq);
 			break;
 		}
+
+		if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+			continue;
+
 		/* TODO: Should check and handle checksum. */
 		if (err > len) {
 			pr_err("Discarded truncated rx packet: "
@@ -284,10 +315,83 @@ static void handle_rx(struct vhost_net *net)
 		}
 	}
 
+	handle_async_rx_events_notify(net, vq);
+
 	mutex_unlock(&vq->mutex);
 	unuse_mm(net->dev.mm);
 }
 
+struct vhost_notifier *notify_dequeue(struct vhost_virtqueue *vq)
+{
+	struct vhost_notifier *vnotify = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vq->notify_lock, flags);
+	if (!list_empty(&vq->notifier)) {
+		vnotify = list_first_entry(&vq->notifier,
+				struct vhost_notifier, list);
+		list_del(&vnotify->list);
+	}
+	spin_unlock_irqrestore(&vq->notify_lock, flags);
+	return vnotify;
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+				struct vhost_virtqueue *vq)
+{
+	struct vhost_notifier *vnotify = NULL;
+	struct vhost_log *vq_log = NULL;
+	int rx_total_len = 0;
+	int log, size;
+
+	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+		return;
+	if (vq != &net->dev.vqs[VHOST_NET_VQ_RX])
+		return;
+
+	if (vq->receiver)
+		vq->receiver(vq);
+	vq_log = unlikely(vhost_has_feature(
+				&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
+	while ((vnotify = notify_dequeue(vq)) != NULL) {
+		vhost_add_used_and_signal(&net->dev, vq,
+				vnotify->head, vnotify->size);
+		log = vnotify->log;
+		size = vnotify->size;
+		rx_total_len += vnotify->size;
+		vnotify->dtor(vnotify);
+		if (unlikely(vq_log))
+			vhost_log_write(vq, vq_log, log, size);
+		if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+			vhost_poll_queue(&vq->poll);
+			break;
+		}
+	}
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+		struct vhost_virtqueue *vq)
+{
+	struct vhost_notifier *vnotify = NULL;
+	int tx_total_len = 0;
+
+	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+		return;
+	if (vq != &net->dev.vqs[VHOST_NET_VQ_TX])
+		return;
+
+	while ((vnotify = notify_dequeue(vq)) != NULL) {
+		vhost_add_used_and_signal(&net->dev, vq,
+				vnotify->head, 0);
+		tx_total_len += vnotify->size;
+		vnotify->dtor(vnotify);
+		if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+			vhost_poll_queue(&vq->poll);
+			break;
+		}
+	}
+}
+
 static void handle_tx_kick(struct work_struct *work)
 {
 	struct vhost_virtqueue *vq;
@@ -462,7 +566,19 @@ static struct socket *get_tun_socket(int fd)
 	return sock;
 }
 
-static struct socket *get_socket(int fd)
+static struct socket *get_mp_socket(int fd)
+{
+	struct file *file = fget(fd);
+	struct socket *sock;
+	if (!file)
+		return ERR_PTR(-EBADF);
+	sock = mp_get_socket(file);
+	if (IS_ERR(sock))
+		fput(file);
+	return sock;
+}
+
+static struct socket *get_socket(struct vhost_virtqueue *vq, int fd)
 {
 	struct socket *sock;
 	if (fd == -1)
@@ -473,9 +589,26 @@ static struct socket *get_socket(int fd)
 	sock = get_tun_socket(fd);
 	if (!IS_ERR(sock))
 		return sock;
+	sock = get_mp_socket(fd);
+	if (!IS_ERR(sock)) {
+		vq->link_state = VHOST_VQ_LINK_ASYNC;
+		return sock;
+	}
 	return ERR_PTR(-ENOTSOCK);
 }
 
+static void vhost_init_link_state(struct vhost_net *n, int index)
+{
+	struct vhost_virtqueue *vq = n->vqs + index;
+
+	WARN_ON(!mutex_is_locked(&vq->mutex));
+	if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+		vq->receiver = NULL;
+		INIT_LIST_HEAD(&vq->notifier);
+		spin_lock_init(&vq->notify_lock);
+	}
+}
+
 static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 {
 	struct socket *sock, *oldsock;
@@ -493,12 +626,15 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	}
 	vq = n->vqs + index;
 	mutex_lock(&vq->mutex);
-	sock = get_socket(fd);
+	vq->link_state = VHOST_VQ_LINK_SYNC;
+	sock = get_socket(vq, fd);
 	if (IS_ERR(sock)) {
 		r = PTR_ERR(sock);
 		goto err;
 	}
 
+	vhost_init_link_state(n, index);
+
 	/* start polling new socket */
 	oldsock = vq->private_data;
 	if (sock == oldsock)
@@ -507,8 +643,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	vhost_net_disable_vq(n, vq);
 	rcu_assign_pointer(vq->private_data, sock);
 	vhost_net_enable_vq(n, vq);
-	mutex_unlock(&vq->mutex);
 done:
+	mutex_unlock(&vq->mutex);
 	mutex_unlock(&n->dev.mutex);
 	if (oldsock) {
 		vhost_net_flush_vq(n, index);
@@ -516,6 +652,7 @@ done:
 	}
 	return r;
 err:
+	mutex_unlock(&vq->mutex);
 	mutex_unlock(&n->dev.mutex);
 	return r;
 }
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index d1f0453..295bffa 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -43,6 +43,22 @@ struct vhost_log {
 	u64 len;
 };
 
+enum vhost_vq_link_state {
+	VHOST_VQ_LINK_SYNC = 	0,
+	VHOST_VQ_LINK_ASYNC = 	1,
+};
+
+/* The structure to notify the virtqueue for async socket */
+struct vhost_notifier {
+	struct list_head list;
+	struct vhost_virtqueue *vq;
+	int head;
+	int size;
+	int log;
+	void *ctrl;
+	void (*dtor)(struct vhost_notifier *);
+};
+
 /* The virtqueue structure describes a queue attached to a device. */
 struct vhost_virtqueue {
 	struct vhost_dev *dev;
@@ -96,6 +112,13 @@ struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log log[VHOST_NET_MAX_SG];
+	/*Differiate async socket for 0-copy from normal*/
+	enum vhost_vq_link_state link_state;
+	int head;
+	int _log;
+	struct list_head notifier;
+	spinlock_t notify_lock;
+	void (*receiver)(struct vhost_virtqueue *);
 };
 
 struct vhost_dev {
-- 
1.5.4.4


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/3] Let host NIC driver to DMA to guest user space.
  2010-02-10 11:48   ` [PATCH 2/3] Provides multiple submits and asynchronous notifications Xin Xiaohui
@ 2010-02-10 11:49     ` Xin Xiaohui
  0 siblings, 0 replies; 11+ messages in thread
From: Xin Xiaohui @ 2010-02-10 11:49 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mingo, mst, jdike; +Cc: Xin Xiaohui, Zhao Yu

The patch let host NIC driver to receive user space skb,
then the driver has chance to directly DMA to guest user
space buffers thru single ethX interface.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81@gmail.com>
Sigend-off-by: Jeff Dike <jdike@c2.user-mode-linux.org>
---
 include/linux/netdevice.h |   72 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/skbuff.h    |   32 ++++++++++++++++++--
 net/core/dev.c            |   27 +++++++++++++++++
 net/core/skbuff.c         |   62 +++++++++++++++++++++++++++++++++++----
 4 files changed, 184 insertions(+), 9 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 94958c1..0de8688 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -486,6 +486,16 @@ struct netdev_queue {
 } ____cacheline_aligned_in_smp;
 
 
+struct netdev_page_ctor	{
+	int		hdr_len;
+	int		data_len;
+	int		npages;
+	unsigned	flags;
+	struct socket	*sock;
+	struct skb_user_page	*(*ctor)(struct netdev_page_ctor *,
+				struct sk_buff *, int);
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -636,6 +646,8 @@ struct net_device_ops {
 	int			(*ndo_fcoe_ddp_done)(struct net_device *dev,
 						     u16 xid);
 #endif
+	int			(*ndo_page_ctor_prep)(struct net_device *dev,
+						struct netdev_page_ctor *ctor);
 };
 
 /*
@@ -916,6 +928,7 @@ struct net_device
 	/* max exchange id for FCoE LRO by ddp */
 	unsigned int		fcoe_ddp_xid;
 #endif
+	struct netdev_page_ctor		*page_ctor;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -2013,6 +2026,65 @@ static inline u32 dev_ethtool_get_flags(struct net_device *dev)
 		return 0;
 	return dev->ethtool_ops->get_flags(dev);
 }
+
+static inline int netdev_page_ctor_prep(struct net_device *dev,
+		struct netdev_page_ctor *ctor)
+{
+	int rc;
+	int npages, data_len;
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	/* needed by packet split */
+	if (ops->ndo_page_ctor_prep) {
+		rc = ops->ndo_page_ctor_prep(dev, ctor);
+		if (rc)
+			return rc;
+	} else {  /* should be temp */
+		ctor->hdr_len = 128;
+		ctor->data_len = 2048;
+		ctor->npages = 1;
+	}
+
+	if (ctor->hdr_len <= 0)
+		goto err;
+
+	npages = ctor->npages;
+	data_len = ctor->data_len;
+	if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+			(data_len < PAGE_SIZE * (npages - 1) ||
+			 data_len > PAGE_SIZE * npages))
+		goto err;
+
+	return 0;
+err:
+	dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+	return -EINVAL;
+}
+
+static inline int netdev_page_ctor_attach(struct net_device *dev,
+		struct netdev_page_ctor *ctor)
+{
+	if (dev->flags & IFF_UP)
+		return -EBUSY;
+
+	if (rcu_dereference(dev->page_ctor))
+		return -EBUSY;
+
+	rcu_assign_pointer(dev->page_ctor, ctor);
+
+	return 0;
+}
+
+static inline void netdev_page_ctor_detach(struct net_device *dev)
+{
+	if (!rcu_dereference(dev->page_ctor))
+		return;
+
+	rcu_assign_pointer(dev->page_ctor, NULL);
+	synchronize_rcu();
+}
+
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_NETDEVICE_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index df7b23a..c77837e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -209,6 +209,13 @@ struct skb_shared_info {
 	void *		destructor_arg;
 };
 
+struct skb_user_page {
+	u8              *start;
+	int             size;
+	struct skb_frag_struct *frags;
+	struct skb_shared_info *ushinfo;
+	void		(*dtor)(struct skb_user_page *);
+};
 /* We divide dataref into two halves.  The higher 16 bits hold references
  * to the payload part of skb->data.  The lower 16 bits hold references to
  * the entire skb->data.  A clone of a headerless skb holds the length of
@@ -441,17 +448,18 @@ extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-				   gfp_t priority, int fclone, int node);
+				   gfp_t priority, int fclone,
+				   int node, struct net_device *dev);
 static inline struct sk_buff *alloc_skb(unsigned int size,
 					gfp_t priority)
 {
-	return __alloc_skb(size, priority, 0, -1);
+	return __alloc_skb(size, priority, 0, -1, NULL);
 }
 
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 					       gfp_t priority)
 {
-	return __alloc_skb(size, priority, 1, -1);
+	return __alloc_skb(size, priority, 1, -1, NULL);
 }
 
 extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
@@ -1509,6 +1517,24 @@ static inline void netdev_free_page(struct net_device *dev, struct page *page)
 	__free_page(page);
 }
 
+extern struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
+			struct sk_buff *skb, int npages);
+
+extern int netdev_use_ps_feature(struct net_device *dev);
+
+static inline struct skb_user_page *netdev_alloc_user_page(
+		struct net_device *dev,
+		struct sk_buff *skb, unsigned int size)
+{
+	struct skb_user_page *user;
+	int npages = (size < PAGE_SIZE) ? 1 : (size / PAGE_SIZE);
+
+	user = netdev_alloc_user_pages(dev, skb, npages);
+	if (likely(user))
+		return user;
+	return NULL;
+}
+
 /**
  *	skb_clone_writable - is the header of a clone writable
  *	@skb: buffer to check
diff --git a/net/core/dev.c b/net/core/dev.c
index b8f74cf..9d2c2ba 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2265,6 +2265,27 @@ void netif_nit_deliver(struct sk_buff *skb)
 	rcu_read_unlock();
 }
 
+static inline struct sk_buff *handle_user_space_buf(struct sk_buff *skb,
+					struct packet_type **pt_prev,
+					int *ret, struct net_device *orig_dev)
+{
+	struct netdev_page_ctor *ctor = NULL;
+	struct sock *sk = NULL;
+
+	if (skb->dev)
+		ctor = skb->dev->page_ctor;
+	if (!ctor)
+		return skb;
+
+	sk = ctor->sock->sk;
+
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+
+	sk->sk_data_ready(sk, skb->len);
+	return NULL;
+}
+
+
 /**
  *	netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
@@ -2342,6 +2363,9 @@ int netif_receive_skb(struct sk_buff *skb)
 		goto out;
 ncls:
 #endif
+	skb = handle_user_space_buf(skb, &pt_prev, &ret, orig_dev);
+	if (!skb)
+		goto out;
 
 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
 	if (!skb)
@@ -2455,6 +2479,9 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	if (skb_is_gso(skb) || skb_has_frags(skb))
 		goto normal;
 
+	if (skb->dev && skb->dev->page_ctor)
+		goto normal;
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(ptype, head, list) {
 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 80a9616..40461d5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -170,12 +170,13 @@ EXPORT_SYMBOL(skb_under_panic);
  *	%GFP_ATOMIC.
  */
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-			    int fclone, int node)
+			    int fclone, int node, struct net_device *dev)
 {
 	struct kmem_cache *cache;
 	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
 	u8 *data;
+	struct skb_user_page *user = NULL;
 
 	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
 
@@ -185,8 +186,22 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 		goto out;
 
 	size = SKB_DATA_ALIGN(size);
-	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
-			gfp_mask, node);
+
+	if (!dev || !dev->page_ctor) { /* Legacy alloc func */
+		data = kmalloc_node_track_caller(
+				size + sizeof(struct skb_shared_info),
+				gfp_mask, node);
+	} else { /* Allocation may from page constructor of device */
+		user = netdev_alloc_user_page(dev, skb, size);
+		if (!user)
+			data = kmalloc_node_track_caller(
+				size + sizeof(struct skb_shared_info),
+				gfp_mask, node);
+		else {
+			data = user->start;
+			size = SKB_DATA_ALIGN(user->size);
+		}
+	}
 	if (!data)
 		goto nodata;
 
@@ -208,6 +223,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	skb->mac_header = ~0U;
 #endif
 
+	if (user)
+		memcpy(user->ushinfo, skb_shinfo(skb),
+				sizeof(struct skb_shared_info));
+
 	/* make sure we initialize shinfo sequentially */
 	shinfo = skb_shinfo(skb);
 	atomic_set(&shinfo->dataref, 1);
@@ -231,6 +250,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 
 		child->fclone = SKB_FCLONE_UNAVAILABLE;
 	}
+
+	shinfo->destructor_arg = user;
+
 out:
 	return skb;
 nodata:
@@ -259,7 +281,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
 	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
 	struct sk_buff *skb;
 
-	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
+	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node, dev);
 	if (likely(skb)) {
 		skb_reserve(skb, NET_SKB_PAD);
 		skb->dev = dev;
@@ -278,6 +300,27 @@ struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__netdev_alloc_page);
 
+struct skb_user_page *netdev_alloc_user_pages(struct net_device *dev,
+			struct sk_buff *skb, int npages)
+{
+	struct netdev_page_ctor *ctor;
+	struct skb_user_page *user = NULL;
+
+	rcu_read_lock();
+	ctor = rcu_dereference(dev->page_ctor);
+	if (!ctor)
+		goto out;
+
+	BUG_ON(npages > ctor->npages);
+
+	user = ctor->ctor(ctor, skb, npages);
+out:
+	rcu_read_unlock();
+
+	return user;
+}
+EXPORT_SYMBOL(netdev_alloc_user_pages);
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 		int size)
 {
@@ -338,6 +381,8 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 
 static void skb_release_data(struct sk_buff *skb)
 {
+	struct skb_user_page *user = skb_shinfo(skb)->destructor_arg;
+
 	if (!skb->cloned ||
 	    !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
 			       &skb_shinfo(skb)->dataref)) {
@@ -349,7 +394,8 @@ static void skb_release_data(struct sk_buff *skb)
 
 		if (skb_has_frags(skb))
 			skb_drop_fraglist(skb);
-
+		if (skb->dev && skb->dev->page_ctor && user && user->dtor)
+			user->dtor(user);
 		kfree(skb->head);
 	}
 }
@@ -503,8 +549,12 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
 	if (skb_shared(skb) || skb_cloned(skb))
 		return 0;
 
-	skb_release_head_state(skb);
+	if (skb->dev && skb->dev->page_ctor)
+		return 0;
+
 	shinfo = skb_shinfo(skb);
+
+	skb_release_head_state(skb);
 	atomic_set(&shinfo->dataref, 1);
 	shinfo->nr_frags = 0;
 	shinfo->gso_size = 0;
-- 
1.5.4.4


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/3] Provide a zero-copy method on KVM virtio-net.
  2010-02-10 11:48 [PATCH 0/3] Provide a zero-copy method on KVM virtio-net Xin Xiaohui
  2010-02-10 11:48 ` [PATCH 1/3] A device for zero-copy based " Xin Xiaohui
@ 2010-02-10 13:40 ` Arnd Bergmann
  2010-02-11  7:40   ` Xin, Xiaohui
  2010-02-11  8:54   ` Xin, Xiaohui
  2 siblings, 1 reply; 11+ messages in thread
From: Arnd Bergmann @ 2010-02-10 13:40 UTC (permalink / raw)
  To: Xin Xiaohui; +Cc: netdev, kvm, linux-kernel, mingo, mst, jdike

On Wednesday 10 February 2010, Xin Xiaohui wrote:
> The idea is simple, just to pin the guest VM user space and then
> let host NIC driver has the chance to directly DMA to it. 
> The patches are based on vhost-net backend driver. We add a device
> which provides proto_ops as sendmsg/recvmsg to vhost-net to
> send/recv directly to/from the NIC driver. KVM guest who use the
> vhost-net backend may bind any ethX interface in the host side to
> get copyless data transfer thru guest virtio-net frontend.
> 
> We provide multiple submits and asynchronous notifiicaton to 
> vhost-net too.

This does a lot of things that I had planned for macvtap. It's
great to hear that you have made this much progress.

However, I'd hope that we could combine this with the macvtap driver,
which would give us zero-copy transfer capability both with and
without vhost, as well as (tx at least) when using multiple guests
on a macvlan setup.

For transmit, it should be fairly straightforward to hook up
your zero-copy method and the vhost-net interface into the
macvtap driver.

You have simplified the receiv path significantly by assuming
that the entire netdev can receive into a single guest, right?
I'm assuming that the idea is to allow VMDq adapters to simply
show up as separate adapters and have the driver handle this
in a hardware specific way.
My plan for this was to instead move support for VMDq into the
macvlan driver so we can transparently use VMDq on hardware where
available, including zero-copy receives, but fall back to software
operation on non-VMDq hardware.

	Arnd

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.
  2010-02-10 11:48 ` [PATCH 1/3] A device for zero-copy based " Xin Xiaohui
  2010-02-10 11:48   ` [PATCH 2/3] Provides multiple submits and asynchronous notifications Xin Xiaohui
@ 2010-02-10 15:17   ` Eric Dumazet
  2010-02-11  5:33     ` Xin, Xiaohui
  1 sibling, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2010-02-10 15:17 UTC (permalink / raw)
  To: Xin Xiaohui; +Cc: netdev, kvm, linux-kernel, mingo, mst, jdike, Zhao Yu

Le mercredi 10 février 2010 à 19:48 +0800, Xin Xiaohui a écrit :
> Add a device to utilize the vhost-net backend driver for
> copy-less data transfer between guest FE and host NIC.
> It pins the guest user space to the host memory and
> provides proto_ops as sendmsg/recvmsg to vhost-net.
> 
> Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
> Signed-off-by: Zhao Yu <yzhao81@gmail.com>
> Sigend-off-by: Jeff Dike <jdike@c2.user-mode-linux.org>


> +static int page_ctor_attach(struct mp_struct *mp)
> +{
> +	int rc;
> +	struct page_ctor *ctor;
> +	struct net_device *dev = mp->dev;
> +
> +	rcu_read_lock();
> +	if (rcu_dereference(mp->ctor)) {
> +		rcu_read_unlock();
> +		return -EBUSY;
> +	}
> +	rcu_read_unlock();

Strange read locking here, for an obvious writer role.
What do you really want to do ?
If writer are serialized by mp_mutex, you dont need this
recu_read_lock()/rcu_read_unlock() stuff.

> +
> +	ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
> +	if (!ctor)
> +		return -ENOMEM;
> +	rc = netdev_page_ctor_prep(dev, &ctor->ctor);
> +	if (rc)
> +		goto fail;
> +
> +	ctor->cache = kmem_cache_create("skb_page_info",
> +			sizeof(struct page_info), 0,
> +			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);


SLAB_PANIC here means : crash whole system in case of error.
This is not what you want in a driver.

> +
> +	if (!ctor->cache)
> +		goto cache_fail;
> +
> +	INIT_LIST_HEAD(&ctor->readq);
> +	spin_lock_init(&ctor->read_lock);
> +
> +	ctor->w_len = 0;
> +	ctor->r_len = 0;
> +
> +	dev_hold(dev);
> +	ctor->dev = dev;
> +	ctor->ctor.ctor = page_ctor;
> +	ctor->ctor.sock = &mp->socket;
> +	atomic_set(&ctor->refcnt, 1);
> +
> +	rc = netdev_page_ctor_attach(dev, &ctor->ctor);
> +	if (rc)
> +		goto fail;
> +
> +	/* locked by mp_mutex */
> +	rcu_assign_pointer(mp->ctor, ctor);
> +
> +	/* XXX:Need we do set_offload here ? */
> +
> +	return 0;
> +
> +fail:
> +	kmem_cache_destroy(ctor->cache);
> +cache_fail:
> +	kfree(ctor);
> +	dev_put(dev);
> +
> +	return rc;
> +}
> +
> +
> +static inline void get_page_ctor(struct page_ctor *ctor)
> +{
> +       atomic_inc(&ctor->refcnt);
> +}
> +
> +static inline void put_page_ctor(struct page_ctor *ctor)
> +{
> +	if (atomic_dec_and_test(&ctor->refcnt))
> +		kfree(ctor);

Are you sure a RCU grace period is not needed before freeing ?


> +
> +static int page_ctor_detach(struct mp_struct *mp)
> +{
> +	struct page_ctor *ctor;
> +	struct page_info *info;
> +	int i;
> +
> +	rcu_read_lock();
> +	ctor = rcu_dereference(mp->ctor);
> +	rcu_read_unlock();

Strange locking again here


> +
> +	if (!ctor)
> +		return -ENODEV;
> +
> +	while ((info = info_dequeue(ctor))) {
> +		for (i = 0; i < info->pnum; i++)
> +			if (info->pages[i])
> +				put_page(info->pages[i]);
> +		kmem_cache_free(ctor->cache, info);
> +	}
> +	kmem_cache_destroy(ctor->cache);
> +	netdev_page_ctor_detach(ctor->dev);
> +	dev_put(ctor->dev);
> +
> +	/* locked by mp_mutex */
> +	rcu_assign_pointer(mp->ctor, NULL);
> +	synchronize_rcu();
> +
> +	put_page_ctor(ctor);
> +
> +	return 0;
> +}
> +
> +/* For small user space buffers transmit, we don't need to call
> + * get_user_pages().
> + */
> +static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
> +		int total)
> +{
> +	struct page_info *info = kmem_cache_alloc(ctor->cache, GFP_KERNEL);
kmem_cache_zalloc() ?
> +
> +	if (!info)
> +		return NULL;
> +	memset(info, 0, sizeof(struct page_info));
> +	memset(info->pages, 0, sizeof(info->pages));

redundant memset() whole structure already cleared one line above

> +
> +	info->header = 0;
already cleared
> +	info->total = total;
> +	info->skb = NULL;
already cleared 
> 
> +	info->user.dtor = page_dtor;
> +	info->ctor = ctor;
> +	info->flags = INFO_WRITE;
> +	info->pnum = 0;
already cleared 
> 
> +	return info;
> +}
> +
> +/* The main function to transform the guest user space address
> + * to host kernel address via get_user_pages(). Thus the hardware
> + * can do DMA directly to the user space address.
> + */
> +static struct page_info *alloc_page_info(struct page_ctor *ctor,
> +			struct iovec *iov, int count, struct frag *frags,
> +			int npages, int total)
> +{
> +	int rc;
> +	int i, j, n = 0;
> +	int len;
> +	unsigned long base;
> +	struct page_info *info = kmem_cache_alloc(ctor->cache, GFP_KERNEL);
kmem_cache_zalloc() ? 
> 
> +
> +	if (!info)
> +		return NULL;
> +	memset(info, 0, sizeof(struct page_info));
kmem_cache_zalloc() ?
> +	memset(info->pages, 0, sizeof(info->pages));
already cleared 
> 
> +
> +	down_read(&current->mm->mmap_sem);
> +	for (i = j = 0; i < count; i++) {
> +		base = (unsigned long)iov[i].iov_base;
> +		len = iov[i].iov_len;
> +
> +		if (!len)
> +			continue;
> +		n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
> +
> +		rc = get_user_pages(current, current->mm, base, n,
> +				npages ? 1 : 0, 0, &info->pages[j], NULL);
> +		if (rc != n) {
> +			up_read(&current->mm->mmap_sem);
> +			goto failed;
> +		}
> +
> +		while (n--) {
> +			frags[j].offset = base & ~PAGE_MASK;
> +			frags[j].size = min_t(int, len,
> +					PAGE_SIZE - frags[j].offset);
> +			len -= frags[j].size;
> +			base += frags[j].size;
> +			j++;
> +		}
> +	}
> +	up_read(&current->mm->mmap_sem);
> +
> +#ifdef CONFIG_HIGHMEM
> +	if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
> +		for (i = 0; i < j; i++) {
> +			if (PageHighMem(info->pages[i]))
> +				goto failed;
> +		}
> +	}
> +#endif
> +
> +	info->header = 0;
> +	info->total = total;
> +	info->skb = NULL;
> +	info->user.dtor = page_dtor;
> +	info->ctor = ctor;
> +	info->pnum = j;
> +
> +	if (!npages)
> +		info->flags = INFO_WRITE;
> +	if (info->flags == INFO_READ) {
> +		info->user.start = (u8 *)(((unsigned long)
> +				(pfn_to_kaddr(page_to_pfn(info->pages[0]))) +
> +				frags[0].offset) - NET_IP_ALIGN - NET_SKB_PAD);
> +		info->user.size = iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD;
> +	}
> +	return info;
> +
> +failed:
> +	for (i = 0; i < j; i++)
> +		put_page(info->pages[i]);
> +
> +	kmem_cache_free(ctor->cache, info);
> +
> +	return NULL;
> +}
> +
> +struct page_ctor *mp_rcu_get_ctor(struct page_ctor *ctor)
> +{
> +	struct page_ctor *_ctor = NULL;
> +
> +	rcu_read_lock();
> +	_ctor = rcu_dereference(ctor);
> +	rcu_read_unlock();
strange locking. After rcu_read_unlock() you have no guarantee _ctor
points to something not freed.
> +
> +	if (!_ctor) {
> +		DBG(KERN_INFO "Device %s cannot do mediate passthru.\n",
> +				ctor->dev->name);
> +		return NULL;
> +	}
> +	if (_ctor)
redundant test
> +		get_page_ctor(_ctor);
> +	return _ctor;
> +}
> +

I stopped my review at this point. Please check your RCU usages. It is
not sufficient to hold rcu read lock just to fetch the pointer, you also
must hold the lock while using the object itself, or get a reference on
object before release RCU lock, to make sure object wont disappear under
you...

for example :

rcu_read_lock();
ptr = rcu_dereference(...);
if (ptr)
	atomic_inc(&ptr->refcnt);
rcu_read_unlock();







^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.
  2010-02-10 15:17   ` [PATCH 1/3] A device for zero-copy based on KVM virtio-net Eric Dumazet
@ 2010-02-11  5:33     ` Xin, Xiaohui
  0 siblings, 0 replies; 11+ messages in thread
From: Xin, Xiaohui @ 2010-02-11  5:33 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, kvm, linux-kernel, mingo, mst, jdike, Zhao Yu

Eric,
Thanks. I will look into that. But don't stop there. 
Please comments more. :-)

Thanks
Xiaohui

-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com] 
Sent: Wednesday, February 10, 2010 11:18 PM
To: Xin, Xiaohui
Cc: netdev@vger.kernel.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org; mingo@elte.hu; mst@redhat.com; jdike@c2.user-mode-linux.org; Zhao Yu
Subject: Re: [PATCH 1/3] A device for zero-copy based on KVM virtio-net.

Le mercredi 10 février 2010 à 19:48 +0800, Xin Xiaohui a écrit :
> Add a device to utilize the vhost-net backend driver for
> copy-less data transfer between guest FE and host NIC.
> It pins the guest user space to the host memory and
> provides proto_ops as sendmsg/recvmsg to vhost-net.
> 
> Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
> Signed-off-by: Zhao Yu <yzhao81@gmail.com>
> Sigend-off-by: Jeff Dike <jdike@c2.user-mode-linux.org>


> +static int page_ctor_attach(struct mp_struct *mp)
> +{
> +	int rc;
> +	struct page_ctor *ctor;
> +	struct net_device *dev = mp->dev;
> +
> +	rcu_read_lock();
> +	if (rcu_dereference(mp->ctor)) {
> +		rcu_read_unlock();
> +		return -EBUSY;
> +	}
> +	rcu_read_unlock();

Strange read locking here, for an obvious writer role.
What do you really want to do ?
If writer are serialized by mp_mutex, you dont need this
recu_read_lock()/rcu_read_unlock() stuff.

> +
> +	ctor = kzalloc(sizeof(*ctor), GFP_KERNEL);
> +	if (!ctor)
> +		return -ENOMEM;
> +	rc = netdev_page_ctor_prep(dev, &ctor->ctor);
> +	if (rc)
> +		goto fail;
> +
> +	ctor->cache = kmem_cache_create("skb_page_info",
> +			sizeof(struct page_info), 0,
> +			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);


SLAB_PANIC here means : crash whole system in case of error.
This is not what you want in a driver.

> +
> +	if (!ctor->cache)
> +		goto cache_fail;
> +
> +	INIT_LIST_HEAD(&ctor->readq);
> +	spin_lock_init(&ctor->read_lock);
> +
> +	ctor->w_len = 0;
> +	ctor->r_len = 0;
> +
> +	dev_hold(dev);
> +	ctor->dev = dev;
> +	ctor->ctor.ctor = page_ctor;
> +	ctor->ctor.sock = &mp->socket;
> +	atomic_set(&ctor->refcnt, 1);
> +
> +	rc = netdev_page_ctor_attach(dev, &ctor->ctor);
> +	if (rc)
> +		goto fail;
> +
> +	/* locked by mp_mutex */
> +	rcu_assign_pointer(mp->ctor, ctor);
> +
> +	/* XXX:Need we do set_offload here ? */
> +
> +	return 0;
> +
> +fail:
> +	kmem_cache_destroy(ctor->cache);
> +cache_fail:
> +	kfree(ctor);
> +	dev_put(dev);
> +
> +	return rc;
> +}
> +
> +
> +static inline void get_page_ctor(struct page_ctor *ctor)
> +{
> +       atomic_inc(&ctor->refcnt);
> +}
> +
> +static inline void put_page_ctor(struct page_ctor *ctor)
> +{
> +	if (atomic_dec_and_test(&ctor->refcnt))
> +		kfree(ctor);

Are you sure a RCU grace period is not needed before freeing ?


> +
> +static int page_ctor_detach(struct mp_struct *mp)
> +{
> +	struct page_ctor *ctor;
> +	struct page_info *info;
> +	int i;
> +
> +	rcu_read_lock();
> +	ctor = rcu_dereference(mp->ctor);
> +	rcu_read_unlock();

Strange locking again here


> +
> +	if (!ctor)
> +		return -ENODEV;
> +
> +	while ((info = info_dequeue(ctor))) {
> +		for (i = 0; i < info->pnum; i++)
> +			if (info->pages[i])
> +				put_page(info->pages[i]);
> +		kmem_cache_free(ctor->cache, info);
> +	}
> +	kmem_cache_destroy(ctor->cache);
> +	netdev_page_ctor_detach(ctor->dev);
> +	dev_put(ctor->dev);
> +
> +	/* locked by mp_mutex */
> +	rcu_assign_pointer(mp->ctor, NULL);
> +	synchronize_rcu();
> +
> +	put_page_ctor(ctor);
> +
> +	return 0;
> +}
> +
> +/* For small user space buffers transmit, we don't need to call
> + * get_user_pages().
> + */
> +static struct page_info *alloc_small_page_info(struct page_ctor *ctor,
> +		int total)
> +{
> +	struct page_info *info = kmem_cache_alloc(ctor->cache, GFP_KERNEL);
kmem_cache_zalloc() ?
> +
> +	if (!info)
> +		return NULL;
> +	memset(info, 0, sizeof(struct page_info));
> +	memset(info->pages, 0, sizeof(info->pages));

redundant memset() whole structure already cleared one line above

> +
> +	info->header = 0;
already cleared
> +	info->total = total;
> +	info->skb = NULL;
already cleared 
> 
> +	info->user.dtor = page_dtor;
> +	info->ctor = ctor;
> +	info->flags = INFO_WRITE;
> +	info->pnum = 0;
already cleared 
> 
> +	return info;
> +}
> +
> +/* The main function to transform the guest user space address
> + * to host kernel address via get_user_pages(). Thus the hardware
> + * can do DMA directly to the user space address.
> + */
> +static struct page_info *alloc_page_info(struct page_ctor *ctor,
> +			struct iovec *iov, int count, struct frag *frags,
> +			int npages, int total)
> +{
> +	int rc;
> +	int i, j, n = 0;
> +	int len;
> +	unsigned long base;
> +	struct page_info *info = kmem_cache_alloc(ctor->cache, GFP_KERNEL);
kmem_cache_zalloc() ? 
> 
> +
> +	if (!info)
> +		return NULL;
> +	memset(info, 0, sizeof(struct page_info));
kmem_cache_zalloc() ?
> +	memset(info->pages, 0, sizeof(info->pages));
already cleared 
> 
> +
> +	down_read(&current->mm->mmap_sem);
> +	for (i = j = 0; i < count; i++) {
> +		base = (unsigned long)iov[i].iov_base;
> +		len = iov[i].iov_len;
> +
> +		if (!len)
> +			continue;
> +		n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
> +
> +		rc = get_user_pages(current, current->mm, base, n,
> +				npages ? 1 : 0, 0, &info->pages[j], NULL);
> +		if (rc != n) {
> +			up_read(&current->mm->mmap_sem);
> +			goto failed;
> +		}
> +
> +		while (n--) {
> +			frags[j].offset = base & ~PAGE_MASK;
> +			frags[j].size = min_t(int, len,
> +					PAGE_SIZE - frags[j].offset);
> +			len -= frags[j].size;
> +			base += frags[j].size;
> +			j++;
> +		}
> +	}
> +	up_read(&current->mm->mmap_sem);
> +
> +#ifdef CONFIG_HIGHMEM
> +	if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
> +		for (i = 0; i < j; i++) {
> +			if (PageHighMem(info->pages[i]))
> +				goto failed;
> +		}
> +	}
> +#endif
> +
> +	info->header = 0;
> +	info->total = total;
> +	info->skb = NULL;
> +	info->user.dtor = page_dtor;
> +	info->ctor = ctor;
> +	info->pnum = j;
> +
> +	if (!npages)
> +		info->flags = INFO_WRITE;
> +	if (info->flags == INFO_READ) {
> +		info->user.start = (u8 *)(((unsigned long)
> +				(pfn_to_kaddr(page_to_pfn(info->pages[0]))) +
> +				frags[0].offset) - NET_IP_ALIGN - NET_SKB_PAD);
> +		info->user.size = iov[0].iov_len + NET_IP_ALIGN + NET_SKB_PAD;
> +	}
> +	return info;
> +
> +failed:
> +	for (i = 0; i < j; i++)
> +		put_page(info->pages[i]);
> +
> +	kmem_cache_free(ctor->cache, info);
> +
> +	return NULL;
> +}
> +
> +struct page_ctor *mp_rcu_get_ctor(struct page_ctor *ctor)
> +{
> +	struct page_ctor *_ctor = NULL;
> +
> +	rcu_read_lock();
> +	_ctor = rcu_dereference(ctor);
> +	rcu_read_unlock();
strange locking. After rcu_read_unlock() you have no guarantee _ctor
points to something not freed.
> +
> +	if (!_ctor) {
> +		DBG(KERN_INFO "Device %s cannot do mediate passthru.\n",
> +				ctor->dev->name);
> +		return NULL;
> +	}
> +	if (_ctor)
redundant test
> +		get_page_ctor(_ctor);
> +	return _ctor;
> +}
> +

I stopped my review at this point. Please check your RCU usages. It is
not sufficient to hold rcu read lock just to fetch the pointer, you also
must hold the lock while using the object itself, or get a reference on
object before release RCU lock, to make sure object wont disappear under
you...

for example :

rcu_read_lock();
ptr = rcu_dereference(...);
if (ptr)
	atomic_inc(&ptr->refcnt);
rcu_read_unlock();







^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH 0/3] Provide a zero-copy method on KVM virtio-net.
  2010-02-10 13:40 ` [PATCH 0/3] Provide a zero-copy method " Arnd Bergmann
@ 2010-02-11  7:40   ` Xin, Xiaohui
  2010-02-11 13:25     ` Arnd Bergmann
  0 siblings, 1 reply; 11+ messages in thread
From: Xin, Xiaohui @ 2010-02-11  7:40 UTC (permalink / raw)
  To: Arnd Bergmann; +Cc: netdev, kvm, linux-kernel, mingo, mst, jdike

>>On Wednesday 10 February 2010, Xin Xiaohui wrote:
> >The idea is simple, just to pin the guest VM user space and then
> >let host NIC driver has the chance to directly DMA to it. 
> >The patches are based on vhost-net backend driver. We add a device
> >which provides proto_ops as sendmsg/recvmsg to vhost-net to
> >send/recv directly to/from the NIC driver. KVM guest who use the
> >vhost-net backend may bind any ethX interface in the host side to
> >get copyless data transfer thru guest virtio-net frontend.
>> 
>> We provide multiple submits and asynchronous notifiicaton to 
> >vhost-net too.

>This does a lot of things that I had planned for macvtap. It's
>great to hear that you have made this much progress.
>
>However, I'd hope that we could combine this with the macvtap driver,
>which would give us zero-copy transfer capability both with and
>without vhost, as well as (tx at least) when using multiple guests
>on a macvlan setup.

You mean the zero-copy can work with macvtap driver without vhost.
May you give me some detailed info about your macvtap driver and the
relationship between vhost and macvtap to make me have a clear picture then?

>For transmit, it should be fairly straightforward to hook up
>your zero-copy method and the vhost-net interface into the
>macvtap driver.
>
>You have simplified the receiv path significantly by assuming
>that the entire netdev can receive into a single guest, right?

Yes.

>I'm assuming that the idea is to allow VMDq adapters to simply
>show up as separate adapters and have the driver handle this
>in a hardware specific way.

Does the VMDq driver do so now?

>My plan for this was to instead move support for VMDq into the
>macvlan driver so we can transparently use VMDq on hardware where
>available, including zero-copy receives, but fall back to software
>operation on non-VMDq hardware.

	Arnd

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH 0/3] Provide a zero-copy method on KVM virtio-net.
  2010-02-10 11:48 [PATCH 0/3] Provide a zero-copy method on KVM virtio-net Xin Xiaohui
@ 2010-02-11  8:54   ` Xin, Xiaohui
  2010-02-10 13:40 ` [PATCH 0/3] Provide a zero-copy method " Arnd Bergmann
  2010-02-11  8:54   ` Xin, Xiaohui
  2 siblings, 0 replies; 11+ messages in thread
From: Xin, Xiaohui @ 2010-02-11  8:54 UTC (permalink / raw)
  To: Xin, Xiaohui, netdev, kvm, linux-kernel, mingo, mst, jdike

Will be in a vacation during 2/13~2/20, so email may be very slow or no replied
for your comments. But please don't hesitate to comment more, and I will address 
them after the vacation. :-)

Thanks
Xiaohui
-----Original Message-----
From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On Behalf Of Xin Xiaohui
Sent: Wednesday, February 10, 2010 7:49 PM
To: netdev@vger.kernel.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org; mingo@elte.hu; mst@redhat.com; jdike@c2.user-mode-linux.org
Subject: [PATCH 0/3] Provide a zero-copy method on KVM virtio-net.

The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
	To support GRO
	Performance tuning
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH 0/3] Provide a zero-copy method on KVM virtio-net.
@ 2010-02-11  8:54   ` Xin, Xiaohui
  0 siblings, 0 replies; 11+ messages in thread
From: Xin, Xiaohui @ 2010-02-11  8:54 UTC (permalink / raw)
  To: Xin, Xiaohui, netdev, kvm, linux-kernel@vger.kernel.org

Will be in a vacation during 2/13~2/20, so email may be very slow or no replied
for your comments. But please don't hesitate to comment more, and I will address 
them after the vacation. :-)

Thanks
Xiaohui
-----Original Message-----
From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On Behalf Of Xin Xiaohui
Sent: Wednesday, February 10, 2010 7:49 PM
To: netdev@vger.kernel.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org; mingo@elte.hu; mst@redhat.com; jdike@c2.user-mode-linux.org
Subject: [PATCH 0/3] Provide a zero-copy method on KVM virtio-net.

The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
	To support GRO
	Performance tuning
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/3] Provide a zero-copy method on KVM virtio-net.
  2010-02-11  7:40   ` Xin, Xiaohui
@ 2010-02-11 13:25     ` Arnd Bergmann
  0 siblings, 0 replies; 11+ messages in thread
From: Arnd Bergmann @ 2010-02-11 13:25 UTC (permalink / raw)
  To: Xin, Xiaohui; +Cc: netdev, kvm, linux-kernel, mingo, mst, jdike

On Thursday 11 February 2010, Xin, Xiaohui wrote:
> >This does a lot of things that I had planned for macvtap. It's
> >great to hear that you have made this much progress.
> >
> >However, I'd hope that we could combine this with the macvtap driver,
> >which would give us zero-copy transfer capability both with and
> >without vhost, as well as (tx at least) when using multiple guests
> >on a macvlan setup.
> 
> You mean the zero-copy can work with macvtap driver without vhost.
> May you give me some detailed info about your macvtap driver and the
> relationship between vhost and macvtap to make me have a clear picture then?

macvtap provides a user interface that is largely compatible with
the tun/tap driver, and can be used in place of that from qemu.
Vhost-net currently interfaces with tun/tap, but not yet with macvtap,
which is easy enough to add and already on my list.

The underlying code is macvlan, which is a driver that virtualizes
network adapters in software, giving you multiple net_device instances
for a real NIC, each of them with their own MAC address.

In order to do zero-copy transmit with macvtap, the idea is to
add a nonblocking version of the aio_write() function that works
a lot like your transmit function.

For receive, the hardware does not currently know which guest
is supposed to get any frame coming in from the outside. Adding
zero-copy receive requires interaction with the device driver
and hardware capabilities to separate traffic by inbound MAC
address into separate buffers per VM.

> >I'm assuming that the idea is to allow VMDq adapters to simply
> >show up as separate adapters and have the driver handle this
> >in a hardware specific way.
> 
> Does the VMDq driver do so now?

I don't think anyone has published a VMDq capable driver so far.
I was just assuming that you were working on one.

	Arnd

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2010-02-11 13:25 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-02-10 11:48 [PATCH 0/3] Provide a zero-copy method on KVM virtio-net Xin Xiaohui
2010-02-10 11:48 ` [PATCH 1/3] A device for zero-copy based " Xin Xiaohui
2010-02-10 11:48   ` [PATCH 2/3] Provides multiple submits and asynchronous notifications Xin Xiaohui
2010-02-10 11:49     ` [PATCH 3/3] Let host NIC driver to DMA to guest user space Xin Xiaohui
2010-02-10 15:17   ` [PATCH 1/3] A device for zero-copy based on KVM virtio-net Eric Dumazet
2010-02-11  5:33     ` Xin, Xiaohui
2010-02-10 13:40 ` [PATCH 0/3] Provide a zero-copy method " Arnd Bergmann
2010-02-11  7:40   ` Xin, Xiaohui
2010-02-11 13:25     ` Arnd Bergmann
2010-02-11  8:54 ` Xin, Xiaohui
2010-02-11  8:54   ` Xin, Xiaohui

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.