All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
@ 2009-11-20  6:15 Shirley Ma
  2009-11-20  6:19 ` Eric Dumazet
  0 siblings, 1 reply; 20+ messages in thread
From: Shirley Ma @ 2009-11-20  6:15 UTC (permalink / raw)
  To: Michael S. Tsirkin, Avi Kivity, Rusty Russell, netdev, kvm, linux-kernel

This patch is generated against 2.6 git tree. I didn't break up this
patch since it has one functionality. Please review it.

Thanks
Shirley

Signed-off-by: Shirley Ma <xma@us.ibm.com>
------

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b9e002f..6fb788b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -80,33 +80,48 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
 	return (struct skb_vnet_hdr *)skb->cb;
 }
 
-static void give_a_page(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct virtnet_info *vi, struct page *page)
 {
-	page->private = (unsigned long)vi->pages;
+	struct page *npage = (struct page *)page->private;
+
+	if (!npage)
+		page->private = (unsigned long)vi->pages;
+	else {
+		/* give a page list */
+		while (npage) {
+			if (npage->private == (unsigned long)0) {
+				npage->private = (unsigned long)vi->pages;
+				break;
+			}
+			npage = (struct page *)npage->private;
+		}
+	}
 	vi->pages = page;
 }
 
-static void trim_pages(struct virtnet_info *vi, struct sk_buff *skb)
-{
-	unsigned int i;
-
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
-		give_a_page(vi, skb_shinfo(skb)->frags[i].page);
-	skb_shinfo(skb)->nr_frags = 0;
-	skb->data_len = 0;
-}
-
 static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
 {
 	struct page *p = vi->pages;
 
-	if (p)
+	if (p) {
 		vi->pages = (struct page *)p->private;
-	else
+		/* use private to chain big packets */
+		p->private = (unsigned long)0;
+	} else
 		p = alloc_page(gfp_mask);
 	return p;
 }
 
+void virtio_free_pages(void *buf)
+{
+	struct page *page = (struct page *)buf;
+
+	while (page) {
+		__free_pages(page, 0);
+		page = (struct page *)page->private;
+	}
+}
+
 static void skb_xmit_done(struct virtqueue *svq)
 {
 	struct virtnet_info *vi = svq->vdev->priv;
@@ -118,12 +133,36 @@ static void skb_xmit_done(struct virtqueue *svq)
 	netif_wake_queue(vi->dev);
 }
 
-static void receive_skb(struct net_device *dev, struct sk_buff *skb,
+static int set_skb_frags(struct sk_buff *skb, struct page *page,
+				int offset, int len)
+{
+	int i = skb_shinfo(skb)->nr_frags;
+	skb_frag_t *f;
+
+	i = skb_shinfo(skb)->nr_frags;
+	f = &skb_shinfo(skb)->frags[i];
+	f->page = page;
+	f->page_offset = offset;
+
+	if (len > (PAGE_SIZE - f->page_offset))
+		f->size = PAGE_SIZE - f->page_offset;
+	else
+		f->size = len;
+
+	skb_shinfo(skb)->nr_frags++;
+	skb->data_len += f->size;
+	skb->len += f->size;
+
+	len -= f->size;
+	return len;
+}
+
+static void receive_skb(struct net_device *dev, void *buf,
 			unsigned len)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
-	int err;
+	struct skb_vnet_hdr *hdr;
+	struct sk_buff *skb;
 	int i;
 
 	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
@@ -132,39 +171,71 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
 		goto drop;
 	}
 
-	if (vi->mergeable_rx_bufs) {
-		unsigned int copy;
-		char *p = page_address(skb_shinfo(skb)->frags[0].page);
+	if (!vi->mergeable_rx_bufs && !vi->big_packets) {
+		skb = (struct sk_buff *)buf;
+
+		__skb_unlink(skb, &vi->recv);
+
+		hdr = skb_vnet_hdr(skb);
+		len -= sizeof(hdr->hdr);
+		skb_trim(skb, len);
+	} else {
+		struct page *page = (struct page *)buf;
+		int copy, hdr_len, num_buf, offset;
+		char *p;
+
+		p = page_address(page);
 
-		if (len > PAGE_SIZE)
-			len = PAGE_SIZE;
-		len -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
+		skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
+		if (unlikely(!skb)) {
+			dev->stats.rx_dropped++;
+			return;
+		}
+		skb_reserve(skb, NET_IP_ALIGN);
+		hdr = skb_vnet_hdr(skb);
 
-		memcpy(&hdr->mhdr, p, sizeof(hdr->mhdr));
-		p += sizeof(hdr->mhdr);
+		if (vi->mergeable_rx_bufs) {
+			hdr_len = sizeof(hdr->mhdr);
+			memcpy(&hdr->mhdr, p, hdr_len);
+			num_buf = hdr->mhdr.num_buffers;
+			offset = hdr_len;
+			if (len > PAGE_SIZE)
+				len = PAGE_SIZE;
+		} else {
+			/* big packtes 6 bytes alignment between virtio_net
+			 * header and data */
+			hdr_len = sizeof(hdr->hdr);
+			memcpy(&hdr->hdr, p, hdr_len);
+			offset = hdr_len + 6;
+		}
+
+		p += offset;
 
+		len -= hdr_len;
 		copy = len;
 		if (copy > skb_tailroom(skb))
 			copy = skb_tailroom(skb);
-
 		memcpy(skb_put(skb, copy), p, copy);
 
 		len -= copy;
 
-		if (!len) {
-			give_a_page(vi, skb_shinfo(skb)->frags[0].page);
-			skb_shinfo(skb)->nr_frags--;
-		} else {
-			skb_shinfo(skb)->frags[0].page_offset +=
-				sizeof(hdr->mhdr) + copy;
-			skb_shinfo(skb)->frags[0].size = len;
-			skb->data_len += len;
-			skb->len += len;
+		if (!len)
+			give_pages(vi, page);
+		else {
+			len = set_skb_frags(skb, page, copy + offset, len);
+			/* process big packets */
+			while (len > 0) {
+				page = (struct page *)page->private;
+				if (!page)
+					break;
+				len = set_skb_frags(skb, page, 0, len);
+			}
+			if (page && page->private)
+				give_pages(vi, (struct page *)page->private);
 		}
 
-		while (--hdr->mhdr.num_buffers) {
-			struct sk_buff *nskb;
-
+		/* process mergeable buffers */
+		while (vi->mergeable_rx_bufs && --num_buf) {
 			i = skb_shinfo(skb)->nr_frags;
 			if (i >= MAX_SKB_FRAGS) {
 				pr_debug("%s: packet too long %d\n", dev->name,
@@ -173,41 +244,20 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
 				goto drop;
 			}
 
-			nskb = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
-			if (!nskb) {
+			page = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
+			if (!page) {
 				pr_debug("%s: rx error: %d buffers missing\n",
 					 dev->name, hdr->mhdr.num_buffers);
 				dev->stats.rx_length_errors++;
 				goto drop;
 			}
 
-			__skb_unlink(nskb, &vi->recv);
-			vi->num--;
-
-			skb_shinfo(skb)->frags[i] = skb_shinfo(nskb)->frags[0];
-			skb_shinfo(nskb)->nr_frags = 0;
-			kfree_skb(nskb);
-
 			if (len > PAGE_SIZE)
 				len = PAGE_SIZE;
 
-			skb_shinfo(skb)->frags[i].size = len;
-			skb_shinfo(skb)->nr_frags++;
-			skb->data_len += len;
-			skb->len += len;
-		}
-	} else {
-		len -= sizeof(hdr->hdr);
-
-		if (len <= MAX_PACKET_LEN)
-			trim_pages(vi, skb);
+			set_skb_frags(skb, page, 0, len);
 
-		err = pskb_trim(skb, len);
-		if (err) {
-			pr_debug("%s: pskb_trim failed %i %d\n", dev->name,
-				 len, err);
-			dev->stats.rx_dropped++;
-			goto drop;
+			vi->num--;
 		}
 	}
 
@@ -271,107 +321,105 @@ drop:
 	dev_kfree_skb(skb);
 }
 
-static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)
+/* Returns false if we couldn't fill entirely (OOM). */
+static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
 {
-	struct sk_buff *skb;
 	struct scatterlist sg[2+MAX_SKB_FRAGS];
-	int num, err, i;
+	int err = 0;
 	bool oom = false;
 
 	sg_init_table(sg, 2+MAX_SKB_FRAGS);
 	do {
-		struct skb_vnet_hdr *hdr;
-
-		skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN + NET_IP_ALIGN);
-		if (unlikely(!skb)) {
-			oom = true;
-			break;
-		}
-
-		skb_reserve(skb, NET_IP_ALIGN);
-		skb_put(skb, MAX_PACKET_LEN);
-
-		hdr = skb_vnet_hdr(skb);
-		sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
-
-		if (vi->big_packets) {
-			for (i = 0; i < MAX_SKB_FRAGS; i++) {
-				skb_frag_t *f = &skb_shinfo(skb)->frags[i];
-				f->page = get_a_page(vi, gfp);
-				if (!f->page)
-					break;
-
-				f->page_offset = 0;
-				f->size = PAGE_SIZE;
-
-				skb->data_len += PAGE_SIZE;
-				skb->len += PAGE_SIZE;
-
-				skb_shinfo(skb)->nr_frags++;
+		/* allocate skb for MAX_PACKET_LEN len */
+		if (!vi->big_packets && !vi->mergeable_rx_bufs) {
+			struct skb_vnet_hdr *hdr;
+			struct sk_buff *skb;
+
+			skb = netdev_alloc_skb(vi->dev,
+					       MAX_PACKET_LEN + NET_IP_ALIGN);
+			if (unlikely(!skb)) {
+				oom = true;
+				break;
 			}
-		}
-
-		num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
-		skb_queue_head(&vi->recv, skb);
-
-		err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb);
-		if (err < 0) {
-			skb_unlink(skb, &vi->recv);
-			trim_pages(vi, skb);
-			kfree_skb(skb);
-			break;
-		}
-		vi->num++;
-	} while (err >= num);
-	if (unlikely(vi->num > vi->max))
-		vi->max = vi->num;
-	vi->rvq->vq_ops->kick(vi->rvq);
-	return !oom;
-}
-
-/* Returns false if we couldn't fill entirely (OOM). */
-static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
-{
-	struct sk_buff *skb;
-	struct scatterlist sg[1];
-	int err;
-	bool oom = false;
 
-	if (!vi->mergeable_rx_bufs)
-		return try_fill_recv_maxbufs(vi, gfp);
+			skb_reserve(skb, NET_IP_ALIGN);
+			skb_put(skb, MAX_PACKET_LEN);
 
-	do {
-		skb_frag_t *f;
+			hdr = skb_vnet_hdr(skb);
+			sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
 
-		skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
-		if (unlikely(!skb)) {
-			oom = true;
-			break;
-		}
-
-		skb_reserve(skb, NET_IP_ALIGN);
-
-		f = &skb_shinfo(skb)->frags[0];
-		f->page = get_a_page(vi, gfp);
-		if (!f->page) {
-			oom = true;
-			kfree_skb(skb);
-			break;
-		}
+			skb_to_sgvec(skb, sg+1, 0, skb->len);
+			skb_queue_head(&vi->recv, skb);
 
-		f->page_offset = 0;
-		f->size = PAGE_SIZE;
-
-		skb_shinfo(skb)->nr_frags++;
+			err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
+			if (err < 0) {
+				skb_unlink(skb, &vi->recv);
+				kfree_skb(skb);
+				break;
+			}
 
-		sg_init_one(sg, page_address(f->page), PAGE_SIZE);
-		skb_queue_head(&vi->recv, skb);
+		} else {
+			struct page *first_page = NULL;
+			struct page *page;
+			int i = MAX_SKB_FRAGS + 2;
+			char *p;
+
+			/*
+			 * chain pages for big packets, allocate skb
+			 * late for both big packets and mergeable
+			 * buffers
+			 */
+more:			page = get_a_page(vi, gfp);
+			if (!page) {
+				if (first_page)
+					give_pages(vi, first_page);
+				oom = true;
+				break;
+			}
 
-		err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 1, skb);
-		if (err < 0) {
-			skb_unlink(skb, &vi->recv);
-			kfree_skb(skb);
-			break;
+			p = page_address(page);
+			if (vi->mergeable_rx_bufs) {
+				sg_init_one(sg, p, PAGE_SIZE);
+				err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0,
+							       1, page);
+				if (err < 0) {
+					give_pages(vi, page);
+					break;
+				}
+			} else {
+				int hdr_len = sizeof(struct virtio_net_hdr);
+
+				/*
+				 * allocate MAX_SKB_FRAGS + 1 pages for
+				 * big packets
+				 */
+				page->private = (unsigned long)first_page;
+				first_page = page;
+				if (--i == 1) {
+					int offset = hdr_len + 6;
+
+					/*
+					 * share one page between virtio_net
+					 * header and data, and reserve 6 bytes
+					 * for alignment
+					 */
+					sg_set_buf(sg, p, hdr_len);
+					sg_set_buf(sg+1, p + offset,
+						   PAGE_SIZE - offset);
+					err = vi->rvq->vq_ops->add_buf(vi->rvq,
+							sg, 0,
+							MAX_SKB_FRAGS + 2,
+							first_page);
+					if (err < 0) {
+						give_pages(vi, first_page);
+						break;
+					}
+
+				} else {
+					sg_set_buf(&sg[i], p, PAGE_SIZE);
+					goto more;
+				}
+			}
 		}
 		vi->num++;
 	} while (err > 0);
@@ -411,14 +459,13 @@ static void refill_work(struct work_struct *work)
 static int virtnet_poll(struct napi_struct *napi, int budget)
 {
 	struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
-	struct sk_buff *skb = NULL;
+	void *buf = NULL;
 	unsigned int len, received = 0;
 
 again:
 	while (received < budget &&
-	       (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
-		__skb_unlink(skb, &vi->recv);
-		receive_skb(vi->dev, skb, len);
+	       (buf = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
+		receive_skb(vi->dev, buf, len);
 		vi->num--;
 		received++;
 	}
@@ -959,6 +1006,7 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
 	struct sk_buff *skb;
+	int freed;
 
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
@@ -970,11 +1018,17 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
 	}
 	__skb_queue_purge(&vi->send);
 
-	BUG_ON(vi->num != 0);
-
 	unregister_netdev(vi->dev);
 	cancel_delayed_work_sync(&vi->refill);
 
+	if (vi->mergeable_rx_bufs || vi->big_packets) {
+		freed = vi->rvq->vq_ops->destroy_buf(vi->rvq,
+						     virtio_free_pages);
+		vi->num -= freed;
+	}
+
+	BUG_ON(vi->num != 0);
+
 	vdev->config->del_vqs(vi->vdev);
 
 	while (vi->pages)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index fbd2ecd..aec7fe7 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -334,6 +334,29 @@ static bool vring_enable_cb(struct virtqueue *_vq)
 	return true;
 }
 
+static int vring_destroy_buf(struct virtqueue *_vq, void (*callback)(void *))
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	void *ret;
+	unsigned int i;
+	int freed = 0;
+
+	START_USE(vq);
+
+	for (i = 0; i < vq->vring.num; i++) {
+		if (vq->data[i]) {
+			/* detach_buf clears data, so grab it now. */
+			ret = vq->data[i];
+			detach_buf(vq, i);
+			callback(ret);
+			freed++;
+		}
+	}
+
+	END_USE(vq);
+	return freed;
+}
+
 irqreturn_t vring_interrupt(int irq, void *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
@@ -360,6 +383,7 @@ static struct virtqueue_ops vring_vq_ops = {
 	.kick = vring_kick,
 	.disable_cb = vring_disable_cb,
 	.enable_cb = vring_enable_cb,
+	.destroy_buf = vring_destroy_buf,
 };
 
 struct virtqueue *vring_new_virtqueue(unsigned int num,
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 057a2e0..7b1e86c 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -71,6 +71,7 @@ struct virtqueue_ops {
 
 	void (*disable_cb)(struct virtqueue *vq);
 	bool (*enable_cb)(struct virtqueue *vq);
+	int (*destroy_buf)(struct virtqueue *vq, void (*callback)(void *));
 };
 
 /**



^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-20  6:15 [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net Shirley Ma
@ 2009-11-20  6:19 ` Eric Dumazet
  2009-11-20 16:08   ` Shirley Ma
  2009-11-20 16:21   ` Shirley Ma
  0 siblings, 2 replies; 20+ messages in thread
From: Eric Dumazet @ 2009-11-20  6:19 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Michael S. Tsirkin, Avi Kivity, Rusty Russell, netdev, kvm, linux-kernel

Shirley Ma a écrit :
> This patch is generated against 2.6 git tree. I didn't break up this
> patch since it has one functionality. Please review it.
> 
> Thanks
> Shirley
> 
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ------
>  
> +void virtio_free_pages(void *buf)
> +{
> +	struct page *page = (struct page *)buf;
> +
> +	while (page) {
> +		__free_pages(page, 0);
> +		page = (struct page *)page->private;
> +	}
> +}
> +

Interesting use after free :)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-20  6:19 ` Eric Dumazet
@ 2009-11-20 16:08   ` Shirley Ma
  2009-11-20 16:21   ` Shirley Ma
  1 sibling, 0 replies; 20+ messages in thread
From: Shirley Ma @ 2009-11-20 16:08 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Michael S. Tsirkin, Avi Kivity, Rusty Russell, netdev, kvm, linux-kernel

On Fri, 2009-11-20 at 07:19 +0100, Eric Dumazet wrote:
> > +void virtio_free_pages(void *buf)
> > +{
> > +     struct page *page = (struct page *)buf;
> > +
> > +     while (page) {
> > +             __free_pages(page, 0);
> > +             page = (struct page *)page->private;
> > +     }
> > +}
> > +
> 
> Interesting use after free :)

Good catch. This code was run when virtio_net removal. I run many times
of remove virtio_net, and didn't hit any panic :(. Fixed it as below:

void virtio_free_pages(void *buf)
{
	struct page *page = (struct page *)buf;
	struct page *npage;

	while (page) {
		npage = page->private;
		__free_pages(page, 0);
		page = npage;
	}
}

Thanks
Shirley



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-20  6:19 ` Eric Dumazet
  2009-11-20 16:08   ` Shirley Ma
@ 2009-11-20 16:21   ` Shirley Ma
  2009-11-23  1:08     ` Rusty Russell
  2009-11-23  9:43     ` Michael S. Tsirkin
  1 sibling, 2 replies; 20+ messages in thread
From: Shirley Ma @ 2009-11-20 16:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Michael S. Tsirkin, Avi Kivity, Rusty Russell, netdev, kvm, linux-kernel

On Fri, 2009-11-20 at 07:19 +0100, Eric Dumazet wrote:
> Interesting use after free :)

Thanks for catching the stupid mistake. This is the updated patch for
review.

Signed-off-by: Shirley Ma (xma@us.ibm.com)
------

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b9e002f..5699bd3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -80,33 +80,50 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
 	return (struct skb_vnet_hdr *)skb->cb;
 }
 
-static void give_a_page(struct virtnet_info *vi, struct page *page)
+static void give_pages(struct virtnet_info *vi, struct page *page)
 {
-	page->private = (unsigned long)vi->pages;
+	struct page *npage = (struct page *)page->private;
+
+	if (!npage)
+		page->private = (unsigned long)vi->pages;
+	else {
+		/* give a page list */
+		while (npage) {
+			if (npage->private == (unsigned long)0) {
+				npage->private = (unsigned long)vi->pages;
+				break;
+			}
+			npage = (struct page *)npage->private;
+		}
+	}
 	vi->pages = page;
 }
 
-static void trim_pages(struct virtnet_info *vi, struct sk_buff *skb)
-{
-	unsigned int i;
-
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
-		give_a_page(vi, skb_shinfo(skb)->frags[i].page);
-	skb_shinfo(skb)->nr_frags = 0;
-	skb->data_len = 0;
-}
-
 static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
 {
 	struct page *p = vi->pages;
 
-	if (p)
+	if (p) {
 		vi->pages = (struct page *)p->private;
-	else
+		/* use private to chain big packets */
+		p->private = (unsigned long)0;
+	} else
 		p = alloc_page(gfp_mask);
 	return p;
 }
 
+void virtio_free_pages(void *buf)
+{
+	struct page *page = (struct page *)buf;
+	struct page *npage;
+
+	while (page) {
+		npage = (struct page *)page->private;
+		__free_pages(page, 0);
+		page = npage;
+	}
+}
+
 static void skb_xmit_done(struct virtqueue *svq)
 {
 	struct virtnet_info *vi = svq->vdev->priv;
@@ -118,12 +135,36 @@ static void skb_xmit_done(struct virtqueue *svq)
 	netif_wake_queue(vi->dev);
 }
 
-static void receive_skb(struct net_device *dev, struct sk_buff *skb,
+static int set_skb_frags(struct sk_buff *skb, struct page *page,
+				int offset, int len)
+{
+	int i = skb_shinfo(skb)->nr_frags;
+	skb_frag_t *f;
+
+	i = skb_shinfo(skb)->nr_frags;
+	f = &skb_shinfo(skb)->frags[i];
+	f->page = page;
+	f->page_offset = offset;
+
+	if (len > (PAGE_SIZE - f->page_offset))
+		f->size = PAGE_SIZE - f->page_offset;
+	else
+		f->size = len;
+
+	skb_shinfo(skb)->nr_frags++;
+	skb->data_len += f->size;
+	skb->len += f->size;
+
+	len -= f->size;
+	return len;
+}
+
+static void receive_skb(struct net_device *dev, void *buf,
 			unsigned len)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
-	int err;
+	struct skb_vnet_hdr *hdr;
+	struct sk_buff *skb;
 	int i;
 
 	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
@@ -132,39 +173,71 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
 		goto drop;
 	}
 
-	if (vi->mergeable_rx_bufs) {
-		unsigned int copy;
-		char *p = page_address(skb_shinfo(skb)->frags[0].page);
+	if (!vi->mergeable_rx_bufs && !vi->big_packets) {
+		skb = (struct sk_buff *)buf;
+
+		__skb_unlink(skb, &vi->recv);
+
+		hdr = skb_vnet_hdr(skb);
+		len -= sizeof(hdr->hdr);
+		skb_trim(skb, len);
+	} else {
+		struct page *page = (struct page *)buf;
+		int copy, hdr_len, num_buf, offset;
+		char *p;
+
+		p = page_address(page);
 
-		if (len > PAGE_SIZE)
-			len = PAGE_SIZE;
-		len -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
+		skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
+		if (unlikely(!skb)) {
+			dev->stats.rx_dropped++;
+			return;
+		}
+		skb_reserve(skb, NET_IP_ALIGN);
+		hdr = skb_vnet_hdr(skb);
 
-		memcpy(&hdr->mhdr, p, sizeof(hdr->mhdr));
-		p += sizeof(hdr->mhdr);
+		if (vi->mergeable_rx_bufs) {
+			hdr_len = sizeof(hdr->mhdr);
+			memcpy(&hdr->mhdr, p, hdr_len);
+			num_buf = hdr->mhdr.num_buffers;
+			offset = hdr_len;
+			if (len > PAGE_SIZE)
+				len = PAGE_SIZE;
+		} else {
+			/* big packtes 6 bytes alignment between virtio_net
+			 * header and data */
+			hdr_len = sizeof(hdr->hdr);
+			memcpy(&hdr->hdr, p, hdr_len);
+			offset = hdr_len + 6;
+		}
+
+		p += offset;
 
+		len -= hdr_len;
 		copy = len;
 		if (copy > skb_tailroom(skb))
 			copy = skb_tailroom(skb);
-
 		memcpy(skb_put(skb, copy), p, copy);
 
 		len -= copy;
 
-		if (!len) {
-			give_a_page(vi, skb_shinfo(skb)->frags[0].page);
-			skb_shinfo(skb)->nr_frags--;
-		} else {
-			skb_shinfo(skb)->frags[0].page_offset +=
-				sizeof(hdr->mhdr) + copy;
-			skb_shinfo(skb)->frags[0].size = len;
-			skb->data_len += len;
-			skb->len += len;
+		if (!len)
+			give_pages(vi, page);
+		else {
+			len = set_skb_frags(skb, page, copy + offset, len);
+			/* process big packets */
+			while (len > 0) {
+				page = (struct page *)page->private;
+				if (!page)
+					break;
+				len = set_skb_frags(skb, page, 0, len);
+			}
+			if (page && page->private)
+				give_pages(vi, (struct page *)page->private);
 		}
 
-		while (--hdr->mhdr.num_buffers) {
-			struct sk_buff *nskb;
-
+		/* process mergeable buffers */
+		while (vi->mergeable_rx_bufs && --num_buf) {
 			i = skb_shinfo(skb)->nr_frags;
 			if (i >= MAX_SKB_FRAGS) {
 				pr_debug("%s: packet too long %d\n", dev->name,
@@ -173,41 +246,20 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
 				goto drop;
 			}
 
-			nskb = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
-			if (!nskb) {
+			page = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
+			if (!page) {
 				pr_debug("%s: rx error: %d buffers missing\n",
 					 dev->name, hdr->mhdr.num_buffers);
 				dev->stats.rx_length_errors++;
 				goto drop;
 			}
 
-			__skb_unlink(nskb, &vi->recv);
-			vi->num--;
-
-			skb_shinfo(skb)->frags[i] = skb_shinfo(nskb)->frags[0];
-			skb_shinfo(nskb)->nr_frags = 0;
-			kfree_skb(nskb);
-
 			if (len > PAGE_SIZE)
 				len = PAGE_SIZE;
 
-			skb_shinfo(skb)->frags[i].size = len;
-			skb_shinfo(skb)->nr_frags++;
-			skb->data_len += len;
-			skb->len += len;
-		}
-	} else {
-		len -= sizeof(hdr->hdr);
-
-		if (len <= MAX_PACKET_LEN)
-			trim_pages(vi, skb);
+			set_skb_frags(skb, page, 0, len);
 
-		err = pskb_trim(skb, len);
-		if (err) {
-			pr_debug("%s: pskb_trim failed %i %d\n", dev->name,
-				 len, err);
-			dev->stats.rx_dropped++;
-			goto drop;
+			vi->num--;
 		}
 	}
 
@@ -271,107 +323,105 @@ drop:
 	dev_kfree_skb(skb);
 }
 
-static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)
+/* Returns false if we couldn't fill entirely (OOM). */
+static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
 {
-	struct sk_buff *skb;
 	struct scatterlist sg[2+MAX_SKB_FRAGS];
-	int num, err, i;
+	int err = 0;
 	bool oom = false;
 
 	sg_init_table(sg, 2+MAX_SKB_FRAGS);
 	do {
-		struct skb_vnet_hdr *hdr;
-
-		skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN + NET_IP_ALIGN);
-		if (unlikely(!skb)) {
-			oom = true;
-			break;
-		}
-
-		skb_reserve(skb, NET_IP_ALIGN);
-		skb_put(skb, MAX_PACKET_LEN);
-
-		hdr = skb_vnet_hdr(skb);
-		sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
-
-		if (vi->big_packets) {
-			for (i = 0; i < MAX_SKB_FRAGS; i++) {
-				skb_frag_t *f = &skb_shinfo(skb)->frags[i];
-				f->page = get_a_page(vi, gfp);
-				if (!f->page)
-					break;
-
-				f->page_offset = 0;
-				f->size = PAGE_SIZE;
-
-				skb->data_len += PAGE_SIZE;
-				skb->len += PAGE_SIZE;
-
-				skb_shinfo(skb)->nr_frags++;
+		/* allocate skb for MAX_PACKET_LEN len */
+		if (!vi->big_packets && !vi->mergeable_rx_bufs) {
+			struct skb_vnet_hdr *hdr;
+			struct sk_buff *skb;
+
+			skb = netdev_alloc_skb(vi->dev,
+					       MAX_PACKET_LEN + NET_IP_ALIGN);
+			if (unlikely(!skb)) {
+				oom = true;
+				break;
 			}
-		}
-
-		num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
-		skb_queue_head(&vi->recv, skb);
-
-		err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb);
-		if (err < 0) {
-			skb_unlink(skb, &vi->recv);
-			trim_pages(vi, skb);
-			kfree_skb(skb);
-			break;
-		}
-		vi->num++;
-	} while (err >= num);
-	if (unlikely(vi->num > vi->max))
-		vi->max = vi->num;
-	vi->rvq->vq_ops->kick(vi->rvq);
-	return !oom;
-}
-
-/* Returns false if we couldn't fill entirely (OOM). */
-static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
-{
-	struct sk_buff *skb;
-	struct scatterlist sg[1];
-	int err;
-	bool oom = false;
 
-	if (!vi->mergeable_rx_bufs)
-		return try_fill_recv_maxbufs(vi, gfp);
+			skb_reserve(skb, NET_IP_ALIGN);
+			skb_put(skb, MAX_PACKET_LEN);
 
-	do {
-		skb_frag_t *f;
+			hdr = skb_vnet_hdr(skb);
+			sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
 
-		skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
-		if (unlikely(!skb)) {
-			oom = true;
-			break;
-		}
-
-		skb_reserve(skb, NET_IP_ALIGN);
-
-		f = &skb_shinfo(skb)->frags[0];
-		f->page = get_a_page(vi, gfp);
-		if (!f->page) {
-			oom = true;
-			kfree_skb(skb);
-			break;
-		}
+			skb_to_sgvec(skb, sg+1, 0, skb->len);
+			skb_queue_head(&vi->recv, skb);
 
-		f->page_offset = 0;
-		f->size = PAGE_SIZE;
-
-		skb_shinfo(skb)->nr_frags++;
+			err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
+			if (err < 0) {
+				skb_unlink(skb, &vi->recv);
+				kfree_skb(skb);
+				break;
+			}
 
-		sg_init_one(sg, page_address(f->page), PAGE_SIZE);
-		skb_queue_head(&vi->recv, skb);
+		} else {
+			struct page *first_page = NULL;
+			struct page *page;
+			int i = MAX_SKB_FRAGS + 2;
+			char *p;
+
+			/*
+			 * chain pages for big packets, allocate skb
+			 * late for both big packets and mergeable
+			 * buffers
+			 */
+more:			page = get_a_page(vi, gfp);
+			if (!page) {
+				if (first_page)
+					give_pages(vi, first_page);
+				oom = true;
+				break;
+			}
 
-		err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 1, skb);
-		if (err < 0) {
-			skb_unlink(skb, &vi->recv);
-			kfree_skb(skb);
-			break;
+			p = page_address(page);
+			if (vi->mergeable_rx_bufs) {
+				sg_init_one(sg, p, PAGE_SIZE);
+				err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0,
+							       1, page);
+				if (err < 0) {
+					give_pages(vi, page);
+					break;
+				}
+			} else {
+				int hdr_len = sizeof(struct virtio_net_hdr);
+
+				/*
+				 * allocate MAX_SKB_FRAGS + 1 pages for
+				 * big packets
+				 */
+				page->private = (unsigned long)first_page;
+				first_page = page;
+				if (--i == 1) {
+					int offset = hdr_len + 6;
+
+					/*
+					 * share one page between virtio_net
+					 * header and data, and reserve 6 bytes
+					 * for alignment
+					 */
+					sg_set_buf(sg, p, hdr_len);
+					sg_set_buf(sg+1, p + offset,
+						   PAGE_SIZE - offset);
+					err = vi->rvq->vq_ops->add_buf(vi->rvq,
+							sg, 0,
+							MAX_SKB_FRAGS + 2,
+							first_page);
+					if (err < 0) {
+						give_pages(vi, first_page);
+						break;
+					}
+
+				} else {
+					sg_set_buf(&sg[i], p, PAGE_SIZE);
+					goto more;
+				}
+			}
 		}
 		vi->num++;
 	} while (err > 0);
@@ -411,14 +461,13 @@ static void refill_work(struct work_struct *work)
 static int virtnet_poll(struct napi_struct *napi, int budget)
 {
 	struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
-	struct sk_buff *skb = NULL;
+	void *buf = NULL;
 	unsigned int len, received = 0;
 
 again:
 	while (received < budget &&
-	       (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
-		__skb_unlink(skb, &vi->recv);
-		receive_skb(vi->dev, skb, len);
+	       (buf = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
+		receive_skb(vi->dev, buf, len);
 		vi->num--;
 		received++;
 	}
@@ -959,6 +1008,7 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
 	struct sk_buff *skb;
+	int freed;
 
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
@@ -970,11 +1020,17 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
 	}
 	__skb_queue_purge(&vi->send);
 
-	BUG_ON(vi->num != 0);
-
 	unregister_netdev(vi->dev);
 	cancel_delayed_work_sync(&vi->refill);
 
+	if (vi->mergeable_rx_bufs || vi->big_packets) {
+		freed = vi->rvq->vq_ops->destroy_buf(vi->rvq,
+						     virtio_free_pages);
+		vi->num -= freed;
+	}
+
+	BUG_ON(vi->num != 0);
+
 	vdev->config->del_vqs(vi->vdev);
 
 	while (vi->pages)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index fbd2ecd..aec7fe7 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -334,6 +334,29 @@ static bool vring_enable_cb(struct virtqueue *_vq)
 	return true;
 }
 
+static int vring_destroy_buf(struct virtqueue *_vq, void (*callback)(void *))
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+	void *ret;
+	unsigned int i;
+	int freed = 0;
+
+	START_USE(vq);
+
+	for (i = 0; i < vq->vring.num; i++) {
+		if (vq->data[i]) {
+			/* detach_buf clears data, so grab it now. */
+			ret = vq->data[i];
+			detach_buf(vq, i);
+			callback(ret);
+			freed++;
+		}
+	}
+
+	END_USE(vq);
+	return freed;
+}
+
 irqreturn_t vring_interrupt(int irq, void *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
@@ -360,6 +383,7 @@ static struct virtqueue_ops vring_vq_ops = {
 	.kick = vring_kick,
 	.disable_cb = vring_disable_cb,
 	.enable_cb = vring_enable_cb,
+	.destroy_buf = vring_destroy_buf,
 };
 
 struct virtqueue *vring_new_virtqueue(unsigned int num,
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 057a2e0..7b1e86c 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -71,6 +71,7 @@ struct virtqueue_ops {
 
 	void (*disable_cb)(struct virtqueue *vq);
 	bool (*enable_cb)(struct virtqueue *vq);
+	int (*destroy_buf)(struct virtqueue *vq, void (*callback)(void *));
 };
 
 /**





^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-20 16:21   ` Shirley Ma
@ 2009-11-23  1:08     ` Rusty Russell
  2009-11-23 16:07       ` Shirley Ma
  2009-11-23 19:01       ` Shirley Ma
  2009-11-23  9:43     ` Michael S. Tsirkin
  1 sibling, 2 replies; 20+ messages in thread
From: Rusty Russell @ 2009-11-23  1:08 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Eric Dumazet, Michael S. Tsirkin, Avi Kivity, netdev, kvm, linux-kernel

On Sat, 21 Nov 2009 02:51:41 am Shirley Ma wrote:
> Signed-off-by: Shirley Ma (xma@us.ibm.com)

Hi Shirley,

   This patch seems like a good idea, but it needs some work.  As this is
the first time I've received a patch from you, I will go through it in extra
detail.  (As virtio maintainer, it's my responsibility to include this patch,
or not).

> -static void give_a_page(struct virtnet_info *vi, struct page *page)
> +static void give_pages(struct virtnet_info *vi, struct page *page)
>  {
> -	page->private = (unsigned long)vi->pages;
> +	struct page *npage = (struct page *)page->private;
> +
> +	if (!npage)
> +		page->private = (unsigned long)vi->pages;
> +	else {
> +		/* give a page list */
> +		while (npage) {
> +			if (npage->private == (unsigned long)0) {
> +				npage->private = (unsigned long)vi->pages;
> +				break;
> +			}
> +			npage = (struct page *)npage->private;
> +		}
> +	}
>  	vi->pages = page;

The loops should cover both cases of the if branch.  Either way, we want
to find the last page in the list so you can set that ->private pointer to 
vi->pages.

Also, the "== (unsigned long)0" is a little verbose.

How about:
	struct page *end;

	/* Find end of list, sew whole thing into vi->pages. */
	for (end = page; end->private; end = (struct page *)end->private);
	end->private = (unsigned long)vi->pages;
	vi->pages = page;

> +void virtio_free_pages(void *buf)

This is a terrible name; it's specific to virtio_net, plus it should be
static.  Just "free_pages" should be sufficient here I think.

> +{
> +	struct page *page = (struct page *)buf;
> +	struct page *npage;
> +
> +	while (page) {
> +		npage = (struct page *)page->private;
> +		__free_pages(page, 0);
> +		page = npage;
> +	}

This smells a lot like a for loop to me?

	struct page *i, *next;

	for (i = buf; next; i = next) {
		next = (struct page *)i->private;
		__free_pages(i, 0);
	}

> +static int set_skb_frags(struct sk_buff *skb, struct page *page,
> +				int offset, int len)

A better name might be "skb_add_frag()"?

> +static void receive_skb(struct net_device *dev, void *buf,
>  			unsigned len)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
> -	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
> -	int err;
> +	struct skb_vnet_hdr *hdr;
> +	struct sk_buff *skb;
>  	int i;
>  
>  	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
> @@ -132,39 +173,71 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
>  		goto drop;
>  	}
>  
> -	if (vi->mergeable_rx_bufs) {
> -		unsigned int copy;
> -		char *p = page_address(skb_shinfo(skb)->frags[0].page);
> +	if (!vi->mergeable_rx_bufs && !vi->big_packets) {
> +		skb = (struct sk_buff *)buf;

This cast is unnecessary, but a comment would be nice:
	/* Simple case: the pointer is the 1514-byte skb */

> +	} else {

And a matching comment here:

	/* The pointer is a chain of pages. */

> +		struct page *page = (struct page *)buf;
> +		int copy, hdr_len, num_buf, offset;
> +		char *p;
> +
> +		p = page_address(page);
>  
> -		if (len > PAGE_SIZE)
> -			len = PAGE_SIZE;
> -		len -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +		skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
> +		if (unlikely(!skb)) {
> +			dev->stats.rx_dropped++;

Does this mean we leak all those pages?  Shouldn't we give_pages() here?

> +			return;
> +		}
> +		skb_reserve(skb, NET_IP_ALIGN);
> +		hdr = skb_vnet_hdr(skb);
>  
> -		memcpy(&hdr->mhdr, p, sizeof(hdr->mhdr));
> -		p += sizeof(hdr->mhdr);
> +		if (vi->mergeable_rx_bufs) {
> +			hdr_len = sizeof(hdr->mhdr);
> +			memcpy(&hdr->mhdr, p, hdr_len);
> +			num_buf = hdr->mhdr.num_buffers;
> +			offset = hdr_len;
> +			if (len > PAGE_SIZE)
> +				len = PAGE_SIZE;
> +		} else {
> +			/* big packtes 6 bytes alignment between virtio_net
> +			 * header and data */
> +			hdr_len = sizeof(hdr->hdr);
> +			memcpy(&hdr->hdr, p, hdr_len);
> +			offset = hdr_len + 6;

Really?  I can't see where the current driver does this 6 byte offset.  There
should be a header, then the packet...

Ah, I see below that you set things up this way.  The better way to do this
is to create a new structure to use in various places.

	struct padded_vnet_hdr {
		struct virtio_net_hdr hdr;
		/* This padding makes our packet 16 byte aligned */
		char padding[6];
	};

However, I question whether making it 16 byte is the right thing: the
ethernet header is 14 bytes long, so don't we want 8 bytes of padding?

> +		}

I think you can move the memcpy outside the if, like so:

	memcpy(&hdr, p, hdr_len);

> +		if (!len)
> +			give_pages(vi, page);
> +		else {
> +			len = set_skb_frags(skb, page, copy + offset, len);
> +			/* process big packets */
> +			while (len > 0) {
> +				page = (struct page *)page->private;
> +				if (!page)
> +					break;
> +				len = set_skb_frags(skb, page, 0, len);
> +			}
> +			if (page && page->private)
> +				give_pages(vi, (struct page *)page->private);
>  		}

I can't help but think this statement should be one loop somehow.  Something
like:

	offset += copy;

	while (len) {
		len = skb_add_frag(skb, page, offset, len);
		page = (struct page *)page->private;
		offset = 0;
	}
	if (page)
		give_pages(vi, page);

> -static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)
> +/* Returns false if we couldn't fill entirely (OOM). */
> +static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)

The result of trying to merge all the cases here is messy.  I'd split it
inside the loop into: add_recvbuf_small(vi, gfp), add_recvbuf_big(vi, gfp)
and add_recvbuf_mergeable(vi, gfp).

It'll also be easier for me to review each case then, as well as getting
rid of the big packets if we decide to do that later.  You could even do
that split as a separate patch, before this one.

> @@ -970,11 +1020,17 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
>  	}
>  	__skb_queue_purge(&vi->send);
>  
> -	BUG_ON(vi->num != 0);
> -
>  	unregister_netdev(vi->dev);
>  	cancel_delayed_work_sync(&vi->refill);
>  
> +	if (vi->mergeable_rx_bufs || vi->big_packets) {
> +		freed = vi->rvq->vq_ops->destroy_buf(vi->rvq,
> +						     virtio_free_pages);
> +		vi->num -= freed;
> +	}
> +
> +	BUG_ON(vi->num != 0);
> +

destroy_buf should really be called destroy_bufs().  And I don't think you
use the vi->recv skb list in this case at all, so the loop which purges those
should be under an "else {" of this branch.

> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index fbd2ecd..aec7fe7 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -334,6 +334,29 @@ static bool vring_enable_cb(struct virtqueue *_vq)
>  	return true;
>  }
>  
> +static int vring_destroy_buf(struct virtqueue *_vq, void (*callback)(void *))

This new parameter should be introduced as a separate patch.  It should also be
called destroy_bufs.  It also returns an unsigned int.  I would call the callback
something a little more informative, such as "destroy"; here and in the header.

> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	void *ret;
> +	unsigned int i;
> +	int freed = 0;
> +
> +	START_USE(vq);
> +
> +	for (i = 0; i < vq->vring.num; i++) {
> +		if (vq->data[i]) {
> +			/* detach_buf clears data, so grab it now. */
> +			ret = vq->data[i];
> +			detach_buf(vq, i);
> +			callback(ret);

ret is a bad name for this; how about buf?

Overall, the patch looks good.  But it would have been nicer if it were
split into several parts: cleanups, new infrastructure, then the actual
allocation change.

Thanks!
Rusty.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-20 16:21   ` Shirley Ma
  2009-11-23  1:08     ` Rusty Russell
@ 2009-11-23  9:43     ` Michael S. Tsirkin
  2009-11-23 16:18       ` Shirley Ma
  1 sibling, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2009-11-23  9:43 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Eric Dumazet, Avi Kivity, Rusty Russell, netdev, kvm, linux-kernel

On Fri, Nov 20, 2009 at 08:21:41AM -0800, Shirley Ma wrote:
> On Fri, 2009-11-20 at 07:19 +0100, Eric Dumazet wrote:
> > Interesting use after free :)
> 
> Thanks for catching the stupid mistake. This is the updated patch for
> review.
> 
> Signed-off-by: Shirley Ma (xma@us.ibm.com)

some style comments. addressing them will make it
easier to review actual content.

> ------
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index b9e002f..5699bd3 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -80,33 +80,50 @@ static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
>  	return (struct skb_vnet_hdr *)skb->cb;
>  }
>  
> -static void give_a_page(struct virtnet_info *vi, struct page *page)
> +static void give_pages(struct virtnet_info *vi, struct page *page)
>  {
> -	page->private = (unsigned long)vi->pages;
> +	struct page *npage = (struct page *)page->private;
> +
> +	if (!npage)
> +		page->private = (unsigned long)vi->pages;
> +	else {
> +		/* give a page list */
> +		while (npage) {
> +			if (npage->private == (unsigned long)0) {

should be !npage->private
and nesting is too deep here:
this is cleaner in a give_a_page subroutine
as it was.

> +				npage->private = (unsigned long)vi->pages;
> +				break;
> +			}
> +			npage = (struct page *)npage->private;
> +		}
> +	}
>  	vi->pages = page;
>  }
>  
> -static void trim_pages(struct virtnet_info *vi, struct sk_buff *skb)
> -{
> -	unsigned int i;
> -
> -	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> -		give_a_page(vi, skb_shinfo(skb)->frags[i].page);
> -	skb_shinfo(skb)->nr_frags = 0;
> -	skb->data_len = 0;
> -}
> -
>  static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)

so in short, we are constantly walking a linked

>  {
>  	struct page *p = vi->pages;
>  
> -	if (p)
> +	if (p) {
>  		vi->pages = (struct page *)p->private;
> -	else
> +		/* use private to chain big packets */

packets? or pages?

> +		p->private = (unsigned long)0;

the comment is not really helpful:
you say you use private to chain but 0 does not
chain anything. You also do not need the cast to long?

> +	} else
>  		p = alloc_page(gfp_mask);
>  	return p;
>  }
>  
> +void virtio_free_pages(void *buf)
> +{
> +	struct page *page = (struct page *)buf;
> +	struct page *npage;
> +
> +	while (page) {
> +		npage = (struct page *)page->private;
> +		__free_pages(page, 0);
> +		page = npage;
> +	}
> +}
> +
>  static void skb_xmit_done(struct virtqueue *svq)
>  {
>  	struct virtnet_info *vi = svq->vdev->priv;
> @@ -118,12 +135,36 @@ static void skb_xmit_done(struct virtqueue *svq)
>  	netif_wake_queue(vi->dev);
>  }
>  
> -static void receive_skb(struct net_device *dev, struct sk_buff *skb,
> +static int set_skb_frags(struct sk_buff *skb, struct page *page,
> +				int offset, int len)
> +{
> +	int i = skb_shinfo(skb)->nr_frags;
> +	skb_frag_t *f;
> +
> +	i = skb_shinfo(skb)->nr_frags;
> +	f = &skb_shinfo(skb)->frags[i];
> +	f->page = page;
> +	f->page_offset = offset;
> +
> +	if (len > (PAGE_SIZE - f->page_offset))

brackets around math are not needed.

> +		f->size = PAGE_SIZE - f->page_offset;
> +	else
> +		f->size = len;
> +
> +	skb_shinfo(skb)->nr_frags++;
> +	skb->data_len += f->size;
> +	skb->len += f->size;
> +
> +	len -= f->size;
> +	return len;
> +}
> +
> +static void receive_skb(struct net_device *dev, void *buf,
>  			unsigned len)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
> -	struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
> -	int err;
> +	struct skb_vnet_hdr *hdr;
> +	struct sk_buff *skb;
>  	int i;
>  
>  	if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {
> @@ -132,39 +173,71 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
>  		goto drop;
>  	}
>  
> -	if (vi->mergeable_rx_bufs) {
> -		unsigned int copy;
> -		char *p = page_address(skb_shinfo(skb)->frags[0].page);
> +	if (!vi->mergeable_rx_bufs && !vi->big_packets) {
> +		skb = (struct sk_buff *)buf;
> +
> +		__skb_unlink(skb, &vi->recv);
> +
> +		hdr = skb_vnet_hdr(skb);
> +		len -= sizeof(hdr->hdr);
> +		skb_trim(skb, len);
> +	} else {
> +		struct page *page = (struct page *)buf;
> +		int copy, hdr_len, num_buf, offset;
> +		char *p;
> +
> +		p = page_address(page);
>  
> -		if (len > PAGE_SIZE)
> -			len = PAGE_SIZE;
> -		len -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +		skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
> +		if (unlikely(!skb)) {
> +			dev->stats.rx_dropped++;
> +			return;
> +		}
> +		skb_reserve(skb, NET_IP_ALIGN);
> +		hdr = skb_vnet_hdr(skb);
>  
> -		memcpy(&hdr->mhdr, p, sizeof(hdr->mhdr));
> -		p += sizeof(hdr->mhdr);
> +		if (vi->mergeable_rx_bufs) {
> +			hdr_len = sizeof(hdr->mhdr);

space and no brackets after sizeof.

> +			memcpy(&hdr->mhdr, p, hdr_len);
> +			num_buf = hdr->mhdr.num_buffers;
> +			offset = hdr_len;
> +			if (len > PAGE_SIZE)
> +				len = PAGE_SIZE;
> +		} else {
> +			/* big packtes 6 bytes alignment between virtio_net

typo

> +			 * header and data */

please think of a way to get rid of magic constants like 6 and 2
here and elsewhere.

> +			hdr_len = sizeof(hdr->hdr);
> +			memcpy(&hdr->hdr, p, hdr_len);
> +			offset = hdr_len + 6;
> +		}
> +
> +		p += offset;
>  
> +		len -= hdr_len;
>  		copy = len;
>  		if (copy > skb_tailroom(skb))
>  			copy = skb_tailroom(skb);
> -
>  		memcpy(skb_put(skb, copy), p, copy);
>  
>  		len -= copy;
>  
> -		if (!len) {
> -			give_a_page(vi, skb_shinfo(skb)->frags[0].page);
> -			skb_shinfo(skb)->nr_frags--;
> -		} else {
> -			skb_shinfo(skb)->frags[0].page_offset +=
> -				sizeof(hdr->mhdr) + copy;
> -			skb_shinfo(skb)->frags[0].size = len;
> -			skb->data_len += len;
> -			skb->len += len;
> +		if (!len)
> +			give_pages(vi, page);
> +		else {
> +			len = set_skb_frags(skb, page, copy + offset, len);
> +			/* process big packets */
> +			while (len > 0) {
> +				page = (struct page *)page->private;
> +				if (!page)
> +					break;
> +				len = set_skb_frags(skb, page, 0, len);
> +			}
> +			if (page && page->private)
> +				give_pages(vi, (struct page *)page->private);
>  		}
>  
> -		while (--hdr->mhdr.num_buffers) {
> -			struct sk_buff *nskb;
> -
> +		/* process mergeable buffers */
> +		while (vi->mergeable_rx_bufs && --num_buf) {
>  			i = skb_shinfo(skb)->nr_frags;
>  			if (i >= MAX_SKB_FRAGS) {
>  				pr_debug("%s: packet too long %d\n", dev->name,
> @@ -173,41 +246,20 @@ static void receive_skb(struct net_device *dev, struct sk_buff *skb,
>  				goto drop;
>  			}
>  
> -			nskb = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
> -			if (!nskb) {
> +			page = vi->rvq->vq_ops->get_buf(vi->rvq, &len);
> +			if (!page) {
>  				pr_debug("%s: rx error: %d buffers missing\n",
>  					 dev->name, hdr->mhdr.num_buffers);
>  				dev->stats.rx_length_errors++;
>  				goto drop;
>  			}
>  
> -			__skb_unlink(nskb, &vi->recv);
> -			vi->num--;
> -
> -			skb_shinfo(skb)->frags[i] = skb_shinfo(nskb)->frags[0];
> -			skb_shinfo(nskb)->nr_frags = 0;
> -			kfree_skb(nskb);
> -
>  			if (len > PAGE_SIZE)
>  				len = PAGE_SIZE;
>  
> -			skb_shinfo(skb)->frags[i].size = len;
> -			skb_shinfo(skb)->nr_frags++;
> -			skb->data_len += len;
> -			skb->len += len;
> -		}
> -	} else {
> -		len -= sizeof(hdr->hdr);
> -
> -		if (len <= MAX_PACKET_LEN)
> -			trim_pages(vi, skb);
> +			set_skb_frags(skb, page, 0, len);
>  
> -		err = pskb_trim(skb, len);
> -		if (err) {
> -			pr_debug("%s: pskb_trim failed %i %d\n", dev->name,
> -				 len, err);
> -			dev->stats.rx_dropped++;
> -			goto drop;
> +			vi->num--;
>  		}
>  	}
>  
> @@ -271,107 +323,105 @@ drop:
>  	dev_kfree_skb(skb);
>  }
>  
> -static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)
> +/* Returns false if we couldn't fill entirely (OOM). */
> +static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
>  {
> -	struct sk_buff *skb;
>  	struct scatterlist sg[2+MAX_SKB_FRAGS];
> -	int num, err, i;
> +	int err = 0;
>  	bool oom = false;
>  
>  	sg_init_table(sg, 2+MAX_SKB_FRAGS);
>  	do {
> -		struct skb_vnet_hdr *hdr;
> -
> -		skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN + NET_IP_ALIGN);
> -		if (unlikely(!skb)) {
> -			oom = true;
> -			break;
> -		}
> -
> -		skb_reserve(skb, NET_IP_ALIGN);
> -		skb_put(skb, MAX_PACKET_LEN);
> -
> -		hdr = skb_vnet_hdr(skb);
> -		sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
> -
> -		if (vi->big_packets) {
> -			for (i = 0; i < MAX_SKB_FRAGS; i++) {
> -				skb_frag_t *f = &skb_shinfo(skb)->frags[i];
> -				f->page = get_a_page(vi, gfp);
> -				if (!f->page)
> -					break;
> -
> -				f->page_offset = 0;
> -				f->size = PAGE_SIZE;
> -
> -				skb->data_len += PAGE_SIZE;
> -				skb->len += PAGE_SIZE;
> -
> -				skb_shinfo(skb)->nr_frags++;
> +		/* allocate skb for MAX_PACKET_LEN len */
> +		if (!vi->big_packets && !vi->mergeable_rx_bufs) {
> +			struct skb_vnet_hdr *hdr;
> +			struct sk_buff *skb;
> +
> +			skb = netdev_alloc_skb(vi->dev,
> +					       MAX_PACKET_LEN + NET_IP_ALIGN);
> +			if (unlikely(!skb)) {
> +				oom = true;
> +				break;
>  			}
> -		}
> -
> -		num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;
> -		skb_queue_head(&vi->recv, skb);
> -
> -		err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb);
> -		if (err < 0) {
> -			skb_unlink(skb, &vi->recv);
> -			trim_pages(vi, skb);
> -			kfree_skb(skb);
> -			break;
> -		}
> -		vi->num++;
> -	} while (err >= num);
> -	if (unlikely(vi->num > vi->max))
> -		vi->max = vi->num;
> -	vi->rvq->vq_ops->kick(vi->rvq);
> -	return !oom;
> -}
> -
> -/* Returns false if we couldn't fill entirely (OOM). */
> -static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
> -{
> -	struct sk_buff *skb;
> -	struct scatterlist sg[1];
> -	int err;
> -	bool oom = false;
>  
> -	if (!vi->mergeable_rx_bufs)
> -		return try_fill_recv_maxbufs(vi, gfp);
> +			skb_reserve(skb, NET_IP_ALIGN);
> +			skb_put(skb, MAX_PACKET_LEN);
>  
> -	do {
> -		skb_frag_t *f;
> +			hdr = skb_vnet_hdr(skb);
> +			sg_set_buf(sg, &hdr->hdr, sizeof(hdr->hdr));
>  
> -		skb = netdev_alloc_skb(vi->dev, GOOD_COPY_LEN + NET_IP_ALIGN);
> -		if (unlikely(!skb)) {
> -			oom = true;
> -			break;
> -		}
> -
> -		skb_reserve(skb, NET_IP_ALIGN);
> -
> -		f = &skb_shinfo(skb)->frags[0];
> -		f->page = get_a_page(vi, gfp);
> -		if (!f->page) {
> -			oom = true;
> -			kfree_skb(skb);
> -			break;
> -		}
> +			skb_to_sgvec(skb, sg+1, 0, skb->len);
> +			skb_queue_head(&vi->recv, skb);
>  
> -		f->page_offset = 0;
> -		f->size = PAGE_SIZE;
> -
> -		skb_shinfo(skb)->nr_frags++;
> +			err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 2, skb);
> +			if (err < 0) {
> +				skb_unlink(skb, &vi->recv);
> +				kfree_skb(skb);
> +				break;
> +			}
>  
> -		sg_init_one(sg, page_address(f->page), PAGE_SIZE);
> -		skb_queue_head(&vi->recv, skb);
> +		} else {
> +			struct page *first_page = NULL;
> +			struct page *page;
> +			int i = MAX_SKB_FRAGS + 2;

replace MAX_SKB_FRAGS + 2 with something symbolic? We have it in 2 palces now.
And comment.

> +			char *p;
> +
> +			/*
> +			 * chain pages for big packets, allocate skb
> +			 * late for both big packets and mergeable
> +			 * buffers
> +			 */
> +more:			page = get_a_page(vi, gfp);


terrible goto based loop
move stuff into subfunction, it will be much
more manageable, and convert this to a simple
for loop.


> +			if (!page) {
> +				if (first_page)
> +					give_pages(vi, first_page);
> +				oom = true;
> +				break;
> +			}
>  
> -		err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, 1, skb);
> -		if (err < 0) {
> -			skb_unlink(skb, &vi->recv);
> -			kfree_skb(skb);
> -			break;
> +			p = page_address(page);
> +			if (vi->mergeable_rx_bufs) {
> +				sg_init_one(sg, p, PAGE_SIZE);
> +				err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0,
> +							       1, page);
> +				if (err < 0) {
> +					give_pages(vi, page);
> +					break;
> +				}
> +			} else {
> +				int hdr_len = sizeof(struct virtio_net_hdr);
> +
> +				/*
> +				 * allocate MAX_SKB_FRAGS + 1 pages for
> +				 * big packets
> +				 */

and here it is MAX_SKB_FRAGS + 1

> +				page->private = (unsigned long)first_page;
> +				first_page = page;
> +				if (--i == 1) {

this is pretty hairy ... has to be this way?
What you are trying to do here
is fill buffer with pages, in a loop, with first one
using a partial page, and then add it.
Is that it?
So please code this in a straight forward manner.
it should be as simple as:

	offset = XXX
	for (i = 0; i < MAX_SKB_FRAGS + 2; ++i) {
		
		sg_set_buf(sg + i, p + offset, PAGE_SIZE - offset);
		offset = 0;

	}

	err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, MAX_SKB_FRAGS + 2, first_page);

> +					int offset = hdr_len + 6;
> +
> +					/*
> +					 * share one page between virtio_net
> +					 * header and data, and reserve 6 bytes
> +					 * for alignment
> +					 */
> +					sg_set_buf(sg, p, hdr_len);
> +					sg_set_buf(sg+1, p + offset,

space around +
sg + 1 here is same as &sg[i] in fact?

> +						   PAGE_SIZE - offset);
> +					err = vi->rvq->vq_ops->add_buf(vi->rvq,
> +							sg, 0,
> +							MAX_SKB_FRAGS + 2,
> +							first_page);
> +					if (err < 0) {
> +						give_pages(vi, first_page);
> +						break;
> +					}
> +
> +				} else {
> +					sg_set_buf(&sg[i], p, PAGE_SIZE);
> +					goto more;
> +				}
> +			}
>  		}
>  		vi->num++;
>  	} while (err > 0);
> @@ -411,14 +461,13 @@ static void refill_work(struct work_struct *work)
>  static int virtnet_poll(struct napi_struct *napi, int budget)
>  {
>  	struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
> -	struct sk_buff *skb = NULL;
> +	void *buf = NULL;
>  	unsigned int len, received = 0;
>  
>  again:
>  	while (received < budget &&
> -	       (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
> -		__skb_unlink(skb, &vi->recv);
> -		receive_skb(vi->dev, skb, len);
> +	       (buf = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) {
> +		receive_skb(vi->dev, buf, len);
>  		vi->num--;
>  		received++;
>  	}
> @@ -959,6 +1008,7 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
>  {
>  	struct virtnet_info *vi = vdev->priv;
>  	struct sk_buff *skb;
> +	int freed;
>  
>  	/* Stop all the virtqueues. */
>  	vdev->config->reset(vdev);
> @@ -970,11 +1020,17 @@ static void __devexit virtnet_remove(struct virtio_device *vdev)
>  	}
>  	__skb_queue_purge(&vi->send);
>  
> -	BUG_ON(vi->num != 0);
> -
>  	unregister_netdev(vi->dev);
>  	cancel_delayed_work_sync(&vi->refill);

I this we must flush here otherwise refill might be in progress.

>  
> +	if (vi->mergeable_rx_bufs || vi->big_packets) {
> +		freed = vi->rvq->vq_ops->destroy_buf(vi->rvq,
> +						     virtio_free_pages);
> +		vi->num -= freed;
> +	}
> +
> +	BUG_ON(vi->num != 0);
> +
>  	vdev->config->del_vqs(vi->vdev);
>  
>  	while (vi->pages)
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index fbd2ecd..aec7fe7 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -334,6 +334,29 @@ static bool vring_enable_cb(struct virtqueue *_vq)
>  	return true;
>  }
>  
> +static int vring_destroy_buf(struct virtqueue *_vq, void (*callback)(void *))
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	void *ret;
> +	unsigned int i;
> +	int freed = 0;
> +
> +	START_USE(vq);
> +
> +	for (i = 0; i < vq->vring.num; i++) {
> +		if (vq->data[i]) {
> +			/* detach_buf clears data, so grab it now. */
> +			ret = vq->data[i];
> +			detach_buf(vq, i);
> +			callback(ret);
> +			freed++;
> +		}
> +	}
> +
> +	END_USE(vq);
> +	return freed;
> +}
> +
>  irqreturn_t vring_interrupt(int irq, void *_vq)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);

virtio ring bits really must be a separate patch.

> @@ -360,6 +383,7 @@ static struct virtqueue_ops vring_vq_ops = {
>  	.kick = vring_kick,
>  	.disable_cb = vring_disable_cb,
>  	.enable_cb = vring_enable_cb,
> +	.destroy_buf = vring_destroy_buf,

not sure what a good name is, but destroy_buf is not it.

>  };
>  
>  struct virtqueue *vring_new_virtqueue(unsigned int num,
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index 057a2e0..7b1e86c 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -71,6 +71,7 @@ struct virtqueue_ops {
>  
>  	void (*disable_cb)(struct virtqueue *vq);
>  	bool (*enable_cb)(struct virtqueue *vq);
> +	int (*destroy_buf)(struct virtqueue *vq, void (*callback)(void *));

callback -> destructor?

>  };
>  
>  /**
> 
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-23  1:08     ` Rusty Russell
@ 2009-11-23 16:07       ` Shirley Ma
  2009-11-23 22:24         ` Rusty Russell
  2009-11-23 19:01       ` Shirley Ma
  1 sibling, 1 reply; 20+ messages in thread
From: Shirley Ma @ 2009-11-23 16:07 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Eric Dumazet, Michael S. Tsirkin, Avi Kivity, netdev, kvm, linux-kernel

On Mon, 2009-11-23 at 11:38 +1030, Rusty Russell wrote:
> How about:
>         struct page *end;
> 
>         /* Find end of list, sew whole thing into vi->pages. */
>         for (end = page; end->private; end = (struct page
> *)end->private);
>         end->private = (unsigned long)vi->pages;
>         vi->pages = page;

Yes, this looks nicer.

> > +void virtio_free_pages(void *buf)
> 
> This is a terrible name; it's specific to virtio_net, plus it should
> be
> static.  Just "free_pages" should be sufficient here I think.

> 
> This smells a lot like a for loop to me?
> 
>         struct page *i, *next;
> 
>         for (i = buf; next; i = next) {
>                 next = (struct page *)i->private;
>                 __free_pages(i, 0);
>         }
Agree, will change it.

> > +static int set_skb_frags(struct sk_buff *skb, struct page *page,
> > +                             int offset, int len)
> 
> A better name might be "skb_add_frag()"?
OK.

> > +             skb = (struct sk_buff *)buf;
> This cast is unnecessary, but a comment would be nice:
>         /* Simple case: the pointer is the 1514-byte skb */
> 
> > +     } else {

Without this cast there is a compile warning. 

> And a matching comment here:
> 
>         /* The pointer is a chain of pages. */
> 
OK.

> > +             if (unlikely(!skb)) {
> > +                     dev->stats.rx_dropped++;
> 
> Does this mean we leak all those pages?  Shouldn't we give_pages()
> here?

Yes, it should call give_pages here.


> > +                     offset = hdr_len + 6;
> 
> Really?  I can't see where the current driver does this 6 byte offset.
> There
> should be a header, then the packet...
> Ah, I see below that you set things up this way.  The better way to do
> this
> is to create a new structure to use in various places.
> 
>         struct padded_vnet_hdr {
>                 struct virtio_net_hdr hdr;
>                 /* This padding makes our packet 16 byte aligned */
>                 char padding[6];
>         };
> 
> However, I question whether making it 16 byte is the right thing: the
> ethernet header is 14 bytes long, so don't we want 8 bytes of padding?

Because in QEMU it requires 10 bytes header in a separately, so one page
is used to share between virtio_net_hdr header which is 10 bytes head
and rest of data. So I put 6 bytes offset here between two buffers. I
didn't look at the reason why a seperate buf is used for virtio_net_hdr
in QEMU.

> > +             }
> 
> I think you can move the memcpy outside the if, like so:
> 
>         memcpy(&hdr, p, hdr_len);

I didn't move it out, because num_buf = hdr->mhdr.num_buffers;

> > +             if (!len)
> > +                     give_pages(vi, page);
> > +             else {
> > +                     len = set_skb_frags(skb, page, copy + offset,
> len);
> > +                     /* process big packets */
> > +                     while (len > 0) {
> > +                             page = (struct page *)page->private;
> > +                             if (!page)
> > +                                     break;
> > +                             len = set_skb_frags(skb, page, 0,
> len);
> > +                     }
> > +                     if (page && page->private)
> > +                             give_pages(vi, (struct page
> *)page->private);
> >               }
> 
> I can't help but think this statement should be one loop somehow.
> Something
> like:
> 
>         offset += copy;
> 
>         while (len) {
>                 len = skb_add_frag(skb, page, offset, len);
>                 page = (struct page *)page->private;
>                 offset = 0;
>         }
>         if (page)
>                 give_pages(vi, page);

I was little bit worried about qemu messed up len (i saw code in
mergeable buffer checking len > PAGE_SIZE), so I put page checking
inside. I will change it outside if checking len is enough.

> > -static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t
> gfp)
> > +/* Returns false if we couldn't fill entirely (OOM). */
> > +static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
> 
> The result of trying to merge all the cases here is messy.  I'd split
> it
> inside the loop into: add_recvbuf_small(vi, gfp), add_recvbuf_big(vi,
> gfp)
> and add_recvbuf_mergeable(vi, gfp).
> 
> It'll also be easier for me to review each case then, as well as
> getting
> rid of the big packets if we decide to do that later.  You could even
> do
> that split as a separate patch, before this one.

Ok, will separate it.

> destroy_buf should really be called destroy_bufs().  And I don't think
> you
> use the vi->recv skb list in this case at all, so the loop which
> purges those
> should be under an "else {" of this branch.

Yes.

> This new parameter should be introduced as a separate patch.  It
> should also be
> called destroy_bufs.  It also returns an unsigned int.  I would call
> the callback
> something a little more informative, such as "destroy"; here and in
> the header.

Ok.

> ret is a bad name for this; how about buf?
Sure.

> Overall, the patch looks good.  But it would have been nicer if it
> were
> split into several parts: cleanups, new infrastructure, then the
> actual
> allocation change.

I will try to separate this patch.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-23  9:43     ` Michael S. Tsirkin
@ 2009-11-23 16:18       ` Shirley Ma
  0 siblings, 0 replies; 20+ messages in thread
From: Shirley Ma @ 2009-11-23 16:18 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Eric Dumazet, Avi Kivity, Rusty Russell, netdev, kvm, linux-kernel

On Mon, 2009-11-23 at 11:43 +0200, Michael S. Tsirkin wrote:

> should be !npage->private
> and nesting is too deep here:
> this is cleaner in a give_a_page subroutine
> as it was.

This will be addressed with Rusty's comment.


> > +		/* use private to chain big packets */
> 
> packets? or pages?

Will change it to chain pages for big packets

> > +		p->private = (unsigned long)0;
> 
> the comment is not really helpful:
> you say you use private to chain but 0 does not
> chain anything. You also do not need the cast to long?

Ok.

> > +	if (len > (PAGE_SIZE - f->page_offset))
> 
> brackets around math are not needed.

OK.

> typo
> 
> > +			 * header and data */
Got it.

> please think of a way to get rid of magic constants like 6 and 2
> here and elsewhere.

Will do.

> replace MAX_SKB_FRAGS + 2 with something symbolic? We have it in 2 palces now.
> And comment.

Ok, I can change it.

> terrible goto based loop
> move stuff into subfunction, it will be much
> more manageable, and convert this to a simple
> for loop.

Will change it to different functions based on Rusty's comment.

> and here it is MAX_SKB_FRAGS + 1

I think I should use MAX_SKB_FRAGS + 2 instead. Now I only use
MAX_SKB_FRAGS + 1 but allocated + 2.

> > +				page->private = (unsigned long)first_page;
> > +				first_page = page;
> > +				if (--i == 1) {
> 
> this is pretty hairy ... has to be this way?
> What you are trying to do here
> is fill buffer with pages, in a loop, with first one
> using a partial page, and then add it.
> Is that it?
Yes.

> So please code this in a straight forward manner.
> it should be as simple as:

> 	offset = XXX
> 	for (i = 0; i < MAX_SKB_FRAGS + 2; ++i) {
> 		
> 		sg_set_buf(sg + i, p + offset, PAGE_SIZE - offset);
> 		offset = 0;
> 
> 	}
Ok, looks more neat.

> space around +
> sg + 1 here is same as &sg[i] in fact?

Ok.

> callback -> destructor?

Ok.

I will integrate these comments with Rusty's and resubmit the patch set.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-23  1:08     ` Rusty Russell
  2009-11-23 16:07       ` Shirley Ma
@ 2009-11-23 19:01       ` Shirley Ma
  1 sibling, 0 replies; 20+ messages in thread
From: Shirley Ma @ 2009-11-23 19:01 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Eric Dumazet, Michael S. Tsirkin, Avi Kivity, netdev, kvm, linux-kernel

Hello Rusty,

On Mon, 2009-11-23 at 11:38 +1030, Rusty Russell wrote:
> Overall, the patch looks good.  But it would have been nicer if it
> were
> split into several parts: cleanups, new infrastructure, then the
> actual
> allocation change.

I have split the patch into a set: cleanups, new infrastructure, and
actual allocation change in add buffers: add_recvbuf_big,
add_recvbuf_small, add_recvbuf_mergeage per your suggestion, and also in
recv buffers: recv_big, recv_small, recv_mergeable.

I hope you will agree with it. I am testing the patch-set now, will
submit it soon after finishing all test cases.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-23 16:07       ` Shirley Ma
@ 2009-11-23 22:24         ` Rusty Russell
  2009-11-23 23:27           ` Shirley Ma
  2009-11-24 11:37           ` Michael S. Tsirkin
  0 siblings, 2 replies; 20+ messages in thread
From: Rusty Russell @ 2009-11-23 22:24 UTC (permalink / raw)
  To: Shirley Ma
  Cc: Eric Dumazet, Michael S. Tsirkin, Avi Kivity, netdev, kvm,
	linux-kernel, Hollis Blanchard

On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
> > > +             skb = (struct sk_buff *)buf;
> > This cast is unnecessary, but a comment would be nice:
> 
> Without this cast there is a compile warning. 

Hi Shirley,

   Looks like buf is a void *, so no cast should be necessary.  But I could
be reading the patch wrong.

> > However, I question whether making it 16 byte is the right thing: the
> > ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
> 
> Because in QEMU it requires 10 bytes header in a separately, so one page
> is used to share between virtio_net_hdr header which is 10 bytes head
> and rest of data. So I put 6 bytes offset here between two buffers. I
> didn't look at the reason why a seperate buf is used for virtio_net_hdr
> in QEMU.

It's a qemu bug.  It insists the header be an element in the scatterlist by
itself.  Unfortunately we have to accommodate it.

However, there's no reason for the merged rxbuf and big packet layout to be
the same: for the big packet case you should layout as follows:

#define BIG_PACKET_PAD (NET_SKB_PAD - sizeof(struct virtio_net_hdr) + NET_IP_ALIGN)
struct big_packet_page {
	struct virtio_net_hdr hdr;
	char pad[BIG_PACKET_PAD];
	/* Actual packet data starts here */
	unsigned char data[PAGE_SIZE - BIG_PACKET_PAD - sizeof(struct virtio_net_hdr)];
};

Then use this type as the template for registering the sg list for the
big packet case.

> > I think you can move the memcpy outside the if, like so:
> > 
> >         memcpy(&hdr, p, hdr_len);
> 
> I didn't move it out, because num_buf = hdr->mhdr.num_buffers;

Yes, that has to stay inside, but the memcpy can move out.  It's just a bit
neater to have more common code.

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-23 22:24         ` Rusty Russell
@ 2009-11-23 23:27           ` Shirley Ma
  2009-11-24 11:37           ` Michael S. Tsirkin
  1 sibling, 0 replies; 20+ messages in thread
From: Shirley Ma @ 2009-11-23 23:27 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Eric Dumazet, Michael S. Tsirkin, Avi Kivity, netdev, kvm,
	linux-kernel, Hollis Blanchard

On Tue, 2009-11-24 at 08:54 +1030, Rusty Russell wrote:
> #define BIG_PACKET_PAD (NET_SKB_PAD - sizeof(struct virtio_net_hdr) +
> NET_IP_ALIGN)
> struct big_packet_page {
>         struct virtio_net_hdr hdr;
>         char pad[BIG_PACKET_PAD];
>         /* Actual packet data starts here */
>         unsigned char data[PAGE_SIZE - BIG_PACKET_PAD - sizeof(struct
> virtio_net_hdr)];
> };

The padding was used for qemu userspace buffer copy, skb data is copied
from the page for size of GOOD_COPY_LEN, and skb data is reserved
NET_IP_ALIGN.

If we add paddings as above, userspace copy will starts NET_SKB_PAD +
NET_IP_ALIGN? I am little bit confused here.

> Then use this type as the template for registering the sg list for the
> big packet case.
> 
> > > I think you can move the memcpy outside the if, like so:
> > > 
> > >         memcpy(&hdr, p, hdr_len);
> > 
> > I didn't move it out, because num_buf = hdr->mhdr.num_buffers;
> 
> Yes, that has to stay inside, but the memcpy can move out.  It's just
> a bit
> neater to have more common code.

num_buf is assigned after memcpy so we couldn't move memcpy out.

Anyway I separated mergeable buffers and big packets in the new patch to
make it clear and it will be easy to modify when we drop big packets
support in the future.

Thanks
Shirley


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-23 22:24         ` Rusty Russell
  2009-11-23 23:27           ` Shirley Ma
@ 2009-11-24 11:37           ` Michael S. Tsirkin
  2009-11-24 14:36             ` Anthony Liguori
  2009-11-25  0:12             ` Rusty Russell
  1 sibling, 2 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2009-11-24 11:37 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Shirley Ma, Eric Dumazet, Avi Kivity, netdev, kvm, linux-kernel,
	Hollis Blanchard

On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
> On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
> > > > +             skb = (struct sk_buff *)buf;
> > > This cast is unnecessary, but a comment would be nice:
> > 
> > Without this cast there is a compile warning. 
> 
> Hi Shirley,
> 
>    Looks like buf is a void *, so no cast should be necessary.  But I could
> be reading the patch wrong.
> 
> > > However, I question whether making it 16 byte is the right thing: the
> > > ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
> > 
> > Because in QEMU it requires 10 bytes header in a separately, so one page
> > is used to share between virtio_net_hdr header which is 10 bytes head
> > and rest of data. So I put 6 bytes offset here between two buffers. I
> > didn't look at the reason why a seperate buf is used for virtio_net_hdr
> > in QEMU.
> 
> It's a qemu bug.  It insists the header be an element in the scatterlist by
> itself.  Unfortunately we have to accommodate it.

We do?  Let's just fix this?
All we have to do is replace memcpy with proper iovec walk, correct?
Something like the followng (untested) patch?  It's probably not too
late to put this in the next qemu release...

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>


diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index 2f147e5..06c5148 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -434,26 +434,59 @@ static int iov_fill(struct iovec *iov, int iovcnt, const void *buf, int count)
     return offset;
 }
 
+static int iov_skip(struct iovec *iov, int iovcnt, int count)
+{
+    int offset, i;
+
+    offset = i = 0;
+    while (offset < count && i < iovcnt) {
+        int len = MIN(iov[i].iov_len, count - offset);
+	iov[i].iov_base += len;
+	iov[i].iov_len -= len;
+        offset += len;
+        i++;
+    }
+
+    return offset;
+}
+
+static int iov_copy(struct iovec *to, struct iovec *from, int iovcnt, int count)
+{
+    int offset, i;
+
+    offset = i = 0;
+    while (offset < count && i < iovcnt) {
+        int len = MIN(from[i].iov_len, count - offset);
+	to[i].iov_base = from[i].iov_base;
+	to[i].iov_len = from[i].iov_len;
+        offset += len;
+        i++;
+    }
+
+    return i;
+}
+
 static int receive_header(VirtIONet *n, struct iovec *iov, int iovcnt,
                           const void *buf, size_t size, size_t hdr_len)
 {
-    struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)iov[0].iov_base;
+    struct virtio_net_hdr hdr = {};
     int offset = 0;
 
-    hdr->flags = 0;
-    hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
+    hdr.flags = 0;
+    hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
 
     if (n->has_vnet_hdr) {
-        memcpy(hdr, buf, sizeof(*hdr));
-        offset = sizeof(*hdr);
-        work_around_broken_dhclient(hdr, buf + offset, size - offset);
+        memcpy(&hdr, buf, sizeof hdr);
+        offset = sizeof hdr;
+        work_around_broken_dhclient(&hdr, buf + offset, size - offset);
     }
 
+    iov_fill(iov, iovcnt, &hdr, sizeof hdr);
+
     /* We only ever receive a struct virtio_net_hdr from the tapfd,
      * but we may be passing along a larger header to the guest.
      */
-    iov[0].iov_base += hdr_len;
-    iov[0].iov_len  -= hdr_len;
+    iov_skip(iov, iovcnt, hdr_len);
 
     return offset;
 }
@@ -514,7 +547,8 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
 static ssize_t virtio_net_receive(VLANClientState *vc, const uint8_t *buf, size_t size)
 {
     VirtIONet *n = vc->opaque;
-    struct virtio_net_hdr_mrg_rxbuf *mhdr = NULL;
+    struct iovec mhdr[VIRTQUEUE_MAX_SIZE];
+    int mhdrcnt = 0;
     size_t hdr_len, offset, i;
 
     if (!virtio_net_can_receive(n->vc))
@@ -552,16 +586,13 @@ static ssize_t virtio_net_receive(VLANClientState *vc, const uint8_t *buf, size_
             exit(1);
         }
 
-        if (!n->mergeable_rx_bufs && elem.in_sg[0].iov_len != hdr_len) {
-            fprintf(stderr, "virtio-net header not in first element\n");
-            exit(1);
-        }
-
         memcpy(&sg, &elem.in_sg[0], sizeof(sg[0]) * elem.in_num);
 
         if (i == 0) {
-            if (n->mergeable_rx_bufs)
-                mhdr = (struct virtio_net_hdr_mrg_rxbuf *)sg[0].iov_base;
+            if (n->mergeable_rx_bufs) {
+                mhdrcnt = iov_copy(mhdr, sg, elem.in_num,
+                                   sizeof(struct virtio_net_hdr_mrg_rxbuf));
+            }
 
             offset += receive_header(n, sg, elem.in_num,
                                      buf + offset, size - offset, hdr_len);
@@ -579,8 +610,12 @@ static ssize_t virtio_net_receive(VLANClientState *vc, const uint8_t *buf, size_
         offset += len;
     }
 
-    if (mhdr)
-        mhdr->num_buffers = i;
+    if (mhdrcnt) {
+        uint16_t num = i;
+        iov_skip(mhdr, mhdrcnt,
+                 offsetof(struct virtio_net_hdr_mrg_rxbuf, num_buffers));
+        iov_fill(mhdr, mhdrcnt, &num, sizeof num);
+    }
 
     virtqueue_flush(n->rx_vq, i);
     virtio_notify(&n->vdev, n->rx_vq);
@@ -627,20 +662,19 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
             sizeof(struct virtio_net_hdr_mrg_rxbuf) :
             sizeof(struct virtio_net_hdr);
 
-        if (out_num < 1 || out_sg->iov_len != hdr_len) {
-            fprintf(stderr, "virtio-net header not in first element\n");
+        if (out_num < 1) {
+            fprintf(stderr, "virtio-net: no output element\n");
             exit(1);
         }
 
         /* ignore the header if GSO is not supported */
         if (!n->has_vnet_hdr) {
-            out_num--;
-            out_sg++;
+            iov_skip(out_sg, out_num, hdr_len);
             len += hdr_len;
         } else if (n->mergeable_rx_bufs) {
             /* tapfd expects a struct virtio_net_hdr */
             hdr_len -= sizeof(struct virtio_net_hdr);
-            out_sg->iov_len -= hdr_len;
+            iov_skip(out_sg, out_num, hdr_len);
             len += hdr_len;
         }
 

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-24 11:37           ` Michael S. Tsirkin
@ 2009-11-24 14:36             ` Anthony Liguori
  2009-11-24 16:04               ` Michael S. Tsirkin
                                 ` (2 more replies)
  2009-11-25  0:12             ` Rusty Russell
  1 sibling, 3 replies; 20+ messages in thread
From: Anthony Liguori @ 2009-11-24 14:36 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Rusty Russell, Shirley Ma, Eric Dumazet, Avi Kivity, netdev, kvm,
	linux-kernel, Hollis Blanchard

Michael S. Tsirkin wrote:
> On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
>   
>> On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
>>     
>>>>> +             skb = (struct sk_buff *)buf;
>>>>>           
>>>> This cast is unnecessary, but a comment would be nice:
>>>>         
>>> Without this cast there is a compile warning. 
>>>       
>> Hi Shirley,
>>
>>    Looks like buf is a void *, so no cast should be necessary.  But I could
>> be reading the patch wrong.
>>
>>     
>>>> However, I question whether making it 16 byte is the right thing: the
>>>> ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
>>>>         
>>> Because in QEMU it requires 10 bytes header in a separately, so one page
>>> is used to share between virtio_net_hdr header which is 10 bytes head
>>> and rest of data. So I put 6 bytes offset here between two buffers. I
>>> didn't look at the reason why a seperate buf is used for virtio_net_hdr
>>> in QEMU.
>>>       
>> It's a qemu bug.  It insists the header be an element in the scatterlist by
>> itself.  Unfortunately we have to accommodate it.
>>     
>
> We do?  Let's just fix this?
>   

So does lguest.  It's been that way since the beginning.  Fixing this 
would result in breaking older guests.

We really need to introduce a feature bit if we want to change this.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-24 14:36             ` Anthony Liguori
@ 2009-11-24 16:04               ` Michael S. Tsirkin
  2009-11-24 16:04               ` Michael S. Tsirkin
  2009-11-25  0:15               ` Rusty Russell
  2 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2009-11-24 16:04 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Rusty Russell, Shirley Ma, Eric Dumazet, Avi Kivity, netdev, kvm,
	linux-kernel, Hollis Blanchard

On Tue, Nov 24, 2009 at 08:36:32AM -0600, Anthony Liguori wrote:
> Michael S. Tsirkin wrote:
>> On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
>>   
>>> On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
>>>     
>>>>>> +             skb = (struct sk_buff *)buf;
>>>>>>           
>>>>> This cast is unnecessary, but a comment would be nice:
>>>>>         
>>>> Without this cast there is a compile warning.       
>>> Hi Shirley,
>>>
>>>    Looks like buf is a void *, so no cast should be necessary.  But I could
>>> be reading the patch wrong.
>>>
>>>     
>>>>> However, I question whether making it 16 byte is the right thing: the
>>>>> ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
>>>>>         
>>>> Because in QEMU it requires 10 bytes header in a separately, so one page
>>>> is used to share between virtio_net_hdr header which is 10 bytes head
>>>> and rest of data. So I put 6 bytes offset here between two buffers. I
>>>> didn't look at the reason why a seperate buf is used for virtio_net_hdr
>>>> in QEMU.
>>>>       
>>> It's a qemu bug.  It insists the header be an element in the scatterlist by
>>> itself.  Unfortunately we have to accommodate it.
>>>     
>>
>> We do?  Let's just fix this?
>>   
>
> So does lguest.  It's been that way since the beginning.  Fixing this  
> would result in breaking older guests.

The patch you are replying to fixes this in a way that does not break older guests.

> We really need to introduce a feature bit if we want to change this.

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-24 14:36             ` Anthony Liguori
  2009-11-24 16:04               ` Michael S. Tsirkin
@ 2009-11-24 16:04               ` Michael S. Tsirkin
  2009-11-25  0:15               ` Rusty Russell
  2 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2009-11-24 16:04 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Rusty Russell, Shirley Ma, Eric Dumazet, Avi Kivity, netdev, kvm,
	linux-kernel, Hollis Blanchard

On Tue, Nov 24, 2009 at 08:36:32AM -0600, Anthony Liguori wrote:
> Michael S. Tsirkin wrote:
>> On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
>>   
>>> On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
>>>     
>>>>>> +             skb = (struct sk_buff *)buf;
>>>>>>           
>>>>> This cast is unnecessary, but a comment would be nice:
>>>>>         
>>>> Without this cast there is a compile warning.       
>>> Hi Shirley,
>>>
>>>    Looks like buf is a void *, so no cast should be necessary.  But I could
>>> be reading the patch wrong.
>>>
>>>     
>>>>> However, I question whether making it 16 byte is the right thing: the
>>>>> ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
>>>>>         
>>>> Because in QEMU it requires 10 bytes header in a separately, so one page
>>>> is used to share between virtio_net_hdr header which is 10 bytes head
>>>> and rest of data. So I put 6 bytes offset here between two buffers. I
>>>> didn't look at the reason why a seperate buf is used for virtio_net_hdr
>>>> in QEMU.
>>>>       
>>> It's a qemu bug.  It insists the header be an element in the scatterlist by
>>> itself.  Unfortunately we have to accommodate it.
>>>     
>>
>> We do?  Let's just fix this?
>>   
>
> So does lguest.

It does? All I see it doing is writev/readv,
and this passes things to tap which handles
this correctly.


>  It's been that way since the beginning.  Fixing this  
> would result in breaking older guests.

If you look at my patch, it handles old guests just fine :).

> We really need to introduce a feature bit if we want to change this.

I am not sure I agree: we can't add feature bits
for all bugs, can we?

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-24 11:37           ` Michael S. Tsirkin
  2009-11-24 14:36             ` Anthony Liguori
@ 2009-11-25  0:12             ` Rusty Russell
  2009-11-25  9:15               ` Michael S. Tsirkin
  1 sibling, 1 reply; 20+ messages in thread
From: Rusty Russell @ 2009-11-25  0:12 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Shirley Ma, Eric Dumazet, Avi Kivity, netdev, kvm, linux-kernel,
	Hollis Blanchard

On Tue, 24 Nov 2009 10:07:54 pm Michael S. Tsirkin wrote:
> On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
> > On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
> > > > > +             skb = (struct sk_buff *)buf;
> > > > This cast is unnecessary, but a comment would be nice:
> > > 
> > > Without this cast there is a compile warning. 
> > 
> > Hi Shirley,
> > 
> >    Looks like buf is a void *, so no cast should be necessary.  But I could
> > be reading the patch wrong.
> > 
> > > > However, I question whether making it 16 byte is the right thing: the
> > > > ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
> > > 
> > > Because in QEMU it requires 10 bytes header in a separately, so one page
> > > is used to share between virtio_net_hdr header which is 10 bytes head
> > > and rest of data. So I put 6 bytes offset here between two buffers. I
> > > didn't look at the reason why a seperate buf is used for virtio_net_hdr
> > > in QEMU.
> > 
> > It's a qemu bug.  It insists the header be an element in the scatterlist by
> > itself.  Unfortunately we have to accommodate it.
> 
> We do?  Let's just fix this?
> All we have to do is replace memcpy with proper iovec walk, correct?
> Something like the followng (untested) patch?  It's probably not too
> late to put this in the next qemu release...

You might want to implement a more generic helper which does:

	/* Return pointer into iovec if we can, otherwise copy into buf */
	void *pull_iovec(struct iovec *iov, int iovcnt, void *buf, size_t len)
	{
		unsigned int i;
		void *p;

		if (likely(iov_cnt && iov[0].iov_len >= len)) {
			/* Nice contiguous chunk. */
			void *p = iov[0].iov_base;
			iov[i].iov_base += len;
			iov[i].iov_len -= len;
			return p;
		}

		p = buf;
		for (i = 0; i < iov_cnt; i++) {
			size_t this_len = min(len, iov[i].iov_len);
			memcpy(p, iov[i].iov_base, this_len);
			len -= this_len;
			iov[i].iov_base += len;
			iov[i].iov_len -= len;
			if (len == 0)
				return buf;
		}
		/* BTW, we screwed your iovec. */
		return NULL;
	}

Then use it in all the virtio drivers...

Thanks!
Rusty.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-24 14:36             ` Anthony Liguori
  2009-11-24 16:04               ` Michael S. Tsirkin
  2009-11-24 16:04               ` Michael S. Tsirkin
@ 2009-11-25  0:15               ` Rusty Russell
  2 siblings, 0 replies; 20+ messages in thread
From: Rusty Russell @ 2009-11-25  0:15 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Michael S. Tsirkin, Shirley Ma, Eric Dumazet, Avi Kivity, netdev,
	kvm, linux-kernel, Hollis Blanchard

On Wed, 25 Nov 2009 01:06:32 am Anthony Liguori wrote:
> So does lguest.  It's been that way since the beginning.  Fixing this 
> would result in breaking older guests.

Agreed, we can't "fix" it in the guests, but it's a wart.  That's why I
haven't bothered fixing it, but if someone else wants to I'll cheer all the
way.

lguest did it because I knew I could fix lguest any time; it was a bad mistake
and I will now fix lguest :)

> We really need to introduce a feature bit if we want to change this.

I don't think it's worth it.  But the spec does say that the implementation
should not rely on the framing (I think there's a note that current
implementations are buggy tho, so you should frame it separately anyway).

That way future devices can get it right, at least.

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-25  0:12             ` Rusty Russell
@ 2009-11-25  9:15               ` Michael S. Tsirkin
  2009-11-25 10:20                 ` Rusty Russell
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2009-11-25  9:15 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Shirley Ma, Eric Dumazet, Avi Kivity, netdev, kvm, linux-kernel,
	Hollis Blanchard

On Wed, Nov 25, 2009 at 10:42:06AM +1030, Rusty Russell wrote:
> On Tue, 24 Nov 2009 10:07:54 pm Michael S. Tsirkin wrote:
> > On Tue, Nov 24, 2009 at 08:54:23AM +1030, Rusty Russell wrote:
> > > On Tue, 24 Nov 2009 02:37:01 am Shirley Ma wrote:
> > > > > > +             skb = (struct sk_buff *)buf;
> > > > > This cast is unnecessary, but a comment would be nice:
> > > > 
> > > > Without this cast there is a compile warning. 
> > > 
> > > Hi Shirley,
> > > 
> > >    Looks like buf is a void *, so no cast should be necessary.  But I could
> > > be reading the patch wrong.
> > > 
> > > > > However, I question whether making it 16 byte is the right thing: the
> > > > > ethernet header is 14 bytes long, so don't we want 8 bytes of padding?
> > > > 
> > > > Because in QEMU it requires 10 bytes header in a separately, so one page
> > > > is used to share between virtio_net_hdr header which is 10 bytes head
> > > > and rest of data. So I put 6 bytes offset here between two buffers. I
> > > > didn't look at the reason why a seperate buf is used for virtio_net_hdr
> > > > in QEMU.
> > > 
> > > It's a qemu bug.  It insists the header be an element in the scatterlist by
> > > itself.  Unfortunately we have to accommodate it.
> > 
> > We do?  Let's just fix this?
> > All we have to do is replace memcpy with proper iovec walk, correct?
> > Something like the followng (untested) patch?  It's probably not too
> > late to put this in the next qemu release...
> 
> You might want to implement a more generic helper which does:
> 
> 	/* Return pointer into iovec if we can, otherwise copy into buf */
> 	void *pull_iovec(struct iovec *iov, int iovcnt, void *buf, size_t len)
> 	{
> 		unsigned int i;
> 		void *p;
> 
> 		if (likely(iov_cnt && iov[0].iov_len >= len)) {
> 			/* Nice contiguous chunk. */
> 			void *p = iov[0].iov_base;
> 			iov[i].iov_base += len;
> 			iov[i].iov_len -= len;
> 			return p;
> 		}
> 
> 		p = buf;
> 		for (i = 0; i < iov_cnt; i++) {
> 			size_t this_len = min(len, iov[i].iov_len);
> 			memcpy(p, iov[i].iov_base, this_len);
> 			len -= this_len;
> 			iov[i].iov_base += len;
> 			iov[i].iov_len -= len;
> 			if (len == 0)
> 				return buf;
> 		}
> 		/* BTW, we screwed your iovec. */
> 		return NULL;
> 	}
> 
> Then use it in all the virtio drivers...

Hmm, is it really worth it to save a header copy if it's linear?  We are
going to access it anyway, and it fits into one cacheline nicely.  On
the other hand we have more code making life harder for compiler and
processor.

> Thanks!
> Rusty.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-25  9:15               ` Michael S. Tsirkin
@ 2009-11-25 10:20                 ` Rusty Russell
  2009-11-25 11:40                   ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Rusty Russell @ 2009-11-25 10:20 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Shirley Ma, Eric Dumazet, Avi Kivity, netdev, kvm, linux-kernel,
	Hollis Blanchard

On Wed, 25 Nov 2009 07:45:30 pm Michael S. Tsirkin wrote:
> Hmm, is it really worth it to save a header copy if it's linear?  We are
> going to access it anyway, and it fits into one cacheline nicely.  On
> the other hand we have more code making life harder for compiler and
> processor.

Not sure: I think there would be many places where it would be useful.

We do a similar thing in the kernel to inspect non-linear packets, and
it's served us well.

Cheers,
Rusty.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net
  2009-11-25 10:20                 ` Rusty Russell
@ 2009-11-25 11:40                   ` Michael S. Tsirkin
  0 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2009-11-25 11:40 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Shirley Ma, Eric Dumazet, Avi Kivity, netdev, kvm, linux-kernel,
	Hollis Blanchard

On Wed, Nov 25, 2009 at 08:50:21PM +1030, Rusty Russell wrote:
> On Wed, 25 Nov 2009 07:45:30 pm Michael S. Tsirkin wrote:
> > Hmm, is it really worth it to save a header copy if it's linear?  We are
> > going to access it anyway, and it fits into one cacheline nicely.  On
> > the other hand we have more code making life harder for compiler and
> > processor.
> 
> Not sure: I think there would be many places where it would be useful.
> 
> We do a similar thing in the kernel to inspect non-linear packets, and
> it's served us well.

You mean this gives measureable speedup? Okay ...

> Cheers,
> Rusty.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2009-11-25 11:43 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-20  6:15 [PATCH 1/1] Defer skb allocation for both mergeable buffers and big packets in virtio_net Shirley Ma
2009-11-20  6:19 ` Eric Dumazet
2009-11-20 16:08   ` Shirley Ma
2009-11-20 16:21   ` Shirley Ma
2009-11-23  1:08     ` Rusty Russell
2009-11-23 16:07       ` Shirley Ma
2009-11-23 22:24         ` Rusty Russell
2009-11-23 23:27           ` Shirley Ma
2009-11-24 11:37           ` Michael S. Tsirkin
2009-11-24 14:36             ` Anthony Liguori
2009-11-24 16:04               ` Michael S. Tsirkin
2009-11-24 16:04               ` Michael S. Tsirkin
2009-11-25  0:15               ` Rusty Russell
2009-11-25  0:12             ` Rusty Russell
2009-11-25  9:15               ` Michael S. Tsirkin
2009-11-25 10:20                 ` Rusty Russell
2009-11-25 11:40                   ` Michael S. Tsirkin
2009-11-23 19:01       ` Shirley Ma
2009-11-23  9:43     ` Michael S. Tsirkin
2009-11-23 16:18       ` Shirley Ma

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.