linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 1/2] virtio-ring: Use threshold for switching to indirect descriptors
@ 2012-08-28 13:04 Sasha Levin
  2012-08-28 13:04 ` [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible Sasha Levin
  2012-08-28 13:20 ` [PATCH v2 1/2] virtio-ring: Use threshold for switching to indirect descriptors Michael S. Tsirkin
  0 siblings, 2 replies; 29+ messages in thread
From: Sasha Levin @ 2012-08-28 13:04 UTC (permalink / raw)
  To: rusty, mst; +Cc: virtualization, linux-kernel, avi, kvm, Sasha Levin

Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will use indirect
descriptors even if we have plenty of space in the ring. This means that
we take a performance hit at all times due to the overhead of creating
indirect descriptors.

Instead, use it only after we're below a configurable offset.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 drivers/block/virtio_blk.c          |  4 ++++
 drivers/char/hw_random/virtio-rng.c |  4 ++++
 drivers/char/virtio_console.c       |  4 ++++
 drivers/net/virtio_net.c            |  4 ++++
 drivers/virtio/virtio_balloon.c     |  4 ++++
 drivers/virtio/virtio_ring.c        | 21 +++++++++++++++------
 include/linux/virtio.h              |  1 +
 net/9p/trans_virtio.c               |  4 ++++
 8 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2edfb5c..13b8ae9 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -22,6 +22,9 @@ static DEFINE_IDA(vd_index_ida);
 
 struct workqueue_struct *virtblk_wq;
 
+static unsigned int indirect_thresh;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 struct virtio_blk
 {
 	struct virtio_device *vdev;
@@ -735,6 +738,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 
 	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
 	vblk->config_enable = true;
+	vdev->indirect_thresh = indirect_thresh;
 
 	err = init_vq(vblk);
 	if (err)
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 5708299..02d8421 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -25,6 +25,9 @@
 #include <linux/virtio_rng.h>
 #include <linux/module.h>
 
+static unsigned int indirect_thresh;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 static struct virtqueue *vq;
 static unsigned int data_avail;
 static DECLARE_COMPLETION(have_data);
@@ -93,6 +96,7 @@ static int probe_common(struct virtio_device *vdev)
 	int err;
 
 	/* We expect a single virtqueue. */
+	vdev->indirect_thresh = indirect_thresh;
 	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
 	if (IS_ERR(vq))
 		return PTR_ERR(vq);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index e88f843..fc14e7f 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -39,6 +39,9 @@
 #include <linux/module.h>
 #include "../tty/hvc/hvc_console.h"
 
+static unsigned int indirect_thresh;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 /*
  * This is a global struct for storing common data for all the devices
  * this driver handles.
@@ -1887,6 +1890,7 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
 				       max_nr_ports),
 			      &portdev->config.max_nr_ports) == 0)
 		multiport = true;
+	vdev->indirect_thresh = indirect_thresh;
 
 	err = init_vqs(portdev);
 	if (err < 0) {
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index cbf8b06..64a8321 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -34,6 +34,9 @@ static bool csum = true, gso = true;
 module_param(csum, bool, 0444);
 module_param(gso, bool, 0444);
 
+static unsigned int indirect_thresh;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 /* FIXME: MTU in config. */
 #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
@@ -1128,6 +1131,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
+	vdev->indirect_thresh = indirect_thresh;
 
 	err = init_vqs(vi);
 	if (err)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 0908e60..fce7347 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -35,6 +35,9 @@
  */
 #define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
 
+static unsigned int indirect_thresh;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 struct virtio_balloon
 {
 	struct virtio_device *vdev;
@@ -356,6 +359,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	init_waitqueue_head(&vb->acked);
 	vb->vdev = vdev;
 	vb->need_stats_update = 0;
+	vdev->indirect_thresh = indirect_thresh;
 
 	err = init_vqs(vb);
 	if (err)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 5aa43c3..99a64a7 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -87,8 +87,11 @@ struct vring_virtqueue
 	/* Other side has made a mess, don't try any more. */
 	bool broken;
 
-	/* Host supports indirect buffers */
-	bool indirect;
+	/*
+	 * Min. number of free space in the ring to trigger direct
+	 * descriptor use
+	 */
+	unsigned int indirect_thresh;
 
 	/* Host publishes avail event idx */
 	bool event;
@@ -216,9 +219,12 @@ int virtqueue_add_buf(struct virtqueue *_vq,
 	}
 #endif
 
-	/* If the host supports indirect descriptor tables, and we have multiple
-	 * buffers, then go indirect. FIXME: tune this threshold */
-	if (vq->indirect && (out + in) > 1 && vq->num_free) {
+	/*
+	 * If the host supports indirect descriptor tables, and we have multiple
+	 * buffers, then go indirect.
+	 */
+	if ((out + in) > 1 && vq->num_free &&
+		(vq->num_free < vq->indirect_thresh)) {
 		head = vring_add_indirect(vq, sg, out, in, gfp);
 		if (likely(head >= 0))
 			goto add_head;
@@ -647,13 +653,16 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 	vq->broken = false;
 	vq->last_used_idx = 0;
 	vq->num_added = 0;
+	vq->indirect_thresh = 0;
 	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
 	vq->in_use = false;
 	vq->last_add_time_valid = false;
 #endif
 
-	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
+	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
+		vq->indirect_thresh = vdev->indirect_thresh;
+
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
 	/* No callback?  Tell other side not to bother us. */
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index a1ba8bb..48bc457 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -69,6 +69,7 @@ struct virtio_device {
 	/* Note that this is a Linux set_bit-style bitmap. */
 	unsigned long features[1];
 	void *priv;
+	unsigned int indirect_thresh;
 };
 
 #define dev_to_virtio(dev) container_of(dev, struct virtio_device, dev)
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 35b8911..fc93962 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -52,6 +52,9 @@
 
 #define VIRTQUEUE_NUM	128
 
+static unsigned int indirect_thresh;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 /* a single mutex to manage channel initialization and attachment */
 static DEFINE_MUTEX(virtio_9p_lock);
 static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
@@ -501,6 +504,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
 	chan->vdev = vdev;
 
 	/* We expect one virtqueue, for requests. */
+	vdev->indirect_thresh = indirect_thresh;
 	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
 	if (IS_ERR(chan->vq)) {
 		err = PTR_ERR(chan->vq);
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-28 13:04 [PATCH v2 1/2] virtio-ring: Use threshold for switching to indirect descriptors Sasha Levin
@ 2012-08-28 13:04 ` Sasha Levin
  2012-08-28 13:20   ` Michael S. Tsirkin
  2012-08-29 15:38   ` Michael S. Tsirkin
  2012-08-28 13:20 ` [PATCH v2 1/2] virtio-ring: Use threshold for switching to indirect descriptors Michael S. Tsirkin
  1 sibling, 2 replies; 29+ messages in thread
From: Sasha Levin @ 2012-08-28 13:04 UTC (permalink / raw)
  To: rusty, mst; +Cc: virtualization, linux-kernel, avi, kvm, Sasha Levin

Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
use indirect descriptors and allocate them using a simple
kmalloc().

This patch adds a cache which will allow indirect buffers under
a configurable size to be allocated from that cache instead.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---

Changes in v2:

 - Free correctly indirect buffers.

 drivers/block/virtio_blk.c          |  4 ++++
 drivers/char/hw_random/virtio-rng.c |  4 ++++
 drivers/char/virtio_console.c       |  4 ++++
 drivers/net/virtio_net.c            |  4 ++++
 drivers/virtio/virtio_balloon.c     |  4 ++++
 drivers/virtio/virtio_ring.c        | 28 ++++++++++++++++++++++++----
 include/linux/virtio.h              |  1 +
 net/9p/trans_virtio.c               |  5 +++++
 8 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 13b8ae9..7f670af 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -25,6 +25,9 @@ struct workqueue_struct *virtblk_wq;
 static unsigned int indirect_thresh;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 struct virtio_blk
 {
 	struct virtio_device *vdev;
@@ -739,6 +742,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
 	vblk->config_enable = true;
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
 
 	err = init_vq(vblk);
 	if (err)
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 02d8421..8475ece 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -28,6 +28,9 @@
 static unsigned int indirect_thresh;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 static struct virtqueue *vq;
 static unsigned int data_avail;
 static DECLARE_COMPLETION(have_data);
@@ -97,6 +100,7 @@ static int probe_common(struct virtio_device *vdev)
 
 	/* We expect a single virtqueue. */
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
 	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
 	if (IS_ERR(vq))
 		return PTR_ERR(vq);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index fc14e7f..c6f24a7 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -42,6 +42,9 @@
 static unsigned int indirect_thresh;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 /*
  * This is a global struct for storing common data for all the devices
  * this driver handles.
@@ -1891,6 +1894,7 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
 			      &portdev->config.max_nr_ports) == 0)
 		multiport = true;
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
 
 	err = init_vqs(portdev);
 	if (err < 0) {
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 64a8321..c091efd 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -37,6 +37,9 @@ module_param(gso, bool, 0444);
 static unsigned int indirect_thresh;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 /* FIXME: MTU in config. */
 #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
@@ -1132,6 +1135,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
 
 	err = init_vqs(vi);
 	if (err)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index fce7347..ccf87db 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -38,6 +38,9 @@
 static unsigned int indirect_thresh;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 struct virtio_balloon
 {
 	struct virtio_device *vdev;
@@ -360,6 +363,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	vb->vdev = vdev;
 	vb->need_stats_update = 0;
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
 
 	err = init_vqs(vb);
 	if (err)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 99a64a7..e8b9c54 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -93,6 +93,10 @@ struct vring_virtqueue
 	 */
 	unsigned int indirect_thresh;
 
+	/* Buffers below this size will be allocated from cache */
+	unsigned int indirect_alloc_thresh;
+	struct kmem_cache *indirect_cache;
+
 	/* Host publishes avail event idx */
 	bool event;
 
@@ -135,7 +139,10 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
 	unsigned head;
 	int i;
 
-	desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
+	if ((out + in) <= vq->indirect_alloc_thresh)
+		desc = kmem_cache_alloc(vq->indirect_cache, gfp);
+	else
+		desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
 	if (!desc)
 		return -ENOMEM;
 
@@ -384,8 +391,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 	i = head;
 
 	/* Free the indirect table */
-	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
-		kfree(phys_to_virt(vq->vring.desc[i].addr));
+	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT) {
+		if (vq->vring.desc[i].len > vq->indirect_alloc_thresh)
+			kfree(phys_to_virt(vq->vring.desc[i].addr));
+		else
+			kmem_cache_free(vq->indirect_cache,
+					phys_to_virt(vq->vring.desc[i].addr));
+	}
 
 	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
 		i = vq->vring.desc[i].next;
@@ -654,14 +666,20 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 	vq->last_used_idx = 0;
 	vq->num_added = 0;
 	vq->indirect_thresh = 0;
+	vq->indirect_alloc_thresh = 0;
+	vq->indirect_cache = NULL;
 	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
 	vq->in_use = false;
 	vq->last_add_time_valid = false;
 #endif
 
-	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
+	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) {
 		vq->indirect_thresh = vdev->indirect_thresh;
+		vq->indirect_alloc_thresh = vdev->indirect_alloc_thresh;
+		if (vq->indirect_alloc_thresh)
+			vq->indirect_cache = KMEM_CACHE(vring_desc[vq->indirect_alloc_thresh], 0);
+	}
 
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
@@ -685,6 +703,8 @@ EXPORT_SYMBOL_GPL(vring_new_virtqueue);
 void vring_del_virtqueue(struct virtqueue *vq)
 {
 	list_del(&vq->list);
+	if (to_vvq(vq)->indirect_cache)
+		kmem_cache_destroy(to_vvq(vq)->indirect_cache);
 	kfree(to_vvq(vq));
 }
 EXPORT_SYMBOL_GPL(vring_del_virtqueue);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 48bc457..3261c02 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -70,6 +70,7 @@ struct virtio_device {
 	unsigned long features[1];
 	void *priv;
 	unsigned int indirect_thresh;
+	unsigned int indirect_alloc_thresh;
 };
 
 #define dev_to_virtio(dev) container_of(dev, struct virtio_device, dev)
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index fc93962..3044c86 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -55,6 +55,9 @@
 static unsigned int indirect_thresh;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 /* a single mutex to manage channel initialization and attachment */
 static DEFINE_MUTEX(virtio_9p_lock);
 static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
@@ -505,6 +508,8 @@ static int p9_virtio_probe(struct virtio_device *vdev)
 
 	/* We expect one virtqueue, for requests. */
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
+
 	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
 	if (IS_ERR(chan->vq)) {
 		err = PTR_ERR(chan->vq);
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 1/2] virtio-ring: Use threshold for switching to indirect descriptors
  2012-08-28 13:04 [PATCH v2 1/2] virtio-ring: Use threshold for switching to indirect descriptors Sasha Levin
  2012-08-28 13:04 ` [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible Sasha Levin
@ 2012-08-28 13:20 ` Michael S. Tsirkin
  1 sibling, 0 replies; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-08-28 13:20 UTC (permalink / raw)
  To: Sasha Levin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On Tue, Aug 28, 2012 at 03:04:02PM +0200, Sasha Levin wrote:
> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will use indirect
> descriptors even if we have plenty of space in the ring. This means that
> we take a performance hit at all times due to the overhead of creating
> indirect descriptors.
> 
> Instead, use it only after we're below a configurable offset.
> 
> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>

I imagine this helps performance? Any numbers?

> ---
>  drivers/block/virtio_blk.c          |  4 ++++
>  drivers/char/hw_random/virtio-rng.c |  4 ++++
>  drivers/char/virtio_console.c       |  4 ++++
>  drivers/net/virtio_net.c            |  4 ++++
>  drivers/virtio/virtio_balloon.c     |  4 ++++
>  drivers/virtio/virtio_ring.c        | 21 +++++++++++++++------
>  include/linux/virtio.h              |  1 +
>  net/9p/trans_virtio.c               |  4 ++++
>  8 files changed, 40 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> index 2edfb5c..13b8ae9 100644
> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -22,6 +22,9 @@ static DEFINE_IDA(vd_index_ida);
>  
>  struct workqueue_struct *virtblk_wq;
>  
> +static unsigned int indirect_thresh;
> +module_param(indirect_thresh, uint, S_IRUGO);
> +
>  struct virtio_blk
>  {
>  	struct virtio_device *vdev;
> @@ -735,6 +738,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
>  
>  	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
>  	vblk->config_enable = true;
> +	vdev->indirect_thresh = indirect_thresh;
>  
>  	err = init_vq(vblk);
>  	if (err)
> diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
> index 5708299..02d8421 100644
> --- a/drivers/char/hw_random/virtio-rng.c
> +++ b/drivers/char/hw_random/virtio-rng.c
> @@ -25,6 +25,9 @@
>  #include <linux/virtio_rng.h>
>  #include <linux/module.h>
>  
> +static unsigned int indirect_thresh;
> +module_param(indirect_thresh, uint, S_IRUGO);
> +
>  static struct virtqueue *vq;
>  static unsigned int data_avail;
>  static DECLARE_COMPLETION(have_data);
> @@ -93,6 +96,7 @@ static int probe_common(struct virtio_device *vdev)
>  	int err;
>  
>  	/* We expect a single virtqueue. */
> +	vdev->indirect_thresh = indirect_thresh;
>  	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
>  	if (IS_ERR(vq))
>  		return PTR_ERR(vq);
> diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> index e88f843..fc14e7f 100644
> --- a/drivers/char/virtio_console.c
> +++ b/drivers/char/virtio_console.c
> @@ -39,6 +39,9 @@
>  #include <linux/module.h>
>  #include "../tty/hvc/hvc_console.h"
>  
> +static unsigned int indirect_thresh;
> +module_param(indirect_thresh, uint, S_IRUGO);
> +
>  /*
>   * This is a global struct for storing common data for all the devices
>   * this driver handles.
> @@ -1887,6 +1890,7 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
>  				       max_nr_ports),
>  			      &portdev->config.max_nr_ports) == 0)
>  		multiport = true;
> +	vdev->indirect_thresh = indirect_thresh;
>  
>  	err = init_vqs(portdev);
>  	if (err < 0) {
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index cbf8b06..64a8321 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -34,6 +34,9 @@ static bool csum = true, gso = true;
>  module_param(csum, bool, 0444);
>  module_param(gso, bool, 0444);
>  
> +static unsigned int indirect_thresh;
> +module_param(indirect_thresh, uint, S_IRUGO);
> +
>  /* FIXME: MTU in config. */
>  #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>  #define GOOD_COPY_LEN	128
> @@ -1128,6 +1131,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>  
>  	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
>  		vi->mergeable_rx_bufs = true;
> +	vdev->indirect_thresh = indirect_thresh;
>  
>  	err = init_vqs(vi);
>  	if (err)
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 0908e60..fce7347 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -35,6 +35,9 @@
>   */
>  #define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
>  
> +static unsigned int indirect_thresh;
> +module_param(indirect_thresh, uint, S_IRUGO);
> +
>  struct virtio_balloon
>  {
>  	struct virtio_device *vdev;
> @@ -356,6 +359,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  	init_waitqueue_head(&vb->acked);
>  	vb->vdev = vdev;
>  	vb->need_stats_update = 0;
> +	vdev->indirect_thresh = indirect_thresh;
>  
>  	err = init_vqs(vb);
>  	if (err)
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 5aa43c3..99a64a7 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -87,8 +87,11 @@ struct vring_virtqueue
>  	/* Other side has made a mess, don't try any more. */
>  	bool broken;
>  
> -	/* Host supports indirect buffers */
> -	bool indirect;
> +	/*
> +	 * Min. number of free space in the ring to trigger direct
> +	 * descriptor use
> +	 */
> +	unsigned int indirect_thresh;
>  
>  	/* Host publishes avail event idx */
>  	bool event;
> @@ -216,9 +219,12 @@ int virtqueue_add_buf(struct virtqueue *_vq,
>  	}
>  #endif
>  
> -	/* If the host supports indirect descriptor tables, and we have multiple
> -	 * buffers, then go indirect. FIXME: tune this threshold */
> -	if (vq->indirect && (out + in) > 1 && vq->num_free) {
> +	/*
> +	 * If the host supports indirect descriptor tables, and we have multiple
> +	 * buffers, then go indirect.
> +	 */
> +	if ((out + in) > 1 && vq->num_free &&
> +		(vq->num_free < vq->indirect_thresh)) {
>  		head = vring_add_indirect(vq, sg, out, in, gfp);
>  		if (likely(head >= 0))
>  			goto add_head;
> @@ -647,13 +653,16 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
>  	vq->broken = false;
>  	vq->last_used_idx = 0;
>  	vq->num_added = 0;
> +	vq->indirect_thresh = 0;
>  	list_add_tail(&vq->vq.list, &vdev->vqs);
>  #ifdef DEBUG
>  	vq->in_use = false;
>  	vq->last_add_time_valid = false;
>  #endif
>  
> -	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
> +	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
> +		vq->indirect_thresh = vdev->indirect_thresh;
> +
>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>  
>  	/* No callback?  Tell other side not to bother us. */
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index a1ba8bb..48bc457 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -69,6 +69,7 @@ struct virtio_device {
>  	/* Note that this is a Linux set_bit-style bitmap. */
>  	unsigned long features[1];
>  	void *priv;
> +	unsigned int indirect_thresh;
>  };
>  
>  #define dev_to_virtio(dev) container_of(dev, struct virtio_device, dev)
> diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
> index 35b8911..fc93962 100644
> --- a/net/9p/trans_virtio.c
> +++ b/net/9p/trans_virtio.c
> @@ -52,6 +52,9 @@
>  
>  #define VIRTQUEUE_NUM	128
>  
> +static unsigned int indirect_thresh;
> +module_param(indirect_thresh, uint, S_IRUGO);
> +
>  /* a single mutex to manage channel initialization and attachment */
>  static DEFINE_MUTEX(virtio_9p_lock);
>  static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
> @@ -501,6 +504,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
>  	chan->vdev = vdev;
>  
>  	/* We expect one virtqueue, for requests. */
> +	vdev->indirect_thresh = indirect_thresh;
>  	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
>  	if (IS_ERR(chan->vq)) {
>  		err = PTR_ERR(chan->vq);
> -- 
> 1.7.12

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-28 13:04 ` [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible Sasha Levin
@ 2012-08-28 13:20   ` Michael S. Tsirkin
  2012-08-28 13:35     ` Sasha Levin
  2012-08-29 15:38   ` Michael S. Tsirkin
  1 sibling, 1 reply; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-08-28 13:20 UTC (permalink / raw)
  To: Sasha Levin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On Tue, Aug 28, 2012 at 03:04:03PM +0200, Sasha Levin wrote:
> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
> use indirect descriptors and allocate them using a simple
> kmalloc().
> 
> This patch adds a cache which will allow indirect buffers under
> a configurable size to be allocated from that cache instead.
> 
> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>

I imagine this helps performance? Any numbers?

> ---
> 
> Changes in v2:
> 
>  - Free correctly indirect buffers.
> 
>  drivers/block/virtio_blk.c          |  4 ++++
>  drivers/char/hw_random/virtio-rng.c |  4 ++++
>  drivers/char/virtio_console.c       |  4 ++++
>  drivers/net/virtio_net.c            |  4 ++++
>  drivers/virtio/virtio_balloon.c     |  4 ++++
>  drivers/virtio/virtio_ring.c        | 28 ++++++++++++++++++++++++----
>  include/linux/virtio.h              |  1 +
>  net/9p/trans_virtio.c               |  5 +++++
>  8 files changed, 50 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> index 13b8ae9..7f670af 100644
> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -25,6 +25,9 @@ struct workqueue_struct *virtblk_wq;
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  struct virtio_blk
>  {
>  	struct virtio_device *vdev;
> @@ -739,6 +742,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
>  	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
>  	vblk->config_enable = true;
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>  
>  	err = init_vq(vblk);
>  	if (err)
> diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
> index 02d8421..8475ece 100644
> --- a/drivers/char/hw_random/virtio-rng.c
> +++ b/drivers/char/hw_random/virtio-rng.c
> @@ -28,6 +28,9 @@
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  static struct virtqueue *vq;
>  static unsigned int data_avail;
>  static DECLARE_COMPLETION(have_data);
> @@ -97,6 +100,7 @@ static int probe_common(struct virtio_device *vdev)
>  
>  	/* We expect a single virtqueue. */
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>  	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
>  	if (IS_ERR(vq))
>  		return PTR_ERR(vq);
> diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> index fc14e7f..c6f24a7 100644
> --- a/drivers/char/virtio_console.c
> +++ b/drivers/char/virtio_console.c
> @@ -42,6 +42,9 @@
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  /*
>   * This is a global struct for storing common data for all the devices
>   * this driver handles.
> @@ -1891,6 +1894,7 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
>  			      &portdev->config.max_nr_ports) == 0)
>  		multiport = true;
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>  
>  	err = init_vqs(portdev);
>  	if (err < 0) {
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 64a8321..c091efd 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -37,6 +37,9 @@ module_param(gso, bool, 0444);
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  /* FIXME: MTU in config. */
>  #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>  #define GOOD_COPY_LEN	128
> @@ -1132,6 +1135,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
>  		vi->mergeable_rx_bufs = true;
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>  
>  	err = init_vqs(vi);
>  	if (err)
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index fce7347..ccf87db 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -38,6 +38,9 @@
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  struct virtio_balloon
>  {
>  	struct virtio_device *vdev;
> @@ -360,6 +363,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  	vb->vdev = vdev;
>  	vb->need_stats_update = 0;
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>  
>  	err = init_vqs(vb);
>  	if (err)
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 99a64a7..e8b9c54 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -93,6 +93,10 @@ struct vring_virtqueue
>  	 */
>  	unsigned int indirect_thresh;
>  
> +	/* Buffers below this size will be allocated from cache */
> +	unsigned int indirect_alloc_thresh;
> +	struct kmem_cache *indirect_cache;
> +
>  	/* Host publishes avail event idx */
>  	bool event;
>  
> @@ -135,7 +139,10 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
>  	unsigned head;
>  	int i;
>  
> -	desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
> +	if ((out + in) <= vq->indirect_alloc_thresh)
> +		desc = kmem_cache_alloc(vq->indirect_cache, gfp);
> +	else
> +		desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
>  	if (!desc)
>  		return -ENOMEM;
>  
> @@ -384,8 +391,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
>  	i = head;
>  
>  	/* Free the indirect table */
> -	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
> -		kfree(phys_to_virt(vq->vring.desc[i].addr));
> +	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT) {
> +		if (vq->vring.desc[i].len > vq->indirect_alloc_thresh)
> +			kfree(phys_to_virt(vq->vring.desc[i].addr));
> +		else
> +			kmem_cache_free(vq->indirect_cache,
> +					phys_to_virt(vq->vring.desc[i].addr));
> +	}
>  
>  	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
>  		i = vq->vring.desc[i].next;
> @@ -654,14 +666,20 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
>  	vq->last_used_idx = 0;
>  	vq->num_added = 0;
>  	vq->indirect_thresh = 0;
> +	vq->indirect_alloc_thresh = 0;
> +	vq->indirect_cache = NULL;
>  	list_add_tail(&vq->vq.list, &vdev->vqs);
>  #ifdef DEBUG
>  	vq->in_use = false;
>  	vq->last_add_time_valid = false;
>  #endif
>  
> -	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
> +	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) {
>  		vq->indirect_thresh = vdev->indirect_thresh;
> +		vq->indirect_alloc_thresh = vdev->indirect_alloc_thresh;
> +		if (vq->indirect_alloc_thresh)
> +			vq->indirect_cache = KMEM_CACHE(vring_desc[vq->indirect_alloc_thresh], 0);
> +	}
>  
>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>  
> @@ -685,6 +703,8 @@ EXPORT_SYMBOL_GPL(vring_new_virtqueue);
>  void vring_del_virtqueue(struct virtqueue *vq)
>  {
>  	list_del(&vq->list);
> +	if (to_vvq(vq)->indirect_cache)
> +		kmem_cache_destroy(to_vvq(vq)->indirect_cache);
>  	kfree(to_vvq(vq));
>  }
>  EXPORT_SYMBOL_GPL(vring_del_virtqueue);
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index 48bc457..3261c02 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -70,6 +70,7 @@ struct virtio_device {
>  	unsigned long features[1];
>  	void *priv;
>  	unsigned int indirect_thresh;
> +	unsigned int indirect_alloc_thresh;
>  };
>  
>  #define dev_to_virtio(dev) container_of(dev, struct virtio_device, dev)
> diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
> index fc93962..3044c86 100644
> --- a/net/9p/trans_virtio.c
> +++ b/net/9p/trans_virtio.c
> @@ -55,6 +55,9 @@
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  /* a single mutex to manage channel initialization and attachment */
>  static DEFINE_MUTEX(virtio_9p_lock);
>  static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
> @@ -505,6 +508,8 @@ static int p9_virtio_probe(struct virtio_device *vdev)
>  
>  	/* We expect one virtqueue, for requests. */
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
> +
>  	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
>  	if (IS_ERR(chan->vq)) {
>  		err = PTR_ERR(chan->vq);
> -- 
> 1.7.12

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-28 13:20   ` Michael S. Tsirkin
@ 2012-08-28 13:35     ` Sasha Levin
  2012-08-29 11:07       ` Michael S. Tsirkin
  2012-09-06  0:02       ` Rusty Russell
  0 siblings, 2 replies; 29+ messages in thread
From: Sasha Levin @ 2012-08-28 13:35 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On 08/28/2012 03:20 PM, Michael S. Tsirkin wrote:
> On Tue, Aug 28, 2012 at 03:04:03PM +0200, Sasha Levin wrote:
>> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
>> use indirect descriptors and allocate them using a simple
>> kmalloc().
>>
>> This patch adds a cache which will allow indirect buffers under
>> a configurable size to be allocated from that cache instead.
>>
>> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> 
> I imagine this helps performance? Any numbers?

I ran benchmarks on the original RFC, I've re-tested it now and got similar
numbers to the original ones (virtio-net using vhost-net, thresh=16):

Before:
	Recv   Send    Send
	Socket Socket  Message  Elapsed
	Size   Size    Size     Time     Throughput
	bytes  bytes   bytes    secs.    10^6bits/sec

	 87380  16384  16384    10.00    4512.12

After:
	Recv   Send    Send
	Socket Socket  Message  Elapsed
	Size   Size    Size     Time     Throughput
	bytes  bytes   bytes    secs.    10^6bits/sec

	 87380  16384  16384    10.00    5399.18


Thanks,
Sasha

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-28 13:35     ` Sasha Levin
@ 2012-08-29 11:07       ` Michael S. Tsirkin
  2012-08-29 15:03         ` Sasha Levin
  2012-09-06  0:02       ` Rusty Russell
  1 sibling, 1 reply; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-08-29 11:07 UTC (permalink / raw)
  To: Sasha Levin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On Tue, Aug 28, 2012 at 03:35:00PM +0200, Sasha Levin wrote:
> On 08/28/2012 03:20 PM, Michael S. Tsirkin wrote:
> > On Tue, Aug 28, 2012 at 03:04:03PM +0200, Sasha Levin wrote:
> >> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
> >> use indirect descriptors and allocate them using a simple
> >> kmalloc().
> >>
> >> This patch adds a cache which will allow indirect buffers under
> >> a configurable size to be allocated from that cache instead.
> >>
> >> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> > 
> > I imagine this helps performance? Any numbers?
> 
> I ran benchmarks on the original RFC, I've re-tested it now and got similar
> numbers to the original ones (virtio-net using vhost-net, thresh=16):
> 
> Before:
> 	Recv   Send    Send
> 	Socket Socket  Message  Elapsed
> 	Size   Size    Size     Time     Throughput
> 	bytes  bytes   bytes    secs.    10^6bits/sec
> 
> 	 87380  16384  16384    10.00    4512.12
> 
> After:
> 	Recv   Send    Send
> 	Socket Socket  Message  Elapsed
> 	Size   Size    Size     Time     Throughput
> 	bytes  bytes   bytes    secs.    10^6bits/sec
> 
> 	 87380  16384  16384    10.00    5399.18
> 
> 
> Thanks,
> Sasha

This is with both patches 1 + 2?
Sorry could you please also test what happens if you apply
- just patch 1
- just patch 2

Thanks!

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-29 11:07       ` Michael S. Tsirkin
@ 2012-08-29 15:03         ` Sasha Levin
  2012-08-29 15:14           ` Michael S. Tsirkin
  2012-08-29 15:38           ` Michael S. Tsirkin
  0 siblings, 2 replies; 29+ messages in thread
From: Sasha Levin @ 2012-08-29 15:03 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On 08/29/2012 01:07 PM, Michael S. Tsirkin wrote:
> On Tue, Aug 28, 2012 at 03:35:00PM +0200, Sasha Levin wrote:
>> On 08/28/2012 03:20 PM, Michael S. Tsirkin wrote:
>>> On Tue, Aug 28, 2012 at 03:04:03PM +0200, Sasha Levin wrote:
>>>> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
>>>> use indirect descriptors and allocate them using a simple
>>>> kmalloc().
>>>>
>>>> This patch adds a cache which will allow indirect buffers under
>>>> a configurable size to be allocated from that cache instead.
>>>>
>>>> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
>>>
>>> I imagine this helps performance? Any numbers?
>>
>> I ran benchmarks on the original RFC, I've re-tested it now and got similar
>> numbers to the original ones (virtio-net using vhost-net, thresh=16):
>>
>> Before:
>> 	Recv   Send    Send
>> 	Socket Socket  Message  Elapsed
>> 	Size   Size    Size     Time     Throughput
>> 	bytes  bytes   bytes    secs.    10^6bits/sec
>>
>> 	 87380  16384  16384    10.00    4512.12
>>
>> After:
>> 	Recv   Send    Send
>> 	Socket Socket  Message  Elapsed
>> 	Size   Size    Size     Time     Throughput
>> 	bytes  bytes   bytes    secs.    10^6bits/sec
>>
>> 	 87380  16384  16384    10.00    5399.18
>>
>>
>> Thanks,
>> Sasha
> 
> This is with both patches 1 + 2?
> Sorry could you please also test what happens if you apply
> - just patch 1
> - just patch 2
> 
> Thanks!

Sure thing!

I've also re-ran it on a IBM server type host instead of my laptop. Here are the
results:

Vanilla kernel:

MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
() port 0 AF_INET
enable_enobufs failed: getprotobyname
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

 87380  16384  16384    10.00    7922.72

Patch 1, with threshold=16:

MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
() port 0 AF_INET
enable_enobufs failed: getprotobyname
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

 87380  16384  16384    10.00    8415.07

Patch 2:

MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
() port 0 AF_INET
enable_enobufs failed: getprotobyname
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

 87380  16384  16384    10.00    8931.05


Note that these are simple tests with netperf listening on one end and a simple
'netperf -H [host]' within the guest. If there are other tests which may be
interesting please let me know.


Thanks,
Sasha


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-29 15:03         ` Sasha Levin
@ 2012-08-29 15:14           ` Michael S. Tsirkin
  2012-08-30 10:34             ` Sasha Levin
  2012-08-29 15:38           ` Michael S. Tsirkin
  1 sibling, 1 reply; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-08-29 15:14 UTC (permalink / raw)
  To: Sasha Levin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On Wed, Aug 29, 2012 at 05:03:03PM +0200, Sasha Levin wrote:
> On 08/29/2012 01:07 PM, Michael S. Tsirkin wrote:
> > On Tue, Aug 28, 2012 at 03:35:00PM +0200, Sasha Levin wrote:
> >> On 08/28/2012 03:20 PM, Michael S. Tsirkin wrote:
> >>> On Tue, Aug 28, 2012 at 03:04:03PM +0200, Sasha Levin wrote:
> >>>> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
> >>>> use indirect descriptors and allocate them using a simple
> >>>> kmalloc().
> >>>>
> >>>> This patch adds a cache which will allow indirect buffers under
> >>>> a configurable size to be allocated from that cache instead.
> >>>>
> >>>> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> >>>
> >>> I imagine this helps performance? Any numbers?
> >>
> >> I ran benchmarks on the original RFC, I've re-tested it now and got similar
> >> numbers to the original ones (virtio-net using vhost-net, thresh=16):
> >>
> >> Before:
> >> 	Recv   Send    Send
> >> 	Socket Socket  Message  Elapsed
> >> 	Size   Size    Size     Time     Throughput
> >> 	bytes  bytes   bytes    secs.    10^6bits/sec
> >>
> >> 	 87380  16384  16384    10.00    4512.12
> >>
> >> After:
> >> 	Recv   Send    Send
> >> 	Socket Socket  Message  Elapsed
> >> 	Size   Size    Size     Time     Throughput
> >> 	bytes  bytes   bytes    secs.    10^6bits/sec
> >>
> >> 	 87380  16384  16384    10.00    5399.18
> >>
> >>
> >> Thanks,
> >> Sasha
> > 
> > This is with both patches 1 + 2?
> > Sorry could you please also test what happens if you apply
> > - just patch 1
> > - just patch 2
> > 
> > Thanks!
> 
> Sure thing!
> 
> I've also re-ran it on a IBM server type host instead of my laptop. Here are the
> results:
> 
> Vanilla kernel:
> 
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
> () port 0 AF_INET
> enable_enobufs failed: getprotobyname
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
> 
>  87380  16384  16384    10.00    7922.72
> 
> Patch 1, with threshold=16:

OK so let us set it to 16 for virtio-net by default then?


> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
> () port 0 AF_INET
> enable_enobufs failed: getprotobyname
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
> 
>  87380  16384  16384    10.00    8415.07
> 
> Patch 2:
> 
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
> () port 0 AF_INET
> enable_enobufs failed: getprotobyname
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
> 
>  87380  16384  16384    10.00    8931.05
> 
> 
> Note that these are simple tests with netperf listening on one end and a simple
> 'netperf -H [host]' within the guest. If there are other tests which may be
> interesting please let me know.
> 
> 
> Thanks,
> Sasha

Checking that host CPU utilization did not jump would be nice.
E.g. measure BW/host CPU.

-- 
MST

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-28 13:04 ` [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible Sasha Levin
  2012-08-28 13:20   ` Michael S. Tsirkin
@ 2012-08-29 15:38   ` Michael S. Tsirkin
  2012-08-29 17:14     ` Sasha Levin
  1 sibling, 1 reply; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-08-29 15:38 UTC (permalink / raw)
  To: Sasha Levin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On Tue, Aug 28, 2012 at 03:04:03PM +0200, Sasha Levin wrote:
> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
> use indirect descriptors and allocate them using a simple
> kmalloc().
> 
> This patch adds a cache which will allow indirect buffers under
> a configurable size to be allocated from that cache instead.
> 
> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>

The API is ugly - how does driver know what to set?
Is this a typical request size? Then let's call it that.

Also this is really per VQ, right?

What is a good default for net? I guess max sg?

> ---
> 
> Changes in v2:
> 
>  - Free correctly indirect buffers.
> 
>  drivers/block/virtio_blk.c          |  4 ++++
>  drivers/char/hw_random/virtio-rng.c |  4 ++++
>  drivers/char/virtio_console.c       |  4 ++++
>  drivers/net/virtio_net.c            |  4 ++++
>  drivers/virtio/virtio_balloon.c     |  4 ++++
>  drivers/virtio/virtio_ring.c        | 28 ++++++++++++++++++++++++----
>  include/linux/virtio.h              |  1 +
>  net/9p/trans_virtio.c               |  5 +++++
>  8 files changed, 50 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> index 13b8ae9..7f670af 100644
> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -25,6 +25,9 @@ struct workqueue_struct *virtblk_wq;
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  struct virtio_blk
>  {
>  	struct virtio_device *vdev;
> @@ -739,6 +742,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
>  	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
>  	vblk->config_enable = true;
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>  
>  	err = init_vq(vblk);
>  	if (err)
> diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
> index 02d8421..8475ece 100644
> --- a/drivers/char/hw_random/virtio-rng.c
> +++ b/drivers/char/hw_random/virtio-rng.c
> @@ -28,6 +28,9 @@
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  static struct virtqueue *vq;
>  static unsigned int data_avail;
>  static DECLARE_COMPLETION(have_data);
> @@ -97,6 +100,7 @@ static int probe_common(struct virtio_device *vdev)
>  
>  	/* We expect a single virtqueue. */
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>  	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
>  	if (IS_ERR(vq))
>  		return PTR_ERR(vq);
> diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> index fc14e7f..c6f24a7 100644
> --- a/drivers/char/virtio_console.c
> +++ b/drivers/char/virtio_console.c
> @@ -42,6 +42,9 @@
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  /*
>   * This is a global struct for storing common data for all the devices
>   * this driver handles.
> @@ -1891,6 +1894,7 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
>  			      &portdev->config.max_nr_ports) == 0)
>  		multiport = true;
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>  
>  	err = init_vqs(portdev);
>  	if (err < 0) {
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 64a8321..c091efd 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -37,6 +37,9 @@ module_param(gso, bool, 0444);
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  /* FIXME: MTU in config. */
>  #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>  #define GOOD_COPY_LEN	128
> @@ -1132,6 +1135,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
>  		vi->mergeable_rx_bufs = true;
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>  
>  	err = init_vqs(vi);
>  	if (err)
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index fce7347..ccf87db 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -38,6 +38,9 @@
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  struct virtio_balloon
>  {
>  	struct virtio_device *vdev;
> @@ -360,6 +363,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  	vb->vdev = vdev;
>  	vb->need_stats_update = 0;
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>  
>  	err = init_vqs(vb);
>  	if (err)
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 99a64a7..e8b9c54 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -93,6 +93,10 @@ struct vring_virtqueue
>  	 */
>  	unsigned int indirect_thresh;
>  
> +	/* Buffers below this size will be allocated from cache */
> +	unsigned int indirect_alloc_thresh;
> +	struct kmem_cache *indirect_cache;
> +
>  	/* Host publishes avail event idx */
>  	bool event;
>  
> @@ -135,7 +139,10 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
>  	unsigned head;
>  	int i;
>  
> -	desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
> +	if ((out + in) <= vq->indirect_alloc_thresh)
> +		desc = kmem_cache_alloc(vq->indirect_cache, gfp);
> +	else
> +		desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
>  	if (!desc)
>  		return -ENOMEM;

So this means that drivers need to know how large each
descriptor is to avoid trying to allocate 32Mbyte chunks :)

>  
> @@ -384,8 +391,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
>  	i = head;
>  
>  	/* Free the indirect table */
> -	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
> -		kfree(phys_to_virt(vq->vring.desc[i].addr));
> +	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT) {
> +		if (vq->vring.desc[i].len > vq->indirect_alloc_thresh)

This looks wrong. indirect_alloc_thresh is in entries but len is in
bytes, isn't it?


> +			kfree(phys_to_virt(vq->vring.desc[i].addr));
> +		else
> +			kmem_cache_free(vq->indirect_cache,
> +					phys_to_virt(vq->vring.desc[i].addr));
> +	}
>  
>  	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
>  		i = vq->vring.desc[i].next;
> @@ -654,14 +666,20 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
>  	vq->last_used_idx = 0;
>  	vq->num_added = 0;
>  	vq->indirect_thresh = 0;
> +	vq->indirect_alloc_thresh = 0;
> +	vq->indirect_cache = NULL;
>  	list_add_tail(&vq->vq.list, &vdev->vqs);
>  #ifdef DEBUG
>  	vq->in_use = false;
>  	vq->last_add_time_valid = false;
>  #endif
>  
> -	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
> +	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) {
>  		vq->indirect_thresh = vdev->indirect_thresh;
> +		vq->indirect_alloc_thresh = vdev->indirect_alloc_thresh;
> +		if (vq->indirect_alloc_thresh)
> +			vq->indirect_cache = KMEM_CACHE(vring_desc[vq->indirect_alloc_thresh],
> 0);

I am not a purist but this line looks way too long.
Also - no need to check cache creation succeeded?
On failure - disable caching?

Also - let's check that values are sane before passing them on?
They come from user after all.

Also - should not threshold be per VQ? E.g. for -net we do not
need the cache for RX unless in legacy big packet mode.

> +	}
>  
>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>  
> @@ -685,6 +703,8 @@ EXPORT_SYMBOL_GPL(vring_new_virtqueue);
>  void vring_del_virtqueue(struct virtqueue *vq)
>  {
>  	list_del(&vq->list);
> +	if (to_vvq(vq)->indirect_cache)
> +		kmem_cache_destroy(to_vvq(vq)->indirect_cache);
>  	kfree(to_vvq(vq));
>  }
>  EXPORT_SYMBOL_GPL(vring_del_virtqueue);
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index 48bc457..3261c02 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -70,6 +70,7 @@ struct virtio_device {
>  	unsigned long features[1];
>  	void *priv;
>  	unsigned int indirect_thresh;
> +	unsigned int indirect_alloc_thresh;
>  };
>  
>  #define dev_to_virtio(dev) container_of(dev, struct virtio_device, dev)
> diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
> index fc93962..3044c86 100644
> --- a/net/9p/trans_virtio.c
> +++ b/net/9p/trans_virtio.c
> @@ -55,6 +55,9 @@
>  static unsigned int indirect_thresh;
>  module_param(indirect_thresh, uint, S_IRUGO);
>  
> +static unsigned int indirect_alloc_thresh;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>  /* a single mutex to manage channel initialization and attachment */
>  static DEFINE_MUTEX(virtio_9p_lock);
>  static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
> @@ -505,6 +508,8 @@ static int p9_virtio_probe(struct virtio_device *vdev)
>  
>  	/* We expect one virtqueue, for requests. */
>  	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
> +
>  	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
>  	if (IS_ERR(chan->vq)) {
>  		err = PTR_ERR(chan->vq);
> -- 
> 1.7.12

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-29 15:03         ` Sasha Levin
  2012-08-29 15:14           ` Michael S. Tsirkin
@ 2012-08-29 15:38           ` Michael S. Tsirkin
  2012-08-29 16:50             ` Sasha Levin
  1 sibling, 1 reply; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-08-29 15:38 UTC (permalink / raw)
  To: Sasha Levin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On Wed, Aug 29, 2012 at 05:03:03PM +0200, Sasha Levin wrote:
> On 08/29/2012 01:07 PM, Michael S. Tsirkin wrote:
> > On Tue, Aug 28, 2012 at 03:35:00PM +0200, Sasha Levin wrote:
> >> On 08/28/2012 03:20 PM, Michael S. Tsirkin wrote:
> >>> On Tue, Aug 28, 2012 at 03:04:03PM +0200, Sasha Levin wrote:
> >>>> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
> >>>> use indirect descriptors and allocate them using a simple
> >>>> kmalloc().
> >>>>
> >>>> This patch adds a cache which will allow indirect buffers under
> >>>> a configurable size to be allocated from that cache instead.
> >>>>
> >>>> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> >>>
> >>> I imagine this helps performance? Any numbers?
> >>
> >> I ran benchmarks on the original RFC, I've re-tested it now and got similar
> >> numbers to the original ones (virtio-net using vhost-net, thresh=16):
> >>
> >> Before:
> >> 	Recv   Send    Send
> >> 	Socket Socket  Message  Elapsed
> >> 	Size   Size    Size     Time     Throughput
> >> 	bytes  bytes   bytes    secs.    10^6bits/sec
> >>
> >> 	 87380  16384  16384    10.00    4512.12
> >>
> >> After:
> >> 	Recv   Send    Send
> >> 	Socket Socket  Message  Elapsed
> >> 	Size   Size    Size     Time     Throughput
> >> 	bytes  bytes   bytes    secs.    10^6bits/sec
> >>
> >> 	 87380  16384  16384    10.00    5399.18
> >>
> >>
> >> Thanks,
> >> Sasha
> > 
> > This is with both patches 1 + 2?
> > Sorry could you please also test what happens if you apply
> > - just patch 1
> > - just patch 2
> > 
> > Thanks!
> 
> Sure thing!
> 
> I've also re-ran it on a IBM server type host instead of my laptop. Here are the
> results:
> 
> Vanilla kernel:
> 
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
> () port 0 AF_INET
> enable_enobufs failed: getprotobyname
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
> 
>  87380  16384  16384    10.00    7922.72
> 
> Patch 1, with threshold=16:
> 
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
> () port 0 AF_INET
> enable_enobufs failed: getprotobyname
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
> 
>  87380  16384  16384    10.00    8415.07
> 
> Patch 2:
> 
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
> () port 0 AF_INET
> enable_enobufs failed: getprotobyname
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
> 
>  87380  16384  16384    10.00    8931.05
> 
> 
> Note that these are simple tests with netperf listening on one end and a simple
> 'netperf -H [host]' within the guest. If there are other tests which may be
> interesting please let me know.
> 
> 
> Thanks,
> Sasha


And which parameter did you use for patch 2?


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-29 15:38           ` Michael S. Tsirkin
@ 2012-08-29 16:50             ` Sasha Levin
  2012-09-06  1:02               ` Rusty Russell
  0 siblings, 1 reply; 29+ messages in thread
From: Sasha Levin @ 2012-08-29 16:50 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On 08/29/2012 05:38 PM, Michael S. Tsirkin wrote:
> On Wed, Aug 29, 2012 at 05:03:03PM +0200, Sasha Levin wrote:
>> On 08/29/2012 01:07 PM, Michael S. Tsirkin wrote:
>>> On Tue, Aug 28, 2012 at 03:35:00PM +0200, Sasha Levin wrote:
>>>> On 08/28/2012 03:20 PM, Michael S. Tsirkin wrote:
>>>>> On Tue, Aug 28, 2012 at 03:04:03PM +0200, Sasha Levin wrote:
>>>>>> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
>>>>>> use indirect descriptors and allocate them using a simple
>>>>>> kmalloc().
>>>>>>
>>>>>> This patch adds a cache which will allow indirect buffers under
>>>>>> a configurable size to be allocated from that cache instead.
>>>>>>
>>>>>> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
>>>>>
>>>>> I imagine this helps performance? Any numbers?
>>>>
>>>> I ran benchmarks on the original RFC, I've re-tested it now and got similar
>>>> numbers to the original ones (virtio-net using vhost-net, thresh=16):
>>>>
>>>> Before:
>>>> 	Recv   Send    Send
>>>> 	Socket Socket  Message  Elapsed
>>>> 	Size   Size    Size     Time     Throughput
>>>> 	bytes  bytes   bytes    secs.    10^6bits/sec
>>>>
>>>> 	 87380  16384  16384    10.00    4512.12
>>>>
>>>> After:
>>>> 	Recv   Send    Send
>>>> 	Socket Socket  Message  Elapsed
>>>> 	Size   Size    Size     Time     Throughput
>>>> 	bytes  bytes   bytes    secs.    10^6bits/sec
>>>>
>>>> 	 87380  16384  16384    10.00    5399.18
>>>>
>>>>
>>>> Thanks,
>>>> Sasha
>>>
>>> This is with both patches 1 + 2?
>>> Sorry could you please also test what happens if you apply
>>> - just patch 1
>>> - just patch 2
>>>
>>> Thanks!
>>
>> Sure thing!
>>
>> I've also re-ran it on a IBM server type host instead of my laptop. Here are the
>> results:
>>
>> Vanilla kernel:
>>
>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
>> () port 0 AF_INET
>> enable_enobufs failed: getprotobyname
>> Recv   Send    Send
>> Socket Socket  Message  Elapsed
>> Size   Size    Size     Time     Throughput
>> bytes  bytes   bytes    secs.    10^6bits/sec
>>
>>  87380  16384  16384    10.00    7922.72
>>
>> Patch 1, with threshold=16:
>>
>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
>> () port 0 AF_INET
>> enable_enobufs failed: getprotobyname
>> Recv   Send    Send
>> Socket Socket  Message  Elapsed
>> Size   Size    Size     Time     Throughput
>> bytes  bytes   bytes    secs.    10^6bits/sec
>>
>>  87380  16384  16384    10.00    8415.07
>>
>> Patch 2:
>>
>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
>> () port 0 AF_INET
>> enable_enobufs failed: getprotobyname
>> Recv   Send    Send
>> Socket Socket  Message  Elapsed
>> Size   Size    Size     Time     Throughput
>> bytes  bytes   bytes    secs.    10^6bits/sec
>>
>>  87380  16384  16384    10.00    8931.05
>>
>>
>> Note that these are simple tests with netperf listening on one end and a simple
>> 'netperf -H [host]' within the guest. If there are other tests which may be
>> interesting please let me know.
>>
>>
>> Thanks,
>> Sasha
> 
> 
> And which parameter did you use for patch 2?
> 

Same as in the first one, 16, the only difference in patch 2 is that we use a
kmemcache, so there's no point in changing the threshold vs patch 1.


Thanks,
Sasha

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-29 15:38   ` Michael S. Tsirkin
@ 2012-08-29 17:14     ` Sasha Levin
  2012-08-29 18:12       ` Michael S. Tsirkin
  0 siblings, 1 reply; 29+ messages in thread
From: Sasha Levin @ 2012-08-29 17:14 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On 08/29/2012 05:38 PM, Michael S. Tsirkin wrote:
> On Tue, Aug 28, 2012 at 03:04:03PM +0200, Sasha Levin wrote:
>> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
>> use indirect descriptors and allocate them using a simple
>> kmalloc().
>>
>> This patch adds a cache which will allow indirect buffers under
>> a configurable size to be allocated from that cache instead.
>>
>> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> 
> The API is ugly - how does driver know what to set?
> Is this a typical request size? Then let's call it that.

We've discussed it during the RFC phase, the idea is that we don't know what
would be a good number to use as threshold - which is why I'd like to keep it
disabled as default until it gets more serious tests.

The driver doesn't know what to set, the plan was to make it a dynamic
algorithms which would change it based on current load. Since I can't really do
testing which will provide the correct values for that, the decision was to do
it this way as the first stage and modify it later.

> Also this is really per VQ, right?

Right, we keep it per-device at this stage to keep it simple.

> What is a good default for net? I guess max sg?

I think that it depends on the workload. I'd say we should keep the default to 0
(disabled) unless we can have a good way to adjust it to the load.

>> ---
>>
>> Changes in v2:
>>
>>  - Free correctly indirect buffers.
>>
>>  drivers/block/virtio_blk.c          |  4 ++++
>>  drivers/char/hw_random/virtio-rng.c |  4 ++++
>>  drivers/char/virtio_console.c       |  4 ++++
>>  drivers/net/virtio_net.c            |  4 ++++
>>  drivers/virtio/virtio_balloon.c     |  4 ++++
>>  drivers/virtio/virtio_ring.c        | 28 ++++++++++++++++++++++++----
>>  include/linux/virtio.h              |  1 +
>>  net/9p/trans_virtio.c               |  5 +++++
>>  8 files changed, 50 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
>> index 13b8ae9..7f670af 100644
>> --- a/drivers/block/virtio_blk.c
>> +++ b/drivers/block/virtio_blk.c
>> @@ -25,6 +25,9 @@ struct workqueue_struct *virtblk_wq;
>>  static unsigned int indirect_thresh;
>>  module_param(indirect_thresh, uint, S_IRUGO);
>>  
>> +static unsigned int indirect_alloc_thresh;
>> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
>> +
>>  struct virtio_blk
>>  {
>>  	struct virtio_device *vdev;
>> @@ -739,6 +742,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
>>  	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
>>  	vblk->config_enable = true;
>>  	vdev->indirect_thresh = indirect_thresh;
>> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>>  
>>  	err = init_vq(vblk);
>>  	if (err)
>> diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
>> index 02d8421..8475ece 100644
>> --- a/drivers/char/hw_random/virtio-rng.c
>> +++ b/drivers/char/hw_random/virtio-rng.c
>> @@ -28,6 +28,9 @@
>>  static unsigned int indirect_thresh;
>>  module_param(indirect_thresh, uint, S_IRUGO);
>>  
>> +static unsigned int indirect_alloc_thresh;
>> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
>> +
>>  static struct virtqueue *vq;
>>  static unsigned int data_avail;
>>  static DECLARE_COMPLETION(have_data);
>> @@ -97,6 +100,7 @@ static int probe_common(struct virtio_device *vdev)
>>  
>>  	/* We expect a single virtqueue. */
>>  	vdev->indirect_thresh = indirect_thresh;
>> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>>  	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
>>  	if (IS_ERR(vq))
>>  		return PTR_ERR(vq);
>> diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
>> index fc14e7f..c6f24a7 100644
>> --- a/drivers/char/virtio_console.c
>> +++ b/drivers/char/virtio_console.c
>> @@ -42,6 +42,9 @@
>>  static unsigned int indirect_thresh;
>>  module_param(indirect_thresh, uint, S_IRUGO);
>>  
>> +static unsigned int indirect_alloc_thresh;
>> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
>> +
>>  /*
>>   * This is a global struct for storing common data for all the devices
>>   * this driver handles.
>> @@ -1891,6 +1894,7 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
>>  			      &portdev->config.max_nr_ports) == 0)
>>  		multiport = true;
>>  	vdev->indirect_thresh = indirect_thresh;
>> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>>  
>>  	err = init_vqs(portdev);
>>  	if (err < 0) {
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 64a8321..c091efd 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -37,6 +37,9 @@ module_param(gso, bool, 0444);
>>  static unsigned int indirect_thresh;
>>  module_param(indirect_thresh, uint, S_IRUGO);
>>  
>> +static unsigned int indirect_alloc_thresh;
>> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
>> +
>>  /* FIXME: MTU in config. */
>>  #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>>  #define GOOD_COPY_LEN	128
>> @@ -1132,6 +1135,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>>  	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
>>  		vi->mergeable_rx_bufs = true;
>>  	vdev->indirect_thresh = indirect_thresh;
>> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>>  
>>  	err = init_vqs(vi);
>>  	if (err)
>> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
>> index fce7347..ccf87db 100644
>> --- a/drivers/virtio/virtio_balloon.c
>> +++ b/drivers/virtio/virtio_balloon.c
>> @@ -38,6 +38,9 @@
>>  static unsigned int indirect_thresh;
>>  module_param(indirect_thresh, uint, S_IRUGO);
>>  
>> +static unsigned int indirect_alloc_thresh;
>> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
>> +
>>  struct virtio_balloon
>>  {
>>  	struct virtio_device *vdev;
>> @@ -360,6 +363,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
>>  	vb->vdev = vdev;
>>  	vb->need_stats_update = 0;
>>  	vdev->indirect_thresh = indirect_thresh;
>> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>>  
>>  	err = init_vqs(vb);
>>  	if (err)
>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
>> index 99a64a7..e8b9c54 100644
>> --- a/drivers/virtio/virtio_ring.c
>> +++ b/drivers/virtio/virtio_ring.c
>> @@ -93,6 +93,10 @@ struct vring_virtqueue
>>  	 */
>>  	unsigned int indirect_thresh;
>>  
>> +	/* Buffers below this size will be allocated from cache */
>> +	unsigned int indirect_alloc_thresh;
>> +	struct kmem_cache *indirect_cache;
>> +
>>  	/* Host publishes avail event idx */
>>  	bool event;
>>  
>> @@ -135,7 +139,10 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
>>  	unsigned head;
>>  	int i;
>>  
>> -	desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
>> +	if ((out + in) <= vq->indirect_alloc_thresh)
>> +		desc = kmem_cache_alloc(vq->indirect_cache, gfp);
>> +	else
>> +		desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
>>  	if (!desc)
>>  		return -ENOMEM;
> 
> So this means that drivers need to know how large each
> descriptor is to avoid trying to allocate 32Mbyte chunks :)

Right, this interface isn't perfect and would hopefully be removed in favour of
a dynamic algorithm in the driver.

>>  
>> @@ -384,8 +391,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
>>  	i = head;
>>  
>>  	/* Free the indirect table */
>> -	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
>> -		kfree(phys_to_virt(vq->vring.desc[i].addr));
>> +	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT) {
>> +		if (vq->vring.desc[i].len > vq->indirect_alloc_thresh)
> 
> This looks wrong. indirect_alloc_thresh is in entries but len is in
> bytes, isn't it?

Uh, yes. It's supposed to be '.len/sizeof(struct vring_desc)'.

>> +			kfree(phys_to_virt(vq->vring.desc[i].addr));
>> +		else
>> +			kmem_cache_free(vq->indirect_cache,
>> +					phys_to_virt(vq->vring.desc[i].addr));
>> +	}
>>  
>>  	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
>>  		i = vq->vring.desc[i].next;
>> @@ -654,14 +666,20 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
>>  	vq->last_used_idx = 0;
>>  	vq->num_added = 0;
>>  	vq->indirect_thresh = 0;
>> +	vq->indirect_alloc_thresh = 0;
>> +	vq->indirect_cache = NULL;
>>  	list_add_tail(&vq->vq.list, &vdev->vqs);
>>  #ifdef DEBUG
>>  	vq->in_use = false;
>>  	vq->last_add_time_valid = false;
>>  #endif
>>  
>> -	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
>> +	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) {
>>  		vq->indirect_thresh = vdev->indirect_thresh;
>> +		vq->indirect_alloc_thresh = vdev->indirect_alloc_thresh;
>> +		if (vq->indirect_alloc_thresh)
>> +			vq->indirect_cache = KMEM_CACHE(vring_desc[vq->indirect_alloc_thresh],
>> 0);
> 
> I am not a purist but this line looks way too long.
> Also - no need to check cache creation succeeded?
> On failure - disable caching?
> 
> Also - let's check that values are sane before passing them on?
> They come from user after all.

will fix these two.

> Also - should not threshold be per VQ? E.g. for -net we do not
> need the cache for RX unless in legacy big packet mode.

We've discussed it two months ago over IRC (at least thats what I have in my
notes) - the plan was to keep it simple per-device until something more advanced
to deal with the threshold shows up.


Thanks,
Sasha

> 
>> +	}
>>  
>>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>>  
>> @@ -685,6 +703,8 @@ EXPORT_SYMBOL_GPL(vring_new_virtqueue);
>>  void vring_del_virtqueue(struct virtqueue *vq)
>>  {
>>  	list_del(&vq->list);
>> +	if (to_vvq(vq)->indirect_cache)
>> +		kmem_cache_destroy(to_vvq(vq)->indirect_cache);
>>  	kfree(to_vvq(vq));
>>  }
>>  EXPORT_SYMBOL_GPL(vring_del_virtqueue);
>> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
>> index 48bc457..3261c02 100644
>> --- a/include/linux/virtio.h
>> +++ b/include/linux/virtio.h
>> @@ -70,6 +70,7 @@ struct virtio_device {
>>  	unsigned long features[1];
>>  	void *priv;
>>  	unsigned int indirect_thresh;
>> +	unsigned int indirect_alloc_thresh;
>>  };
>>  
>>  #define dev_to_virtio(dev) container_of(dev, struct virtio_device, dev)
>> diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
>> index fc93962..3044c86 100644
>> --- a/net/9p/trans_virtio.c
>> +++ b/net/9p/trans_virtio.c
>> @@ -55,6 +55,9 @@
>>  static unsigned int indirect_thresh;
>>  module_param(indirect_thresh, uint, S_IRUGO);
>>  
>> +static unsigned int indirect_alloc_thresh;
>> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
>> +
>>  /* a single mutex to manage channel initialization and attachment */
>>  static DEFINE_MUTEX(virtio_9p_lock);
>>  static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
>> @@ -505,6 +508,8 @@ static int p9_virtio_probe(struct virtio_device *vdev)
>>  
>>  	/* We expect one virtqueue, for requests. */
>>  	vdev->indirect_thresh = indirect_thresh;
>> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>> +
>>  	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
>>  	if (IS_ERR(chan->vq)) {
>>  		err = PTR_ERR(chan->vq);
>> -- 
>> 1.7.12


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-29 17:14     ` Sasha Levin
@ 2012-08-29 18:12       ` Michael S. Tsirkin
  2012-08-29 20:46         ` Sasha Levin
  0 siblings, 1 reply; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-08-29 18:12 UTC (permalink / raw)
  To: Sasha Levin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On Wed, Aug 29, 2012 at 07:14:01PM +0200, Sasha Levin wrote:
> On 08/29/2012 05:38 PM, Michael S. Tsirkin wrote:
> > On Tue, Aug 28, 2012 at 03:04:03PM +0200, Sasha Levin wrote:
> >> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
> >> use indirect descriptors and allocate them using a simple
> >> kmalloc().
> >>
> >> This patch adds a cache which will allow indirect buffers under
> >> a configurable size to be allocated from that cache instead.
> >>
> >> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> > 
> > The API is ugly - how does driver know what to set?
> > Is this a typical request size? Then let's call it that.
> 
> We've discussed it during the RFC phase, the idea is that we don't know what
> would be a good number to use as threshold - which is why I'd like to keep it
> disabled as default until it gets more serious tests.
> 
> The driver doesn't know what to set, the plan was to make it a dynamic
> algorithms which would change it based on current load. Since I can't really do
> testing which will provide the correct values for that, the decision was to do
> it this way as the first stage and modify it later.
> 
> > Also this is really per VQ, right?
> 
> Right, we keep it per-device at this stage to keep it simple.
> 
> > What is a good default for net? I guess max sg?
> 
> I think that it depends on the workload. I'd say we should keep the default to 0
> (disabled) unless we can have a good way to adjust it to the load.

For *all* drivers?

Then it is mostly useless. No one has the time to tweak module
parameters in real life.

For virtio-net, 16+1 is not too much and ensures we always
use the cache.

If that works better than 0 I would say run with 17.


> >> ---
> >>
> >> Changes in v2:
> >>
> >>  - Free correctly indirect buffers.
> >>
> >>  drivers/block/virtio_blk.c          |  4 ++++
> >>  drivers/char/hw_random/virtio-rng.c |  4 ++++
> >>  drivers/char/virtio_console.c       |  4 ++++
> >>  drivers/net/virtio_net.c            |  4 ++++
> >>  drivers/virtio/virtio_balloon.c     |  4 ++++
> >>  drivers/virtio/virtio_ring.c        | 28 ++++++++++++++++++++++++----
> >>  include/linux/virtio.h              |  1 +
> >>  net/9p/trans_virtio.c               |  5 +++++
> >>  8 files changed, 50 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> >> index 13b8ae9..7f670af 100644
> >> --- a/drivers/block/virtio_blk.c
> >> +++ b/drivers/block/virtio_blk.c
> >> @@ -25,6 +25,9 @@ struct workqueue_struct *virtblk_wq;
> >>  static unsigned int indirect_thresh;
> >>  module_param(indirect_thresh, uint, S_IRUGO);
> >>  
> >> +static unsigned int indirect_alloc_thresh;
> >> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> >> +
> >>  struct virtio_blk
> >>  {
> >>  	struct virtio_device *vdev;
> >> @@ -739,6 +742,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
> >>  	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
> >>  	vblk->config_enable = true;
> >>  	vdev->indirect_thresh = indirect_thresh;
> >> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
> >>  
> >>  	err = init_vq(vblk);
> >>  	if (err)
> >> diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
> >> index 02d8421..8475ece 100644
> >> --- a/drivers/char/hw_random/virtio-rng.c
> >> +++ b/drivers/char/hw_random/virtio-rng.c
> >> @@ -28,6 +28,9 @@
> >>  static unsigned int indirect_thresh;
> >>  module_param(indirect_thresh, uint, S_IRUGO);
> >>  
> >> +static unsigned int indirect_alloc_thresh;
> >> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> >> +
> >>  static struct virtqueue *vq;
> >>  static unsigned int data_avail;
> >>  static DECLARE_COMPLETION(have_data);
> >> @@ -97,6 +100,7 @@ static int probe_common(struct virtio_device *vdev)
> >>  
> >>  	/* We expect a single virtqueue. */
> >>  	vdev->indirect_thresh = indirect_thresh;
> >> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
> >>  	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
> >>  	if (IS_ERR(vq))
> >>  		return PTR_ERR(vq);
> >> diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> >> index fc14e7f..c6f24a7 100644
> >> --- a/drivers/char/virtio_console.c
> >> +++ b/drivers/char/virtio_console.c
> >> @@ -42,6 +42,9 @@
> >>  static unsigned int indirect_thresh;
> >>  module_param(indirect_thresh, uint, S_IRUGO);
> >>  
> >> +static unsigned int indirect_alloc_thresh;
> >> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> >> +
> >>  /*
> >>   * This is a global struct for storing common data for all the devices
> >>   * this driver handles.
> >> @@ -1891,6 +1894,7 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
> >>  			      &portdev->config.max_nr_ports) == 0)
> >>  		multiport = true;
> >>  	vdev->indirect_thresh = indirect_thresh;
> >> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
> >>  
> >>  	err = init_vqs(portdev);
> >>  	if (err < 0) {
> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >> index 64a8321..c091efd 100644
> >> --- a/drivers/net/virtio_net.c
> >> +++ b/drivers/net/virtio_net.c
> >> @@ -37,6 +37,9 @@ module_param(gso, bool, 0444);
> >>  static unsigned int indirect_thresh;
> >>  module_param(indirect_thresh, uint, S_IRUGO);
> >>  
> >> +static unsigned int indirect_alloc_thresh;
> >> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> >> +
> >>  /* FIXME: MTU in config. */
> >>  #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> >>  #define GOOD_COPY_LEN	128
> >> @@ -1132,6 +1135,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> >>  	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
> >>  		vi->mergeable_rx_bufs = true;
> >>  	vdev->indirect_thresh = indirect_thresh;
> >> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
> >>  
> >>  	err = init_vqs(vi);
> >>  	if (err)
> >> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> >> index fce7347..ccf87db 100644
> >> --- a/drivers/virtio/virtio_balloon.c
> >> +++ b/drivers/virtio/virtio_balloon.c
> >> @@ -38,6 +38,9 @@
> >>  static unsigned int indirect_thresh;
> >>  module_param(indirect_thresh, uint, S_IRUGO);
> >>  
> >> +static unsigned int indirect_alloc_thresh;
> >> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> >> +
> >>  struct virtio_balloon
> >>  {
> >>  	struct virtio_device *vdev;
> >> @@ -360,6 +363,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
> >>  	vb->vdev = vdev;
> >>  	vb->need_stats_update = 0;
> >>  	vdev->indirect_thresh = indirect_thresh;
> >> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
> >>  
> >>  	err = init_vqs(vb);
> >>  	if (err)
> >> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> >> index 99a64a7..e8b9c54 100644
> >> --- a/drivers/virtio/virtio_ring.c
> >> +++ b/drivers/virtio/virtio_ring.c
> >> @@ -93,6 +93,10 @@ struct vring_virtqueue
> >>  	 */
> >>  	unsigned int indirect_thresh;
> >>  
> >> +	/* Buffers below this size will be allocated from cache */
> >> +	unsigned int indirect_alloc_thresh;
> >> +	struct kmem_cache *indirect_cache;
> >> +
> >>  	/* Host publishes avail event idx */
> >>  	bool event;
> >>  
> >> @@ -135,7 +139,10 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
> >>  	unsigned head;
> >>  	int i;
> >>  
> >> -	desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
> >> +	if ((out + in) <= vq->indirect_alloc_thresh)
> >> +		desc = kmem_cache_alloc(vq->indirect_cache, gfp);
> >> +	else
> >> +		desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
> >>  	if (!desc)
> >>  		return -ENOMEM;
> > 
> > So this means that drivers need to know how large each
> > descriptor is to avoid trying to allocate 32Mbyte chunks :)
> 
> Right, this interface isn't perfect and would hopefully be removed in favour of
> a dynamic algorithm in the driver.
> 
> >>  
> >> @@ -384,8 +391,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
> >>  	i = head;
> >>  
> >>  	/* Free the indirect table */
> >> -	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
> >> -		kfree(phys_to_virt(vq->vring.desc[i].addr));
> >> +	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT) {
> >> +		if (vq->vring.desc[i].len > vq->indirect_alloc_thresh)
> > 
> > This looks wrong. indirect_alloc_thresh is in entries but len is in
> > bytes, isn't it?
> 
> Uh, yes. It's supposed to be '.len/sizeof(struct vring_desc)'.
> 
> >> +			kfree(phys_to_virt(vq->vring.desc[i].addr));
> >> +		else
> >> +			kmem_cache_free(vq->indirect_cache,
> >> +					phys_to_virt(vq->vring.desc[i].addr));
> >> +	}
> >>  
> >>  	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
> >>  		i = vq->vring.desc[i].next;
> >> @@ -654,14 +666,20 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
> >>  	vq->last_used_idx = 0;
> >>  	vq->num_added = 0;
> >>  	vq->indirect_thresh = 0;
> >> +	vq->indirect_alloc_thresh = 0;
> >> +	vq->indirect_cache = NULL;
> >>  	list_add_tail(&vq->vq.list, &vdev->vqs);
> >>  #ifdef DEBUG
> >>  	vq->in_use = false;
> >>  	vq->last_add_time_valid = false;
> >>  #endif
> >>  
> >> -	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
> >> +	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) {
> >>  		vq->indirect_thresh = vdev->indirect_thresh;
> >> +		vq->indirect_alloc_thresh = vdev->indirect_alloc_thresh;
> >> +		if (vq->indirect_alloc_thresh)
> >> +			vq->indirect_cache = KMEM_CACHE(vring_desc[vq->indirect_alloc_thresh],
> >> 0);
> > 
> > I am not a purist but this line looks way too long.
> > Also - no need to check cache creation succeeded?
> > On failure - disable caching?
> > 
> > Also - let's check that values are sane before passing them on?
> > They come from user after all.
> 
> will fix these two.
> 
> > Also - should not threshold be per VQ? E.g. for -net we do not
> > need the cache for RX unless in legacy big packet mode.
> 
> We've discussed it two months ago over IRC (at least thats what I have in my
> notes) - the plan was to keep it simple per-device until something more advanced
> to deal with the threshold shows up.
> 
> 
> Thanks,
> Sasha
> 
> > 
> >> +	}
> >>  
> >>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
> >>  
> >> @@ -685,6 +703,8 @@ EXPORT_SYMBOL_GPL(vring_new_virtqueue);
> >>  void vring_del_virtqueue(struct virtqueue *vq)
> >>  {
> >>  	list_del(&vq->list);
> >> +	if (to_vvq(vq)->indirect_cache)
> >> +		kmem_cache_destroy(to_vvq(vq)->indirect_cache);
> >>  	kfree(to_vvq(vq));
> >>  }
> >>  EXPORT_SYMBOL_GPL(vring_del_virtqueue);
> >> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> >> index 48bc457..3261c02 100644
> >> --- a/include/linux/virtio.h
> >> +++ b/include/linux/virtio.h
> >> @@ -70,6 +70,7 @@ struct virtio_device {
> >>  	unsigned long features[1];
> >>  	void *priv;
> >>  	unsigned int indirect_thresh;
> >> +	unsigned int indirect_alloc_thresh;
> >>  };
> >>  
> >>  #define dev_to_virtio(dev) container_of(dev, struct virtio_device, dev)
> >> diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
> >> index fc93962..3044c86 100644
> >> --- a/net/9p/trans_virtio.c
> >> +++ b/net/9p/trans_virtio.c
> >> @@ -55,6 +55,9 @@
> >>  static unsigned int indirect_thresh;
> >>  module_param(indirect_thresh, uint, S_IRUGO);
> >>  
> >> +static unsigned int indirect_alloc_thresh;
> >> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> >> +
> >>  /* a single mutex to manage channel initialization and attachment */
> >>  static DEFINE_MUTEX(virtio_9p_lock);
> >>  static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
> >> @@ -505,6 +508,8 @@ static int p9_virtio_probe(struct virtio_device *vdev)
> >>  
> >>  	/* We expect one virtqueue, for requests. */
> >>  	vdev->indirect_thresh = indirect_thresh;
> >> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
> >> +
> >>  	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
> >>  	if (IS_ERR(chan->vq)) {
> >>  		err = PTR_ERR(chan->vq);
> >> -- 
> >> 1.7.12

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-29 18:12       ` Michael S. Tsirkin
@ 2012-08-29 20:46         ` Sasha Levin
  2012-08-29 22:52           ` Michael S. Tsirkin
  0 siblings, 1 reply; 29+ messages in thread
From: Sasha Levin @ 2012-08-29 20:46 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On 08/29/2012 08:12 PM, Michael S. Tsirkin wrote:
>>> > > What is a good default for net? I guess max sg?
>> > 
>> > I think that it depends on the workload. I'd say we should keep the default to 0
>> > (disabled) unless we can have a good way to adjust it to the load.
> For *all* drivers?
> 
> Then it is mostly useless. No one has the time to tweak module
> parameters in real life.
> 
> For virtio-net, 16+1 is not too much and ensures we always
> use the cache.
> 
> If that works better than 0 I would say run with 17.

I was being extra-cautious with leaving it disabled until specifically enabled
because I assumed that this would be one of the first comments I'll get if it
was enabled by default :)

If you're comfortable with setting it to a sane default like 17, I'm perfectly
fine with that as well.


Thanks,
Sasha


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-29 20:46         ` Sasha Levin
@ 2012-08-29 22:52           ` Michael S. Tsirkin
  0 siblings, 0 replies; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-08-29 22:52 UTC (permalink / raw)
  To: Sasha Levin; +Cc: avi, linux-kernel, kvm, virtualization

On Wed, Aug 29, 2012 at 10:46:19PM +0200, Sasha Levin wrote:
> On 08/29/2012 08:12 PM, Michael S. Tsirkin wrote:
> >>> > > What is a good default for net? I guess max sg?
> >> > 
> >> > I think that it depends on the workload. I'd say we should keep the default to 0
> >> > (disabled) unless we can have a good way to adjust it to the load.
> > For *all* drivers?
> > 
> > Then it is mostly useless. No one has the time to tweak module
> > parameters in real life.
> > 
> > For virtio-net, 16+1 is not too much and ensures we always
> > use the cache.
> > 
> > If that works better than 0 I would say run with 17.
> 
> I was being extra-cautious with leaving it disabled until specifically enabled
> because I assumed that this would be one of the first comments I'll get if it
> was enabled by default :)
> 
> If you're comfortable with setting it to a sane default like 17, I'm perfectly
> fine with that as well.
> 
> 
> Thanks,
> Sasha

If our testing shows it helps and does not trigger regressions, then
why not? module params are mostly there for developers.
They are not all that helpful to users.

> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-29 15:14           ` Michael S. Tsirkin
@ 2012-08-30 10:34             ` Sasha Levin
  0 siblings, 0 replies; 29+ messages in thread
From: Sasha Levin @ 2012-08-30 10:34 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: rusty, virtualization, linux-kernel, avi, kvm

On 08/29/2012 05:14 PM, Michael S. Tsirkin wrote:
>> > Note that these are simple tests with netperf listening on one end and a simple
>> > 'netperf -H [host]' within the guest. If there are other tests which may be
>> > interesting please let me know.
> Checking that host CPU utilization did not jump would be nice.
> E.g. measure BW/host CPU.

Tested it now, no change in CPU between the original, patch 1 and patch 2.


Thanks,
Sasha

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-28 13:35     ` Sasha Levin
  2012-08-29 11:07       ` Michael S. Tsirkin
@ 2012-09-06  0:02       ` Rusty Russell
  1 sibling, 0 replies; 29+ messages in thread
From: Rusty Russell @ 2012-09-06  0:02 UTC (permalink / raw)
  To: Sasha Levin, Michael S. Tsirkin; +Cc: virtualization, linux-kernel, avi, kvm

Sasha Levin <levinsasha928@gmail.com> writes:

> On 08/28/2012 03:20 PM, Michael S. Tsirkin wrote:
>> On Tue, Aug 28, 2012 at 03:04:03PM +0200, Sasha Levin wrote:
>>> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
>>> use indirect descriptors and allocate them using a simple
>>> kmalloc().
>>>
>>> This patch adds a cache which will allow indirect buffers under
>>> a configurable size to be allocated from that cache instead.
>>>
>>> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
>> 
>> I imagine this helps performance? Any numbers?
>
> I ran benchmarks on the original RFC, I've re-tested it now and got similar
> numbers to the original ones (virtio-net using vhost-net, thresh=16):
>
> Before:
> 	Recv   Send    Send
> 	Socket Socket  Message  Elapsed
> 	Size   Size    Size     Time     Throughput
> 	bytes  bytes   bytes    secs.    10^6bits/sec
>
> 	 87380  16384  16384    10.00    4512.12
>
> After:
> 	Recv   Send    Send
> 	Socket Socket  Message  Elapsed
> 	Size   Size    Size     Time     Throughput
> 	bytes  bytes   bytes    secs.    10^6bits/sec
>
> 	 87380  16384  16384    10.00    5399.18

I have an older patch which adjusts the threshold dynamically, can you
compare?  User-adjustable thresholds are statistically never adjusted :(

virtio: use indirect buffers based on demand (heuristic)

virtio_ring uses a ring buffer of descriptors: indirect support allows
a single descriptor to refer to a table of descriptors.  This saves
space in the ring, but requires a kmalloc/kfree.

Rather than try to figure out what the right threshold at which to use
indirect buffers, we drop the threshold dynamically when the ring is
under stress.

Note: to stress this, I reduced the ring size to 32 in lguest, and a
1G send reduced the threshold to 9.

Note2: I moved the BUG_ON()s above the indirect test, where they belong
(indirect falls thru on OOM, so the constraints still apply).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c |   61 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 52 insertions(+), 9 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -89,6 +89,8 @@ struct vring_virtqueue
 
 	/* Host supports indirect buffers */
 	bool indirect;
+	/* Threshold before we go indirect. */
+	unsigned int indirect_threshold;
 
 	/* Host publishes avail event idx */
 	bool event;
@@ -174,6 +176,34 @@ static int vring_add_indirect(struct vri
 	return head;
 }
 
+static void adjust_threshold(struct vring_virtqueue *vq,
+			     unsigned int out, unsigned int in)
+{
+	/* There are really two species of virtqueue, and it matters here.
+	 * If there are no output parts, it's a "normally full" receive queue,
+	 * otherwise it's a "normally empty" send queue. */
+	if (out) {
+		/* Leave threshold unless we're full. */
+		if (out + in < vq->num_free)
+			return;
+	} else {
+		/* Leave threshold unless we're empty. */
+		if (vq->num_free != vq->vring.num)
+			return;
+	}
+
+	/* Never drop threshold below 1 */
+	vq->indirect_threshold /= 2;
+	vq->indirect_threshold |= 1;
+
+#if 0
+	printk("%s %s: indirect threshold %u (%u+%u vs %u)\n",
+	       dev_name(&vq->vq.vdev->dev),
+	       vq->vq.name, vq->indirect_threshold,
+	       out, in, vq->num_free);
+#endif
+}
+
 int virtqueue_get_queue_index(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
@@ -226,17 +256,32 @@ int virtqueue_add_buf(struct virtqueue *
 	}
 #endif
 
-	/* If the host supports indirect descriptor tables, and we have multiple
-	 * buffers, then go indirect. FIXME: tune this threshold */
-	if (vq->indirect && (out + in) > 1 && vq->num_free) {
-		head = vring_add_indirect(vq, sg, out, in, gfp);
-		if (likely(head >= 0))
-			goto add_head;
-	}
-
 	BUG_ON(out + in > vq->vring.num);
 	BUG_ON(out + in == 0);
 
+
+	/* If the host supports indirect descriptor tables, consider it. */
+	if (vq->indirect) {
+		bool try_indirect;
+
+		/* We tweak the threshold automatically. */
+		adjust_threshold(vq, out, in);
+
+		/* If we can't fit any at all, fall through. */
+		if (vq->num_free == 0)
+			try_indirect = false;
+		else if (out + in > vq->num_free)
+			try_indirect = true;
+		else
+			try_indirect = (out + in > vq->indirect_threshold);
+
+		if (try_indirect) {
+			head = vring_add_indirect(vq, sg, out, in);
+			if (head != vq->vring.num)
+				goto add_head;
+		}
+	}
+
 	if (vq->num_free < out + in) {
 		pr_debug("Can't add buf len %i - avail = %i\n",
 			 out + in, vq->num_free);
@@ -666,6 +711,7 @@ struct virtqueue *vring_new_virtqueue(un
 #endif
 
 	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
+	vq->indirect_threshold = num;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
 	/* No callback?  Tell other side not to bother us. */



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-08-29 16:50             ` Sasha Levin
@ 2012-09-06  1:02               ` Rusty Russell
  2012-09-06  5:02                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 29+ messages in thread
From: Rusty Russell @ 2012-09-06  1:02 UTC (permalink / raw)
  To: Sasha Levin, Michael S. Tsirkin; +Cc: virtualization, linux-kernel, avi, kvm

Sasha Levin <levinsasha928@gmail.com> writes:
>> On Wed, Aug 29, 2012 at 05:03:03PM +0200, Sasha Levin wrote:
>>> I've also re-ran it on a IBM server type host instead of my laptop. Here are the
>>> results:
>>>
>>> Vanilla kernel:
>>>
>>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
>>> () port 0 AF_INET
>>> enable_enobufs failed: getprotobyname
>>> Recv   Send    Send
>>> Socket Socket  Message  Elapsed
>>> Size   Size    Size     Time     Throughput
>>> bytes  bytes   bytes    secs.    10^6bits/sec
>>>
>>>  87380  16384  16384    10.00    7922.72
>>>
>>> Patch 1, with threshold=16:
>>>
>>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
>>> () port 0 AF_INET
>>> enable_enobufs failed: getprotobyname
>>> Recv   Send    Send
>>> Socket Socket  Message  Elapsed
>>> Size   Size    Size     Time     Throughput
>>> bytes  bytes   bytes    secs.    10^6bits/sec
>>>
>>>  87380  16384  16384    10.00    8415.07
>>>
>>> Patch 2:
>>>
>>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
>>> () port 0 AF_INET
>>> enable_enobufs failed: getprotobyname
>>> Recv   Send    Send
>>> Socket Socket  Message  Elapsed
>>> Size   Size    Size     Time     Throughput
>>> bytes  bytes   bytes    secs.    10^6bits/sec
>>>
>>>  87380  16384  16384    10.00    8931.05
>>>
>>>
>>> Note that these are simple tests with netperf listening on one end and a simple
>>> 'netperf -H [host]' within the guest. If there are other tests which may be
>>> interesting please let me know.

It might be worth just unconditionally having a cache for the 2
descriptor case.  This is what I get with qemu tap, though for some
reason the device features don't have guest or host CSUM, so my setup is
probably screwed:

Queue histogram for virtio0:
Size distribution for input (max=128427):
  1: 128427  ################################################################
Size distribution for output (max=256485):
  2: 256485  ################################################################
Size distribution for control (max=10):
  3: 10      ################################################################
  4: 5       ################################

Here's a patch, what do you get (run ifconfig to trigger the dump; yeah,
it's a hack!)

Hack: histogram of buffer sizes for virtio devices.

Currently triggered by a stats query (eg ifconfig) on a net device.

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -727,6 +727,8 @@ static struct rtnl_link_stats64 *virtnet
 	tot->rx_length_errors = dev->stats.rx_length_errors;
 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
 
+	virtio_dev_dump_histogram(vi->vdev);
+
 	return tot;
 }
 
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -108,6 +108,16 @@ void virtio_check_driver_offered_feature
 }
 EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature);
 
+void virtio_dev_dump_histogram(const struct virtio_device *vdev)
+{
+	const struct virtqueue *vq;
+
+	printk("Queue histogram for %s:\n", dev_name(&vdev->dev));
+	list_for_each_entry(vq, &vdev->vqs, list)
+		virtqueue_dump_histogram(vq);
+}
+EXPORT_SYMBOL_GPL(virtio_dev_dump_histogram);
+
 static int virtio_dev_probe(struct device *_d)
 {
 	int err, i;
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -120,6 +120,8 @@ struct vring_virtqueue
 	ktime_t last_add_time;
 #endif
 
+	unsigned int *histo;
+
 	/* Tokens for callbacks. */
 	void *data[];
 };
@@ -259,6 +261,8 @@ int virtqueue_add_buf(struct virtqueue *
 	BUG_ON(out + in > vq->vring.num);
 	BUG_ON(out + in == 0);
 
+	vq->histo[out+in]++;
+
 	/* If the host supports indirect descriptor tables, consider it. */
 	if (vq->indirect) {
 		bool try_indirect;
@@ -726,6 +730,7 @@ struct virtqueue *vring_new_virtqueue(un
 	}
 	vq->data[i] = NULL;
 
+	vq->histo = kzalloc(num * sizeof(vq->histo[0]), GFP_KERNEL);
 	return &vq->vq;
 }
 EXPORT_SYMBOL_GPL(vring_new_virtqueue);
@@ -772,4 +777,33 @@ unsigned int virtqueue_get_vring_size(st
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
 
+void virtqueue_dump_histogram(const struct virtqueue *_vq)
+{
+	const struct vring_virtqueue *vq = to_vvq(_vq);
+	int i, j, start = 0, end = 0, max = 1;
+	char line[120];
+
+	for (i = 0; i < vq->vring.num; i++) {
+		if (!vq->histo[i])
+			continue;
+
+		end = i;
+		if (!vq->histo[start])
+			start = i;
+
+		if (vq->histo[i] > max)
+			max = vq->histo[i];
+	}
+
+	printk("Size distribution for %s (max=%u):\n", _vq->name, max);
+	for (i = start; i <= end; i++) {
+		unsigned int off;
+		off = sprintf(line, "%3u: %-7u ", i, vq->histo[i]);
+		for (j = 0; j < vq->histo[i] * 64 / max; j++)
+			line[off++] = '#';
+		line[off] = '\0';
+		printk("%s\n", line);
+	}
+}
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -52,6 +52,9 @@ unsigned int virtqueue_get_vring_size(st
 
 int virtqueue_get_queue_index(struct virtqueue *vq);
 
+void virtio_dev_dump_histogram(const struct virtio_device *vdev);
+void virtqueue_dump_histogram(const struct virtqueue *vq);
+
 /**
  * virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-09-06  1:02               ` Rusty Russell
@ 2012-09-06  5:02                 ` Michael S. Tsirkin
  2012-09-06  7:57                   ` Rusty Russell
  2012-09-10 15:52                   ` Paolo Bonzini
  0 siblings, 2 replies; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-09-06  5:02 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Sasha Levin, virtualization, linux-kernel, avi, kvm

On Thu, Sep 06, 2012 at 10:32:48AM +0930, Rusty Russell wrote:
> Sasha Levin <levinsasha928@gmail.com> writes:
> >> On Wed, Aug 29, 2012 at 05:03:03PM +0200, Sasha Levin wrote:
> >>> I've also re-ran it on a IBM server type host instead of my laptop. Here are the
> >>> results:
> >>>
> >>> Vanilla kernel:
> >>>
> >>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
> >>> () port 0 AF_INET
> >>> enable_enobufs failed: getprotobyname
> >>> Recv   Send    Send
> >>> Socket Socket  Message  Elapsed
> >>> Size   Size    Size     Time     Throughput
> >>> bytes  bytes   bytes    secs.    10^6bits/sec
> >>>
> >>>  87380  16384  16384    10.00    7922.72
> >>>
> >>> Patch 1, with threshold=16:
> >>>
> >>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
> >>> () port 0 AF_INET
> >>> enable_enobufs failed: getprotobyname
> >>> Recv   Send    Send
> >>> Socket Socket  Message  Elapsed
> >>> Size   Size    Size     Time     Throughput
> >>> bytes  bytes   bytes    secs.    10^6bits/sec
> >>>
> >>>  87380  16384  16384    10.00    8415.07
> >>>
> >>> Patch 2:
> >>>
> >>> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.33.1
> >>> () port 0 AF_INET
> >>> enable_enobufs failed: getprotobyname
> >>> Recv   Send    Send
> >>> Socket Socket  Message  Elapsed
> >>> Size   Size    Size     Time     Throughput
> >>> bytes  bytes   bytes    secs.    10^6bits/sec
> >>>
> >>>  87380  16384  16384    10.00    8931.05
> >>>
> >>>
> >>> Note that these are simple tests with netperf listening on one end and a simple
> >>> 'netperf -H [host]' within the guest. If there are other tests which may be
> >>> interesting please let me know.
> 
> It might be worth just unconditionally having a cache for the 2
> descriptor case.  This is what I get with qemu tap, though for some
> reason the device features don't have guest or host CSUM, so my setup is
> probably screwed:

Yes without checksum net core always linearizes packets, so yes it is
screwed.
For -net, skb always allocates space for 17 frags + linear part so
it seems sane to do same in virtio core, and allocate, for -net,
up to max_frags + 1 from cache.
We can adjust it: no _SG -> 2 otherwise 18.

Not sure about other drivers, maybe really use 2 there for now.

> Queue histogram for virtio0:
> Size distribution for input (max=128427):
>   1: 128427  ################################################################
> Size distribution for output (max=256485):
>   2: 256485  ################################################################
> Size distribution for control (max=10):
>   3: 10      ################################################################
>   4: 5       ################################
> 
> Here's a patch, what do you get (run ifconfig to trigger the dump; yeah,
> it's a hack!)
> 
> Hack: histogram of buffer sizes for virtio devices.
> 
> Currently triggered by a stats query (eg ifconfig) on a net device.
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -727,6 +727,8 @@ static struct rtnl_link_stats64 *virtnet
>  	tot->rx_length_errors = dev->stats.rx_length_errors;
>  	tot->rx_frame_errors = dev->stats.rx_frame_errors;
>  
> +	virtio_dev_dump_histogram(vi->vdev);
> +
>  	return tot;
>  }
>  
> diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> --- a/drivers/virtio/virtio.c
> +++ b/drivers/virtio/virtio.c
> @@ -108,6 +108,16 @@ void virtio_check_driver_offered_feature
>  }
>  EXPORT_SYMBOL_GPL(virtio_check_driver_offered_feature);
>  
> +void virtio_dev_dump_histogram(const struct virtio_device *vdev)
> +{
> +	const struct virtqueue *vq;
> +
> +	printk("Queue histogram for %s:\n", dev_name(&vdev->dev));
> +	list_for_each_entry(vq, &vdev->vqs, list)
> +		virtqueue_dump_histogram(vq);
> +}
> +EXPORT_SYMBOL_GPL(virtio_dev_dump_histogram);
> +
>  static int virtio_dev_probe(struct device *_d)
>  {
>  	int err, i;
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -120,6 +120,8 @@ struct vring_virtqueue
>  	ktime_t last_add_time;
>  #endif
>  
> +	unsigned int *histo;
> +
>  	/* Tokens for callbacks. */
>  	void *data[];
>  };
> @@ -259,6 +261,8 @@ int virtqueue_add_buf(struct virtqueue *
>  	BUG_ON(out + in > vq->vring.num);
>  	BUG_ON(out + in == 0);
>  
> +	vq->histo[out+in]++;
> +
>  	/* If the host supports indirect descriptor tables, consider it. */
>  	if (vq->indirect) {
>  		bool try_indirect;
> @@ -726,6 +730,7 @@ struct virtqueue *vring_new_virtqueue(un
>  	}
>  	vq->data[i] = NULL;
>  
> +	vq->histo = kzalloc(num * sizeof(vq->histo[0]), GFP_KERNEL);
>  	return &vq->vq;
>  }
>  EXPORT_SYMBOL_GPL(vring_new_virtqueue);
> @@ -772,4 +777,33 @@ unsigned int virtqueue_get_vring_size(st
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
>  
> +void virtqueue_dump_histogram(const struct virtqueue *_vq)
> +{
> +	const struct vring_virtqueue *vq = to_vvq(_vq);
> +	int i, j, start = 0, end = 0, max = 1;
> +	char line[120];
> +
> +	for (i = 0; i < vq->vring.num; i++) {
> +		if (!vq->histo[i])
> +			continue;
> +
> +		end = i;
> +		if (!vq->histo[start])
> +			start = i;
> +
> +		if (vq->histo[i] > max)
> +			max = vq->histo[i];
> +	}
> +
> +	printk("Size distribution for %s (max=%u):\n", _vq->name, max);
> +	for (i = start; i <= end; i++) {
> +		unsigned int off;
> +		off = sprintf(line, "%3u: %-7u ", i, vq->histo[i]);
> +		for (j = 0; j < vq->histo[i] * 64 / max; j++)
> +			line[off++] = '#';
> +		line[off] = '\0';
> +		printk("%s\n", line);
> +	}
> +}
> +
>  MODULE_LICENSE("GPL");
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -52,6 +52,9 @@ unsigned int virtqueue_get_vring_size(st
>  
>  int virtqueue_get_queue_index(struct virtqueue *vq);
>  
> +void virtio_dev_dump_histogram(const struct virtio_device *vdev);
> +void virtqueue_dump_histogram(const struct virtqueue *vq);
> +
>  /**
>   * virtio_device - representation of a device using virtio
>   * @index: unique position on the virtio bus

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-09-06  5:02                 ` Michael S. Tsirkin
@ 2012-09-06  7:57                   ` Rusty Russell
  2012-09-06  8:45                     ` Michael S. Tsirkin
  2012-09-10 15:52                   ` Paolo Bonzini
  1 sibling, 1 reply; 29+ messages in thread
From: Rusty Russell @ 2012-09-06  7:57 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sasha Levin, virtualization, linux-kernel, avi, kvm

"Michael S. Tsirkin" <mst@redhat.com> writes:
> Yes without checksum net core always linearizes packets, so yes it is
> screwed.
> For -net, skb always allocates space for 17 frags + linear part so
> it seems sane to do same in virtio core, and allocate, for -net,
> up to max_frags + 1 from cache.
> We can adjust it: no _SG -> 2 otherwise 18.

But I thought it used individual buffers these days?

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-09-06  7:57                   ` Rusty Russell
@ 2012-09-06  8:45                     ` Michael S. Tsirkin
  2012-09-06 23:49                       ` Rusty Russell
  0 siblings, 1 reply; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-09-06  8:45 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Sasha Levin, virtualization, linux-kernel, avi, kvm

On Thu, Sep 06, 2012 at 05:27:23PM +0930, Rusty Russell wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> > Yes without checksum net core always linearizes packets, so yes it is
> > screwed.
> > For -net, skb always allocates space for 17 frags + linear part so
> > it seems sane to do same in virtio core, and allocate, for -net,
> > up to max_frags + 1 from cache.
> > We can adjust it: no _SG -> 2 otherwise 18.
> 
> But I thought it used individual buffers these days?

Yes for receive, no for transmit. That's probably why
we should have the threshold per vq, not per device, BTW.

-- 
MST

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-09-06  8:45                     ` Michael S. Tsirkin
@ 2012-09-06 23:49                       ` Rusty Russell
  2012-09-07  0:06                         ` Michael S. Tsirkin
                                           ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Rusty Russell @ 2012-09-06 23:49 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sasha Levin, virtualization, linux-kernel, avi, kvm

"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Thu, Sep 06, 2012 at 05:27:23PM +0930, Rusty Russell wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> > Yes without checksum net core always linearizes packets, so yes it is
>> > screwed.
>> > For -net, skb always allocates space for 17 frags + linear part so
>> > it seems sane to do same in virtio core, and allocate, for -net,
>> > up to max_frags + 1 from cache.
>> > We can adjust it: no _SG -> 2 otherwise 18.
>> 
>> But I thought it used individual buffers these days?
>
> Yes for receive, no for transmit. That's probably why
> we should have the threshold per vq, not per device, BTW.

Can someone actually run with my histogram patch and see what the real
numbers are?

I'm not convinced that the ideal 17-buffer case actually happens as much
as we think.  And if it's not happening with this netperf test, we're
testing the wrong thing.

Thanks,
Rusty.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-09-06 23:49                       ` Rusty Russell
@ 2012-09-07  0:06                         ` Michael S. Tsirkin
  2012-09-10 16:01                         ` Thomas Lendacky
       [not found]                         ` <14037987.hiN6MSGY6z@tomlt1.ibmoffice.com>
  2 siblings, 0 replies; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-09-07  0:06 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Sasha Levin, virtualization, linux-kernel, avi, kvm

On Fri, Sep 07, 2012 at 09:19:04AM +0930, Rusty Russell wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > On Thu, Sep 06, 2012 at 05:27:23PM +0930, Rusty Russell wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> > Yes without checksum net core always linearizes packets, so yes it is
> >> > screwed.
> >> > For -net, skb always allocates space for 17 frags + linear part so
> >> > it seems sane to do same in virtio core, and allocate, for -net,
> >> > up to max_frags + 1 from cache.
> >> > We can adjust it: no _SG -> 2 otherwise 18.
> >> 
> >> But I thought it used individual buffers these days?
> >
> > Yes for receive, no for transmit. That's probably why
> > we should have the threshold per vq, not per device, BTW.
> 
> Can someone actually run with my histogram patch and see what the real
> numbers are?
> 
> I'm not convinced that the ideal 17-buffer case actually happens as much
> as we think.  And if it's not happening with this netperf test, we're
> testing the wrong thing.
> 
> Thanks,
> Rusty.

hope to play with it next week

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-09-06  5:02                 ` Michael S. Tsirkin
  2012-09-06  7:57                   ` Rusty Russell
@ 2012-09-10 15:52                   ` Paolo Bonzini
  1 sibling, 0 replies; 29+ messages in thread
From: Paolo Bonzini @ 2012-09-10 15:52 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Rusty Russell, linux-kernel, avi, Sasha Levin, kvm, virtualization

Il 06/09/2012 07:02, Michael S. Tsirkin ha scritto:
>> > It might be worth just unconditionally having a cache for the 2
>> > descriptor case.  This is what I get with qemu tap, though for some
>> > reason the device features don't have guest or host CSUM, so my setup is
>> > probably screwed:
> Yes without checksum net core always linearizes packets, so yes it is
> screwed.
> For -net, skb always allocates space for 17 frags + linear part so
> it seems sane to do same in virtio core, and allocate, for -net,
> up to max_frags + 1 from cache.
> We can adjust it: no _SG -> 2 otherwise 18.
> 
> Not sure about other drivers, maybe really use 2 there for now.

2 should also be good for virtio-blk and virtio-scsi 4KB random rw
workloads.

Paolo


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-09-06 23:49                       ` Rusty Russell
  2012-09-07  0:06                         ` Michael S. Tsirkin
@ 2012-09-10 16:01                         ` Thomas Lendacky
       [not found]                         ` <14037987.hiN6MSGY6z@tomlt1.ibmoffice.com>
  2 siblings, 0 replies; 29+ messages in thread
From: Thomas Lendacky @ 2012-09-10 16:01 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Michael S. Tsirkin, Sasha Levin, virtualization, linux-kernel, avi, kvm

On Friday, September 07, 2012 09:19:04 AM Rusty Russell wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> > On Thu, Sep 06, 2012 at 05:27:23PM +0930, Rusty Russell wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> > Yes without checksum net core always linearizes packets, so yes it is
> >> > screwed.
> >> > For -net, skb always allocates space for 17 frags + linear part so
> >> > it seems sane to do same in virtio core, and allocate, for -net,
> >> > up to max_frags + 1 from cache.
> >> > We can adjust it: no _SG -> 2 otherwise 18.
> >> 
> >> But I thought it used individual buffers these days?
> > 
> > Yes for receive, no for transmit. That's probably why
> > we should have the threshold per vq, not per device, BTW.
> 
> Can someone actually run with my histogram patch and see what the real
> numbers are?

Somehow some HTML got in my first reply, resending...

I ran some TCP_RR and TCP_STREAM sessions, both host-to-guest and
guest-to-host, with a form of the histogram patch applied against a
RHEL6.3 kernel. The histogram values were reset after each test.

Here are the results:

60 session TCP_RR from host-to-guest with 256 byte request and 256 byte
response for 60 seconds:

Queue histogram for virtio1:
Size distribution for input (max=7818456):
 1: 7818456 ################################################################
Size distribution for output (max=7816698):
 2: 149     
 3: 7816698 ################################################################
 4: 2       
 5: 1       
Size distribution for control (max=1):
 0: 0


4 session TCP_STREAM from host-to-guest with 4K message size for 60 seconds:

Queue histogram for virtio1:
Size distribution for input (max=16050941):
 1: 16050941 ################################################################
Size distribution for output (max=1877796):
 2: 1877796 ################################################################
 3: 5       
Size distribution for control (max=1):
 0: 0

4 session TCP_STREAM from host-to-guest with 16K message size for 60 seconds:

Queue histogram for virtio1:
Size distribution for input (max=16831151):
 1: 16831151 ################################################################
Size distribution for output (max=1923965):
 2: 1923965 ################################################################
 3: 5       
Size distribution for control (max=1):
 0: 0

4 session TCP_STREAM from guest-to-host with 4K message size for 60 seconds:

Queue histogram for virtio1:
Size distribution for input (max=1316069):
 1: 1316069 ################################################################
Size distribution for output (max=879213):
 2: 24      
 3: 24097   #
 4: 23176   #
 5: 3412    
 6: 4446    
 7: 4663    
 8: 4195    
 9: 3772    
10: 3388    
11: 3666    
12: 2885    
13: 2759    
14: 2997    
15: 3060    
16: 2651    
17: 2235    
18: 92721   ######  
19: 879213  ################################################################
Size distribution for control (max=1):
 0: 0

4 session TCP_STREAM from guest-to-host with 16K message size for 60 seconds:

Queue histogram for virtio1:
Size distribution for input (max=1428590):
 1: 1428590 ################################################################
Size distribution for output (max=957774):
 2: 20
 3: 54955   ###
 4: 34281   ##
 5: 2967
 6: 3394
 7: 9400
 8: 3061
 9: 3397
10: 3258
11: 3275
12: 3147
13: 2876
14: 2747
15: 2832
16: 2013
17: 1670
18: 100369  ######
19: 957774  ################################################################
Size distribution for control (max=1):
 0: 0

Thanks,
Tom

> 
> I'm not convinced that the ideal 17-buffer case actually happens as much
> as we think.  And if it's not happening with this netperf test, we're
> testing the wrong thing.
> 
> Thanks,
> Rusty.
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
       [not found]                         ` <14037987.hiN6MSGY6z@tomlt1.ibmoffice.com>
@ 2012-09-10 16:08                           ` Michael S. Tsirkin
  2012-09-12  6:13                           ` Rusty Russell
  1 sibling, 0 replies; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-09-10 16:08 UTC (permalink / raw)
  To: Thomas Lendacky
  Cc: Rusty Russell, Sasha Levin, virtualization, linux-kernel, avi, kvm

On Mon, Sep 10, 2012 at 10:47:15AM -0500, Thomas Lendacky wrote:
> On Friday, September 07, 2012 09:19:04 AM Rusty Russell wrote:
> 
> > "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > > On Thu, Sep 06, 2012 at 05:27:23PM +0930, Rusty Russell wrote:
> 
> > >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > >> > Yes without checksum net core always linearizes packets, so yes it is
> 
> > >> > screwed.
> 
> > >> > For -net, skb always allocates space for 17 frags + linear part so
> 
> > >> > it seems sane to do same in virtio core, and allocate, for -net,
> 
> > >> > up to max_frags + 1 from cache.
> 
> > >> > We can adjust it: no _SG -> 2 otherwise 18.
> 
> > >>
> 
> > >> But I thought it used individual buffers these days?
> 
> > >
> 
> > > Yes for receive, no for transmit. That's probably why
> 
> > > we should have the threshold per vq, not per device, BTW.
> 
> >
> 
> > Can someone actually run with my histogram patch and see what the real
> 
> > numbers are?
> 
> >
> 
>  
> 
> I ran some TCP_RR and TCP_STREAM sessions, both host-to-guest and
> 
> guest-to-host, with a form of the histogram patch applied against a
> 
> RHEL6.3 kernel. The histogram values were reset after each test.
> 
>  
> 
> Here are the results:
> 
>  
> 
> 60 session TCP_RR from host-to-guest with 256 byte request and 256 byte
> 
> response for 60 seconds:
> 
>  
> 
> Queue histogram for virtio1:
> 
> Size distribution for input (max=7818456):
> 
> 1: 7818456 ################################################################
> 
> Size distribution for output (max=7816698):
> 
> 2: 149
> 
> 3: 7816698 ################################################################

Here, a threshold would help.

> 
> 4: 2
> 
> 5: 1
> 
> Size distribution for control (max=1):
> 
> 0: 0
> 
>  
> 
>  
> 
> 4 session TCP_STREAM from host-to-guest with 4K message size for 60 seconds:
> 
>  
> 
> Queue histogram for virtio1:
> 
> Size distribution for input (max=16050941):
> 
> 1: 16050941 ################################################################
> 
> Size distribution for output (max=1877796):
> 
> 2: 1877796 ################################################################
> 
> 3: 5
> 
> Size distribution for control (max=1):
> 
> 0: 0
> 
>  
> 
> 4 session TCP_STREAM from host-to-guest with 16K message size for 60 seconds:
> 
>  
> 
> Queue histogram for virtio1:
> 
> Size distribution for input (max=16831151):
> 
> 1: 16831151 ################################################################
> 
> Size distribution for output (max=1923965):
> 
> 2: 1923965 ################################################################
> 
> 3: 5
> 
> Size distribution for control (max=1):
> 
> 0: 0
> 

Hmm for virtio net output we do always use 2 s/g, this is because of a
qemu bug. Maybe it's time we fixed this, added a feature bit?
This would fix above without threshold hacks.


> 
> 4 session TCP_STREAM from guest-to-host with 4K message size for 60 seconds:
> 
>  
> 
> Queue histogram for virtio1:
> 
> Size distribution for input (max=1316069):
> 
> 1: 1316069 ################################################################
> 
> Size distribution for output (max=879213):
> 
> 2: 24
> 
> 3: 24097 #
> 
> 4: 23176 #
> 
> 5: 3412
> 
> 6: 4446
> 
> 7: 4663
> 
> 8: 4195
> 
> 9: 3772
> 
> 10: 3388
> 
> 11: 3666
> 
> 12: 2885
> 
> 13: 2759
> 
> 14: 2997
> 
> 15: 3060
> 
> 16: 2651
> 
> 17: 2235
> 
> 18: 92721 ######
> 
> 19: 879213 ################################################################
> 
> Size distribution for control (max=1):
> 
> 0: 0
> 
>  
> 
> 4 session TCP_STREAM from guest-to-host with 16K message size for 60 seconds:
> 
>  
> 
> Queue histogram for virtio1:
> 
> Size distribution for input (max=1428590):
> 
> 1: 1428590 ################################################################
> 
> Size distribution for output (max=957774):
> 
> 2: 20
> 
> 3: 54955 ###
> 
> 4: 34281 ##
> 
> 5: 2967
> 
> 6: 3394
> 
> 7: 9400
> 
> 8: 3061
> 
> 9: 3397
> 
> 10: 3258
> 
> 11: 3275
> 
> 12: 3147
> 
> 13: 2876
> 
> 14: 2747
> 
> 15: 2832
> 
> 16: 2013
> 
> 17: 1670
> 
> 18: 100369 ######
> 
> 19: 957774 ################################################################
> 
> Size distribution for control (max=1):
> 
> 0: 0
> 
>  
> 
> Thanks,
> 
> Tom

In these tests we would have to set threshold pretty high.
I wonder whether the following makes any difference: the idea is to
A. get less false cache sharing by allocating full cache lines
B. better locality by using same cache for multiple sizes

So we get some of the wins of the threshold without bothering
with a cache.

Will try to test but not until later this week.

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 5aa43c3..c184712 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -132,7 +132,8 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
 	unsigned head;
 	int i;
 
-	desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
+	desc = kmalloc(L1_CACHE_ALIGN((out + in) * sizeof(struct vring_desc)),
+		       gfp);
 	if (!desc)
 		return -ENOMEM;
 


-- 
MST

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
       [not found]                         ` <14037987.hiN6MSGY6z@tomlt1.ibmoffice.com>
  2012-09-10 16:08                           ` Michael S. Tsirkin
@ 2012-09-12  6:13                           ` Rusty Russell
  2012-09-12 10:44                             ` Sasha Levin
  1 sibling, 1 reply; 29+ messages in thread
From: Rusty Russell @ 2012-09-12  6:13 UTC (permalink / raw)
  To: Thomas Lendacky
  Cc: Michael S. Tsirkin, Sasha Levin, virtualization, linux-kernel, avi, kvm

Thomas Lendacky <tahm@linux.vnet.ibm.com> writes:
> I ran some TCP_RR and TCP_STREAM sessions, both host-to-guest and
> guest-to-host, with a form of the histogram patch applied against a
> RHEL6.3 kernel. The histogram values were reset after each test.

Hey, thanks!  This is exactly what I wanted to see...

> 60 session TCP_RR from host-to-guest with 256 byte request and 256 byte
> response for 60 seconds:
>
> Queue histogram for virtio1:
> Size distribution for input (max=7818456):
>  1: 7818456 ################################################################

These are always 1, so we don't indirect them anyway, so no cache required.

> Size distribution for output (max=7816698):
>  2: 149     
>  3: 7816698 ################################################################
>  4: 2       
>  5: 1       
> Size distribution for control (max=1):
>  0: 0

OK, tiny TCP data, but latency sensitive.

> Queue histogram for virtio1:
> Size distribution for input (max=16050941):
>  1: 16050941 ################################################################
> Size distribution for output (max=1877796):
>  2: 1877796 ################################################################
>  3: 5       
> Size distribution for control (max=1):
>  0: 0

Acks.  Not that many, not that latency sensitive.

> 4 session TCP_STREAM from guest-to-host with 4K message size for 60 seconds:
>
> Queue histogram for virtio1:
> Size distribution for input (max=1316069):
>  1: 1316069 ################################################################
> Size distribution for output (max=879213):
>  2: 24      
>  3: 24097   #
>  4: 23176   #
>  5: 3412    
>  6: 4446    
>  7: 4663    
>  8: 4195    
>  9: 3772    
> 10: 3388    
> 11: 3666    
> 12: 2885    
> 13: 2759    
> 14: 2997    
> 15: 3060    
> 16: 2651    
> 17: 2235    
> 18: 92721   ######  
> 19: 879213  ################################################################

Hey, that +1 is there in MAX_SKB_FRAGS for a reason!  Who knew?

This looks like we could really use a:

        int vq_set_indirect_cache(struct virtqueue *vq, unsigned num);

Which networking would set on the xmit queue(s) if we have GSO.

The real question is now whether we'd want a separate indirect cache for
the 3 case (so num above should be a bitmap?), or reuse the same one, or
not use it at all?

Benchmarking will tell...

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-09-12  6:13                           ` Rusty Russell
@ 2012-09-12 10:44                             ` Sasha Levin
  2012-10-23 15:14                               ` Michael S. Tsirkin
  0 siblings, 1 reply; 29+ messages in thread
From: Sasha Levin @ 2012-09-12 10:44 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Thomas Lendacky, Michael S. Tsirkin, virtualization,
	linux-kernel, avi, kvm

On 09/12/2012 08:13 AM, Rusty Russell wrote:
> The real question is now whether we'd want a separate indirect cache for
> the 3 case (so num above should be a bitmap?), or reuse the same one, or
> not use it at all?
> 
> Benchmarking will tell...

Since there are no specific decisions about actual values, I'll just modify the
code to use cache per-vq instead of per-device.


Thanks,
Sasha

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible
  2012-09-12 10:44                             ` Sasha Levin
@ 2012-10-23 15:14                               ` Michael S. Tsirkin
  0 siblings, 0 replies; 29+ messages in thread
From: Michael S. Tsirkin @ 2012-10-23 15:14 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Rusty Russell, Thomas Lendacky, virtualization, linux-kernel, avi, kvm

On Wed, Sep 12, 2012 at 12:44:47PM +0200, Sasha Levin wrote:
> On 09/12/2012 08:13 AM, Rusty Russell wrote:
> > The real question is now whether we'd want a separate indirect cache for
> > the 3 case (so num above should be a bitmap?), or reuse the same one, or
> > not use it at all?
> > 
> > Benchmarking will tell...
> 
> Since there are no specific decisions about actual values, I'll just modify the
> code to use cache per-vq instead of per-device.
> 
> 
> Thanks,
> Sasha

One wonders whether we can still use the slab caches
and improve the locality by aligning the size.
Something like the below - this passed basic testing but
didn't measure performance yet.

    virtio: align size for indirect buffers
    
    Improve locality for indirect buffer allocations
    and avoid false cache sharing by aligning
    allocations to cache line size.
    
    Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 2fc85f2..93e6c3a 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -119,7 +119,8 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
 	unsigned head;
 	int i;
 
-	desc = kmalloc((out + in) * sizeof(struct vring_desc), GFP_ATOMIC);
+	desc = kmalloc(L1_CACHE_ALIGN((out + in) * sizeof(struct vring_desc)),
+		       GFP_ATOMIC);
 	if (!desc)
 		return vq->vring.num;
 

^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2012-10-23 15:12 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-08-28 13:04 [PATCH v2 1/2] virtio-ring: Use threshold for switching to indirect descriptors Sasha Levin
2012-08-28 13:04 ` [PATCH v2 2/2] virtio-ring: Allocate indirect buffers from cache when possible Sasha Levin
2012-08-28 13:20   ` Michael S. Tsirkin
2012-08-28 13:35     ` Sasha Levin
2012-08-29 11:07       ` Michael S. Tsirkin
2012-08-29 15:03         ` Sasha Levin
2012-08-29 15:14           ` Michael S. Tsirkin
2012-08-30 10:34             ` Sasha Levin
2012-08-29 15:38           ` Michael S. Tsirkin
2012-08-29 16:50             ` Sasha Levin
2012-09-06  1:02               ` Rusty Russell
2012-09-06  5:02                 ` Michael S. Tsirkin
2012-09-06  7:57                   ` Rusty Russell
2012-09-06  8:45                     ` Michael S. Tsirkin
2012-09-06 23:49                       ` Rusty Russell
2012-09-07  0:06                         ` Michael S. Tsirkin
2012-09-10 16:01                         ` Thomas Lendacky
     [not found]                         ` <14037987.hiN6MSGY6z@tomlt1.ibmoffice.com>
2012-09-10 16:08                           ` Michael S. Tsirkin
2012-09-12  6:13                           ` Rusty Russell
2012-09-12 10:44                             ` Sasha Levin
2012-10-23 15:14                               ` Michael S. Tsirkin
2012-09-10 15:52                   ` Paolo Bonzini
2012-09-06  0:02       ` Rusty Russell
2012-08-29 15:38   ` Michael S. Tsirkin
2012-08-29 17:14     ` Sasha Levin
2012-08-29 18:12       ` Michael S. Tsirkin
2012-08-29 20:46         ` Sasha Levin
2012-08-29 22:52           ` Michael S. Tsirkin
2012-08-28 13:20 ` [PATCH v2 1/2] virtio-ring: Use threshold for switching to indirect descriptors Michael S. Tsirkin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).