All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] virtio_ring: Shadow available ring flags & index
@ 2015-11-11  0:21 Venkatesh Srinivas
  2015-11-11 12:34 ` Michael S. Tsirkin
  2015-11-11 12:34 ` Michael S. Tsirkin
  0 siblings, 2 replies; 16+ messages in thread
From: Venkatesh Srinivas @ 2015-11-11  0:21 UTC (permalink / raw)
  To: virtualization
  Cc: pbonzini, dmatlack, kvm, luto, mst, huawei.xie, rusty, vsrinivas,
	Venkatesh Srinivas

Improves cacheline transfer flow of available ring header.

Virtqueues are implemented as a pair of rings, one producer->consumer
avail ring and one consumer->producer used ring; preceding the
avail ring in memory are two contiguous u16 fields -- avail->flags
and avail->idx. A producer posts work by writing to avail->idx and
a consumer reads avail->idx.

The flags and idx fields only need to be written by a producer CPU
and only read by a consumer CPU; when the producer and consumer are
running on different CPUs and the virtio_ring code is structured to
only have source writes/sink reads, we can continuously transfer the
avail header cacheline between 'M' states between cores. This flow
optimizes core -> core bandwidth on certain CPUs.

(see: "Software Optimization Guide for AMD Family 15h Processors",
Section 11.6; similar language appears in the 10h guide and should
apply to CPUs w/ exclusive caches, using LLC as a transfer cache)

Unfortunately the existing virtio_ring code issued reads to the
avail->idx and read-modify-writes to avail->flags on the producer.

This change shadows the flags and index fields in producer memory;
the vring code now reads from the shadows and only ever writes to
avail->flags and avail->idx, allowing the cacheline to transfer
core -> core optimally.

In a concurrent version of vring_bench, the time required for
10,000,000 buffer checkout/returns was reduced by ~2% (average
across many runs) on an AMD Piledriver (15h) CPU:

(w/o shadowing):
 Performance counter stats for './vring_bench':
     5,451,082,016      L1-dcache-loads
     ...
       2.221477739 seconds time elapsed

(w/ shadowing):
 Performance counter stats for './vring_bench':
     5,405,701,361      L1-dcache-loads
     ...
       2.168405376 seconds time elapsed

The further away (in a NUMA sense) virtio producers and consumers are
from each other, the more we expect to benefit. Physical implementations
of virtio devices and implementations of virtio where the consumer polls
vring avail indexes (vhost) should also benefit.

Signed-off-by: Venkatesh Srinivas <venkateshs@google.com>
---
 drivers/virtio/virtio_ring.c | 46 ++++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 096b857..6262015 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -80,6 +80,12 @@ struct vring_virtqueue {
 	/* Last used index we've seen. */
 	u16 last_used_idx;
 
+	/* Last written value to avail->flags */
+	u16 avail_flags_shadow;
+
+	/* Last written value to avail->idx in guest byte order */
+	u16 avail_idx_shadow;
+
 	/* How to notify other side. FIXME: commonalize hcalls! */
 	bool (*notify)(struct virtqueue *vq);
 
@@ -235,13 +241,14 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 
 	/* Put entry in available array (but don't update avail->idx until they
 	 * do sync). */
-	avail = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) & (vq->vring.num - 1);
+	avail = vq->avail_idx_shadow & (vq->vring.num - 1);
 	vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
 
 	/* Descriptors and available array need to be set before we expose the
 	 * new available array entries. */
 	virtio_wmb(vq->weak_barriers);
-	vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) + 1);
+	vq->avail_idx_shadow++;
+	vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
 	vq->num_added++;
 
 	pr_debug("Added buffer head %i to %p\n", head, vq);
@@ -354,8 +361,8 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
 	 * event. */
 	virtio_mb(vq->weak_barriers);
 
-	old = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->num_added;
-	new = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx);
+	old = vq->avail_idx_shadow - vq->num_added;
+	new = vq->avail_idx_shadow;
 	vq->num_added = 0;
 
 #ifdef DEBUG
@@ -510,7 +517,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 	/* If we expect an interrupt for the next entry, tell host
 	 * by writing event index and flush out the write before
 	 * the read in the next get_buf call. */
-	if (!(vq->vring.avail->flags & cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT))) {
+	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
 		vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx);
 		virtio_mb(vq->weak_barriers);
 	}
@@ -537,7 +544,11 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	vq->vring.avail->flags |= cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT);
+	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
+		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
+
 }
 EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
 
@@ -565,7 +576,10 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
 	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
 	 * either clear the flags bit or point the event index at the next
 	 * entry. Always do both to keep code simple. */
-	vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT);
+	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
 	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
 	END_USE(vq);
 	return last_used_idx;
@@ -633,9 +647,12 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
 	 * either clear the flags bit or point the event index at the next
 	 * entry. Always do both to keep code simple. */
-	vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT);
+	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
 	/* TODO: tune this threshold */
-	bufs = (u16)(virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->last_used_idx) * 3 / 4;
+	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
 	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs);
 	virtio_mb(vq->weak_barriers);
 	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
@@ -670,7 +687,8 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 		/* detach_buf clears data, so grab it now. */
 		buf = vq->data[i];
 		detach_buf(vq, i);
-		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - 1);
+		vq->avail_idx_shadow--;
+		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
 		END_USE(vq);
 		return buf;
 	}
@@ -735,6 +753,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 	vq->weak_barriers = weak_barriers;
 	vq->broken = false;
 	vq->last_used_idx = 0;
+	vq->avail_flags_shadow = 0;
+	vq->avail_idx_shadow = 0;
 	vq->num_added = 0;
 	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
@@ -746,8 +766,10 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
 	/* No callback?  Tell other side not to bother us. */
-	if (!callback)
-		vq->vring.avail->flags |= cpu_to_virtio16(vdev, VRING_AVAIL_F_NO_INTERRUPT);
+	if (!callback) {
+		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+		vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
+	}
 
 	/* Put everything in free lists. */
 	vq->free_head = 0;
-- 
2.6.0.rc2.230.g3dd15c0


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-11  0:21 [PATCH] virtio_ring: Shadow available ring flags & index Venkatesh Srinivas
@ 2015-11-11 12:34 ` Michael S. Tsirkin
  2015-11-13 23:41   ` Venkatesh Srinivas
  2015-11-13 23:41   ` Venkatesh Srinivas via Virtualization
  2015-11-11 12:34 ` Michael S. Tsirkin
  1 sibling, 2 replies; 16+ messages in thread
From: Michael S. Tsirkin @ 2015-11-11 12:34 UTC (permalink / raw)
  To: Venkatesh Srinivas
  Cc: virtualization, pbonzini, dmatlack, kvm, luto, huawei.xie, rusty,
	vsrinivas

On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
> Improves cacheline transfer flow of available ring header.
> 
> Virtqueues are implemented as a pair of rings, one producer->consumer
> avail ring and one consumer->producer used ring; preceding the
> avail ring in memory are two contiguous u16 fields -- avail->flags
> and avail->idx. A producer posts work by writing to avail->idx and
> a consumer reads avail->idx.
> 
> The flags and idx fields only need to be written by a producer CPU
> and only read by a consumer CPU; when the producer and consumer are
> running on different CPUs and the virtio_ring code is structured to
> only have source writes/sink reads, we can continuously transfer the
> avail header cacheline between 'M' states between cores. This flow
> optimizes core -> core bandwidth on certain CPUs.
> 
> (see: "Software Optimization Guide for AMD Family 15h Processors",
> Section 11.6; similar language appears in the 10h guide and should
> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
> 
> Unfortunately the existing virtio_ring code issued reads to the
> avail->idx and read-modify-writes to avail->flags on the producer.
> 
> This change shadows the flags and index fields in producer memory;
> the vring code now reads from the shadows and only ever writes to
> avail->flags and avail->idx, allowing the cacheline to transfer
> core -> core optimally.

Sounds logical, I'll apply this after a  bit of testing
of my own, thanks!

> In a concurrent version of vring_bench, the time required for
> 10,000,000 buffer checkout/returns was reduced by ~2% (average
> across many runs) on an AMD Piledriver (15h) CPU:
> 
> (w/o shadowing):
>  Performance counter stats for './vring_bench':
>      5,451,082,016      L1-dcache-loads
>      ...
>        2.221477739 seconds time elapsed
> 
> (w/ shadowing):
>  Performance counter stats for './vring_bench':
>      5,405,701,361      L1-dcache-loads
>      ...
>        2.168405376 seconds time elapsed

Could you supply the full command line you used
to test this?

> The further away (in a NUMA sense) virtio producers and consumers are
> from each other, the more we expect to benefit. Physical implementations
> of virtio devices and implementations of virtio where the consumer polls
> vring avail indexes (vhost) should also benefit.
> 
> Signed-off-by: Venkatesh Srinivas <venkateshs@google.com>

Here's a similar patch for the ring itself:
https://lkml.org/lkml/2015/9/10/111

Does it help you as well?


> ---
>  drivers/virtio/virtio_ring.c | 46 ++++++++++++++++++++++++++++++++------------
>  1 file changed, 34 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 096b857..6262015 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -80,6 +80,12 @@ struct vring_virtqueue {
>  	/* Last used index we've seen. */
>  	u16 last_used_idx;
>  
> +	/* Last written value to avail->flags */
> +	u16 avail_flags_shadow;
> +
> +	/* Last written value to avail->idx in guest byte order */
> +	u16 avail_idx_shadow;
> +
>  	/* How to notify other side. FIXME: commonalize hcalls! */
>  	bool (*notify)(struct virtqueue *vq);
>  
> @@ -235,13 +241,14 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  
>  	/* Put entry in available array (but don't update avail->idx until they
>  	 * do sync). */
> -	avail = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) & (vq->vring.num - 1);
> +	avail = vq->avail_idx_shadow & (vq->vring.num - 1);
>  	vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
>  
>  	/* Descriptors and available array need to be set before we expose the
>  	 * new available array entries. */
>  	virtio_wmb(vq->weak_barriers);
> -	vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) + 1);
> +	vq->avail_idx_shadow++;
> +	vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
>  	vq->num_added++;
>  
>  	pr_debug("Added buffer head %i to %p\n", head, vq);
> @@ -354,8 +361,8 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
>  	 * event. */
>  	virtio_mb(vq->weak_barriers);
>  
> -	old = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->num_added;
> -	new = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx);
> +	old = vq->avail_idx_shadow - vq->num_added;
> +	new = vq->avail_idx_shadow;
>  	vq->num_added = 0;
>  
>  #ifdef DEBUG
> @@ -510,7 +517,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
>  	/* If we expect an interrupt for the next entry, tell host
>  	 * by writing event index and flush out the write before
>  	 * the read in the next get_buf call. */
> -	if (!(vq->vring.avail->flags & cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT))) {
> +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
>  		vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx);
>  		virtio_mb(vq->weak_barriers);
>  	}
> @@ -537,7 +544,11 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  
> -	vq->vring.avail->flags |= cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT);
> +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> +		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
> +
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
>  
> @@ -565,7 +576,10 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
>  	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
>  	 * either clear the flags bit or point the event index at the next
>  	 * entry. Always do both to keep code simple. */
> -	vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT);
> +	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> +		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> +		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
>  	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
>  	END_USE(vq);
>  	return last_used_idx;
> @@ -633,9 +647,12 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
>  	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
>  	 * either clear the flags bit or point the event index at the next
>  	 * entry. Always do both to keep code simple. */
> -	vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT);
> +	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> +		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> +		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
>  	/* TODO: tune this threshold */
> -	bufs = (u16)(virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->last_used_idx) * 3 / 4;
> +	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
>  	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs);
>  	virtio_mb(vq->weak_barriers);
>  	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
> @@ -670,7 +687,8 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
>  		/* detach_buf clears data, so grab it now. */
>  		buf = vq->data[i];
>  		detach_buf(vq, i);
> -		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - 1);
> +		vq->avail_idx_shadow--;
> +		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
>  		END_USE(vq);
>  		return buf;
>  	}
> @@ -735,6 +753,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>  	vq->weak_barriers = weak_barriers;
>  	vq->broken = false;
>  	vq->last_used_idx = 0;
> +	vq->avail_flags_shadow = 0;
> +	vq->avail_idx_shadow = 0;
>  	vq->num_added = 0;
>  	list_add_tail(&vq->vq.list, &vdev->vqs);
>  #ifdef DEBUG
> @@ -746,8 +766,10 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>  
>  	/* No callback?  Tell other side not to bother us. */
> -	if (!callback)
> -		vq->vring.avail->flags |= cpu_to_virtio16(vdev, VRING_AVAIL_F_NO_INTERRUPT);
> +	if (!callback) {
> +		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +		vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
> +	}
>  
>  	/* Put everything in free lists. */
>  	vq->free_head = 0;
> -- 
> 2.6.0.rc2.230.g3dd15c0

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-11  0:21 [PATCH] virtio_ring: Shadow available ring flags & index Venkatesh Srinivas
  2015-11-11 12:34 ` Michael S. Tsirkin
@ 2015-11-11 12:34 ` Michael S. Tsirkin
  1 sibling, 0 replies; 16+ messages in thread
From: Michael S. Tsirkin @ 2015-11-11 12:34 UTC (permalink / raw)
  To: Venkatesh Srinivas
  Cc: kvm, virtualization, vsrinivas, luto, dmatlack, pbonzini, huawei.xie

On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
> Improves cacheline transfer flow of available ring header.
> 
> Virtqueues are implemented as a pair of rings, one producer->consumer
> avail ring and one consumer->producer used ring; preceding the
> avail ring in memory are two contiguous u16 fields -- avail->flags
> and avail->idx. A producer posts work by writing to avail->idx and
> a consumer reads avail->idx.
> 
> The flags and idx fields only need to be written by a producer CPU
> and only read by a consumer CPU; when the producer and consumer are
> running on different CPUs and the virtio_ring code is structured to
> only have source writes/sink reads, we can continuously transfer the
> avail header cacheline between 'M' states between cores. This flow
> optimizes core -> core bandwidth on certain CPUs.
> 
> (see: "Software Optimization Guide for AMD Family 15h Processors",
> Section 11.6; similar language appears in the 10h guide and should
> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
> 
> Unfortunately the existing virtio_ring code issued reads to the
> avail->idx and read-modify-writes to avail->flags on the producer.
> 
> This change shadows the flags and index fields in producer memory;
> the vring code now reads from the shadows and only ever writes to
> avail->flags and avail->idx, allowing the cacheline to transfer
> core -> core optimally.

Sounds logical, I'll apply this after a  bit of testing
of my own, thanks!

> In a concurrent version of vring_bench, the time required for
> 10,000,000 buffer checkout/returns was reduced by ~2% (average
> across many runs) on an AMD Piledriver (15h) CPU:
> 
> (w/o shadowing):
>  Performance counter stats for './vring_bench':
>      5,451,082,016      L1-dcache-loads
>      ...
>        2.221477739 seconds time elapsed
> 
> (w/ shadowing):
>  Performance counter stats for './vring_bench':
>      5,405,701,361      L1-dcache-loads
>      ...
>        2.168405376 seconds time elapsed

Could you supply the full command line you used
to test this?

> The further away (in a NUMA sense) virtio producers and consumers are
> from each other, the more we expect to benefit. Physical implementations
> of virtio devices and implementations of virtio where the consumer polls
> vring avail indexes (vhost) should also benefit.
> 
> Signed-off-by: Venkatesh Srinivas <venkateshs@google.com>

Here's a similar patch for the ring itself:
https://lkml.org/lkml/2015/9/10/111

Does it help you as well?


> ---
>  drivers/virtio/virtio_ring.c | 46 ++++++++++++++++++++++++++++++++------------
>  1 file changed, 34 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 096b857..6262015 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -80,6 +80,12 @@ struct vring_virtqueue {
>  	/* Last used index we've seen. */
>  	u16 last_used_idx;
>  
> +	/* Last written value to avail->flags */
> +	u16 avail_flags_shadow;
> +
> +	/* Last written value to avail->idx in guest byte order */
> +	u16 avail_idx_shadow;
> +
>  	/* How to notify other side. FIXME: commonalize hcalls! */
>  	bool (*notify)(struct virtqueue *vq);
>  
> @@ -235,13 +241,14 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>  
>  	/* Put entry in available array (but don't update avail->idx until they
>  	 * do sync). */
> -	avail = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) & (vq->vring.num - 1);
> +	avail = vq->avail_idx_shadow & (vq->vring.num - 1);
>  	vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
>  
>  	/* Descriptors and available array need to be set before we expose the
>  	 * new available array entries. */
>  	virtio_wmb(vq->weak_barriers);
> -	vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) + 1);
> +	vq->avail_idx_shadow++;
> +	vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
>  	vq->num_added++;
>  
>  	pr_debug("Added buffer head %i to %p\n", head, vq);
> @@ -354,8 +361,8 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
>  	 * event. */
>  	virtio_mb(vq->weak_barriers);
>  
> -	old = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->num_added;
> -	new = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx);
> +	old = vq->avail_idx_shadow - vq->num_added;
> +	new = vq->avail_idx_shadow;
>  	vq->num_added = 0;
>  
>  #ifdef DEBUG
> @@ -510,7 +517,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
>  	/* If we expect an interrupt for the next entry, tell host
>  	 * by writing event index and flush out the write before
>  	 * the read in the next get_buf call. */
> -	if (!(vq->vring.avail->flags & cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT))) {
> +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
>  		vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx);
>  		virtio_mb(vq->weak_barriers);
>  	}
> @@ -537,7 +544,11 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
>  {
>  	struct vring_virtqueue *vq = to_vvq(_vq);
>  
> -	vq->vring.avail->flags |= cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT);
> +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> +		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
> +
>  }
>  EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
>  
> @@ -565,7 +576,10 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
>  	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
>  	 * either clear the flags bit or point the event index at the next
>  	 * entry. Always do both to keep code simple. */
> -	vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT);
> +	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> +		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> +		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
>  	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
>  	END_USE(vq);
>  	return last_used_idx;
> @@ -633,9 +647,12 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
>  	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
>  	 * either clear the flags bit or point the event index at the next
>  	 * entry. Always do both to keep code simple. */
> -	vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT);
> +	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
> +		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
> +		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> +	}
>  	/* TODO: tune this threshold */
> -	bufs = (u16)(virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->last_used_idx) * 3 / 4;
> +	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
>  	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs);
>  	virtio_mb(vq->weak_barriers);
>  	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
> @@ -670,7 +687,8 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
>  		/* detach_buf clears data, so grab it now. */
>  		buf = vq->data[i];
>  		detach_buf(vq, i);
> -		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - 1);
> +		vq->avail_idx_shadow--;
> +		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
>  		END_USE(vq);
>  		return buf;
>  	}
> @@ -735,6 +753,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>  	vq->weak_barriers = weak_barriers;
>  	vq->broken = false;
>  	vq->last_used_idx = 0;
> +	vq->avail_flags_shadow = 0;
> +	vq->avail_idx_shadow = 0;
>  	vq->num_added = 0;
>  	list_add_tail(&vq->vq.list, &vdev->vqs);
>  #ifdef DEBUG
> @@ -746,8 +766,10 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>  	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>  
>  	/* No callback?  Tell other side not to bother us. */
> -	if (!callback)
> -		vq->vring.avail->flags |= cpu_to_virtio16(vdev, VRING_AVAIL_F_NO_INTERRUPT);
> +	if (!callback) {
> +		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +		vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
> +	}
>  
>  	/* Put everything in free lists. */
>  	vq->free_head = 0;
> -- 
> 2.6.0.rc2.230.g3dd15c0

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-11 12:34 ` Michael S. Tsirkin
@ 2015-11-13 23:41   ` Venkatesh Srinivas
  2015-11-17  3:46     ` Xie, Huawei
  2015-11-13 23:41   ` Venkatesh Srinivas via Virtualization
  1 sibling, 1 reply; 16+ messages in thread
From: Venkatesh Srinivas @ 2015-11-13 23:41 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: virtualization, Paolo Bonzini, David Matlack, KVM list, luto,
	huawei.xie, Rusty Russell, Venkatesh Srinivas

On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
> > Improves cacheline transfer flow of available ring header.
> >
> > Virtqueues are implemented as a pair of rings, one producer->consumer
> > avail ring and one consumer->producer used ring; preceding the
> > avail ring in memory are two contiguous u16 fields -- avail->flags
> > and avail->idx. A producer posts work by writing to avail->idx and
> > a consumer reads avail->idx.
> >
> > The flags and idx fields only need to be written by a producer CPU
> > and only read by a consumer CPU; when the producer and consumer are
> > running on different CPUs and the virtio_ring code is structured to
> > only have source writes/sink reads, we can continuously transfer the
> > avail header cacheline between 'M' states between cores. This flow
> > optimizes core -> core bandwidth on certain CPUs.
> >
> > (see: "Software Optimization Guide for AMD Family 15h Processors",
> > Section 11.6; similar language appears in the 10h guide and should
> > apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
> >
> > Unfortunately the existing virtio_ring code issued reads to the
> > avail->idx and read-modify-writes to avail->flags on the producer.
> >
> > This change shadows the flags and index fields in producer memory;
> > the vring code now reads from the shadows and only ever writes to
> > avail->flags and avail->idx, allowing the cacheline to transfer
> > core -> core optimally.
>
> Sounds logical, I'll apply this after a  bit of testing
> of my own, thanks!

Thanks!

> > In a concurrent version of vring_bench, the time required for
> > 10,000,000 buffer checkout/returns was reduced by ~2% (average
> > across many runs) on an AMD Piledriver (15h) CPU:
> >
> > (w/o shadowing):
> >  Performance counter stats for './vring_bench':
> >      5,451,082,016      L1-dcache-loads
> >      ...
> >        2.221477739 seconds time elapsed
> >
> > (w/ shadowing):
> >  Performance counter stats for './vring_bench':
> >      5,405,701,361      L1-dcache-loads
> >      ...
> >        2.168405376 seconds time elapsed
>
> Could you supply the full command line you used
> to test this?

Yes --

perf stat -e L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores \
	./vring_bench

The standard version of vring_bench is single-threaded (posted on this list
but never submitted); my tests were with a version that has a worker thread
polling the VQ. How should I share it? Should I just attach it to an email
here?

> > The further away (in a NUMA sense) virtio producers and consumers are
> > from each other, the more we expect to benefit. Physical implementations
> > of virtio devices and implementations of virtio where the consumer polls
> > vring avail indexes (vhost) should also benefit.
> >
> > Signed-off-by: Venkatesh Srinivas <venkateshs@google.com>
>
> Here's a similar patch for the ring itself:
> https://lkml.org/lkml/2015/9/10/111
>
> Does it help you as well?

I tested your patch in our environment; our virtqueues do not support
Indirect entries and your patch does not manage to elide many writes, so I
do not see a performance difference. In an environment with Indirect, your
patch will likely be a win.

(My patch gets most of its win by eliminating reads on the producer; when
the producer reads avail fields at the same time the consumer is polling,
we see cacheline transfers that hurt performance. Your patch eliminates
writes, which is nice, but our tests w/ polling are not as sensitive to
writes from the producer.)

I have two quick comments on your patch --
1) I think you need to kfree vq->avail when deleting the virtqueue.

2) Should we avoid allocating a cache for virtqueues that are not
   performance critical? (ex: virtio-scsi eventq/controlq, virtio-net
   controlq)

Should I post comments in reply to the original patch email (given that it
is ~2 months old)?

Thanks!
-- vs;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-11 12:34 ` Michael S. Tsirkin
  2015-11-13 23:41   ` Venkatesh Srinivas
@ 2015-11-13 23:41   ` Venkatesh Srinivas via Virtualization
  1 sibling, 0 replies; 16+ messages in thread
From: Venkatesh Srinivas via Virtualization @ 2015-11-13 23:41 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: KVM list, virtualization, Venkatesh Srinivas, luto,
	David Matlack, Paolo Bonzini, huawei.xie

On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
> > Improves cacheline transfer flow of available ring header.
> >
> > Virtqueues are implemented as a pair of rings, one producer->consumer
> > avail ring and one consumer->producer used ring; preceding the
> > avail ring in memory are two contiguous u16 fields -- avail->flags
> > and avail->idx. A producer posts work by writing to avail->idx and
> > a consumer reads avail->idx.
> >
> > The flags and idx fields only need to be written by a producer CPU
> > and only read by a consumer CPU; when the producer and consumer are
> > running on different CPUs and the virtio_ring code is structured to
> > only have source writes/sink reads, we can continuously transfer the
> > avail header cacheline between 'M' states between cores. This flow
> > optimizes core -> core bandwidth on certain CPUs.
> >
> > (see: "Software Optimization Guide for AMD Family 15h Processors",
> > Section 11.6; similar language appears in the 10h guide and should
> > apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
> >
> > Unfortunately the existing virtio_ring code issued reads to the
> > avail->idx and read-modify-writes to avail->flags on the producer.
> >
> > This change shadows the flags and index fields in producer memory;
> > the vring code now reads from the shadows and only ever writes to
> > avail->flags and avail->idx, allowing the cacheline to transfer
> > core -> core optimally.
>
> Sounds logical, I'll apply this after a  bit of testing
> of my own, thanks!

Thanks!

> > In a concurrent version of vring_bench, the time required for
> > 10,000,000 buffer checkout/returns was reduced by ~2% (average
> > across many runs) on an AMD Piledriver (15h) CPU:
> >
> > (w/o shadowing):
> >  Performance counter stats for './vring_bench':
> >      5,451,082,016      L1-dcache-loads
> >      ...
> >        2.221477739 seconds time elapsed
> >
> > (w/ shadowing):
> >  Performance counter stats for './vring_bench':
> >      5,405,701,361      L1-dcache-loads
> >      ...
> >        2.168405376 seconds time elapsed
>
> Could you supply the full command line you used
> to test this?

Yes --

perf stat -e L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores \
	./vring_bench

The standard version of vring_bench is single-threaded (posted on this list
but never submitted); my tests were with a version that has a worker thread
polling the VQ. How should I share it? Should I just attach it to an email
here?

> > The further away (in a NUMA sense) virtio producers and consumers are
> > from each other, the more we expect to benefit. Physical implementations
> > of virtio devices and implementations of virtio where the consumer polls
> > vring avail indexes (vhost) should also benefit.
> >
> > Signed-off-by: Venkatesh Srinivas <venkateshs@google.com>
>
> Here's a similar patch for the ring itself:
> https://lkml.org/lkml/2015/9/10/111
>
> Does it help you as well?

I tested your patch in our environment; our virtqueues do not support
Indirect entries and your patch does not manage to elide many writes, so I
do not see a performance difference. In an environment with Indirect, your
patch will likely be a win.

(My patch gets most of its win by eliminating reads on the producer; when
the producer reads avail fields at the same time the consumer is polling,
we see cacheline transfers that hurt performance. Your patch eliminates
writes, which is nice, but our tests w/ polling are not as sensitive to
writes from the producer.)

I have two quick comments on your patch --
1) I think you need to kfree vq->avail when deleting the virtqueue.

2) Should we avoid allocating a cache for virtqueues that are not
   performance critical? (ex: virtio-scsi eventq/controlq, virtio-net
   controlq)

Should I post comments in reply to the original patch email (given that it
is ~2 months old)?

Thanks!
-- vs;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-13 23:41   ` Venkatesh Srinivas
@ 2015-11-17  3:46     ` Xie, Huawei
  2015-11-18  4:08       ` Venkatesh Srinivas via Virtualization
  0 siblings, 1 reply; 16+ messages in thread
From: Xie, Huawei @ 2015-11-17  3:46 UTC (permalink / raw)
  To: Venkatesh Srinivas, Michael S. Tsirkin
  Cc: KVM list, virtualization, Venkatesh Srinivas, luto,
	David Matlack, Paolo Bonzini

On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
> On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
>> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
>>> Improves cacheline transfer flow of available ring header.
>>>
>>> Virtqueues are implemented as a pair of rings, one producer->consumer
>>> avail ring and one consumer->producer used ring; preceding the
>>> avail ring in memory are two contiguous u16 fields -- avail->flags
>>> and avail->idx. A producer posts work by writing to avail->idx and
>>> a consumer reads avail->idx.
>>>
>>> The flags and idx fields only need to be written by a producer CPU
>>> and only read by a consumer CPU; when the producer and consumer are
>>> running on different CPUs and the virtio_ring code is structured to
>>> only have source writes/sink reads, we can continuously transfer the
>>> avail header cacheline between 'M' states between cores. This flow
>>> optimizes core -> core bandwidth on certain CPUs.
>>>
>>> (see: "Software Optimization Guide for AMD Family 15h Processors",
>>> Section 11.6; similar language appears in the 10h guide and should
>>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
>>>
>>> Unfortunately the existing virtio_ring code issued reads to the
>>> avail->idx and read-modify-writes to avail->flags on the producer.
>>>
>>> This change shadows the flags and index fields in producer memory;
>>> the vring code now reads from the shadows and only ever writes to
>>> avail->flags and avail->idx, allowing the cacheline to transfer
>>> core -> core optimally.
>> Sounds logical, I'll apply this after a  bit of testing
>> of my own, thanks!
> Thanks!
Venkatesh:
Is it that your patch only applies to CPUs w/ exclusive caches? Do you
have perf data on Intel CPUs?
For the perf metric you provide, why not L1-dcache-load-misses which is
more meaning full?
>
>>> In a concurrent version of vring_bench, the time required for
>>> 10,000,000 buffer checkout/returns was reduced by ~2% (average
>>> across many runs) on an AMD Piledriver (15h) CPU:
>>>
>>> (w/o shadowing):
>>>  Performance counter stats for './vring_bench':
>>>      5,451,082,016      L1-dcache-loads
>>>      ...
>>>        2.221477739 seconds time elapsed
>>>
>>> (w/ shadowing):
>>>  Performance counter stats for './vring_bench':
>>>      5,405,701,361      L1-dcache-loads
>>>      ...
>>>        2.168405376 seconds time elapsed
>> Could you supply the full command line you used
>> to test this?
> Yes --
>
> perf stat -e L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores \
> 	./vring_bench
>
> The standard version of vring_bench is single-threaded (posted on this list
> but never submitted); my tests were with a version that has a worker thread
> polling the VQ. How should I share it? Should I just attach it to an email
> here?
>
>>> The further away (in a NUMA sense) virtio producers and consumers are
>>> from each other, the more we expect to benefit. Physical implementations
>>> of virtio devices and implementations of virtio where the consumer polls
>>> vring avail indexes (vhost) should also benefit.
>>>
>>> Signed-off-by: Venkatesh Srinivas <venkateshs@google.com>
>> Here's a similar patch for the ring itself:
>> https://lkml.org/lkml/2015/9/10/111
>>
>> Does it help you as well?
> I tested your patch in our environment; our virtqueues do not support
> Indirect entries and your patch does not manage to elide many writes, so I
> do not see a performance difference. In an environment with Indirect, your
> patch will likely be a win.
>
> (My patch gets most of its win by eliminating reads on the producer; when
> the producer reads avail fields at the same time the consumer is polling,
> we see cacheline transfers that hurt performance. Your patch eliminates
> writes, which is nice, but our tests w/ polling are not as sensitive to
> writes from the producer.)
>
> I have two quick comments on your patch --
> 1) I think you need to kfree vq->avail when deleting the virtqueue.
>
> 2) Should we avoid allocating a cache for virtqueues that are not
>    performance critical? (ex: virtio-scsi eventq/controlq, virtio-net
>    controlq)
>
> Should I post comments in reply to the original patch email (given that it
> is ~2 months old)?
>
> Thanks!
> -- vs;
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-17  3:46     ` Xie, Huawei
@ 2015-11-18  4:08       ` Venkatesh Srinivas via Virtualization
  2015-11-18  4:28         ` Venkatesh Srinivas via Virtualization
  2015-11-18  4:28         ` Venkatesh Srinivas
  0 siblings, 2 replies; 16+ messages in thread
From: Venkatesh Srinivas via Virtualization @ 2015-11-18  4:08 UTC (permalink / raw)
  To: Xie, Huawei
  Cc: KVM list, Michael S. Tsirkin, virtualization, Venkatesh Srinivas,
	luto, David Matlack, Paolo Bonzini

[-- Attachment #1: Type: text/plain, Size: 10168 bytes --]

On Mon, Nov 16, 2015 at 7:46 PM, Xie, Huawei <huawei.xie@intel.com> wrote:

> On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
> > On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
> >> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
> >>> Improves cacheline transfer flow of available ring header.
> >>>
> >>> Virtqueues are implemented as a pair of rings, one producer->consumer
> >>> avail ring and one consumer->producer used ring; preceding the
> >>> avail ring in memory are two contiguous u16 fields -- avail->flags
> >>> and avail->idx. A producer posts work by writing to avail->idx and
> >>> a consumer reads avail->idx.
> >>>
> >>> The flags and idx fields only need to be written by a producer CPU
> >>> and only read by a consumer CPU; when the producer and consumer are
> >>> running on different CPUs and the virtio_ring code is structured to
> >>> only have source writes/sink reads, we can continuously transfer the
> >>> avail header cacheline between 'M' states between cores. This flow
> >>> optimizes core -> core bandwidth on certain CPUs.
> >>>
> >>> (see: "Software Optimization Guide for AMD Family 15h Processors",
> >>> Section 11.6; similar language appears in the 10h guide and should
> >>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
> >>>
> >>> Unfortunately the existing virtio_ring code issued reads to the
> >>> avail->idx and read-modify-writes to avail->flags on the producer.
> >>>
> >>> This change shadows the flags and index fields in producer memory;
> >>> the vring code now reads from the shadows and only ever writes to
> >>> avail->flags and avail->idx, allowing the cacheline to transfer
> >>> core -> core optimally.
> >> Sounds logical, I'll apply this after a  bit of testing
> >> of my own, thanks!
> > Thanks!
>

> Venkatesh:
> Is it that your patch only applies to CPUs w/ exclusive caches?

No --- it applies when the inter-cache coherence flow is optimized by
'M' -> 'M' transfers and when producer reads might interfere w/
consumer prefetchw/reads. The AMD Optimization guides have specific
language on this subject, but other platforms may benefit.
(see Intel #'s below)

> Do you have perf data on Intel CPUs?

Good idea -- I ran some tests on a couple of Intel platforms:

(these are perf data from sample runs; for each I ran many runs, the
 numbers were pretty stable except for Haswell-EP cross-socket)

One-socket Intel Xeon W3690 ("Westmere"), 3.46 GHz; core turbo disabled
=======================================================================
(note -- w/ core turbo disabled, performance is _very_ stable; variance of
 < 0.5% run-to-run; figure of merit is "seconds elapsed" here)

* Producer / consumer bound to Hyperthread pairs:

 Performance counter stats for './vring_bench_noshadow 1000000000':

 343,425,166,916 L1-dcache-loads
      21,393,148 L1-dcache-load-misses     #    0.01% of all L1-dcache hits
  61,709,640,363 L1-dcache-stores
       5,745,690 L1-dcache-store-misses
  10,186,932,553 L1-dcache-prefetches
           1,491 L1-dcache-prefetch-misses
   121.335699344 seconds time elapsed

 Performance counter stats for './vring_bench_shadow 1000000000':

 334,766,413,861 L1-dcache-loads
      15,787,778 L1-dcache-load-misses     #    0.00% of all L1-dcache hits
  62,735,792,799 L1-dcache-stores
       3,252,113 L1-dcache-store-misses
   9,018,273,596 L1-dcache-prefetches
             819 L1-dcache-prefetch-misses
   121.206339656 seconds time elapsed

Effectively Performance-neutral.

* Producer / consumer bound to separate cores, same socket:

 Performance counter stats for './vring_bench_noshadow 1000000000':

   399,943,384,509 L1-dcache-loads
     8,868,334,693 L1-dcache-load-misses     #    2.22% of all L1-dcache hits
    62,721,376,685 L1-dcache-stores
     2,786,806,982 L1-dcache-store-misses
    10,915,046,967 L1-dcache-prefetches
           328,508 L1-dcache-prefetch-misses
     146.585969976 seconds time elapsed

 Performance counter stats for './vring_bench_shadow 1000000000':

   425,123,067,750 L1-dcache-loads 
     6,689,318,709 L1-dcache-load-misses     #    1.57% of all L1-dcache hits
    62,747,525,005 L1-dcache-stores 
     2,496,274,505 L1-dcache-store-misses
     8,627,873,397 L1-dcache-prefetches
           146,729 L1-dcache-prefetch-misses
     142.657327765 seconds time elapsed

2.6% reduction in runtime; note that L1-dcache-load-misses reduced dramatically,
2 Billion(!) L1d misses saved.

Two-socket Intel Sandy Bridge(-EP) Xeon, 2.6 GHz; core turbo disabled
=====================================================================

* Producer / consumer bound to Hyperthread pairs:

 Performance counter stats for './vring_bench_noshadow 100000000':

    37,129,070,402 L1-dcache-loads
         6,416,246 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
     6,207,794,675 L1-dcache-stores
         2,800,094 L1-dcache-store-misses
      17.029790809 seconds time elapsed

 Performance counter stats for './vring_bench_shadow 100000000':

    36,799,559,391 L1-dcache-loads
        10,241,080 L1-dcache-load-misses     #    0.03% of all L1-dcache hits
     6,312,252,458 L1-dcache-stores
         2,742,239 L1-dcache-store-misses
      16.941001709 seconds time elapsed

Effectively Performance-neutral.

* Producer / consumer bound to separate cores, same socket:

 Performance counter stats for './vring_bench_noshadow 100000000':

    27,684,883,046 L1-dcache-loads
       809,933,091 L1-dcache-load-misses     #    2.93% of all L1-dcache hits
     6,219,598,352 L1-dcache-stores
         1,758,503 L1-dcache-store-misses
      15.020511218 seconds time elapsed

 Performance counter stats for './vring_bench_shadow 100000000':

    28,092,111,012 L1-dcache-loads                                             
       716,687,011 L1-dcache-load-misses     #    2.55% of all L1-dcache hits  
     6,290,821,211 L1-dcache-stores                                            
         1,565,583 L1-dcache-store-misses                                      
      15.208420297 seconds time elapsed

Effectively Performance-neutral.

* Producer / consumer bound to separate cores, cross socket:
(Sandy Bridge-EP appears to have less cross-socket variance than Haswell-EP)

 Performance counter stats for './vring_bench_noshadow 100000000':

    35,857,245,449 L1-dcache-loads
       821,746,755 L1-dcache-load-misses     #    2.29% of all L1-dcache hits
     6,252,551,550 L1-dcache-stores
         4,665,405 L1-dcache-store-misses
      46.340035651 seconds time elapsed

 Performance counter stats for './vring_bench_shadow 100000000':

    39,044,022,857 L1-dcache-loads
       711,731,527 L1-dcache-load-misses     #    1.82% of all L1-dcache hits
     6,349,051,557 L1-dcache-stores
         4,292,362 L1-dcache-store-misses
      42.593259436 seconds time elapsed

Runtimes for the cross-socket test have somewhat higher variance, but the
pattern in counts of L1-dcache-loads and L1-dcache-load-misses for nonshadow
vs. shadow code is very stable.

noshadow (w/o this patch) reliably clocks in at ~46 seconds, shadow ranges
from ~48 to ~42 (-2.8% to +8.0%).

Two-socket Intel Haswell(-EP) Xeon, 2.3 GHz; core turbo disabled
================================================================

* Producer / consumer bound to Hyperthread pairs:

 Performance counter stats for './vring_bench_noshadow 10000000000':

   474,856,463,271 L1-dcache-loads
        74,223,784 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
    87,274,898,671 L1-dcache-stores
        31,869,448 L1-dcache-store-misses
     243.290969318 seconds time elapsed

 Performance counter stats for './vring_bench_shadow 10000000000':

   466,891,993,302 L1-dcache-loads
        80,859,208 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
    88,760,627,355 L1-dcache-stores
        35,727,720 L1-dcache-store-misses
     242.146970822 seconds time elapsed

Effectively Performance-neutral.

* Producer / consumer bound to separate cores, same socket:

 Performance counter stats for './vring_bench_noshadow 10000000000':

   357,657,891,797 L1-dcache-loads
     8,760,549,978 L1-dcache-load-misses     #    2.45% of all L1-dcache hits
    87,357,651,103 L1-dcache-stores
        10,166,431 L1-dcache-store-misses
     229.733047436 seconds time elapsed

 Performance counter stats for './vring_bench_shadow 10000000000':

   382,508,881,516 L1-dcache-loads
     8,348,013,630 L1-dcache-load-misses     #    2.18% of all L1-dcache hits
    88,756,639,931 L1-dcache-stores
         9,842,999 L1-dcache-store-misses
     230.850697668 seconds time elapsed

Effectively Performance-neutral.

* Producer / consumer bound to separate cores, different sockets:

Unfortunately I don't have useful numbers for this case -- even with
core turbo disabled, runtime variance is very high (10 - 30% run-to-run).

> For the perf metric you provide, why not L1-dcache-load-misses which is
> more meaning full?

L1-dcache-load-misses is a better metric, you're right; for the original
AMD Piledriver run I posted:

 Performance counter stats for './vring_bench_noshadow':
     5,451,082,016      L1-dcache-loads
        31,690,398      L1-dcache-load-misses
        60,288,052      L1-dcache-stores
        60,517,840      LLC-loads
             9,726      LLC-load-misses
       2.221477739      seconds time elapsed
 
 Performance counter stats for './vring_bench_shadow':
     5,405,701,361      L1-dcache-loads
        31,157,235      L1-dcache-load-misses
        59,172,380      L1-dcache-stores
        59,398,269      LLC-loads
            10,944      LLC-load-misses
       2.168405376      seconds time elapsed

There is a 1.6% reduction in L1-dcache-load-misses, which lines up with
about a 2% reduction in runtime.

Summary:
* No workload on Westmere 1S, Sandy Bridge 2S, and Haswell 2S got worse;
* Westmere 1S cross-core improved by ~2.5% reliably;
* Sandy Bridge 2S cross-core cross-socket may have improved. (cross-socket
  run variance makes it hard to tell)
* AMD Piledriver tests improved by ~2%;
* Other virtio implementations (over PCIe for example) should benefit;

HTH,
-- vs;

[-- Attachment #2: Type: text/html, Size: 3401 bytes --]

[-- Attachment #3: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-18  4:08       ` Venkatesh Srinivas via Virtualization
  2015-11-18  4:28         ` Venkatesh Srinivas via Virtualization
@ 2015-11-18  4:28         ` Venkatesh Srinivas
  2015-11-19 16:15           ` Xie, Huawei
  2015-11-19 16:15           ` Xie, Huawei
  1 sibling, 2 replies; 16+ messages in thread
From: Venkatesh Srinivas @ 2015-11-18  4:28 UTC (permalink / raw)
  To: Xie, Huawei
  Cc: Michael S. Tsirkin, virtualization, Paolo Bonzini, David Matlack,
	KVM list, luto, Rusty Russell, Venkatesh Srinivas

On Tue, Nov 17, 2015 at 08:08:18PM -0800, Venkatesh Srinivas wrote:
> On Mon, Nov 16, 2015 at 7:46 PM, Xie, Huawei <huawei.xie@intel.com> wrote:
> 
> > On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
> > > On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
> > >> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
> > >>> Improves cacheline transfer flow of available ring header.
> > >>>
> > >>> Virtqueues are implemented as a pair of rings, one producer->consumer
> > >>> avail ring and one consumer->producer used ring; preceding the
> > >>> avail ring in memory are two contiguous u16 fields -- avail->flags
> > >>> and avail->idx. A producer posts work by writing to avail->idx and
> > >>> a consumer reads avail->idx.
> > >>>
> > >>> The flags and idx fields only need to be written by a producer CPU
> > >>> and only read by a consumer CPU; when the producer and consumer are
> > >>> running on different CPUs and the virtio_ring code is structured to
> > >>> only have source writes/sink reads, we can continuously transfer the
> > >>> avail header cacheline between 'M' states between cores. This flow
> > >>> optimizes core -> core bandwidth on certain CPUs.
> > >>>
> > >>> (see: "Software Optimization Guide for AMD Family 15h Processors",
> > >>> Section 11.6; similar language appears in the 10h guide and should
> > >>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
> > >>>
> > >>> Unfortunately the existing virtio_ring code issued reads to the
> > >>> avail->idx and read-modify-writes to avail->flags on the producer.
> > >>>
> > >>> This change shadows the flags and index fields in producer memory;
> > >>> the vring code now reads from the shadows and only ever writes to
> > >>> avail->flags and avail->idx, allowing the cacheline to transfer
> > >>> core -> core optimally.
> > >> Sounds logical, I'll apply this after a  bit of testing
> > >> of my own, thanks!
> > > Thanks!
> >
> 
> > Venkatesh:
> > Is it that your patch only applies to CPUs w/ exclusive caches?
> 
> No --- it applies when the inter-cache coherence flow is optimized by
> 'M' -> 'M' transfers and when producer reads might interfere w/
> consumer prefetchw/reads. The AMD Optimization guides have specific
> language on this subject, but other platforms may benefit.
> (see Intel #'s below)
> 
> > Do you have perf data on Intel CPUs?
> 
> Good idea -- I ran some tests on a couple of Intel platforms:
> 
> (these are perf data from sample runs; for each I ran many runs, the
>  numbers were pretty stable except for Haswell-EP cross-socket)
> 
> One-socket Intel Xeon W3690 ("Westmere"), 3.46 GHz; core turbo disabled
> =======================================================================
> (note -- w/ core turbo disabled, performance is _very_ stable; variance of
>  < 0.5% run-to-run; figure of merit is "seconds elapsed" here)
> 
> * Producer / consumer bound to Hyperthread pairs:
> 
>  Performance counter stats for './vring_bench_noshadow 1000000000':
> 
>  343,425,166,916 L1-dcache-loads
>       21,393,148 L1-dcache-load-misses     #    0.01% of all L1-dcache hits
>   61,709,640,363 L1-dcache-stores
>        5,745,690 L1-dcache-store-misses
>   10,186,932,553 L1-dcache-prefetches
>            1,491 L1-dcache-prefetch-misses
>    121.335699344 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 1000000000':
> 
>  334,766,413,861 L1-dcache-loads
>       15,787,778 L1-dcache-load-misses     #    0.00% of all L1-dcache hits
>   62,735,792,799 L1-dcache-stores
>        3,252,113 L1-dcache-store-misses
>    9,018,273,596 L1-dcache-prefetches
>              819 L1-dcache-prefetch-misses
>    121.206339656 seconds time elapsed
> 
> Effectively Performance-neutral.
> 
> * Producer / consumer bound to separate cores, same socket:
> 
>  Performance counter stats for './vring_bench_noshadow 1000000000':
> 
>    399,943,384,509 L1-dcache-loads
>      8,868,334,693 L1-dcache-load-misses     #    2.22% of all L1-dcache hits
>     62,721,376,685 L1-dcache-stores
>      2,786,806,982 L1-dcache-store-misses
>     10,915,046,967 L1-dcache-prefetches
>            328,508 L1-dcache-prefetch-misses
>      146.585969976 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 1000000000':
> 
>    425,123,067,750 L1-dcache-loads 
>      6,689,318,709 L1-dcache-load-misses     #    1.57% of all L1-dcache hits
>     62,747,525,005 L1-dcache-stores 
>      2,496,274,505 L1-dcache-store-misses
>      8,627,873,397 L1-dcache-prefetches
>            146,729 L1-dcache-prefetch-misses
>      142.657327765 seconds time elapsed
> 
> 2.6% reduction in runtime; note that L1-dcache-load-misses reduced
> dramatically, 2 Billion(!) L1d misses saved.
> 
> Two-socket Intel Sandy Bridge(-EP) Xeon, 2.6 GHz; core turbo disabled
> =====================================================================
> 
> * Producer / consumer bound to Hyperthread pairs:
> 
>  Performance counter stats for './vring_bench_noshadow 100000000':
> 
>     37,129,070,402 L1-dcache-loads
>          6,416,246 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>      6,207,794,675 L1-dcache-stores
>          2,800,094 L1-dcache-store-misses
>       17.029790809 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 100000000':
> 
>     36,799,559,391 L1-dcache-loads
>         10,241,080 L1-dcache-load-misses     #    0.03% of all L1-dcache hits
>      6,312,252,458 L1-dcache-stores
>          2,742,239 L1-dcache-store-misses
>       16.941001709 seconds time elapsed
> 
> Effectively Performance-neutral.
> 
> * Producer / consumer bound to separate cores, same socket:
> 
>  Performance counter stats for './vring_bench_noshadow 100000000':
> 
>     27,684,883,046 L1-dcache-loads
>        809,933,091 L1-dcache-load-misses     #    2.93% of all L1-dcache hits
>      6,219,598,352 L1-dcache-stores
>          1,758,503 L1-dcache-store-misses
>       15.020511218 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 100000000':
> 
>     28,092,111,012 L1-dcache-loads                     
>        716,687,011 L1-dcache-load-misses     #    2.55% of all L1-dcache hits 
>      6,290,821,211 L1-dcache-stores 
>          1,565,583 L1-dcache-store-misses                                    
>       15.208420297 seconds time elapsed
> 
> Effectively Performance-neutral.
> 
> * Producer / consumer bound to separate cores, cross socket:
> (Sandy Bridge-EP appears to have less cross-socket variance than Haswell-EP)
> 
>  Performance counter stats for './vring_bench_noshadow 100000000':
> 
>     35,857,245,449 L1-dcache-loads
>        821,746,755 L1-dcache-load-misses     #    2.29% of all L1-dcache hits
>      6,252,551,550 L1-dcache-stores
>          4,665,405 L1-dcache-store-misses
>       46.340035651 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 100000000':
> 
>     39,044,022,857 L1-dcache-loads
>        711,731,527 L1-dcache-load-misses     #    1.82% of all L1-dcache hits
>      6,349,051,557 L1-dcache-stores
>          4,292,362 L1-dcache-store-misses
>       42.593259436 seconds time elapsed
> 
> Runtimes for the cross-socket test have somewhat higher variance, but the
> pattern in counts of L1-dcache-loads and L1-dcache-load-misses for nonshadow
> vs. shadow code is very stable.
> 
> noshadow (w/o this patch) reliably clocks in at ~46 seconds, shadow ranges
> from ~48 to ~42 (-2.8% to +8.0%).
> 
> Two-socket Intel Haswell(-EP) Xeon, 2.3 GHz; core turbo disabled
> ================================================================
> 
> * Producer / consumer bound to Hyperthread pairs:
> 
>  Performance counter stats for './vring_bench_noshadow 10000000000':
> 
>    474,856,463,271 L1-dcache-loads
>         74,223,784 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>     87,274,898,671 L1-dcache-stores
>         31,869,448 L1-dcache-store-misses
>      243.290969318 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 10000000000':
> 
>    466,891,993,302 L1-dcache-loads
>         80,859,208 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>     88,760,627,355 L1-dcache-stores
>         35,727,720 L1-dcache-store-misses
>      242.146970822 seconds time elapsed
> 
> Effectively Performance-neutral.
> 
> * Producer / consumer bound to separate cores, same socket:
> 
>  Performance counter stats for './vring_bench_noshadow 10000000000':
> 
>    357,657,891,797 L1-dcache-loads
>      8,760,549,978 L1-dcache-load-misses     #    2.45% of all L1-dcache hits
>     87,357,651,103 L1-dcache-stores
>         10,166,431 L1-dcache-store-misses
>      229.733047436 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 10000000000':
> 
>    382,508,881,516 L1-dcache-loads
>      8,348,013,630 L1-dcache-load-misses     #    2.18% of all L1-dcache hits
>     88,756,639,931 L1-dcache-stores
>          9,842,999 L1-dcache-store-misses
>      230.850697668 seconds time elapsed
> 
> Effectively Performance-neutral.
> 
> * Producer / consumer bound to separate cores, different sockets:
> 
> Unfortunately I don't have useful numbers for this case -- even with
> core turbo disabled, runtime variance is very high (10 - 30% run-to-run).
> 
> > For the perf metric you provide, why not L1-dcache-load-misses which is
> > more meaning full?
> 
> L1-dcache-load-misses is a better metric, you're right; for the original
> AMD Piledriver run I posted:
> 
>  Performance counter stats for './vring_bench_noshadow':
>      5,451,082,016      L1-dcache-loads
>         31,690,398      L1-dcache-load-misses
>         60,288,052      L1-dcache-stores
>         60,517,840      LLC-loads
>              9,726      LLC-load-misses
>        2.221477739      seconds time elapsed
>  
>  Performance counter stats for './vring_bench_shadow':
>      5,405,701,361      L1-dcache-loads
>         31,157,235      L1-dcache-load-misses
>         59,172,380      L1-dcache-stores
>         59,398,269      LLC-loads
>             10,944      LLC-load-misses
>        2.168405376      seconds time elapsed
> 
> There is a 1.6% reduction in L1-dcache-load-misses, which lines up with
> about a 2% reduction in runtime.
> 
> Summary:
> * No workload on Westmere 1S, Sandy Bridge 2S, and Haswell 2S got worse;
> * Westmere 1S cross-core improved by ~2.5% reliably;
> * Sandy Bridge 2S cross-core cross-socket may have improved. (cross-socket
>   run variance makes it hard to tell)
> * AMD Piledriver tests improved by ~2%;
> * Other virtio implementations (over PCIe for example) should benefit;
> 
> HTH,
> -- vs;

I'm sorry -- I appear to have added an unintentional HTML draft part to my
reply. This would prevent the message from appearing on the kvm@ mailing list
at the minimum.

Re-posting with the HTML part scrubbed.

Sorry,
-- vs;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-18  4:08       ` Venkatesh Srinivas via Virtualization
@ 2015-11-18  4:28         ` Venkatesh Srinivas via Virtualization
  2015-11-18  4:28         ` Venkatesh Srinivas
  1 sibling, 0 replies; 16+ messages in thread
From: Venkatesh Srinivas via Virtualization @ 2015-11-18  4:28 UTC (permalink / raw)
  To: Xie, Huawei
  Cc: KVM list, Michael S. Tsirkin, virtualization, Venkatesh Srinivas,
	luto, David Matlack, Paolo Bonzini

On Tue, Nov 17, 2015 at 08:08:18PM -0800, Venkatesh Srinivas wrote:
> On Mon, Nov 16, 2015 at 7:46 PM, Xie, Huawei <huawei.xie@intel.com> wrote:
> 
> > On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
> > > On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
> > >> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
> > >>> Improves cacheline transfer flow of available ring header.
> > >>>
> > >>> Virtqueues are implemented as a pair of rings, one producer->consumer
> > >>> avail ring and one consumer->producer used ring; preceding the
> > >>> avail ring in memory are two contiguous u16 fields -- avail->flags
> > >>> and avail->idx. A producer posts work by writing to avail->idx and
> > >>> a consumer reads avail->idx.
> > >>>
> > >>> The flags and idx fields only need to be written by a producer CPU
> > >>> and only read by a consumer CPU; when the producer and consumer are
> > >>> running on different CPUs and the virtio_ring code is structured to
> > >>> only have source writes/sink reads, we can continuously transfer the
> > >>> avail header cacheline between 'M' states between cores. This flow
> > >>> optimizes core -> core bandwidth on certain CPUs.
> > >>>
> > >>> (see: "Software Optimization Guide for AMD Family 15h Processors",
> > >>> Section 11.6; similar language appears in the 10h guide and should
> > >>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
> > >>>
> > >>> Unfortunately the existing virtio_ring code issued reads to the
> > >>> avail->idx and read-modify-writes to avail->flags on the producer.
> > >>>
> > >>> This change shadows the flags and index fields in producer memory;
> > >>> the vring code now reads from the shadows and only ever writes to
> > >>> avail->flags and avail->idx, allowing the cacheline to transfer
> > >>> core -> core optimally.
> > >> Sounds logical, I'll apply this after a  bit of testing
> > >> of my own, thanks!
> > > Thanks!
> >
> 
> > Venkatesh:
> > Is it that your patch only applies to CPUs w/ exclusive caches?
> 
> No --- it applies when the inter-cache coherence flow is optimized by
> 'M' -> 'M' transfers and when producer reads might interfere w/
> consumer prefetchw/reads. The AMD Optimization guides have specific
> language on this subject, but other platforms may benefit.
> (see Intel #'s below)
> 
> > Do you have perf data on Intel CPUs?
> 
> Good idea -- I ran some tests on a couple of Intel platforms:
> 
> (these are perf data from sample runs; for each I ran many runs, the
>  numbers were pretty stable except for Haswell-EP cross-socket)
> 
> One-socket Intel Xeon W3690 ("Westmere"), 3.46 GHz; core turbo disabled
> =======================================================================
> (note -- w/ core turbo disabled, performance is _very_ stable; variance of
>  < 0.5% run-to-run; figure of merit is "seconds elapsed" here)
> 
> * Producer / consumer bound to Hyperthread pairs:
> 
>  Performance counter stats for './vring_bench_noshadow 1000000000':
> 
>  343,425,166,916 L1-dcache-loads
>       21,393,148 L1-dcache-load-misses     #    0.01% of all L1-dcache hits
>   61,709,640,363 L1-dcache-stores
>        5,745,690 L1-dcache-store-misses
>   10,186,932,553 L1-dcache-prefetches
>            1,491 L1-dcache-prefetch-misses
>    121.335699344 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 1000000000':
> 
>  334,766,413,861 L1-dcache-loads
>       15,787,778 L1-dcache-load-misses     #    0.00% of all L1-dcache hits
>   62,735,792,799 L1-dcache-stores
>        3,252,113 L1-dcache-store-misses
>    9,018,273,596 L1-dcache-prefetches
>              819 L1-dcache-prefetch-misses
>    121.206339656 seconds time elapsed
> 
> Effectively Performance-neutral.
> 
> * Producer / consumer bound to separate cores, same socket:
> 
>  Performance counter stats for './vring_bench_noshadow 1000000000':
> 
>    399,943,384,509 L1-dcache-loads
>      8,868,334,693 L1-dcache-load-misses     #    2.22% of all L1-dcache hits
>     62,721,376,685 L1-dcache-stores
>      2,786,806,982 L1-dcache-store-misses
>     10,915,046,967 L1-dcache-prefetches
>            328,508 L1-dcache-prefetch-misses
>      146.585969976 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 1000000000':
> 
>    425,123,067,750 L1-dcache-loads 
>      6,689,318,709 L1-dcache-load-misses     #    1.57% of all L1-dcache hits
>     62,747,525,005 L1-dcache-stores 
>      2,496,274,505 L1-dcache-store-misses
>      8,627,873,397 L1-dcache-prefetches
>            146,729 L1-dcache-prefetch-misses
>      142.657327765 seconds time elapsed
> 
> 2.6% reduction in runtime; note that L1-dcache-load-misses reduced
> dramatically, 2 Billion(!) L1d misses saved.
> 
> Two-socket Intel Sandy Bridge(-EP) Xeon, 2.6 GHz; core turbo disabled
> =====================================================================
> 
> * Producer / consumer bound to Hyperthread pairs:
> 
>  Performance counter stats for './vring_bench_noshadow 100000000':
> 
>     37,129,070,402 L1-dcache-loads
>          6,416,246 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>      6,207,794,675 L1-dcache-stores
>          2,800,094 L1-dcache-store-misses
>       17.029790809 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 100000000':
> 
>     36,799,559,391 L1-dcache-loads
>         10,241,080 L1-dcache-load-misses     #    0.03% of all L1-dcache hits
>      6,312,252,458 L1-dcache-stores
>          2,742,239 L1-dcache-store-misses
>       16.941001709 seconds time elapsed
> 
> Effectively Performance-neutral.
> 
> * Producer / consumer bound to separate cores, same socket:
> 
>  Performance counter stats for './vring_bench_noshadow 100000000':
> 
>     27,684,883,046 L1-dcache-loads
>        809,933,091 L1-dcache-load-misses     #    2.93% of all L1-dcache hits
>      6,219,598,352 L1-dcache-stores
>          1,758,503 L1-dcache-store-misses
>       15.020511218 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 100000000':
> 
>     28,092,111,012 L1-dcache-loads                     
>        716,687,011 L1-dcache-load-misses     #    2.55% of all L1-dcache hits 
>      6,290,821,211 L1-dcache-stores 
>          1,565,583 L1-dcache-store-misses                                    
>       15.208420297 seconds time elapsed
> 
> Effectively Performance-neutral.
> 
> * Producer / consumer bound to separate cores, cross socket:
> (Sandy Bridge-EP appears to have less cross-socket variance than Haswell-EP)
> 
>  Performance counter stats for './vring_bench_noshadow 100000000':
> 
>     35,857,245,449 L1-dcache-loads
>        821,746,755 L1-dcache-load-misses     #    2.29% of all L1-dcache hits
>      6,252,551,550 L1-dcache-stores
>          4,665,405 L1-dcache-store-misses
>       46.340035651 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 100000000':
> 
>     39,044,022,857 L1-dcache-loads
>        711,731,527 L1-dcache-load-misses     #    1.82% of all L1-dcache hits
>      6,349,051,557 L1-dcache-stores
>          4,292,362 L1-dcache-store-misses
>       42.593259436 seconds time elapsed
> 
> Runtimes for the cross-socket test have somewhat higher variance, but the
> pattern in counts of L1-dcache-loads and L1-dcache-load-misses for nonshadow
> vs. shadow code is very stable.
> 
> noshadow (w/o this patch) reliably clocks in at ~46 seconds, shadow ranges
> from ~48 to ~42 (-2.8% to +8.0%).
> 
> Two-socket Intel Haswell(-EP) Xeon, 2.3 GHz; core turbo disabled
> ================================================================
> 
> * Producer / consumer bound to Hyperthread pairs:
> 
>  Performance counter stats for './vring_bench_noshadow 10000000000':
> 
>    474,856,463,271 L1-dcache-loads
>         74,223,784 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>     87,274,898,671 L1-dcache-stores
>         31,869,448 L1-dcache-store-misses
>      243.290969318 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 10000000000':
> 
>    466,891,993,302 L1-dcache-loads
>         80,859,208 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>     88,760,627,355 L1-dcache-stores
>         35,727,720 L1-dcache-store-misses
>      242.146970822 seconds time elapsed
> 
> Effectively Performance-neutral.
> 
> * Producer / consumer bound to separate cores, same socket:
> 
>  Performance counter stats for './vring_bench_noshadow 10000000000':
> 
>    357,657,891,797 L1-dcache-loads
>      8,760,549,978 L1-dcache-load-misses     #    2.45% of all L1-dcache hits
>     87,357,651,103 L1-dcache-stores
>         10,166,431 L1-dcache-store-misses
>      229.733047436 seconds time elapsed
> 
>  Performance counter stats for './vring_bench_shadow 10000000000':
> 
>    382,508,881,516 L1-dcache-loads
>      8,348,013,630 L1-dcache-load-misses     #    2.18% of all L1-dcache hits
>     88,756,639,931 L1-dcache-stores
>          9,842,999 L1-dcache-store-misses
>      230.850697668 seconds time elapsed
> 
> Effectively Performance-neutral.
> 
> * Producer / consumer bound to separate cores, different sockets:
> 
> Unfortunately I don't have useful numbers for this case -- even with
> core turbo disabled, runtime variance is very high (10 - 30% run-to-run).
> 
> > For the perf metric you provide, why not L1-dcache-load-misses which is
> > more meaning full?
> 
> L1-dcache-load-misses is a better metric, you're right; for the original
> AMD Piledriver run I posted:
> 
>  Performance counter stats for './vring_bench_noshadow':
>      5,451,082,016      L1-dcache-loads
>         31,690,398      L1-dcache-load-misses
>         60,288,052      L1-dcache-stores
>         60,517,840      LLC-loads
>              9,726      LLC-load-misses
>        2.221477739      seconds time elapsed
>  
>  Performance counter stats for './vring_bench_shadow':
>      5,405,701,361      L1-dcache-loads
>         31,157,235      L1-dcache-load-misses
>         59,172,380      L1-dcache-stores
>         59,398,269      LLC-loads
>             10,944      LLC-load-misses
>        2.168405376      seconds time elapsed
> 
> There is a 1.6% reduction in L1-dcache-load-misses, which lines up with
> about a 2% reduction in runtime.
> 
> Summary:
> * No workload on Westmere 1S, Sandy Bridge 2S, and Haswell 2S got worse;
> * Westmere 1S cross-core improved by ~2.5% reliably;
> * Sandy Bridge 2S cross-core cross-socket may have improved. (cross-socket
>   run variance makes it hard to tell)
> * AMD Piledriver tests improved by ~2%;
> * Other virtio implementations (over PCIe for example) should benefit;
> 
> HTH,
> -- vs;

I'm sorry -- I appear to have added an unintentional HTML draft part to my
reply. This would prevent the message from appearing on the kvm@ mailing list
at the minimum.

Re-posting with the HTML part scrubbed.

Sorry,
-- vs;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-18  4:28         ` Venkatesh Srinivas
@ 2015-11-19 16:15           ` Xie, Huawei
  2015-11-20 18:30             ` Venkatesh Srinivas via Virtualization
  2015-11-20 18:30             ` Venkatesh Srinivas
  2015-11-19 16:15           ` Xie, Huawei
  1 sibling, 2 replies; 16+ messages in thread
From: Xie, Huawei @ 2015-11-19 16:15 UTC (permalink / raw)
  To: Venkatesh Srinivas
  Cc: Michael S. Tsirkin, virtualization, Paolo Bonzini, David Matlack,
	KVM list, luto, Rusty Russell, Venkatesh Srinivas

On 11/18/2015 12:28 PM, Venkatesh Srinivas wrote:
> On Tue, Nov 17, 2015 at 08:08:18PM -0800, Venkatesh Srinivas wrote:
>> On Mon, Nov 16, 2015 at 7:46 PM, Xie, Huawei <huawei.xie@intel.com> wrote:
>>
>>> On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
>>>> On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
>>>>> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
>>>>>> Improves cacheline transfer flow of available ring header.
>>>>>>
>>>>>> Virtqueues are implemented as a pair of rings, one producer->consumer
>>>>>> avail ring and one consumer->producer used ring; preceding the
>>>>>> avail ring in memory are two contiguous u16 fields -- avail->flags
>>>>>> and avail->idx. A producer posts work by writing to avail->idx and
>>>>>> a consumer reads avail->idx.
>>>>>>
>>>>>> The flags and idx fields only need to be written by a producer CPU
>>>>>> and only read by a consumer CPU; when the producer and consumer are
>>>>>> running on different CPUs and the virtio_ring code is structured to
>>>>>> only have source writes/sink reads, we can continuously transfer the
>>>>>> avail header cacheline between 'M' states between cores. This flow
>>>>>> optimizes core -> core bandwidth on certain CPUs.
>>>>>>
>>>>>> (see: "Software Optimization Guide for AMD Family 15h Processors",
>>>>>> Section 11.6; similar language appears in the 10h guide and should
>>>>>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
>>>>>>
>>>>>> Unfortunately the existing virtio_ring code issued reads to the
>>>>>> avail->idx and read-modify-writes to avail->flags on the producer.
>>>>>>
>>>>>> This change shadows the flags and index fields in producer memory;
>>>>>> the vring code now reads from the shadows and only ever writes to
>>>>>> avail->flags and avail->idx, allowing the cacheline to transfer
>>>>>> core -> core optimally.
>>>>> Sounds logical, I'll apply this after a  bit of testing
>>>>> of my own, thanks!
>>>> Thanks!
>>> Venkatesh:
>>> Is it that your patch only applies to CPUs w/ exclusive caches?
>> No --- it applies when the inter-cache coherence flow is optimized by
>> 'M' -> 'M' transfers and when producer reads might interfere w/
>> consumer prefetchw/reads. The AMD Optimization guides have specific
>> language on this subject, but other platforms may benefit.
>> (see Intel #'s below)
For core2core case(not HT paire), after consumer reads that M cache line
for avail_idx, is that line still in the producer core's L1 data cache
with state changing from M->O state?
>>
>>> Do you have perf data on Intel CPUs?
>> Good idea -- I ran some tests on a couple of Intel platforms:
>>
>> (these are perf data from sample runs; for each I ran many runs, the
>>  numbers were pretty stable except for Haswell-EP cross-socket)
>>
>> One-socket Intel Xeon W3690 ("Westmere"), 3.46 GHz; core turbo disabled
>> =======================================================================
>> (note -- w/ core turbo disabled, performance is _very_ stable; variance of
>>  < 0.5% run-to-run; figure of merit is "seconds elapsed" here)
>>
>> * Producer / consumer bound to Hyperthread pairs:
>>
>>  Performance counter stats for './vring_bench_noshadow 1000000000':
>>
>>  343,425,166,916 L1-dcache-loads
>>       21,393,148 L1-dcache-load-misses     #    0.01% of all L1-dcache hits
>>   61,709,640,363 L1-dcache-stores
>>        5,745,690 L1-dcache-store-misses
>>   10,186,932,553 L1-dcache-prefetches
>>            1,491 L1-dcache-prefetch-misses
>>    121.335699344 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 1000000000':
>>
>>  334,766,413,861 L1-dcache-loads
>>       15,787,778 L1-dcache-load-misses     #    0.00% of all L1-dcache hits
>>   62,735,792,799 L1-dcache-stores
>>        3,252,113 L1-dcache-store-misses
>>    9,018,273,596 L1-dcache-prefetches
>>              819 L1-dcache-prefetch-misses
>>    121.206339656 seconds time elapsed
>>
>> Effectively Performance-neutral.
>>
>> * Producer / consumer bound to separate cores, same socket:
>>
>>  Performance counter stats for './vring_bench_noshadow 1000000000':
>>
>>    399,943,384,509 L1-dcache-loads
>>      8,868,334,693 L1-dcache-load-misses     #    2.22% of all L1-dcache hits
>>     62,721,376,685 L1-dcache-stores
>>      2,786,806,982 L1-dcache-store-misses
>>     10,915,046,967 L1-dcache-prefetches
>>            328,508 L1-dcache-prefetch-misses
>>      146.585969976 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 1000000000':
>>
>>    425,123,067,750 L1-dcache-loads 
>>      6,689,318,709 L1-dcache-load-misses     #    1.57% of all L1-dcache hits
>>     62,747,525,005 L1-dcache-stores 
>>      2,496,274,505 L1-dcache-store-misses
>>      8,627,873,397 L1-dcache-prefetches
>>            146,729 L1-dcache-prefetch-misses
>>      142.657327765 seconds time elapsed
>>
>> 2.6% reduction in runtime; note that L1-dcache-load-misses reduced
>> dramatically, 2 Billion(!) L1d misses saved.
>>
>> Two-socket Intel Sandy Bridge(-EP) Xeon, 2.6 GHz; core turbo disabled
>> =====================================================================
>>
>> * Producer / consumer bound to Hyperthread pairs:
>>
>>  Performance counter stats for './vring_bench_noshadow 100000000':
>>
>>     37,129,070,402 L1-dcache-loads
>>          6,416,246 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>>      6,207,794,675 L1-dcache-stores
>>          2,800,094 L1-dcache-store-misses
>>       17.029790809 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 100000000':
>>
>>     36,799,559,391 L1-dcache-loads
>>         10,241,080 L1-dcache-load-misses     #    0.03% of all L1-dcache hits
>>      6,312,252,458 L1-dcache-stores
>>          2,742,239 L1-dcache-store-misses
>>       16.941001709 seconds time elapsed
>>
>> Effectively Performance-neutral.
>>
>> * Producer / consumer bound to separate cores, same socket:
>>
>>  Performance counter stats for './vring_bench_noshadow 100000000':
>>
>>     27,684,883,046 L1-dcache-loads
>>        809,933,091 L1-dcache-load-misses     #    2.93% of all L1-dcache hits
>>      6,219,598,352 L1-dcache-stores
>>          1,758,503 L1-dcache-store-misses
>>       15.020511218 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 100000000':
>>
>>     28,092,111,012 L1-dcache-loads                     
>>        716,687,011 L1-dcache-load-misses     #    2.55% of all L1-dcache hits 
>>      6,290,821,211 L1-dcache-stores 
>>          1,565,583 L1-dcache-store-misses                                    
>>       15.208420297 seconds time elapsed
>>
>> Effectively Performance-neutral.
>>
>> * Producer / consumer bound to separate cores, cross socket:
>> (Sandy Bridge-EP appears to have less cross-socket variance than Haswell-EP)
>>
>>  Performance counter stats for './vring_bench_noshadow 100000000':
>>
>>     35,857,245,449 L1-dcache-loads
>>        821,746,755 L1-dcache-load-misses     #    2.29% of all L1-dcache hits
>>      6,252,551,550 L1-dcache-stores
>>          4,665,405 L1-dcache-store-misses
>>       46.340035651 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 100000000':
>>
>>     39,044,022,857 L1-dcache-loads
>>        711,731,527 L1-dcache-load-misses     #    1.82% of all L1-dcache hits
>>      6,349,051,557 L1-dcache-stores
>>          4,292,362 L1-dcache-store-misses
>>       42.593259436 seconds time elapsed
>>
>> Runtimes for the cross-socket test have somewhat higher variance, but the
>> pattern in counts of L1-dcache-loads and L1-dcache-load-misses for nonshadow
>> vs. shadow code is very stable.
>>
>> noshadow (w/o this patch) reliably clocks in at ~46 seconds, shadow ranges
>> from ~48 to ~42 (-2.8% to +8.0%).
>>
>> Two-socket Intel Haswell(-EP) Xeon, 2.3 GHz; core turbo disabled
>> ================================================================
>>
>> * Producer / consumer bound to Hyperthread pairs:
>>
>>  Performance counter stats for './vring_bench_noshadow 10000000000':
>>
>>    474,856,463,271 L1-dcache-loads
>>         74,223,784 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>>     87,274,898,671 L1-dcache-stores
>>         31,869,448 L1-dcache-store-misses
>>      243.290969318 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 10000000000':
>>
>>    466,891,993,302 L1-dcache-loads
>>         80,859,208 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>>     88,760,627,355 L1-dcache-stores
>>         35,727,720 L1-dcache-store-misses
>>      242.146970822 seconds time elapsed
>>
>> Effectively Performance-neutral.
>>
>> * Producer / consumer bound to separate cores, same socket:
>>
>>  Performance counter stats for './vring_bench_noshadow 10000000000':
>>
>>    357,657,891,797 L1-dcache-loads
>>      8,760,549,978 L1-dcache-load-misses     #    2.45% of all L1-dcache hits
>>     87,357,651,103 L1-dcache-stores
>>         10,166,431 L1-dcache-store-misses
>>      229.733047436 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 10000000000':
>>
>>    382,508,881,516 L1-dcache-loads
>>      8,348,013,630 L1-dcache-load-misses     #    2.18% of all L1-dcache hits
>>     88,756,639,931 L1-dcache-stores
>>          9,842,999 L1-dcache-store-misses
>>      230.850697668 seconds time elapsed
>>
>> Effectively Performance-neutral.
>>
>> * Producer / consumer bound to separate cores, different sockets:
>>
>> Unfortunately I don't have useful numbers for this case -- even with
>> core turbo disabled, runtime variance is very high (10 - 30% run-to-run).
>>
>>> For the perf metric you provide, why not L1-dcache-load-misses which is
>>> more meaning full?
>> L1-dcache-load-misses is a better metric, you're right; for the original
>> AMD Piledriver run I posted:
>>
>>  Performance counter stats for './vring_bench_noshadow':
>>      5,451,082,016      L1-dcache-loads
>>         31,690,398      L1-dcache-load-misses
>>         60,288,052      L1-dcache-stores
>>         60,517,840      LLC-loads
>>              9,726      LLC-load-misses
>>        2.221477739      seconds time elapsed
>>  
>>  Performance counter stats for './vring_bench_shadow':
>>      5,405,701,361      L1-dcache-loads
>>         31,157,235      L1-dcache-load-misses
>>         59,172,380      L1-dcache-stores
>>         59,398,269      LLC-loads
>>             10,944      LLC-load-misses
>>        2.168405376      seconds time elapsed
>>
>> There is a 1.6% reduction in L1-dcache-load-misses, which lines up with
>> about a 2% reduction in runtime.
>>
>> Summary:
>> * No workload on Westmere 1S, Sandy Bridge 2S, and Haswell 2S got worse;
>> * Westmere 1S cross-core improved by ~2.5% reliably;
>> * Sandy Bridge 2S cross-core cross-socket may have improved. (cross-socket
>>   run variance makes it hard to tell)
>> * AMD Piledriver tests improved by ~2%;
>> * Other virtio implementations (over PCIe for example) should benefit;
>>
>> HTH,
>> -- vs;
> I'm sorry -- I appear to have added an unintentional HTML draft part to my
> reply. This would prevent the message from appearing on the kvm@ mailing list
> at the minimum.
>
> Re-posting with the HTML part scrubbed.
>
> Sorry,
> -- vs;
>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-18  4:28         ` Venkatesh Srinivas
  2015-11-19 16:15           ` Xie, Huawei
@ 2015-11-19 16:15           ` Xie, Huawei
  1 sibling, 0 replies; 16+ messages in thread
From: Xie, Huawei @ 2015-11-19 16:15 UTC (permalink / raw)
  To: Venkatesh Srinivas
  Cc: KVM list, Michael S. Tsirkin, virtualization, Venkatesh Srinivas,
	luto, David Matlack, Paolo Bonzini

On 11/18/2015 12:28 PM, Venkatesh Srinivas wrote:
> On Tue, Nov 17, 2015 at 08:08:18PM -0800, Venkatesh Srinivas wrote:
>> On Mon, Nov 16, 2015 at 7:46 PM, Xie, Huawei <huawei.xie@intel.com> wrote:
>>
>>> On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
>>>> On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
>>>>> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
>>>>>> Improves cacheline transfer flow of available ring header.
>>>>>>
>>>>>> Virtqueues are implemented as a pair of rings, one producer->consumer
>>>>>> avail ring and one consumer->producer used ring; preceding the
>>>>>> avail ring in memory are two contiguous u16 fields -- avail->flags
>>>>>> and avail->idx. A producer posts work by writing to avail->idx and
>>>>>> a consumer reads avail->idx.
>>>>>>
>>>>>> The flags and idx fields only need to be written by a producer CPU
>>>>>> and only read by a consumer CPU; when the producer and consumer are
>>>>>> running on different CPUs and the virtio_ring code is structured to
>>>>>> only have source writes/sink reads, we can continuously transfer the
>>>>>> avail header cacheline between 'M' states between cores. This flow
>>>>>> optimizes core -> core bandwidth on certain CPUs.
>>>>>>
>>>>>> (see: "Software Optimization Guide for AMD Family 15h Processors",
>>>>>> Section 11.6; similar language appears in the 10h guide and should
>>>>>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
>>>>>>
>>>>>> Unfortunately the existing virtio_ring code issued reads to the
>>>>>> avail->idx and read-modify-writes to avail->flags on the producer.
>>>>>>
>>>>>> This change shadows the flags and index fields in producer memory;
>>>>>> the vring code now reads from the shadows and only ever writes to
>>>>>> avail->flags and avail->idx, allowing the cacheline to transfer
>>>>>> core -> core optimally.
>>>>> Sounds logical, I'll apply this after a  bit of testing
>>>>> of my own, thanks!
>>>> Thanks!
>>> Venkatesh:
>>> Is it that your patch only applies to CPUs w/ exclusive caches?
>> No --- it applies when the inter-cache coherence flow is optimized by
>> 'M' -> 'M' transfers and when producer reads might interfere w/
>> consumer prefetchw/reads. The AMD Optimization guides have specific
>> language on this subject, but other platforms may benefit.
>> (see Intel #'s below)
For core2core case(not HT paire), after consumer reads that M cache line
for avail_idx, is that line still in the producer core's L1 data cache
with state changing from M->O state?
>>
>>> Do you have perf data on Intel CPUs?
>> Good idea -- I ran some tests on a couple of Intel platforms:
>>
>> (these are perf data from sample runs; for each I ran many runs, the
>>  numbers were pretty stable except for Haswell-EP cross-socket)
>>
>> One-socket Intel Xeon W3690 ("Westmere"), 3.46 GHz; core turbo disabled
>> =======================================================================
>> (note -- w/ core turbo disabled, performance is _very_ stable; variance of
>>  < 0.5% run-to-run; figure of merit is "seconds elapsed" here)
>>
>> * Producer / consumer bound to Hyperthread pairs:
>>
>>  Performance counter stats for './vring_bench_noshadow 1000000000':
>>
>>  343,425,166,916 L1-dcache-loads
>>       21,393,148 L1-dcache-load-misses     #    0.01% of all L1-dcache hits
>>   61,709,640,363 L1-dcache-stores
>>        5,745,690 L1-dcache-store-misses
>>   10,186,932,553 L1-dcache-prefetches
>>            1,491 L1-dcache-prefetch-misses
>>    121.335699344 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 1000000000':
>>
>>  334,766,413,861 L1-dcache-loads
>>       15,787,778 L1-dcache-load-misses     #    0.00% of all L1-dcache hits
>>   62,735,792,799 L1-dcache-stores
>>        3,252,113 L1-dcache-store-misses
>>    9,018,273,596 L1-dcache-prefetches
>>              819 L1-dcache-prefetch-misses
>>    121.206339656 seconds time elapsed
>>
>> Effectively Performance-neutral.
>>
>> * Producer / consumer bound to separate cores, same socket:
>>
>>  Performance counter stats for './vring_bench_noshadow 1000000000':
>>
>>    399,943,384,509 L1-dcache-loads
>>      8,868,334,693 L1-dcache-load-misses     #    2.22% of all L1-dcache hits
>>     62,721,376,685 L1-dcache-stores
>>      2,786,806,982 L1-dcache-store-misses
>>     10,915,046,967 L1-dcache-prefetches
>>            328,508 L1-dcache-prefetch-misses
>>      146.585969976 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 1000000000':
>>
>>    425,123,067,750 L1-dcache-loads 
>>      6,689,318,709 L1-dcache-load-misses     #    1.57% of all L1-dcache hits
>>     62,747,525,005 L1-dcache-stores 
>>      2,496,274,505 L1-dcache-store-misses
>>      8,627,873,397 L1-dcache-prefetches
>>            146,729 L1-dcache-prefetch-misses
>>      142.657327765 seconds time elapsed
>>
>> 2.6% reduction in runtime; note that L1-dcache-load-misses reduced
>> dramatically, 2 Billion(!) L1d misses saved.
>>
>> Two-socket Intel Sandy Bridge(-EP) Xeon, 2.6 GHz; core turbo disabled
>> =====================================================================
>>
>> * Producer / consumer bound to Hyperthread pairs:
>>
>>  Performance counter stats for './vring_bench_noshadow 100000000':
>>
>>     37,129,070,402 L1-dcache-loads
>>          6,416,246 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>>      6,207,794,675 L1-dcache-stores
>>          2,800,094 L1-dcache-store-misses
>>       17.029790809 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 100000000':
>>
>>     36,799,559,391 L1-dcache-loads
>>         10,241,080 L1-dcache-load-misses     #    0.03% of all L1-dcache hits
>>      6,312,252,458 L1-dcache-stores
>>          2,742,239 L1-dcache-store-misses
>>       16.941001709 seconds time elapsed
>>
>> Effectively Performance-neutral.
>>
>> * Producer / consumer bound to separate cores, same socket:
>>
>>  Performance counter stats for './vring_bench_noshadow 100000000':
>>
>>     27,684,883,046 L1-dcache-loads
>>        809,933,091 L1-dcache-load-misses     #    2.93% of all L1-dcache hits
>>      6,219,598,352 L1-dcache-stores
>>          1,758,503 L1-dcache-store-misses
>>       15.020511218 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 100000000':
>>
>>     28,092,111,012 L1-dcache-loads                     
>>        716,687,011 L1-dcache-load-misses     #    2.55% of all L1-dcache hits 
>>      6,290,821,211 L1-dcache-stores 
>>          1,565,583 L1-dcache-store-misses                                    
>>       15.208420297 seconds time elapsed
>>
>> Effectively Performance-neutral.
>>
>> * Producer / consumer bound to separate cores, cross socket:
>> (Sandy Bridge-EP appears to have less cross-socket variance than Haswell-EP)
>>
>>  Performance counter stats for './vring_bench_noshadow 100000000':
>>
>>     35,857,245,449 L1-dcache-loads
>>        821,746,755 L1-dcache-load-misses     #    2.29% of all L1-dcache hits
>>      6,252,551,550 L1-dcache-stores
>>          4,665,405 L1-dcache-store-misses
>>       46.340035651 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 100000000':
>>
>>     39,044,022,857 L1-dcache-loads
>>        711,731,527 L1-dcache-load-misses     #    1.82% of all L1-dcache hits
>>      6,349,051,557 L1-dcache-stores
>>          4,292,362 L1-dcache-store-misses
>>       42.593259436 seconds time elapsed
>>
>> Runtimes for the cross-socket test have somewhat higher variance, but the
>> pattern in counts of L1-dcache-loads and L1-dcache-load-misses for nonshadow
>> vs. shadow code is very stable.
>>
>> noshadow (w/o this patch) reliably clocks in at ~46 seconds, shadow ranges
>> from ~48 to ~42 (-2.8% to +8.0%).
>>
>> Two-socket Intel Haswell(-EP) Xeon, 2.3 GHz; core turbo disabled
>> ================================================================
>>
>> * Producer / consumer bound to Hyperthread pairs:
>>
>>  Performance counter stats for './vring_bench_noshadow 10000000000':
>>
>>    474,856,463,271 L1-dcache-loads
>>         74,223,784 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>>     87,274,898,671 L1-dcache-stores
>>         31,869,448 L1-dcache-store-misses
>>      243.290969318 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 10000000000':
>>
>>    466,891,993,302 L1-dcache-loads
>>         80,859,208 L1-dcache-load-misses     #    0.02% of all L1-dcache hits
>>     88,760,627,355 L1-dcache-stores
>>         35,727,720 L1-dcache-store-misses
>>      242.146970822 seconds time elapsed
>>
>> Effectively Performance-neutral.
>>
>> * Producer / consumer bound to separate cores, same socket:
>>
>>  Performance counter stats for './vring_bench_noshadow 10000000000':
>>
>>    357,657,891,797 L1-dcache-loads
>>      8,760,549,978 L1-dcache-load-misses     #    2.45% of all L1-dcache hits
>>     87,357,651,103 L1-dcache-stores
>>         10,166,431 L1-dcache-store-misses
>>      229.733047436 seconds time elapsed
>>
>>  Performance counter stats for './vring_bench_shadow 10000000000':
>>
>>    382,508,881,516 L1-dcache-loads
>>      8,348,013,630 L1-dcache-load-misses     #    2.18% of all L1-dcache hits
>>     88,756,639,931 L1-dcache-stores
>>          9,842,999 L1-dcache-store-misses
>>      230.850697668 seconds time elapsed
>>
>> Effectively Performance-neutral.
>>
>> * Producer / consumer bound to separate cores, different sockets:
>>
>> Unfortunately I don't have useful numbers for this case -- even with
>> core turbo disabled, runtime variance is very high (10 - 30% run-to-run).
>>
>>> For the perf metric you provide, why not L1-dcache-load-misses which is
>>> more meaning full?
>> L1-dcache-load-misses is a better metric, you're right; for the original
>> AMD Piledriver run I posted:
>>
>>  Performance counter stats for './vring_bench_noshadow':
>>      5,451,082,016      L1-dcache-loads
>>         31,690,398      L1-dcache-load-misses
>>         60,288,052      L1-dcache-stores
>>         60,517,840      LLC-loads
>>              9,726      LLC-load-misses
>>        2.221477739      seconds time elapsed
>>  
>>  Performance counter stats for './vring_bench_shadow':
>>      5,405,701,361      L1-dcache-loads
>>         31,157,235      L1-dcache-load-misses
>>         59,172,380      L1-dcache-stores
>>         59,398,269      LLC-loads
>>             10,944      LLC-load-misses
>>        2.168405376      seconds time elapsed
>>
>> There is a 1.6% reduction in L1-dcache-load-misses, which lines up with
>> about a 2% reduction in runtime.
>>
>> Summary:
>> * No workload on Westmere 1S, Sandy Bridge 2S, and Haswell 2S got worse;
>> * Westmere 1S cross-core improved by ~2.5% reliably;
>> * Sandy Bridge 2S cross-core cross-socket may have improved. (cross-socket
>>   run variance makes it hard to tell)
>> * AMD Piledriver tests improved by ~2%;
>> * Other virtio implementations (over PCIe for example) should benefit;
>>
>> HTH,
>> -- vs;
> I'm sorry -- I appear to have added an unintentional HTML draft part to my
> reply. This would prevent the message from appearing on the kvm@ mailing list
> at the minimum.
>
> Re-posting with the HTML part scrubbed.
>
> Sorry,
> -- vs;
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-19 16:15           ` Xie, Huawei
  2015-11-20 18:30             ` Venkatesh Srinivas via Virtualization
@ 2015-11-20 18:30             ` Venkatesh Srinivas
  2015-11-23 16:46               ` Xie, Huawei
  2015-11-23 16:46               ` Xie, Huawei
  1 sibling, 2 replies; 16+ messages in thread
From: Venkatesh Srinivas @ 2015-11-20 18:30 UTC (permalink / raw)
  To: Xie, Huawei
  Cc: Michael S. Tsirkin, virtualization, Paolo Bonzini, David Matlack,
	KVM list, luto, Rusty Russell, Venkatesh Srinivas

On Thu, Nov 19, 2015 at 04:15:48PM +0000, Xie, Huawei wrote:
> On 11/18/2015 12:28 PM, Venkatesh Srinivas wrote:
> > On Tue, Nov 17, 2015 at 08:08:18PM -0800, Venkatesh Srinivas wrote:
> >> On Mon, Nov 16, 2015 at 7:46 PM, Xie, Huawei <huawei.xie@intel.com> wrote:
> >>
> >>> On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
> >>>> On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
> >>>>> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
> >>>>>> Improves cacheline transfer flow of available ring header.
> >>>>>>
> >>>>>> Virtqueues are implemented as a pair of rings, one producer->consumer
> >>>>>> avail ring and one consumer->producer used ring; preceding the
> >>>>>> avail ring in memory are two contiguous u16 fields -- avail->flags
> >>>>>> and avail->idx. A producer posts work by writing to avail->idx and
> >>>>>> a consumer reads avail->idx.
> >>>>>>
> >>>>>> The flags and idx fields only need to be written by a producer CPU
> >>>>>> and only read by a consumer CPU; when the producer and consumer are
> >>>>>> running on different CPUs and the virtio_ring code is structured to
> >>>>>> only have source writes/sink reads, we can continuously transfer the
> >>>>>> avail header cacheline between 'M' states between cores. This flow
> >>>>>> optimizes core -> core bandwidth on certain CPUs.
> >>>>>>
> >>>>>> (see: "Software Optimization Guide for AMD Family 15h Processors",
> >>>>>> Section 11.6; similar language appears in the 10h guide and should
> >>>>>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
> >>>>>>
> >>>>>> Unfortunately the existing virtio_ring code issued reads to the
> >>>>>> avail->idx and read-modify-writes to avail->flags on the producer.
> >>>>>>
> >>>>>> This change shadows the flags and index fields in producer memory;
> >>>>>> the vring code now reads from the shadows and only ever writes to
> >>>>>> avail->flags and avail->idx, allowing the cacheline to transfer
> >>>>>> core -> core optimally.
> >>>>> Sounds logical, I'll apply this after a  bit of testing
> >>>>> of my own, thanks!
> >>>> Thanks!
> >>> Venkatesh:
> >>> Is it that your patch only applies to CPUs w/ exclusive caches?
> >> No --- it applies when the inter-cache coherence flow is optimized by
> >> 'M' -> 'M' transfers and when producer reads might interfere w/
> >> consumer prefetchw/reads. The AMD Optimization guides have specific
> >> language on this subject, but other platforms may benefit.
> >> (see Intel #'s below)
> For core2core case(not HT paire), after consumer reads that M cache line
> for avail_idx, is that line still in the producer core's L1 data cache
> with state changing from M->O state?

Textbook MOESI would not allow that state combination -- when the consumer
gets the line in 'M' state, the producer cannot hold it in 'O' state.

On the AMD Piledriver, per the Optimization guide, I use PREFETCHW/Load to
get the line in 'M' state on the consumer (invalidating it in the Producer's
cache):

"* Use PREFETCHW on the consumer side, even if the consumer will not modify
   the data"

That, plus the "Optimizing Inter-Core Data Transfer" section imply that
PREFETCHW + MOV will cause the consumer to load the line into 'M' state.

PREFETCHW was not available on Intel CPUs pre-Broadwell; from the public
documentation alone, I don't think we can tell what transition the producer's
cacheline undergoes on these cores. For that matter, the latest documentation
I can find (for Nehalem), indicated there was no 'O' state -- Nehalem
implemented MESIF, not MOESI.

HTH,
-- vs;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-19 16:15           ` Xie, Huawei
@ 2015-11-20 18:30             ` Venkatesh Srinivas via Virtualization
  2015-11-20 18:30             ` Venkatesh Srinivas
  1 sibling, 0 replies; 16+ messages in thread
From: Venkatesh Srinivas via Virtualization @ 2015-11-20 18:30 UTC (permalink / raw)
  To: Xie, Huawei
  Cc: KVM list, Michael S. Tsirkin, virtualization, Venkatesh Srinivas,
	luto, David Matlack, Paolo Bonzini

On Thu, Nov 19, 2015 at 04:15:48PM +0000, Xie, Huawei wrote:
> On 11/18/2015 12:28 PM, Venkatesh Srinivas wrote:
> > On Tue, Nov 17, 2015 at 08:08:18PM -0800, Venkatesh Srinivas wrote:
> >> On Mon, Nov 16, 2015 at 7:46 PM, Xie, Huawei <huawei.xie@intel.com> wrote:
> >>
> >>> On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
> >>>> On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
> >>>>> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
> >>>>>> Improves cacheline transfer flow of available ring header.
> >>>>>>
> >>>>>> Virtqueues are implemented as a pair of rings, one producer->consumer
> >>>>>> avail ring and one consumer->producer used ring; preceding the
> >>>>>> avail ring in memory are two contiguous u16 fields -- avail->flags
> >>>>>> and avail->idx. A producer posts work by writing to avail->idx and
> >>>>>> a consumer reads avail->idx.
> >>>>>>
> >>>>>> The flags and idx fields only need to be written by a producer CPU
> >>>>>> and only read by a consumer CPU; when the producer and consumer are
> >>>>>> running on different CPUs and the virtio_ring code is structured to
> >>>>>> only have source writes/sink reads, we can continuously transfer the
> >>>>>> avail header cacheline between 'M' states between cores. This flow
> >>>>>> optimizes core -> core bandwidth on certain CPUs.
> >>>>>>
> >>>>>> (see: "Software Optimization Guide for AMD Family 15h Processors",
> >>>>>> Section 11.6; similar language appears in the 10h guide and should
> >>>>>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
> >>>>>>
> >>>>>> Unfortunately the existing virtio_ring code issued reads to the
> >>>>>> avail->idx and read-modify-writes to avail->flags on the producer.
> >>>>>>
> >>>>>> This change shadows the flags and index fields in producer memory;
> >>>>>> the vring code now reads from the shadows and only ever writes to
> >>>>>> avail->flags and avail->idx, allowing the cacheline to transfer
> >>>>>> core -> core optimally.
> >>>>> Sounds logical, I'll apply this after a  bit of testing
> >>>>> of my own, thanks!
> >>>> Thanks!
> >>> Venkatesh:
> >>> Is it that your patch only applies to CPUs w/ exclusive caches?
> >> No --- it applies when the inter-cache coherence flow is optimized by
> >> 'M' -> 'M' transfers and when producer reads might interfere w/
> >> consumer prefetchw/reads. The AMD Optimization guides have specific
> >> language on this subject, but other platforms may benefit.
> >> (see Intel #'s below)
> For core2core case(not HT paire), after consumer reads that M cache line
> for avail_idx, is that line still in the producer core's L1 data cache
> with state changing from M->O state?

Textbook MOESI would not allow that state combination -- when the consumer
gets the line in 'M' state, the producer cannot hold it in 'O' state.

On the AMD Piledriver, per the Optimization guide, I use PREFETCHW/Load to
get the line in 'M' state on the consumer (invalidating it in the Producer's
cache):

"* Use PREFETCHW on the consumer side, even if the consumer will not modify
   the data"

That, plus the "Optimizing Inter-Core Data Transfer" section imply that
PREFETCHW + MOV will cause the consumer to load the line into 'M' state.

PREFETCHW was not available on Intel CPUs pre-Broadwell; from the public
documentation alone, I don't think we can tell what transition the producer's
cacheline undergoes on these cores. For that matter, the latest documentation
I can find (for Nehalem), indicated there was no 'O' state -- Nehalem
implemented MESIF, not MOESI.

HTH,
-- vs;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-20 18:30             ` Venkatesh Srinivas
  2015-11-23 16:46               ` Xie, Huawei
@ 2015-11-23 16:46               ` Xie, Huawei
  1 sibling, 0 replies; 16+ messages in thread
From: Xie, Huawei @ 2015-11-23 16:46 UTC (permalink / raw)
  To: Venkatesh Srinivas
  Cc: Michael S. Tsirkin, virtualization, Paolo Bonzini, David Matlack,
	KVM list, luto, Rusty Russell, Venkatesh Srinivas, Lu, Patrick,
	Wang, Zhihong, Abhinkar, Bindiya S

On 11/21/2015 2:30 AM, Venkatesh Srinivas wrote:
> On Thu, Nov 19, 2015 at 04:15:48PM +0000, Xie, Huawei wrote:
>> On 11/18/2015 12:28 PM, Venkatesh Srinivas wrote:
>>> On Tue, Nov 17, 2015 at 08:08:18PM -0800, Venkatesh Srinivas wrote:
>>>> On Mon, Nov 16, 2015 at 7:46 PM, Xie, Huawei <huawei.xie@intel.com> wrote:
>>>>
>>>>> On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
>>>>>> On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
>>>>>>> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
>>>>>>>> Improves cacheline transfer flow of available ring header.
>>>>>>>>
>>>>>>>> Virtqueues are implemented as a pair of rings, one producer->consumer
>>>>>>>> avail ring and one consumer->producer used ring; preceding the
>>>>>>>> avail ring in memory are two contiguous u16 fields -- avail->flags
>>>>>>>> and avail->idx. A producer posts work by writing to avail->idx and
>>>>>>>> a consumer reads avail->idx.
>>>>>>>>
>>>>>>>> The flags and idx fields only need to be written by a producer CPU
>>>>>>>> and only read by a consumer CPU; when the producer and consumer are
>>>>>>>> running on different CPUs and the virtio_ring code is structured to
>>>>>>>> only have source writes/sink reads, we can continuously transfer the
>>>>>>>> avail header cacheline between 'M' states between cores. This flow
>>>>>>>> optimizes core -> core bandwidth on certain CPUs.
>>>>>>>>
>>>>>>>> (see: "Software Optimization Guide for AMD Family 15h Processors",
>>>>>>>> Section 11.6; similar language appears in the 10h guide and should
>>>>>>>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
>>>>>>>>
>>>>>>>> Unfortunately the existing virtio_ring code issued reads to the
>>>>>>>> avail->idx and read-modify-writes to avail->flags on the producer.
>>>>>>>>
>>>>>>>> This change shadows the flags and index fields in producer memory;
>>>>>>>> the vring code now reads from the shadows and only ever writes to
>>>>>>>> avail->flags and avail->idx, allowing the cacheline to transfer
>>>>>>>> core -> core optimally.
>>>>>>> Sounds logical, I'll apply this after a  bit of testing
>>>>>>> of my own, thanks!
>>>>>> Thanks!
>>>>> Venkatesh:
>>>>> Is it that your patch only applies to CPUs w/ exclusive caches?
>>>> No --- it applies when the inter-cache coherence flow is optimized by
>>>> 'M' -> 'M' transfers and when producer reads might interfere w/
>>>> consumer prefetchw/reads. The AMD Optimization guides have specific
>>>> language on this subject, but other platforms may benefit.
>>>> (see Intel #'s below)
>> For core2core case(not HT paire), after consumer reads that M cache line
>> for avail_idx, is that line still in the producer core's L1 data cache
>> with state changing from M->O state?
> Textbook MOESI would not allow that state combination -- when the consumer
> gets the line in 'M' state, the producer cannot hold it in 'O' state.
Hi Venkatesh:
On consumer core, you are using (prefetchw + load) to get the cache line
anyway, even it doesn't mean to write, right? That makes sense for your
cache line transfer.
If using load only, the cache line on producer core should be changed
from M -> O, meaning dirty sharing, and the consumer gets the line with
S state.

I might miss something important in your case. Could you give more
detailed description?
For non-shadow case,
1) Producer updates flags or idx, cache line is set to be M state.
2) When consumer reads the idx or flags, cache line is set to be S state
on consumer core, while the cache line on producer is set to be O state.
What is the problem reading avail idx/flag whose cache line is either M
or O state on producer core? What is the benefit with and without prefetchw?

>
> On the AMD Piledriver, per the Optimization guide, I use PREFETCHW/Load to
> get the line in 'M' state on the consumer (invalidating it in the Producer's
> cache):
>
> "* Use PREFETCHW on the consumer side, even if the consumer will not modify
>    the data"
>
> That, plus the "Optimizing Inter-Core Data Transfer" section imply that
> PREFETCHW + MOV will cause the consumer to load the line into 'M' state.
>
> PREFETCHW was not available on Intel CPUs pre-Broadwell; from the public
> documentation alone, I don't think we can tell what transition the producer's
> cacheline undergoes on these cores. For that matter, the latest documentation
> I can find (for Nehalem), indicated there was no 'O' state -- Nehalem
> implemented MESIF, not MOESI.
By O, i mean AMD MOESI, and i thought you were using only load to load
the cache line on the consumer core. If you are using prefetchw + load,
that makes sense for the state transfer.
>
> HTH,
> -- vs;
>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] virtio_ring: Shadow available ring flags & index
  2015-11-20 18:30             ` Venkatesh Srinivas
@ 2015-11-23 16:46               ` Xie, Huawei
  2015-11-23 16:46               ` Xie, Huawei
  1 sibling, 0 replies; 16+ messages in thread
From: Xie, Huawei @ 2015-11-23 16:46 UTC (permalink / raw)
  To: Venkatesh Srinivas
  Cc: Abhinkar, Bindiya S, KVM list, Michael S. Tsirkin,
	virtualization, Venkatesh Srinivas, Lu, Patrick, luto, Wang,
	Zhihong, David Matlack, Paolo Bonzini

On 11/21/2015 2:30 AM, Venkatesh Srinivas wrote:
> On Thu, Nov 19, 2015 at 04:15:48PM +0000, Xie, Huawei wrote:
>> On 11/18/2015 12:28 PM, Venkatesh Srinivas wrote:
>>> On Tue, Nov 17, 2015 at 08:08:18PM -0800, Venkatesh Srinivas wrote:
>>>> On Mon, Nov 16, 2015 at 7:46 PM, Xie, Huawei <huawei.xie@intel.com> wrote:
>>>>
>>>>> On 11/14/2015 7:41 AM, Venkatesh Srinivas wrote:
>>>>>> On Wed, Nov 11, 2015 at 02:34:33PM +0200, Michael S. Tsirkin wrote:
>>>>>>> On Tue, Nov 10, 2015 at 04:21:07PM -0800, Venkatesh Srinivas wrote:
>>>>>>>> Improves cacheline transfer flow of available ring header.
>>>>>>>>
>>>>>>>> Virtqueues are implemented as a pair of rings, one producer->consumer
>>>>>>>> avail ring and one consumer->producer used ring; preceding the
>>>>>>>> avail ring in memory are two contiguous u16 fields -- avail->flags
>>>>>>>> and avail->idx. A producer posts work by writing to avail->idx and
>>>>>>>> a consumer reads avail->idx.
>>>>>>>>
>>>>>>>> The flags and idx fields only need to be written by a producer CPU
>>>>>>>> and only read by a consumer CPU; when the producer and consumer are
>>>>>>>> running on different CPUs and the virtio_ring code is structured to
>>>>>>>> only have source writes/sink reads, we can continuously transfer the
>>>>>>>> avail header cacheline between 'M' states between cores. This flow
>>>>>>>> optimizes core -> core bandwidth on certain CPUs.
>>>>>>>>
>>>>>>>> (see: "Software Optimization Guide for AMD Family 15h Processors",
>>>>>>>> Section 11.6; similar language appears in the 10h guide and should
>>>>>>>> apply to CPUs w/ exclusive caches, using LLC as a transfer cache)
>>>>>>>>
>>>>>>>> Unfortunately the existing virtio_ring code issued reads to the
>>>>>>>> avail->idx and read-modify-writes to avail->flags on the producer.
>>>>>>>>
>>>>>>>> This change shadows the flags and index fields in producer memory;
>>>>>>>> the vring code now reads from the shadows and only ever writes to
>>>>>>>> avail->flags and avail->idx, allowing the cacheline to transfer
>>>>>>>> core -> core optimally.
>>>>>>> Sounds logical, I'll apply this after a  bit of testing
>>>>>>> of my own, thanks!
>>>>>> Thanks!
>>>>> Venkatesh:
>>>>> Is it that your patch only applies to CPUs w/ exclusive caches?
>>>> No --- it applies when the inter-cache coherence flow is optimized by
>>>> 'M' -> 'M' transfers and when producer reads might interfere w/
>>>> consumer prefetchw/reads. The AMD Optimization guides have specific
>>>> language on this subject, but other platforms may benefit.
>>>> (see Intel #'s below)
>> For core2core case(not HT paire), after consumer reads that M cache line
>> for avail_idx, is that line still in the producer core's L1 data cache
>> with state changing from M->O state?
> Textbook MOESI would not allow that state combination -- when the consumer
> gets the line in 'M' state, the producer cannot hold it in 'O' state.
Hi Venkatesh:
On consumer core, you are using (prefetchw + load) to get the cache line
anyway, even it doesn't mean to write, right? That makes sense for your
cache line transfer.
If using load only, the cache line on producer core should be changed
from M -> O, meaning dirty sharing, and the consumer gets the line with
S state.

I might miss something important in your case. Could you give more
detailed description?
For non-shadow case,
1) Producer updates flags or idx, cache line is set to be M state.
2) When consumer reads the idx or flags, cache line is set to be S state
on consumer core, while the cache line on producer is set to be O state.
What is the problem reading avail idx/flag whose cache line is either M
or O state on producer core? What is the benefit with and without prefetchw?

>
> On the AMD Piledriver, per the Optimization guide, I use PREFETCHW/Load to
> get the line in 'M' state on the consumer (invalidating it in the Producer's
> cache):
>
> "* Use PREFETCHW on the consumer side, even if the consumer will not modify
>    the data"
>
> That, plus the "Optimizing Inter-Core Data Transfer" section imply that
> PREFETCHW + MOV will cause the consumer to load the line into 'M' state.
>
> PREFETCHW was not available on Intel CPUs pre-Broadwell; from the public
> documentation alone, I don't think we can tell what transition the producer's
> cacheline undergoes on these cores. For that matter, the latest documentation
> I can find (for Nehalem), indicated there was no 'O' state -- Nehalem
> implemented MESIF, not MOESI.
By O, i mean AMD MOESI, and i thought you were using only load to load
the cache line on the consumer core. If you are using prefetchw + load,
that makes sense for the state transfer.
>
> HTH,
> -- vs;
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] virtio_ring: Shadow available ring flags & index
@ 2015-11-11  0:21 Venkatesh Srinivas via Virtualization
  0 siblings, 0 replies; 16+ messages in thread
From: Venkatesh Srinivas via Virtualization @ 2015-11-11  0:21 UTC (permalink / raw)
  To: virtualization; +Cc: kvm, mst, vsrinivas, luto, dmatlack, pbonzini, huawei.xie

Improves cacheline transfer flow of available ring header.

Virtqueues are implemented as a pair of rings, one producer->consumer
avail ring and one consumer->producer used ring; preceding the
avail ring in memory are two contiguous u16 fields -- avail->flags
and avail->idx. A producer posts work by writing to avail->idx and
a consumer reads avail->idx.

The flags and idx fields only need to be written by a producer CPU
and only read by a consumer CPU; when the producer and consumer are
running on different CPUs and the virtio_ring code is structured to
only have source writes/sink reads, we can continuously transfer the
avail header cacheline between 'M' states between cores. This flow
optimizes core -> core bandwidth on certain CPUs.

(see: "Software Optimization Guide for AMD Family 15h Processors",
Section 11.6; similar language appears in the 10h guide and should
apply to CPUs w/ exclusive caches, using LLC as a transfer cache)

Unfortunately the existing virtio_ring code issued reads to the
avail->idx and read-modify-writes to avail->flags on the producer.

This change shadows the flags and index fields in producer memory;
the vring code now reads from the shadows and only ever writes to
avail->flags and avail->idx, allowing the cacheline to transfer
core -> core optimally.

In a concurrent version of vring_bench, the time required for
10,000,000 buffer checkout/returns was reduced by ~2% (average
across many runs) on an AMD Piledriver (15h) CPU:

(w/o shadowing):
 Performance counter stats for './vring_bench':
     5,451,082,016      L1-dcache-loads
     ...
       2.221477739 seconds time elapsed

(w/ shadowing):
 Performance counter stats for './vring_bench':
     5,405,701,361      L1-dcache-loads
     ...
       2.168405376 seconds time elapsed

The further away (in a NUMA sense) virtio producers and consumers are
from each other, the more we expect to benefit. Physical implementations
of virtio devices and implementations of virtio where the consumer polls
vring avail indexes (vhost) should also benefit.

Signed-off-by: Venkatesh Srinivas <venkateshs@google.com>
---
 drivers/virtio/virtio_ring.c | 46 ++++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 096b857..6262015 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -80,6 +80,12 @@ struct vring_virtqueue {
 	/* Last used index we've seen. */
 	u16 last_used_idx;
 
+	/* Last written value to avail->flags */
+	u16 avail_flags_shadow;
+
+	/* Last written value to avail->idx in guest byte order */
+	u16 avail_idx_shadow;
+
 	/* How to notify other side. FIXME: commonalize hcalls! */
 	bool (*notify)(struct virtqueue *vq);
 
@@ -235,13 +241,14 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 
 	/* Put entry in available array (but don't update avail->idx until they
 	 * do sync). */
-	avail = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) & (vq->vring.num - 1);
+	avail = vq->avail_idx_shadow & (vq->vring.num - 1);
 	vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
 
 	/* Descriptors and available array need to be set before we expose the
 	 * new available array entries. */
 	virtio_wmb(vq->weak_barriers);
-	vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) + 1);
+	vq->avail_idx_shadow++;
+	vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
 	vq->num_added++;
 
 	pr_debug("Added buffer head %i to %p\n", head, vq);
@@ -354,8 +361,8 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
 	 * event. */
 	virtio_mb(vq->weak_barriers);
 
-	old = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->num_added;
-	new = virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx);
+	old = vq->avail_idx_shadow - vq->num_added;
+	new = vq->avail_idx_shadow;
 	vq->num_added = 0;
 
 #ifdef DEBUG
@@ -510,7 +517,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 	/* If we expect an interrupt for the next entry, tell host
 	 * by writing event index and flush out the write before
 	 * the read in the next get_buf call. */
-	if (!(vq->vring.avail->flags & cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT))) {
+	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
 		vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx);
 		virtio_mb(vq->weak_barriers);
 	}
@@ -537,7 +544,11 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
-	vq->vring.avail->flags |= cpu_to_virtio16(_vq->vdev, VRING_AVAIL_F_NO_INTERRUPT);
+	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
+		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
+
 }
 EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
 
@@ -565,7 +576,10 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
 	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
 	 * either clear the flags bit or point the event index at the next
 	 * entry. Always do both to keep code simple. */
-	vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT);
+	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
 	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
 	END_USE(vq);
 	return last_used_idx;
@@ -633,9 +647,12 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
 	 * either clear the flags bit or point the event index at the next
 	 * entry. Always do both to keep code simple. */
-	vq->vring.avail->flags &= cpu_to_virtio16(_vq->vdev, ~VRING_AVAIL_F_NO_INTERRUPT);
+	if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+		vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+		vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+	}
 	/* TODO: tune this threshold */
-	bufs = (u16)(virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - vq->last_used_idx) * 3 / 4;
+	bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
 	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs);
 	virtio_mb(vq->weak_barriers);
 	if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
@@ -670,7 +687,8 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 		/* detach_buf clears data, so grab it now. */
 		buf = vq->data[i];
 		detach_buf(vq, i);
-		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, virtio16_to_cpu(_vq->vdev, vq->vring.avail->idx) - 1);
+		vq->avail_idx_shadow--;
+		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
 		END_USE(vq);
 		return buf;
 	}
@@ -735,6 +753,8 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 	vq->weak_barriers = weak_barriers;
 	vq->broken = false;
 	vq->last_used_idx = 0;
+	vq->avail_flags_shadow = 0;
+	vq->avail_idx_shadow = 0;
 	vq->num_added = 0;
 	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
@@ -746,8 +766,10 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
 	/* No callback?  Tell other side not to bother us. */
-	if (!callback)
-		vq->vring.avail->flags |= cpu_to_virtio16(vdev, VRING_AVAIL_F_NO_INTERRUPT);
+	if (!callback) {
+		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+		vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
+	}
 
 	/* Put everything in free lists. */
 	vq->free_head = 0;
-- 
2.6.0.rc2.230.g3dd15c0

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2015-11-23 16:46 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-11  0:21 [PATCH] virtio_ring: Shadow available ring flags & index Venkatesh Srinivas
2015-11-11 12:34 ` Michael S. Tsirkin
2015-11-13 23:41   ` Venkatesh Srinivas
2015-11-17  3:46     ` Xie, Huawei
2015-11-18  4:08       ` Venkatesh Srinivas via Virtualization
2015-11-18  4:28         ` Venkatesh Srinivas via Virtualization
2015-11-18  4:28         ` Venkatesh Srinivas
2015-11-19 16:15           ` Xie, Huawei
2015-11-20 18:30             ` Venkatesh Srinivas via Virtualization
2015-11-20 18:30             ` Venkatesh Srinivas
2015-11-23 16:46               ` Xie, Huawei
2015-11-23 16:46               ` Xie, Huawei
2015-11-19 16:15           ` Xie, Huawei
2015-11-13 23:41   ` Venkatesh Srinivas via Virtualization
2015-11-11 12:34 ` Michael S. Tsirkin
  -- strict thread matches above, loose matches on Subject: below --
2015-11-11  0:21 Venkatesh Srinivas via Virtualization

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.