All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] vhost: Add indirect descriptors support to the TX path
@ 2016-09-23  8:28 Maxime Coquelin
  2016-09-23 15:49 ` Michael S. Tsirkin
                   ` (2 more replies)
  0 siblings, 3 replies; 52+ messages in thread
From: Maxime Coquelin @ 2016-09-23  8:28 UTC (permalink / raw)
  To: yuanhan.liu, huawei.xie, dev; +Cc: vkaplans, mst, stephen

Indirect descriptors are usually supported by virtio-net devices,
allowing to dispatch a larger number of requests.

When the virtio device sends a packet using indirect descriptors,
only one slot is used in the ring, even for large packets.

The main effect is to improve the 0% packet loss benchmark.
A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
(fwd io for host, macswap for VM) on DUT shows a +50% gain for
zero loss.

On the downside, micro-benchmark using testpmd txonly in VM and
rxonly on host shows a loss between 1 and 4%.i But depending on
the needs, feature can be disabled at VM boot time by passing
indirect_desc=off argument to vhost-user device in Qemu.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
Changes since v2:
=================
 - Revert back to not checking feature flag to be aligned with
kernel implementation
 - Ensure we don't have nested indirect descriptors
 - Ensure the indirect desc address is valid, to protect against
malicious guests

Changes since RFC:
=================
 - Enrich commit message with figures
 - Rebased on top of dpdk-next-virtio's master
 - Add feature check to ensure we don't receive an indirect desc
if not supported by the virtio driver

 lib/librte_vhost/vhost.c      |  3 ++-
 lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..30bb0ce 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -65,7 +65,8 @@
 				(1ULL << VIRTIO_NET_F_CSUM)    | \
 				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
 				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
-				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
+				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
+				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
 
 uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8a151af..2e0a587 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
 }
 
 static inline int __attribute__((always_inline))
-copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx,
+copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
+		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
 		  struct rte_mempool *mbuf_pool)
 {
 	struct vring_desc *desc;
@@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	/* A counter to avoid desc dead loop chain */
 	uint32_t nr_desc = 1;
 
-	desc = &vq->desc[desc_idx];
-	if (unlikely(desc->len < dev->vhost_hlen))
+	desc = &descs[desc_idx];
+	if (unlikely((desc->len < dev->vhost_hlen)) ||
+			(desc->flags & VRING_DESC_F_INDIRECT))
 		return -1;
 
 	desc_addr = gpa_to_vva(dev, desc->addr);
@@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	 */
 	if (likely((desc->len == dev->vhost_hlen) &&
 		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
-		desc = &vq->desc[desc->next];
+		desc = &descs[desc->next];
+		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
+			return -1;
 
 		desc_addr = gpa_to_vva(dev, desc->addr);
 		if (unlikely(!desc_addr))
@@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
 				break;
 
-			if (unlikely(desc->next >= vq->size ||
-				     ++nr_desc > vq->size))
+			if (unlikely(desc->next >= max_desc ||
+				     ++nr_desc > max_desc))
+				return -1;
+			desc = &descs[desc->next];
+			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
 				return -1;
-			desc = &vq->desc[desc->next];
 
 			desc_addr = gpa_to_vva(dev, desc->addr);
 			if (unlikely(!desc_addr))
@@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	/* Prefetch descriptor index. */
 	rte_prefetch0(&vq->desc[desc_indexes[0]]);
 	for (i = 0; i < count; i++) {
+		struct vring_desc *desc;
+		uint16_t sz, idx;
 		int err;
 
 		if (likely(i + 1 < count))
 			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
 
+		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
+			desc = (struct vring_desc *)gpa_to_vva(dev,
+					vq->desc[desc_indexes[i]].addr);
+			if (unlikely(!desc))
+				break;
+
+			rte_prefetch0(desc);
+			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
+			idx = 0;
+		} else {
+			desc = vq->desc;
+			sz = vq->size;
+			idx = desc_indexes[i];
+		}
+
 		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
 		if (unlikely(pkts[i] == NULL)) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
 			break;
 		}
-		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
-					mbuf_pool);
+		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
 		if (unlikely(err)) {
 			rte_pktmbuf_free(pkts[i]);
 			break;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH v3] vhost: Add indirect descriptors support to the TX path
  2016-09-23  8:28 [PATCH v3] vhost: Add indirect descriptors support to the TX path Maxime Coquelin
@ 2016-09-23 15:49 ` Michael S. Tsirkin
  2016-09-23 18:02   ` Maxime Coquelin
  2016-09-27  4:15 ` Yuanhan Liu
  2016-09-27  8:42 ` [PATCH v4] " Maxime Coquelin
  2 siblings, 1 reply; 52+ messages in thread
From: Michael S. Tsirkin @ 2016-09-23 15:49 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: yuanhan.liu, huawei.xie, dev, vkaplans, stephen

On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
> Indirect descriptors are usually supported by virtio-net devices,
> allowing to dispatch a larger number of requests.
> 
> When the virtio device sends a packet using indirect descriptors,
> only one slot is used in the ring, even for large packets.
> 
> The main effect is to improve the 0% packet loss benchmark.
> A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> zero loss.
> 
> On the downside, micro-benchmark using testpmd txonly in VM and
> rxonly on host shows a loss between 1 and 4%.i But depending on
> the needs, feature can be disabled at VM boot time by passing
> indirect_desc=off argument to vhost-user device in Qemu.

Even better, change guest pmd to only use indirect
descriptors when this makes sense (e.g. sufficiently
large packets).

I would be very interested to know when does it make
sense.

The feature is there, it's up to guest whether to
use it.


> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
> Changes since v2:
> =================
>  - Revert back to not checking feature flag to be aligned with
> kernel implementation
>  - Ensure we don't have nested indirect descriptors
>  - Ensure the indirect desc address is valid, to protect against
> malicious guests
> 
> Changes since RFC:
> =================
>  - Enrich commit message with figures
>  - Rebased on top of dpdk-next-virtio's master
>  - Add feature check to ensure we don't receive an indirect desc
> if not supported by the virtio driver
> 
>  lib/librte_vhost/vhost.c      |  3 ++-
>  lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
>  2 files changed, 33 insertions(+), 11 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 46095c3..30bb0ce 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -65,7 +65,8 @@
>  				(1ULL << VIRTIO_NET_F_CSUM)    | \
>  				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
>  				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
> -				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
> +				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
> +				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
>  
>  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
>  
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 8a151af..2e0a587 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
>  }
>  
>  static inline int __attribute__((always_inline))
> -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -		  struct rte_mbuf *m, uint16_t desc_idx,
> +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
> +		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
>  		  struct rte_mempool *mbuf_pool)
>  {
>  	struct vring_desc *desc;
> @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	/* A counter to avoid desc dead loop chain */
>  	uint32_t nr_desc = 1;
>  
> -	desc = &vq->desc[desc_idx];
> -	if (unlikely(desc->len < dev->vhost_hlen))
> +	desc = &descs[desc_idx];
> +	if (unlikely((desc->len < dev->vhost_hlen)) ||
> +			(desc->flags & VRING_DESC_F_INDIRECT))
>  		return -1;
>  
>  	desc_addr = gpa_to_vva(dev, desc->addr);
> @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	 */
>  	if (likely((desc->len == dev->vhost_hlen) &&
>  		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
> -		desc = &vq->desc[desc->next];
> +		desc = &descs[desc->next];
> +		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> +			return -1;
>  
>  		desc_addr = gpa_to_vva(dev, desc->addr);
>  		if (unlikely(!desc_addr))


Just to make sure, does this still allow a chain of
direct descriptors ending with an indirect one?
This is legal as per spec.

> @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
>  				break;
>  
> -			if (unlikely(desc->next >= vq->size ||
> -				     ++nr_desc > vq->size))
> +			if (unlikely(desc->next >= max_desc ||
> +				     ++nr_desc > max_desc))
> +				return -1;
> +			desc = &descs[desc->next];
> +			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
>  				return -1;
> -			desc = &vq->desc[desc->next];
>  
>  			desc_addr = gpa_to_vva(dev, desc->addr);
>  			if (unlikely(!desc_addr))
> @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>  	/* Prefetch descriptor index. */
>  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
>  	for (i = 0; i < count; i++) {
> +		struct vring_desc *desc;
> +		uint16_t sz, idx;
>  		int err;
>  
>  		if (likely(i + 1 < count))
>  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
>  
> +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
> +			desc = (struct vring_desc *)gpa_to_vva(dev,
> +					vq->desc[desc_indexes[i]].addr);
> +			if (unlikely(!desc))
> +				break;
> +
> +			rte_prefetch0(desc);
> +			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
> +			idx = 0;
> +		} else {
> +			desc = vq->desc;
> +			sz = vq->size;
> +			idx = desc_indexes[i];
> +		}
> +
>  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
>  		if (unlikely(pkts[i] == NULL)) {
>  			RTE_LOG(ERR, VHOST_DATA,
>  				"Failed to allocate memory for mbuf.\n");
>  			break;
>  		}
> -		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
> -					mbuf_pool);
> +		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
>  		if (unlikely(err)) {
>  			rte_pktmbuf_free(pkts[i]);
>  			break;
> -- 
> 2.7.4

Something that I'm missing here: it's legal for guest
to add indirect descriptors for RX.
I don't see the handling of RX here though.
I think it's required for spec compliance.

-- 
MST

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v3] vhost: Add indirect descriptors support to the TX path
  2016-09-23 15:49 ` Michael S. Tsirkin
@ 2016-09-23 18:02   ` Maxime Coquelin
  2016-09-23 18:06     ` Michael S. Tsirkin
  0 siblings, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-09-23 18:02 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: yuanhan.liu, huawei.xie, dev, vkaplans, stephen



On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:
> On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
>> Indirect descriptors are usually supported by virtio-net devices,
>> allowing to dispatch a larger number of requests.
>>
>> When the virtio device sends a packet using indirect descriptors,
>> only one slot is used in the ring, even for large packets.
>>
>> The main effect is to improve the 0% packet loss benchmark.
>> A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
>> (fwd io for host, macswap for VM) on DUT shows a +50% gain for
>> zero loss.
>>
>> On the downside, micro-benchmark using testpmd txonly in VM and
>> rxonly on host shows a loss between 1 and 4%.i But depending on
>> the needs, feature can be disabled at VM boot time by passing
>> indirect_desc=off argument to vhost-user device in Qemu.
>
> Even better, change guest pmd to only use indirect
> descriptors when this makes sense (e.g. sufficiently
> large packets).
With the micro-benchmark, the degradation is quite constant whatever
the packet size.

For PVP, I could not test with larger packets than 64 bytes, as I don't
have a 40G interface, and line rate with 10G is reached rapidly.

> I would be very interested to know when does it make
> sense.
>
> The feature is there, it's up to guest whether to
> use it.
Do you mean the PMD should detect dynamically whether using indirect,
or having an option at device init time to enable or not the feature?

>
>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>> ---
>> Changes since v2:
>> =================
>>  - Revert back to not checking feature flag to be aligned with
>> kernel implementation
>>  - Ensure we don't have nested indirect descriptors
>>  - Ensure the indirect desc address is valid, to protect against
>> malicious guests
>>
>> Changes since RFC:
>> =================
>>  - Enrich commit message with figures
>>  - Rebased on top of dpdk-next-virtio's master
>>  - Add feature check to ensure we don't receive an indirect desc
>> if not supported by the virtio driver
>>
>>  lib/librte_vhost/vhost.c      |  3 ++-
>>  lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
>>  2 files changed, 33 insertions(+), 11 deletions(-)
>>
>> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
>> index 46095c3..30bb0ce 100644
>> --- a/lib/librte_vhost/vhost.c
>> +++ b/lib/librte_vhost/vhost.c
>> @@ -65,7 +65,8 @@
>>  				(1ULL << VIRTIO_NET_F_CSUM)    | \
>>  				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
>>  				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
>> -				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
>> +				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
>> +				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
>>
>>  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
>>
>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>> index 8a151af..2e0a587 100644
>> --- a/lib/librte_vhost/virtio_net.c
>> +++ b/lib/librte_vhost/virtio_net.c
>> @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
>>  }
>>
>>  static inline int __attribute__((always_inline))
>> -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> -		  struct rte_mbuf *m, uint16_t desc_idx,
>> +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
>> +		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
>>  		  struct rte_mempool *mbuf_pool)
>>  {
>>  	struct vring_desc *desc;
>> @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>  	/* A counter to avoid desc dead loop chain */
>>  	uint32_t nr_desc = 1;
>>
>> -	desc = &vq->desc[desc_idx];
>> -	if (unlikely(desc->len < dev->vhost_hlen))
>> +	desc = &descs[desc_idx];
>> +	if (unlikely((desc->len < dev->vhost_hlen)) ||
>> +			(desc->flags & VRING_DESC_F_INDIRECT))
>>  		return -1;
>>
>>  	desc_addr = gpa_to_vva(dev, desc->addr);
>> @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>  	 */
>>  	if (likely((desc->len == dev->vhost_hlen) &&
>>  		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
>> -		desc = &vq->desc[desc->next];
>> +		desc = &descs[desc->next];
>> +		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
>> +			return -1;
>>
>>  		desc_addr = gpa_to_vva(dev, desc->addr);
>>  		if (unlikely(!desc_addr))
>
>
> Just to make sure, does this still allow a chain of
> direct descriptors ending with an indirect one?
> This is legal as per spec.
>
>> @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>  			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
>>  				break;
>>
>> -			if (unlikely(desc->next >= vq->size ||
>> -				     ++nr_desc > vq->size))
>> +			if (unlikely(desc->next >= max_desc ||
>> +				     ++nr_desc > max_desc))
>> +				return -1;
>> +			desc = &descs[desc->next];
>> +			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
>>  				return -1;
>> -			desc = &vq->desc[desc->next];
>>
>>  			desc_addr = gpa_to_vva(dev, desc->addr);
>>  			if (unlikely(!desc_addr))
>> @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>>  	/* Prefetch descriptor index. */
>>  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
>>  	for (i = 0; i < count; i++) {
>> +		struct vring_desc *desc;
>> +		uint16_t sz, idx;
>>  		int err;
>>
>>  		if (likely(i + 1 < count))
>>  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
>>
>> +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
>> +			desc = (struct vring_desc *)gpa_to_vva(dev,
>> +					vq->desc[desc_indexes[i]].addr);
>> +			if (unlikely(!desc))
>> +				break;
>> +
>> +			rte_prefetch0(desc);
>> +			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
>> +			idx = 0;
>> +		} else {
>> +			desc = vq->desc;
>> +			sz = vq->size;
>> +			idx = desc_indexes[i];
>> +		}
>> +
>>  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
>>  		if (unlikely(pkts[i] == NULL)) {
>>  			RTE_LOG(ERR, VHOST_DATA,
>>  				"Failed to allocate memory for mbuf.\n");
>>  			break;
>>  		}
>> -		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
>> -					mbuf_pool);
>> +		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
>>  		if (unlikely(err)) {
>>  			rte_pktmbuf_free(pkts[i]);
>>  			break;
>> --
>> 2.7.4
>
> Something that I'm missing here: it's legal for guest
> to add indirect descriptors for RX.
> I don't see the handling of RX here though.
> I think it's required for spec compliance.
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v3] vhost: Add indirect descriptors support to the TX path
  2016-09-23 18:02   ` Maxime Coquelin
@ 2016-09-23 18:06     ` Michael S. Tsirkin
  2016-09-23 18:16       ` Maxime Coquelin
  0 siblings, 1 reply; 52+ messages in thread
From: Michael S. Tsirkin @ 2016-09-23 18:06 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: yuanhan.liu, huawei.xie, dev, vkaplans, stephen

On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:
> 
> 
> On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:
> > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
> > > Indirect descriptors are usually supported by virtio-net devices,
> > > allowing to dispatch a larger number of requests.
> > > 
> > > When the virtio device sends a packet using indirect descriptors,
> > > only one slot is used in the ring, even for large packets.
> > > 
> > > The main effect is to improve the 0% packet loss benchmark.
> > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > zero loss.
> > > 
> > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > the needs, feature can be disabled at VM boot time by passing
> > > indirect_desc=off argument to vhost-user device in Qemu.
> > 
> > Even better, change guest pmd to only use indirect
> > descriptors when this makes sense (e.g. sufficiently
> > large packets).
> With the micro-benchmark, the degradation is quite constant whatever
> the packet size.
> 
> For PVP, I could not test with larger packets than 64 bytes, as I don't
> have a 40G interface,

Don't 64 byte packets fit in a single slot anyway?
Why would there be an effect with that?

> and line rate with 10G is reached rapidly.

Right but focus on packet loss. you can have that at any rate.

> 
> > I would be very interested to know when does it make
> > sense.
> > 
> > The feature is there, it's up to guest whether to
> > use it.
> Do you mean the PMD should detect dynamically whether using indirect,
> or having an option at device init time to enable or not the feature?

guest PMD should not use indirect where it slows things down.

> > 
> > 
> > > Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > ---
> > > Changes since v2:
> > > =================
> > >  - Revert back to not checking feature flag to be aligned with
> > > kernel implementation
> > >  - Ensure we don't have nested indirect descriptors
> > >  - Ensure the indirect desc address is valid, to protect against
> > > malicious guests
> > > 
> > > Changes since RFC:
> > > =================
> > >  - Enrich commit message with figures
> > >  - Rebased on top of dpdk-next-virtio's master
> > >  - Add feature check to ensure we don't receive an indirect desc
> > > if not supported by the virtio driver
> > > 
> > >  lib/librte_vhost/vhost.c      |  3 ++-
> > >  lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
> > >  2 files changed, 33 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> > > index 46095c3..30bb0ce 100644
> > > --- a/lib/librte_vhost/vhost.c
> > > +++ b/lib/librte_vhost/vhost.c
> > > @@ -65,7 +65,8 @@
> > >  				(1ULL << VIRTIO_NET_F_CSUM)    | \
> > >  				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
> > >  				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
> > > -				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
> > > +				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
> > > +				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
> > > 
> > >  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
> > > 
> > > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > > index 8a151af..2e0a587 100644
> > > --- a/lib/librte_vhost/virtio_net.c
> > > +++ b/lib/librte_vhost/virtio_net.c
> > > @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
> > >  }
> > > 
> > >  static inline int __attribute__((always_inline))
> > > -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > > -		  struct rte_mbuf *m, uint16_t desc_idx,
> > > +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
> > > +		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
> > >  		  struct rte_mempool *mbuf_pool)
> > >  {
> > >  	struct vring_desc *desc;
> > > @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > >  	/* A counter to avoid desc dead loop chain */
> > >  	uint32_t nr_desc = 1;
> > > 
> > > -	desc = &vq->desc[desc_idx];
> > > -	if (unlikely(desc->len < dev->vhost_hlen))
> > > +	desc = &descs[desc_idx];
> > > +	if (unlikely((desc->len < dev->vhost_hlen)) ||
> > > +			(desc->flags & VRING_DESC_F_INDIRECT))
> > >  		return -1;
> > > 
> > >  	desc_addr = gpa_to_vva(dev, desc->addr);
> > > @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > >  	 */
> > >  	if (likely((desc->len == dev->vhost_hlen) &&
> > >  		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
> > > -		desc = &vq->desc[desc->next];
> > > +		desc = &descs[desc->next];
> > > +		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> > > +			return -1;
> > > 
> > >  		desc_addr = gpa_to_vva(dev, desc->addr);
> > >  		if (unlikely(!desc_addr))
> > 
> > 
> > Just to make sure, does this still allow a chain of
> > direct descriptors ending with an indirect one?
> > This is legal as per spec.
> > 
> > > @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > >  			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
> > >  				break;
> > > 
> > > -			if (unlikely(desc->next >= vq->size ||
> > > -				     ++nr_desc > vq->size))
> > > +			if (unlikely(desc->next >= max_desc ||
> > > +				     ++nr_desc > max_desc))
> > > +				return -1;
> > > +			desc = &descs[desc->next];
> > > +			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> > >  				return -1;
> > > -			desc = &vq->desc[desc->next];
> > > 
> > >  			desc_addr = gpa_to_vva(dev, desc->addr);
> > >  			if (unlikely(!desc_addr))
> > > @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> > >  	/* Prefetch descriptor index. */
> > >  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> > >  	for (i = 0; i < count; i++) {
> > > +		struct vring_desc *desc;
> > > +		uint16_t sz, idx;
> > >  		int err;
> > > 
> > >  		if (likely(i + 1 < count))
> > >  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
> > > 
> > > +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
> > > +			desc = (struct vring_desc *)gpa_to_vva(dev,
> > > +					vq->desc[desc_indexes[i]].addr);
> > > +			if (unlikely(!desc))
> > > +				break;
> > > +
> > > +			rte_prefetch0(desc);
> > > +			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
> > > +			idx = 0;
> > > +		} else {
> > > +			desc = vq->desc;
> > > +			sz = vq->size;
> > > +			idx = desc_indexes[i];
> > > +		}
> > > +
> > >  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> > >  		if (unlikely(pkts[i] == NULL)) {
> > >  			RTE_LOG(ERR, VHOST_DATA,
> > >  				"Failed to allocate memory for mbuf.\n");
> > >  			break;
> > >  		}
> > > -		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
> > > -					mbuf_pool);
> > > +		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
> > >  		if (unlikely(err)) {
> > >  			rte_pktmbuf_free(pkts[i]);
> > >  			break;
> > > --
> > > 2.7.4
> > 
> > Something that I'm missing here: it's legal for guest
> > to add indirect descriptors for RX.
> > I don't see the handling of RX here though.
> > I think it's required for spec compliance.
> > 

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v3] vhost: Add indirect descriptors support to the TX path
  2016-09-23 18:06     ` Michael S. Tsirkin
@ 2016-09-23 18:16       ` Maxime Coquelin
  2016-09-23 18:22         ` Michael S. Tsirkin
  0 siblings, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-09-23 18:16 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: yuanhan.liu, huawei.xie, dev, vkaplans, stephen



On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:
> On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:
>>
>>
>> On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:
>>> On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
>>>> Indirect descriptors are usually supported by virtio-net devices,
>>>> allowing to dispatch a larger number of requests.
>>>>
>>>> When the virtio device sends a packet using indirect descriptors,
>>>> only one slot is used in the ring, even for large packets.
>>>>
>>>> The main effect is to improve the 0% packet loss benchmark.
>>>> A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
>>>> (fwd io for host, macswap for VM) on DUT shows a +50% gain for
>>>> zero loss.
>>>>
>>>> On the downside, micro-benchmark using testpmd txonly in VM and
>>>> rxonly on host shows a loss between 1 and 4%.i But depending on
>>>> the needs, feature can be disabled at VM boot time by passing
>>>> indirect_desc=off argument to vhost-user device in Qemu.
>>>
>>> Even better, change guest pmd to only use indirect
>>> descriptors when this makes sense (e.g. sufficiently
>>> large packets).
>> With the micro-benchmark, the degradation is quite constant whatever
>> the packet size.
>>
>> For PVP, I could not test with larger packets than 64 bytes, as I don't
>> have a 40G interface,
>
> Don't 64 byte packets fit in a single slot anyway?
No, indirect is used. I didn't checked in details, but I think this is
because there is no headroom reserved in the mbuf.

This is the condition to meet to fit in a single slot:
/* optimize ring usage */
if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
	rte_mbuf_refcnt_read(txm) == 1 &&
	RTE_MBUF_DIRECT(txm) &&
	txm->nb_segs == 1 &&
	rte_pktmbuf_headroom(txm) >= hdr_size &&
	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
     can_push = 1;
else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
     use_indirect = 1;

I will check more in details next week.

> Why would there be an effect with that?
>
>> and line rate with 10G is reached rapidly.
>
> Right but focus on packet loss. you can have that at any rate.
>
>>
>>> I would be very interested to know when does it make
>>> sense.
>>>
>>> The feature is there, it's up to guest whether to
>>> use it.
>> Do you mean the PMD should detect dynamically whether using indirect,
>> or having an option at device init time to enable or not the feature?
>
> guest PMD should not use indirect where it slows things down.
>
>>>
>>>
>>>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> ---
>>>> Changes since v2:
>>>> =================
>>>>  - Revert back to not checking feature flag to be aligned with
>>>> kernel implementation
>>>>  - Ensure we don't have nested indirect descriptors
>>>>  - Ensure the indirect desc address is valid, to protect against
>>>> malicious guests
>>>>
>>>> Changes since RFC:
>>>> =================
>>>>  - Enrich commit message with figures
>>>>  - Rebased on top of dpdk-next-virtio's master
>>>>  - Add feature check to ensure we don't receive an indirect desc
>>>> if not supported by the virtio driver
>>>>
>>>>  lib/librte_vhost/vhost.c      |  3 ++-
>>>>  lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
>>>>  2 files changed, 33 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
>>>> index 46095c3..30bb0ce 100644
>>>> --- a/lib/librte_vhost/vhost.c
>>>> +++ b/lib/librte_vhost/vhost.c
>>>> @@ -65,7 +65,8 @@
>>>>  				(1ULL << VIRTIO_NET_F_CSUM)    | \
>>>>  				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
>>>>  				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
>>>> -				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
>>>> +				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
>>>> +				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
>>>>
>>>>  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
>>>>
>>>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>>>> index 8a151af..2e0a587 100644
>>>> --- a/lib/librte_vhost/virtio_net.c
>>>> +++ b/lib/librte_vhost/virtio_net.c
>>>> @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
>>>>  }
>>>>
>>>>  static inline int __attribute__((always_inline))
>>>> -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>> -		  struct rte_mbuf *m, uint16_t desc_idx,
>>>> +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
>>>> +		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
>>>>  		  struct rte_mempool *mbuf_pool)
>>>>  {
>>>>  	struct vring_desc *desc;
>>>> @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>>  	/* A counter to avoid desc dead loop chain */
>>>>  	uint32_t nr_desc = 1;
>>>>
>>>> -	desc = &vq->desc[desc_idx];
>>>> -	if (unlikely(desc->len < dev->vhost_hlen))
>>>> +	desc = &descs[desc_idx];
>>>> +	if (unlikely((desc->len < dev->vhost_hlen)) ||
>>>> +			(desc->flags & VRING_DESC_F_INDIRECT))
>>>>  		return -1;
>>>>
>>>>  	desc_addr = gpa_to_vva(dev, desc->addr);
>>>> @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>>  	 */
>>>>  	if (likely((desc->len == dev->vhost_hlen) &&
>>>>  		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
>>>> -		desc = &vq->desc[desc->next];
>>>> +		desc = &descs[desc->next];
>>>> +		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
>>>> +			return -1;
>>>>
>>>>  		desc_addr = gpa_to_vva(dev, desc->addr);
>>>>  		if (unlikely(!desc_addr))
>>>
>>>
>>> Just to make sure, does this still allow a chain of
>>> direct descriptors ending with an indirect one?
>>> This is legal as per spec.
>>>
>>>> @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>>  			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
>>>>  				break;
>>>>
>>>> -			if (unlikely(desc->next >= vq->size ||
>>>> -				     ++nr_desc > vq->size))
>>>> +			if (unlikely(desc->next >= max_desc ||
>>>> +				     ++nr_desc > max_desc))
>>>> +				return -1;
>>>> +			desc = &descs[desc->next];
>>>> +			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
>>>>  				return -1;
>>>> -			desc = &vq->desc[desc->next];
>>>>
>>>>  			desc_addr = gpa_to_vva(dev, desc->addr);
>>>>  			if (unlikely(!desc_addr))
>>>> @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>>>>  	/* Prefetch descriptor index. */
>>>>  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
>>>>  	for (i = 0; i < count; i++) {
>>>> +		struct vring_desc *desc;
>>>> +		uint16_t sz, idx;
>>>>  		int err;
>>>>
>>>>  		if (likely(i + 1 < count))
>>>>  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
>>>>
>>>> +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
>>>> +			desc = (struct vring_desc *)gpa_to_vva(dev,
>>>> +					vq->desc[desc_indexes[i]].addr);
>>>> +			if (unlikely(!desc))
>>>> +				break;
>>>> +
>>>> +			rte_prefetch0(desc);
>>>> +			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
>>>> +			idx = 0;
>>>> +		} else {
>>>> +			desc = vq->desc;
>>>> +			sz = vq->size;
>>>> +			idx = desc_indexes[i];
>>>> +		}
>>>> +
>>>>  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
>>>>  		if (unlikely(pkts[i] == NULL)) {
>>>>  			RTE_LOG(ERR, VHOST_DATA,
>>>>  				"Failed to allocate memory for mbuf.\n");
>>>>  			break;
>>>>  		}
>>>> -		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
>>>> -					mbuf_pool);
>>>> +		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
>>>>  		if (unlikely(err)) {
>>>>  			rte_pktmbuf_free(pkts[i]);
>>>>  			break;
>>>> --
>>>> 2.7.4
>>>
>>> Something that I'm missing here: it's legal for guest
>>> to add indirect descriptors for RX.
>>> I don't see the handling of RX here though.
>>> I think it's required for spec compliance.
>>>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v3] vhost: Add indirect descriptors support to the TX path
  2016-09-23 18:16       ` Maxime Coquelin
@ 2016-09-23 18:22         ` Michael S. Tsirkin
  2016-09-23 20:24           ` Stephen Hemminger
  0 siblings, 1 reply; 52+ messages in thread
From: Michael S. Tsirkin @ 2016-09-23 18:22 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: yuanhan.liu, huawei.xie, dev, vkaplans, stephen

On Fri, Sep 23, 2016 at 08:16:09PM +0200, Maxime Coquelin wrote:
> 
> 
> On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:
> > On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:
> > > 
> > > 
> > > On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:
> > > > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
> > > > > Indirect descriptors are usually supported by virtio-net devices,
> > > > > allowing to dispatch a larger number of requests.
> > > > > 
> > > > > When the virtio device sends a packet using indirect descriptors,
> > > > > only one slot is used in the ring, even for large packets.
> > > > > 
> > > > > The main effect is to improve the 0% packet loss benchmark.
> > > > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > > > zero loss.
> > > > > 
> > > > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > > > the needs, feature can be disabled at VM boot time by passing
> > > > > indirect_desc=off argument to vhost-user device in Qemu.
> > > > 
> > > > Even better, change guest pmd to only use indirect
> > > > descriptors when this makes sense (e.g. sufficiently
> > > > large packets).
> > > With the micro-benchmark, the degradation is quite constant whatever
> > > the packet size.
> > > 
> > > For PVP, I could not test with larger packets than 64 bytes, as I don't
> > > have a 40G interface,
> > 
> > Don't 64 byte packets fit in a single slot anyway?
> No, indirect is used. I didn't checked in details, but I think this is
> because there is no headroom reserved in the mbuf.
> 
> This is the condition to meet to fit in a single slot:
> /* optimize ring usage */
> if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
> 	rte_mbuf_refcnt_read(txm) == 1 &&
> 	RTE_MBUF_DIRECT(txm) &&
> 	txm->nb_segs == 1 &&
> 	rte_pktmbuf_headroom(txm) >= hdr_size &&
> 	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> 		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
>     can_push = 1;
> else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> 	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
>     use_indirect = 1;
> 
> I will check more in details next week.

Two thoughts then
1. so can some headroom be reserved?
2. how about using indirect with 3 s/g entries,
   but direct with 2 and down?


> > Why would there be an effect with that?
> > 
> > > and line rate with 10G is reached rapidly.
> > 
> > Right but focus on packet loss. you can have that at any rate.
> > 
> > > 
> > > > I would be very interested to know when does it make
> > > > sense.
> > > > 
> > > > The feature is there, it's up to guest whether to
> > > > use it.
> > > Do you mean the PMD should detect dynamically whether using indirect,
> > > or having an option at device init time to enable or not the feature?
> > 
> > guest PMD should not use indirect where it slows things down.
> > 
> > > > 
> > > > 
> > > > > Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > > > ---
> > > > > Changes since v2:
> > > > > =================
> > > > >  - Revert back to not checking feature flag to be aligned with
> > > > > kernel implementation
> > > > >  - Ensure we don't have nested indirect descriptors
> > > > >  - Ensure the indirect desc address is valid, to protect against
> > > > > malicious guests
> > > > > 
> > > > > Changes since RFC:
> > > > > =================
> > > > >  - Enrich commit message with figures
> > > > >  - Rebased on top of dpdk-next-virtio's master
> > > > >  - Add feature check to ensure we don't receive an indirect desc
> > > > > if not supported by the virtio driver
> > > > > 
> > > > >  lib/librte_vhost/vhost.c      |  3 ++-
> > > > >  lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
> > > > >  2 files changed, 33 insertions(+), 11 deletions(-)
> > > > > 
> > > > > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> > > > > index 46095c3..30bb0ce 100644
> > > > > --- a/lib/librte_vhost/vhost.c
> > > > > +++ b/lib/librte_vhost/vhost.c
> > > > > @@ -65,7 +65,8 @@
> > > > >  				(1ULL << VIRTIO_NET_F_CSUM)    | \
> > > > >  				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
> > > > >  				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
> > > > > -				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
> > > > > +				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
> > > > > +				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
> > > > > 
> > > > >  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
> > > > > 
> > > > > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > > > > index 8a151af..2e0a587 100644
> > > > > --- a/lib/librte_vhost/virtio_net.c
> > > > > +++ b/lib/librte_vhost/virtio_net.c
> > > > > @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
> > > > >  }
> > > > > 
> > > > >  static inline int __attribute__((always_inline))
> > > > > -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > > > > -		  struct rte_mbuf *m, uint16_t desc_idx,
> > > > > +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
> > > > > +		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
> > > > >  		  struct rte_mempool *mbuf_pool)
> > > > >  {
> > > > >  	struct vring_desc *desc;
> > > > > @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > > > >  	/* A counter to avoid desc dead loop chain */
> > > > >  	uint32_t nr_desc = 1;
> > > > > 
> > > > > -	desc = &vq->desc[desc_idx];
> > > > > -	if (unlikely(desc->len < dev->vhost_hlen))
> > > > > +	desc = &descs[desc_idx];
> > > > > +	if (unlikely((desc->len < dev->vhost_hlen)) ||
> > > > > +			(desc->flags & VRING_DESC_F_INDIRECT))
> > > > >  		return -1;
> > > > > 
> > > > >  	desc_addr = gpa_to_vva(dev, desc->addr);
> > > > > @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > > > >  	 */
> > > > >  	if (likely((desc->len == dev->vhost_hlen) &&
> > > > >  		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
> > > > > -		desc = &vq->desc[desc->next];
> > > > > +		desc = &descs[desc->next];
> > > > > +		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> > > > > +			return -1;
> > > > > 
> > > > >  		desc_addr = gpa_to_vva(dev, desc->addr);
> > > > >  		if (unlikely(!desc_addr))
> > > > 
> > > > 
> > > > Just to make sure, does this still allow a chain of
> > > > direct descriptors ending with an indirect one?
> > > > This is legal as per spec.
> > > > 
> > > > > @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > > > >  			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
> > > > >  				break;
> > > > > 
> > > > > -			if (unlikely(desc->next >= vq->size ||
> > > > > -				     ++nr_desc > vq->size))
> > > > > +			if (unlikely(desc->next >= max_desc ||
> > > > > +				     ++nr_desc > max_desc))
> > > > > +				return -1;
> > > > > +			desc = &descs[desc->next];
> > > > > +			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> > > > >  				return -1;
> > > > > -			desc = &vq->desc[desc->next];
> > > > > 
> > > > >  			desc_addr = gpa_to_vva(dev, desc->addr);
> > > > >  			if (unlikely(!desc_addr))
> > > > > @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> > > > >  	/* Prefetch descriptor index. */
> > > > >  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> > > > >  	for (i = 0; i < count; i++) {
> > > > > +		struct vring_desc *desc;
> > > > > +		uint16_t sz, idx;
> > > > >  		int err;
> > > > > 
> > > > >  		if (likely(i + 1 < count))
> > > > >  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
> > > > > 
> > > > > +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
> > > > > +			desc = (struct vring_desc *)gpa_to_vva(dev,
> > > > > +					vq->desc[desc_indexes[i]].addr);
> > > > > +			if (unlikely(!desc))
> > > > > +				break;
> > > > > +
> > > > > +			rte_prefetch0(desc);
> > > > > +			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
> > > > > +			idx = 0;
> > > > > +		} else {
> > > > > +			desc = vq->desc;
> > > > > +			sz = vq->size;
> > > > > +			idx = desc_indexes[i];
> > > > > +		}
> > > > > +
> > > > >  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> > > > >  		if (unlikely(pkts[i] == NULL)) {
> > > > >  			RTE_LOG(ERR, VHOST_DATA,
> > > > >  				"Failed to allocate memory for mbuf.\n");
> > > > >  			break;
> > > > >  		}
> > > > > -		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
> > > > > -					mbuf_pool);
> > > > > +		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
> > > > >  		if (unlikely(err)) {
> > > > >  			rte_pktmbuf_free(pkts[i]);
> > > > >  			break;
> > > > > --
> > > > > 2.7.4
> > > > 
> > > > Something that I'm missing here: it's legal for guest
> > > > to add indirect descriptors for RX.
> > > > I don't see the handling of RX here though.
> > > > I think it's required for spec compliance.
> > > > 

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v3] vhost: Add indirect descriptors support to the TX path
  2016-09-23 18:22         ` Michael S. Tsirkin
@ 2016-09-23 20:24           ` Stephen Hemminger
  2016-09-26  3:03             ` Yuanhan Liu
  0 siblings, 1 reply; 52+ messages in thread
From: Stephen Hemminger @ 2016-09-23 20:24 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Maxime Coquelin, yuanhan.liu, huawei.xie, dev, vkaplans

On Fri, 23 Sep 2016 21:22:23 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Fri, Sep 23, 2016 at 08:16:09PM +0200, Maxime Coquelin wrote:
> > 
> > 
> > On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:  
> > > On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:  
> > > > 
> > > > 
> > > > On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:  
> > > > > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:  
> > > > > > Indirect descriptors are usually supported by virtio-net devices,
> > > > > > allowing to dispatch a larger number of requests.
> > > > > > 
> > > > > > When the virtio device sends a packet using indirect descriptors,
> > > > > > only one slot is used in the ring, even for large packets.
> > > > > > 
> > > > > > The main effect is to improve the 0% packet loss benchmark.
> > > > > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > > > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > > > > zero loss.
> > > > > > 
> > > > > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > > > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > > > > the needs, feature can be disabled at VM boot time by passing
> > > > > > indirect_desc=off argument to vhost-user device in Qemu.  
> > > > > 
> > > > > Even better, change guest pmd to only use indirect
> > > > > descriptors when this makes sense (e.g. sufficiently
> > > > > large packets).  
> > > > With the micro-benchmark, the degradation is quite constant whatever
> > > > the packet size.
> > > > 
> > > > For PVP, I could not test with larger packets than 64 bytes, as I don't
> > > > have a 40G interface,  
> > > 
> > > Don't 64 byte packets fit in a single slot anyway?  
> > No, indirect is used. I didn't checked in details, but I think this is
> > because there is no headroom reserved in the mbuf.
> > 
> > This is the condition to meet to fit in a single slot:
> > /* optimize ring usage */
> > if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
> > 	rte_mbuf_refcnt_read(txm) == 1 &&
> > 	RTE_MBUF_DIRECT(txm) &&
> > 	txm->nb_segs == 1 &&
> > 	rte_pktmbuf_headroom(txm) >= hdr_size &&
> > 	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> > 		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
> >     can_push = 1;
> > else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> > 	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
> >     use_indirect = 1;
> > 
> > I will check more in details next week.  
> 
> Two thoughts then
> 1. so can some headroom be reserved?
> 2. how about using indirect with 3 s/g entries,
>    but direct with 2 and down?

The default mbuf allocator does keep headroom available. Sounds like a
test bug.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v3] vhost: Add indirect descriptors support to the TX path
  2016-09-23 20:24           ` Stephen Hemminger
@ 2016-09-26  3:03             ` Yuanhan Liu
  2016-09-26 12:25               ` Michael S. Tsirkin
  0 siblings, 1 reply; 52+ messages in thread
From: Yuanhan Liu @ 2016-09-26  3:03 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Michael S. Tsirkin, Maxime Coquelin, huawei.xie, dev, vkaplans

On Fri, Sep 23, 2016 at 01:24:14PM -0700, Stephen Hemminger wrote:
> On Fri, 23 Sep 2016 21:22:23 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Fri, Sep 23, 2016 at 08:16:09PM +0200, Maxime Coquelin wrote:
> > > 
> > > 
> > > On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:  
> > > > On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:  
> > > > > 
> > > > > 
> > > > > On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:  
> > > > > > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:  
> > > > > > > Indirect descriptors are usually supported by virtio-net devices,
> > > > > > > allowing to dispatch a larger number of requests.
> > > > > > > 
> > > > > > > When the virtio device sends a packet using indirect descriptors,
> > > > > > > only one slot is used in the ring, even for large packets.
> > > > > > > 
> > > > > > > The main effect is to improve the 0% packet loss benchmark.
> > > > > > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > > > > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > > > > > zero loss.
> > > > > > > 
> > > > > > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > > > > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > > > > > the needs, feature can be disabled at VM boot time by passing
> > > > > > > indirect_desc=off argument to vhost-user device in Qemu.  
> > > > > > 
> > > > > > Even better, change guest pmd to only use indirect
> > > > > > descriptors when this makes sense (e.g. sufficiently
> > > > > > large packets).  
> > > > > With the micro-benchmark, the degradation is quite constant whatever
> > > > > the packet size.
> > > > > 
> > > > > For PVP, I could not test with larger packets than 64 bytes, as I don't
> > > > > have a 40G interface,  
> > > > 
> > > > Don't 64 byte packets fit in a single slot anyway?  
> > > No, indirect is used. I didn't checked in details, but I think this is
> > > because there is no headroom reserved in the mbuf.
> > > 
> > > This is the condition to meet to fit in a single slot:
> > > /* optimize ring usage */
> > > if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
> > > 	rte_mbuf_refcnt_read(txm) == 1 &&
> > > 	RTE_MBUF_DIRECT(txm) &&
> > > 	txm->nb_segs == 1 &&
> > > 	rte_pktmbuf_headroom(txm) >= hdr_size &&
> > > 	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> > > 		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
> > >     can_push = 1;
> > > else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> > > 	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
> > >     use_indirect = 1;
> > > 
> > > I will check more in details next week.  
> > 
> > Two thoughts then
> > 1. so can some headroom be reserved?
> > 2. how about using indirect with 3 s/g entries,
> >    but direct with 2 and down?
> 
> The default mbuf allocator does keep headroom available. Sounds like a
> test bug.

That's because we don't have VIRTIO_F_ANY_LAYOUT set, as Stephen claimed
in v2's comment.

Since DPDK vhost actually supports VIRTIO_F_ANY_LAYOUT for a while, I'd
like to add it in the features list (VHOST_SUPPORTED_FEATURES).

Will drop a patch shortly.

	--yliu

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v3] vhost: Add indirect descriptors support to the TX path
  2016-09-26  3:03             ` Yuanhan Liu
@ 2016-09-26 12:25               ` Michael S. Tsirkin
  2016-09-26 13:04                 ` Yuanhan Liu
  0 siblings, 1 reply; 52+ messages in thread
From: Michael S. Tsirkin @ 2016-09-26 12:25 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: Stephen Hemminger, Maxime Coquelin, huawei.xie, dev, vkaplans

On Mon, Sep 26, 2016 at 11:03:54AM +0800, Yuanhan Liu wrote:
> On Fri, Sep 23, 2016 at 01:24:14PM -0700, Stephen Hemminger wrote:
> > On Fri, 23 Sep 2016 21:22:23 +0300
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > 
> > > On Fri, Sep 23, 2016 at 08:16:09PM +0200, Maxime Coquelin wrote:
> > > > 
> > > > 
> > > > On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:  
> > > > > On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:  
> > > > > > 
> > > > > > 
> > > > > > On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:  
> > > > > > > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:  
> > > > > > > > Indirect descriptors are usually supported by virtio-net devices,
> > > > > > > > allowing to dispatch a larger number of requests.
> > > > > > > > 
> > > > > > > > When the virtio device sends a packet using indirect descriptors,
> > > > > > > > only one slot is used in the ring, even for large packets.
> > > > > > > > 
> > > > > > > > The main effect is to improve the 0% packet loss benchmark.
> > > > > > > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > > > > > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > > > > > > zero loss.
> > > > > > > > 
> > > > > > > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > > > > > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > > > > > > the needs, feature can be disabled at VM boot time by passing
> > > > > > > > indirect_desc=off argument to vhost-user device in Qemu.  
> > > > > > > 
> > > > > > > Even better, change guest pmd to only use indirect
> > > > > > > descriptors when this makes sense (e.g. sufficiently
> > > > > > > large packets).  
> > > > > > With the micro-benchmark, the degradation is quite constant whatever
> > > > > > the packet size.
> > > > > > 
> > > > > > For PVP, I could not test with larger packets than 64 bytes, as I don't
> > > > > > have a 40G interface,  
> > > > > 
> > > > > Don't 64 byte packets fit in a single slot anyway?  
> > > > No, indirect is used. I didn't checked in details, but I think this is
> > > > because there is no headroom reserved in the mbuf.
> > > > 
> > > > This is the condition to meet to fit in a single slot:
> > > > /* optimize ring usage */
> > > > if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
> > > > 	rte_mbuf_refcnt_read(txm) == 1 &&
> > > > 	RTE_MBUF_DIRECT(txm) &&
> > > > 	txm->nb_segs == 1 &&
> > > > 	rte_pktmbuf_headroom(txm) >= hdr_size &&
> > > > 	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> > > > 		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
> > > >     can_push = 1;
> > > > else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> > > > 	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
> > > >     use_indirect = 1;
> > > > 
> > > > I will check more in details next week.  
> > > 
> > > Two thoughts then
> > > 1. so can some headroom be reserved?
> > > 2. how about using indirect with 3 s/g entries,
> > >    but direct with 2 and down?
> > 
> > The default mbuf allocator does keep headroom available. Sounds like a
> > test bug.
> 
> That's because we don't have VIRTIO_F_ANY_LAYOUT set, as Stephen claimed
> in v2's comment.
> 
> Since DPDK vhost actually supports VIRTIO_F_ANY_LAYOUT for a while, I'd
> like to add it in the features list (VHOST_SUPPORTED_FEATURES).
> 
> Will drop a patch shortly.
> 
> 	--yliu

If VERSION_1 is set then this implies ANY_LAYOUT without it being set.

-- 
MST

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v3] vhost: Add indirect descriptors support to the TX path
  2016-09-26 12:25               ` Michael S. Tsirkin
@ 2016-09-26 13:04                 ` Yuanhan Liu
  0 siblings, 0 replies; 52+ messages in thread
From: Yuanhan Liu @ 2016-09-26 13:04 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Stephen Hemminger, Maxime Coquelin, huawei.xie, dev, vkaplans

On Mon, Sep 26, 2016 at 03:25:35PM +0300, Michael S. Tsirkin wrote:
> On Mon, Sep 26, 2016 at 11:03:54AM +0800, Yuanhan Liu wrote:
> > On Fri, Sep 23, 2016 at 01:24:14PM -0700, Stephen Hemminger wrote:
> > > On Fri, 23 Sep 2016 21:22:23 +0300
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > 
> > > > On Fri, Sep 23, 2016 at 08:16:09PM +0200, Maxime Coquelin wrote:
> > > > > 
> > > > > 
> > > > > On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:  
> > > > > > On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:  
> > > > > > > 
> > > > > > > 
> > > > > > > On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:  
> > > > > > > > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:  
> > > > > > > > > Indirect descriptors are usually supported by virtio-net devices,
> > > > > > > > > allowing to dispatch a larger number of requests.
> > > > > > > > > 
> > > > > > > > > When the virtio device sends a packet using indirect descriptors,
> > > > > > > > > only one slot is used in the ring, even for large packets.
> > > > > > > > > 
> > > > > > > > > The main effect is to improve the 0% packet loss benchmark.
> > > > > > > > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > > > > > > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > > > > > > > zero loss.
> > > > > > > > > 
> > > > > > > > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > > > > > > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > > > > > > > the needs, feature can be disabled at VM boot time by passing
> > > > > > > > > indirect_desc=off argument to vhost-user device in Qemu.  
> > > > > > > > 
> > > > > > > > Even better, change guest pmd to only use indirect
> > > > > > > > descriptors when this makes sense (e.g. sufficiently
> > > > > > > > large packets).  
> > > > > > > With the micro-benchmark, the degradation is quite constant whatever
> > > > > > > the packet size.
> > > > > > > 
> > > > > > > For PVP, I could not test with larger packets than 64 bytes, as I don't
> > > > > > > have a 40G interface,  
> > > > > > 
> > > > > > Don't 64 byte packets fit in a single slot anyway?  
> > > > > No, indirect is used. I didn't checked in details, but I think this is
> > > > > because there is no headroom reserved in the mbuf.
> > > > > 
> > > > > This is the condition to meet to fit in a single slot:
> > > > > /* optimize ring usage */
> > > > > if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
> > > > > 	rte_mbuf_refcnt_read(txm) == 1 &&
> > > > > 	RTE_MBUF_DIRECT(txm) &&
> > > > > 	txm->nb_segs == 1 &&
> > > > > 	rte_pktmbuf_headroom(txm) >= hdr_size &&
> > > > > 	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> > > > > 		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
> > > > >     can_push = 1;
> > > > > else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> > > > > 	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
> > > > >     use_indirect = 1;
> > > > > 
> > > > > I will check more in details next week.  
> > > > 
> > > > Two thoughts then
> > > > 1. so can some headroom be reserved?
> > > > 2. how about using indirect with 3 s/g entries,
> > > >    but direct with 2 and down?
> > > 
> > > The default mbuf allocator does keep headroom available. Sounds like a
> > > test bug.
> > 
> > That's because we don't have VIRTIO_F_ANY_LAYOUT set, as Stephen claimed
> > in v2's comment.
> > 
> > Since DPDK vhost actually supports VIRTIO_F_ANY_LAYOUT for a while, I'd
> > like to add it in the features list (VHOST_SUPPORTED_FEATURES).
> > 
> > Will drop a patch shortly.
> > 
> > 	--yliu
> 
> If VERSION_1 is set then this implies ANY_LAYOUT without it being set.

Yes, I saw this note from you in another email. I kept it as it is,
for two reasons (maybe I should have claimed it earlier):

- we have to return all features we support to the guest.

  We don't know the guest is a modern or legacy device. That means
  we should claim we support both: VERSION_1 and ANY_LAYOUT.

  Assume guest is a legacy device and we just set VERSION_1 (the current
  case), ANY_LAYOUT will never be negotiated.

- I'm following the way Linux kernel takes: it also set both features.

Maybe, we could unset ANY_LAYOUT when VERSION_1 is _negotiated_?

	--yliu

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v3] vhost: Add indirect descriptors support to the TX path
  2016-09-23  8:28 [PATCH v3] vhost: Add indirect descriptors support to the TX path Maxime Coquelin
  2016-09-23 15:49 ` Michael S. Tsirkin
@ 2016-09-27  4:15 ` Yuanhan Liu
  2016-09-27  7:25   ` Maxime Coquelin
  2016-09-27  8:42 ` [PATCH v4] " Maxime Coquelin
  2 siblings, 1 reply; 52+ messages in thread
From: Yuanhan Liu @ 2016-09-27  4:15 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: huawei.xie, dev, vkaplans, mst, stephen

On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
> +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
> +			desc = (struct vring_desc *)gpa_to_vva(dev,

As mentioned before, this would break 32 bit OS build. It should be

	(struct vring_desc *)(uintptr_t)gpa_to_vva(...);

I meant to fix this while apply, later I just realized you haven't
updated the release note (sorry for the late notice).

So would you mind send a new version, with the fix and release note
update? FYI, the release note is at "doc/guides/rel_notes/"

	--yliu

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v3] vhost: Add indirect descriptors support to the TX path
  2016-09-27  4:15 ` Yuanhan Liu
@ 2016-09-27  7:25   ` Maxime Coquelin
  0 siblings, 0 replies; 52+ messages in thread
From: Maxime Coquelin @ 2016-09-27  7:25 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: huawei.xie, dev, vkaplans, mst, stephen



On 09/27/2016 06:15 AM, Yuanhan Liu wrote:
> On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
>> +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
>> +			desc = (struct vring_desc *)gpa_to_vva(dev,
>
> As mentioned before, this would break 32 bit OS build. It should be
>
> 	(struct vring_desc *)(uintptr_t)gpa_to_vva(...);
>
> I meant to fix this while apply, later I just realized you haven't
> updated the release note (sorry for the late notice).
>
> So would you mind send a new version, with the fix and release note
> update? FYI, the release note is at "doc/guides/rel_notes/"
Not a problem, doing it now.

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-09-23  8:28 [PATCH v3] vhost: Add indirect descriptors support to the TX path Maxime Coquelin
  2016-09-23 15:49 ` Michael S. Tsirkin
  2016-09-27  4:15 ` Yuanhan Liu
@ 2016-09-27  8:42 ` Maxime Coquelin
  2016-09-27 12:18   ` Yuanhan Liu
  2016-10-14  7:24   ` Wang, Zhihong
  2 siblings, 2 replies; 52+ messages in thread
From: Maxime Coquelin @ 2016-09-27  8:42 UTC (permalink / raw)
  To: yuanhan.liu, huawei.xie, dev; +Cc: vkaplans, mst, stephen, Maxime Coquelin

Indirect descriptors are usually supported by virtio-net devices,
allowing to dispatch a larger number of requests.

When the virtio device sends a packet using indirect descriptors,
only one slot is used in the ring, even for large packets.

The main effect is to improve the 0% packet loss benchmark.
A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
(fwd io for host, macswap for VM) on DUT shows a +50% gain for
zero loss.

On the downside, micro-benchmark using testpmd txonly in VM and
rxonly on host shows a loss between 1 and 4%.i But depending on
the needs, feature can be disabled at VM boot time by passing
indirect_desc=off argument to vhost-user device in Qemu.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
Changes since v3:
=================
 - Fix compilation warning on 32 bits systems (Yuanhan)
 - Add Release Note paragraph about the feature

Changes since v2:
=================
 - Revert back to not checking feature flag to be aligned with
kernel implementation
 - Ensure we don't have nested indirect descriptors
 - Ensure the indirect desc address is valid, to protect against
malicious guests

Changes since RFC:
==================
 - Enrich commit message with figures
 - Rebased on top of dpdk-next-virtio's master
 - Add feature check to ensure we don't receive an indirect desc
if not supported by the virtio driver

 doc/guides/rel_notes/release_16_11.rst | 11 +++++++++
 lib/librte_vhost/vhost.c               |  3 ++-
 lib/librte_vhost/virtio_net.c          | 41 +++++++++++++++++++++++++---------
 3 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/doc/guides/rel_notes/release_16_11.rst b/doc/guides/rel_notes/release_16_11.rst
index 66916af..aae4908 100644
--- a/doc/guides/rel_notes/release_16_11.rst
+++ b/doc/guides/rel_notes/release_16_11.rst
@@ -36,6 +36,17 @@ New Features
 
      This section is a comment. Make sure to start the actual text at the margin.
 
+   * **Added vhost-user indirect descriptors support.**
+     If indirect descriptor feature is negotiated, each packet sent by the guest
+     will take exactly one slot in the enqueue virtqueue. Without the feature, in
+     current version, even 64 bytes packets take two slots with Virtio PMD on guest
+     side.
+
+     The main impact is better performance for 0% packet loss use-cases, as it
+     behaves as if the virtqueue size was enlarged, so more packets can be buffered
+     in case of system perturbations. On the downside, small performance degradation
+     is measured when running micro-benchmarks.
+
 
 Resolved Issues
 ---------------
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..30bb0ce 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -65,7 +65,8 @@
 				(1ULL << VIRTIO_NET_F_CSUM)    | \
 				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
 				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
-				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
+				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
+				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
 
 uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8a151af..a59c39b 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
 }
 
 static inline int __attribute__((always_inline))
-copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx,
+copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
+		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
 		  struct rte_mempool *mbuf_pool)
 {
 	struct vring_desc *desc;
@@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	/* A counter to avoid desc dead loop chain */
 	uint32_t nr_desc = 1;
 
-	desc = &vq->desc[desc_idx];
-	if (unlikely(desc->len < dev->vhost_hlen))
+	desc = &descs[desc_idx];
+	if (unlikely((desc->len < dev->vhost_hlen)) ||
+			(desc->flags & VRING_DESC_F_INDIRECT))
 		return -1;
 
 	desc_addr = gpa_to_vva(dev, desc->addr);
@@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	 */
 	if (likely((desc->len == dev->vhost_hlen) &&
 		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
-		desc = &vq->desc[desc->next];
+		desc = &descs[desc->next];
+		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
+			return -1;
 
 		desc_addr = gpa_to_vva(dev, desc->addr);
 		if (unlikely(!desc_addr))
@@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
 				break;
 
-			if (unlikely(desc->next >= vq->size ||
-				     ++nr_desc > vq->size))
+			if (unlikely(desc->next >= max_desc ||
+				     ++nr_desc > max_desc))
+				return -1;
+			desc = &descs[desc->next];
+			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
 				return -1;
-			desc = &vq->desc[desc->next];
 
 			desc_addr = gpa_to_vva(dev, desc->addr);
 			if (unlikely(!desc_addr))
@@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	/* Prefetch descriptor index. */
 	rte_prefetch0(&vq->desc[desc_indexes[0]]);
 	for (i = 0; i < count; i++) {
+		struct vring_desc *desc;
+		uint16_t sz, idx;
 		int err;
 
 		if (likely(i + 1 < count))
 			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
 
+		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
+			desc = (struct vring_desc *)(uintptr_t)gpa_to_vva(dev,
+					vq->desc[desc_indexes[i]].addr);
+			if (unlikely(!desc))
+				break;
+
+			rte_prefetch0(desc);
+			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
+			idx = 0;
+		} else {
+			desc = vq->desc;
+			sz = vq->size;
+			idx = desc_indexes[i];
+		}
+
 		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
 		if (unlikely(pkts[i] == NULL)) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
 			break;
 		}
-		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
-					mbuf_pool);
+		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
 		if (unlikely(err)) {
 			rte_pktmbuf_free(pkts[i]);
 			break;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-09-27  8:42 ` [PATCH v4] " Maxime Coquelin
@ 2016-09-27 12:18   ` Yuanhan Liu
  2016-10-14  7:24   ` Wang, Zhihong
  1 sibling, 0 replies; 52+ messages in thread
From: Yuanhan Liu @ 2016-09-27 12:18 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: huawei.xie, dev, vkaplans, mst, stephen

On Tue, Sep 27, 2016 at 10:42:49AM +0200, Maxime Coquelin wrote:
> Indirect descriptors are usually supported by virtio-net devices,
> allowing to dispatch a larger number of requests.
> 
> When the virtio device sends a packet using indirect descriptors,
> only one slot is used in the ring, even for large packets.
> 
> The main effect is to improve the 0% packet loss benchmark.
> A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> zero loss.
> 
> On the downside, micro-benchmark using testpmd txonly in VM and
> rxonly on host shows a loss between 1 and 4%.i But depending on
> the needs, feature can be disabled at VM boot time by passing
> indirect_desc=off argument to vhost-user device in Qemu.
> 
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Applied to dpdk-next-virtio.

Thanks.

	--yliu

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-09-27  8:42 ` [PATCH v4] " Maxime Coquelin
  2016-09-27 12:18   ` Yuanhan Liu
@ 2016-10-14  7:24   ` Wang, Zhihong
  2016-10-14  7:34     ` Wang, Zhihong
  2016-10-14 15:50     ` Maxime Coquelin
  1 sibling, 2 replies; 52+ messages in thread
From: Wang, Zhihong @ 2016-10-14  7:24 UTC (permalink / raw)
  To: Maxime Coquelin, yuanhan.liu, Xie, Huawei, dev; +Cc: vkaplans, mst, stephen



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime Coquelin
> Sent: Tuesday, September 27, 2016 4:43 PM
> To: yuanhan.liu@linux.intel.com; Xie, Huawei <huawei.xie@intel.com>;
> dev@dpdk.org
> Cc: vkaplans@redhat.com; mst@redhat.com;
> stephen@networkplumber.org; Maxime Coquelin
> <maxime.coquelin@redhat.com>
> Subject: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support to
> the TX path
> 
> Indirect descriptors are usually supported by virtio-net devices,
> allowing to dispatch a larger number of requests.
> 
> When the virtio device sends a packet using indirect descriptors,
> only one slot is used in the ring, even for large packets.
> 
> The main effect is to improve the 0% packet loss benchmark.
> A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> zero loss.
> 
> On the downside, micro-benchmark using testpmd txonly in VM and
> rxonly on host shows a loss between 1 and 4%.i But depending on
> the needs, feature can be disabled at VM boot time by passing
> indirect_desc=off argument to vhost-user device in Qemu.
> 
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>


Hi Maxime,

Seems this patch can't with Windows virtio guest in my test.
Have you done similar tests before?

The way I test:

 1. Make sure https://patchwork.codeaurora.org/patch/84339/ is applied

 2. Start testpmd with iofwd between 2 vhost ports

 3. Start 2 Windows guests connected to the 2 vhost ports

 4. Disable firewall and assign IP to each guest using ipconfig

 5. Use ping to test connectivity

When I disable this patch by setting:

    0ULL << VIRTIO_RING_F_INDIRECT_DESC,

the connection is fine, but when I restore:

    1ULL << VIRTIO_RING_F_INDIRECT_DESC,

the connection is broken.


Thanks
Zhihong

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-14  7:24   ` Wang, Zhihong
@ 2016-10-14  7:34     ` Wang, Zhihong
  2016-10-14 15:50     ` Maxime Coquelin
  1 sibling, 0 replies; 52+ messages in thread
From: Wang, Zhihong @ 2016-10-14  7:34 UTC (permalink / raw)
  To: Wang, Zhihong, Maxime Coquelin, yuanhan.liu, Xie, Huawei, dev
  Cc: vkaplans, mst, stephen



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Wang, Zhihong
> Sent: Friday, October 14, 2016 3:25 PM
> To: Maxime Coquelin <maxime.coquelin@redhat.com>;
> yuanhan.liu@linux.intel.com; Xie, Huawei <huawei.xie@intel.com>;
> dev@dpdk.org
> Cc: vkaplans@redhat.com; mst@redhat.com;
> stephen@networkplumber.org
> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
> to the TX path
> 
> 
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime Coquelin
> > Sent: Tuesday, September 27, 2016 4:43 PM
> > To: yuanhan.liu@linux.intel.com; Xie, Huawei <huawei.xie@intel.com>;
> > dev@dpdk.org
> > Cc: vkaplans@redhat.com; mst@redhat.com;
> > stephen@networkplumber.org; Maxime Coquelin
> > <maxime.coquelin@redhat.com>
> > Subject: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support to
> > the TX path
> >
> > Indirect descriptors are usually supported by virtio-net devices,
> > allowing to dispatch a larger number of requests.
> >
> > When the virtio device sends a packet using indirect descriptors,
> > only one slot is used in the ring, even for large packets.
> >
> > The main effect is to improve the 0% packet loss benchmark.
> > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > zero loss.
> >
> > On the downside, micro-benchmark using testpmd txonly in VM and
> > rxonly on host shows a loss between 1 and 4%.i But depending on
> > the needs, feature can be disabled at VM boot time by passing
> > indirect_desc=off argument to vhost-user device in Qemu.
> >
> > Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> 
> 
> Hi Maxime,
> 
> Seems this patch can't with Windows virtio guest in my test.
> Have you done similar tests before?
> 
> The way I test:
> 
>  1. Make sure https://patchwork.codeaurora.org/patch/84339/ is applied
> 
>  2. Start testpmd with iofwd between 2 vhost ports
> 
>  3. Start 2 Windows guests connected to the 2 vhost ports

The mrg_rxbuf feature is on.

> 
>  4. Disable firewall and assign IP to each guest using ipconfig
> 
>  5. Use ping to test connectivity
> 
> When I disable this patch by setting:
> 
>     0ULL << VIRTIO_RING_F_INDIRECT_DESC,
> 
> the connection is fine, but when I restore:
> 
>     1ULL << VIRTIO_RING_F_INDIRECT_DESC,
> 
> the connection is broken.
> 
> 
> Thanks
> Zhihong
> 

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-14  7:24   ` Wang, Zhihong
  2016-10-14  7:34     ` Wang, Zhihong
@ 2016-10-14 15:50     ` Maxime Coquelin
  2016-10-17 11:23       ` Maxime Coquelin
  1 sibling, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-10-14 15:50 UTC (permalink / raw)
  To: Wang, Zhihong, yuanhan.liu, Xie, Huawei, dev; +Cc: vkaplans, mst, stephen



On 10/14/2016 09:24 AM, Wang, Zhihong wrote:
>
>
>> -----Original Message-----
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime Coquelin
>> Sent: Tuesday, September 27, 2016 4:43 PM
>> To: yuanhan.liu@linux.intel.com; Xie, Huawei <huawei.xie@intel.com>;
>> dev@dpdk.org
>> Cc: vkaplans@redhat.com; mst@redhat.com;
>> stephen@networkplumber.org; Maxime Coquelin
>> <maxime.coquelin@redhat.com>
>> Subject: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support to
>> the TX path
>>
>> Indirect descriptors are usually supported by virtio-net devices,
>> allowing to dispatch a larger number of requests.
>>
>> When the virtio device sends a packet using indirect descriptors,
>> only one slot is used in the ring, even for large packets.
>>
>> The main effect is to improve the 0% packet loss benchmark.
>> A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
>> (fwd io for host, macswap for VM) on DUT shows a +50% gain for
>> zero loss.
>>
>> On the downside, micro-benchmark using testpmd txonly in VM and
>> rxonly on host shows a loss between 1 and 4%.i But depending on
>> the needs, feature can be disabled at VM boot time by passing
>> indirect_desc=off argument to vhost-user device in Qemu.
>>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>
>
> Hi Maxime,
>
> Seems this patch can't with Windows virtio guest in my test.
> Have you done similar tests before?
>
> The way I test:
>
>  1. Make sure https://patchwork.codeaurora.org/patch/84339/ is applied
>
>  2. Start testpmd with iofwd between 2 vhost ports
>
>  3. Start 2 Windows guests connected to the 2 vhost ports
>
>  4. Disable firewall and assign IP to each guest using ipconfig
>
>  5. Use ping to test connectivity
>
> When I disable this patch by setting:
>
>     0ULL << VIRTIO_RING_F_INDIRECT_DESC,
>
> the connection is fine, but when I restore:
>
>     1ULL << VIRTIO_RING_F_INDIRECT_DESC,
>
> the connection is broken.

Just noticed I didn't reply to all this morning.
I sent a debug patch to Zhihong, which shows that indirect desc chaining
looks OK.

On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
I'll continue the investigation early next week.

Has anyone already tested Windows guest with vhost-net, which also has
indirect descs support?


Regards,
Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-14 15:50     ` Maxime Coquelin
@ 2016-10-17 11:23       ` Maxime Coquelin
  2016-10-17 13:21         ` Yuanhan Liu
  0 siblings, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-10-17 11:23 UTC (permalink / raw)
  To: Wang, Zhihong, yuanhan.liu, Xie, Huawei, dev; +Cc: vkaplans, mst, stephen



On 10/14/2016 05:50 PM, Maxime Coquelin wrote:
>
>
> On 10/14/2016 09:24 AM, Wang, Zhihong wrote:
>>
>>
>>> -----Original Message-----
>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime Coquelin
>>> Sent: Tuesday, September 27, 2016 4:43 PM
>>> To: yuanhan.liu@linux.intel.com; Xie, Huawei <huawei.xie@intel.com>;
>>> dev@dpdk.org
>>> Cc: vkaplans@redhat.com; mst@redhat.com;
>>> stephen@networkplumber.org; Maxime Coquelin
>>> <maxime.coquelin@redhat.com>
>>> Subject: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>>> support to
>>> the TX path
>>>
>>> Indirect descriptors are usually supported by virtio-net devices,
>>> allowing to dispatch a larger number of requests.
>>>
>>> When the virtio device sends a packet using indirect descriptors,
>>> only one slot is used in the ring, even for large packets.
>>>
>>> The main effect is to improve the 0% packet loss benchmark.
>>> A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
>>> (fwd io for host, macswap for VM) on DUT shows a +50% gain for
>>> zero loss.
>>>
>>> On the downside, micro-benchmark using testpmd txonly in VM and
>>> rxonly on host shows a loss between 1 and 4%.i But depending on
>>> the needs, feature can be disabled at VM boot time by passing
>>> indirect_desc=off argument to vhost-user device in Qemu.
>>>
>>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>>
>>
>> Hi Maxime,
>>
>> Seems this patch can't with Windows virtio guest in my test.
>> Have you done similar tests before?
>>
>> The way I test:
>>
>>  1. Make sure https://patchwork.codeaurora.org/patch/84339/ is applied
>>
>>  2. Start testpmd with iofwd between 2 vhost ports
>>
>>  3. Start 2 Windows guests connected to the 2 vhost ports
>>
>>  4. Disable firewall and assign IP to each guest using ipconfig
>>
>>  5. Use ping to test connectivity
>>
>> When I disable this patch by setting:
>>
>>     0ULL << VIRTIO_RING_F_INDIRECT_DESC,
>>
>> the connection is fine, but when I restore:
>>
>>     1ULL << VIRTIO_RING_F_INDIRECT_DESC,
>>
>> the connection is broken.
>
> Just noticed I didn't reply to all this morning.
> I sent a debug patch to Zhihong, which shows that indirect desc chaining
> looks OK.
>
> On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
> I'll continue the investigation early next week.

The root cause is identified.
When INDIRECT_DESC feature is negotiated, Windows guest uses indirect
for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
virtio-net kernel driver) use indirect only for Tx.

I'll implement indirect support for the Rx path in vhost lib, but the
change will be too big for -rc release.
I propose in the mean time to disable INDIRECT_DESC feature in vhost
lib, we can still enable it locally for testing.

Yuanhan, is it ok for you?

> Has anyone already tested Windows guest with vhost-net, which also has
> indirect descs support?

I tested and confirm it works with vhost-net.

Regards,
Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-17 11:23       ` Maxime Coquelin
@ 2016-10-17 13:21         ` Yuanhan Liu
  2016-10-17 14:14           ` Maxime Coquelin
  0 siblings, 1 reply; 52+ messages in thread
From: Yuanhan Liu @ 2016-10-17 13:21 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: Wang, Zhihong, Xie, Huawei, dev, vkaplans, mst, stephen

On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
> >On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
> >I'll continue the investigation early next week.
> 
> The root cause is identified.
> When INDIRECT_DESC feature is negotiated, Windows guest uses indirect
> for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
> virtio-net kernel driver) use indirect only for Tx.
> 
> I'll implement indirect support for the Rx path in vhost lib, but the
> change will be too big for -rc release.
> I propose in the mean time to disable INDIRECT_DESC feature in vhost
> lib, we can still enable it locally for testing.
> 
> Yuanhan, is it ok for you?

That's okay.

> 
> >Has anyone already tested Windows guest with vhost-net, which also has
> >indirect descs support?
> 
> I tested and confirm it works with vhost-net.

I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
for Rx path, right?

	--yliu

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-17 13:21         ` Yuanhan Liu
@ 2016-10-17 14:14           ` Maxime Coquelin
  2016-10-27  9:00             ` Wang, Zhihong
  0 siblings, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-10-17 14:14 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: Wang, Zhihong, Xie, Huawei, dev, vkaplans, mst, stephen



On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
> On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
>>> On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
>>> I'll continue the investigation early next week.
>>
>> The root cause is identified.
>> When INDIRECT_DESC feature is negotiated, Windows guest uses indirect
>> for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
>> virtio-net kernel driver) use indirect only for Tx.
>> I'll implement indirect support for the Rx path in vhost lib, but the
>> change will be too big for -rc release.
>> I propose in the mean time to disable INDIRECT_DESC feature in vhost
>> lib, we can still enable it locally for testing.
>>
>> Yuanhan, is it ok for you?
>
> That's okay.
I'll send a patch to disable it then.

>
>>
>>> Has anyone already tested Windows guest with vhost-net, which also has
>>> indirect descs support?
>>
>> I tested and confirm it works with vhost-net.
>
> I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
> for Rx path, right?

No, it does support it actually.
I thought it didn't support too, I misread the Kernel implementation of
vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
in Rx path when mergeable buffers is disabled.

The confusion certainly comes from me, sorry about that.

Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-17 14:14           ` Maxime Coquelin
@ 2016-10-27  9:00             ` Wang, Zhihong
  2016-10-27  9:10               ` Maxime Coquelin
  0 siblings, 1 reply; 52+ messages in thread
From: Wang, Zhihong @ 2016-10-27  9:00 UTC (permalink / raw)
  To: Maxime Coquelin, Yuanhan Liu, stephen, Pierre Pfister (ppfister)
  Cc: dev, vkaplans, mst

Hi Maxime,

Seems indirect desc feature is causing serious performance
degradation on Haswell platform, about 20% drop for both
mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
both iofwd and macfwd.

I'm using RC2, and the CPU is Xeon E5-2699 v3 @ 2.30GHz.

Could you please verify if this is true in your test?


Thanks
Zhihong

> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Monday, October 17, 2016 10:15 PM
> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Xie, Huawei
> <huawei.xie@intel.com>; dev@dpdk.org; vkaplans@redhat.com;
> mst@redhat.com; stephen@networkplumber.org
> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
> to the TX path
> 
> 
> 
> On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
> > On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
> >>> On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
> >>> I'll continue the investigation early next week.
> >>
> >> The root cause is identified.
> >> When INDIRECT_DESC feature is negotiated, Windows guest uses indirect
> >> for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
> >> virtio-net kernel driver) use indirect only for Tx.
> >> I'll implement indirect support for the Rx path in vhost lib, but the
> >> change will be too big for -rc release.
> >> I propose in the mean time to disable INDIRECT_DESC feature in vhost
> >> lib, we can still enable it locally for testing.
> >>
> >> Yuanhan, is it ok for you?
> >
> > That's okay.
> I'll send a patch to disable it then.
> 
> >
> >>
> >>> Has anyone already tested Windows guest with vhost-net, which also
> has
> >>> indirect descs support?
> >>
> >> I tested and confirm it works with vhost-net.
> >
> > I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
> > for Rx path, right?
> 
> No, it does support it actually.
> I thought it didn't support too, I misread the Kernel implementation of
> vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
> in Rx path when mergeable buffers is disabled.
> 
> The confusion certainly comes from me, sorry about that.
> 
> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-27  9:00             ` Wang, Zhihong
@ 2016-10-27  9:10               ` Maxime Coquelin
  2016-10-27  9:55                 ` Maxime Coquelin
  2016-10-27 10:33                 ` Yuanhan Liu
  0 siblings, 2 replies; 52+ messages in thread
From: Maxime Coquelin @ 2016-10-27  9:10 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu, stephen, Pierre Pfister (ppfister)
  Cc: dev, vkaplans, mst

Hi Zhihong,

On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
> Hi Maxime,
>
> Seems indirect desc feature is causing serious performance
> degradation on Haswell platform, about 20% drop for both
> mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
> both iofwd and macfwd.
I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy Bridge
platform, and didn't faced such a drop.
Have you tried to pass indirect_desc=off to qemu cmdline to see if you
recover the performance?

Yuanhan, which platform did you use when you tested it with zero copy?

>
> I'm using RC2, and the CPU is Xeon E5-2699 v3 @ 2.30GHz.
>
> Could you please verify if this is true in your test?
I'll try -rc1/-rc2 on my platform, and let you know.

Thanks,
Maxime

>
>
> Thanks
> Zhihong
>
>> -----Original Message-----
>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>> Sent: Monday, October 17, 2016 10:15 PM
>> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
>> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Xie, Huawei
>> <huawei.xie@intel.com>; dev@dpdk.org; vkaplans@redhat.com;
>> mst@redhat.com; stephen@networkplumber.org
>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
>> to the TX path
>>
>>
>>
>> On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
>>> On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
>>>>> On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
>>>>> I'll continue the investigation early next week.
>>>>
>>>> The root cause is identified.
>>>> When INDIRECT_DESC feature is negotiated, Windows guest uses indirect
>>>> for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
>>>> virtio-net kernel driver) use indirect only for Tx.
>>>> I'll implement indirect support for the Rx path in vhost lib, but the
>>>> change will be too big for -rc release.
>>>> I propose in the mean time to disable INDIRECT_DESC feature in vhost
>>>> lib, we can still enable it locally for testing.
>>>>
>>>> Yuanhan, is it ok for you?
>>>
>>> That's okay.
>> I'll send a patch to disable it then.
>>
>>>
>>>>
>>>>> Has anyone already tested Windows guest with vhost-net, which also
>> has
>>>>> indirect descs support?
>>>>
>>>> I tested and confirm it works with vhost-net.
>>>
>>> I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
>>> for Rx path, right?
>>
>> No, it does support it actually.
>> I thought it didn't support too, I misread the Kernel implementation of
>> vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
>> in Rx path when mergeable buffers is disabled.
>>
>> The confusion certainly comes from me, sorry about that.
>>
>> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-27  9:10               ` Maxime Coquelin
@ 2016-10-27  9:55                 ` Maxime Coquelin
  2016-10-27 10:19                   ` Wang, Zhihong
  2016-10-27 10:33                 ` Yuanhan Liu
  1 sibling, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-10-27  9:55 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu, stephen, Pierre Pfister (ppfister)
  Cc: dev, vkaplans, mst



On 10/27/2016 11:10 AM, Maxime Coquelin wrote:
> Hi Zhihong,
>
> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
>> Hi Maxime,
>>
>> Seems indirect desc feature is causing serious performance
>> degradation on Haswell platform, about 20% drop for both
>> mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
>> both iofwd and macfwd.
> I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy Bridge
> platform, and didn't faced such a drop.
> Have you tried to pass indirect_desc=off to qemu cmdline to see if you
> recover the performance?
>
> Yuanhan, which platform did you use when you tested it with zero copy?
>
>>
>> I'm using RC2, and the CPU is Xeon E5-2699 v3 @ 2.30GHz.
>>
>> Could you please verify if this is true in your test?
> I'll try -rc1/-rc2 on my platform, and let you know.
As a first test, I tried again Txonly from the guest to the host (Rxonly),
where Tx indirect descriptors are used, on my E5-2665 @2.40GHz:
v16.11-rc1: 10.81Mpps
v16.11-rc2: 10.91Mpps

-rc2 is even slightly better in my case.
Could you please run the same test on your platform?

And could you provide me more info on your fwd bench?
Do you use dpdk-pktgen on host, or you do fwd on howt with a real NIC
also?

Thanks,
Maxime
> Thanks,
> Maxime
>
>>
>>
>> Thanks
>> Zhihong
>>
>>> -----Original Message-----
>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>> Sent: Monday, October 17, 2016 10:15 PM
>>> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
>>> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Xie, Huawei
>>> <huawei.xie@intel.com>; dev@dpdk.org; vkaplans@redhat.com;
>>> mst@redhat.com; stephen@networkplumber.org
>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>>> support
>>> to the TX path
>>>
>>>
>>>
>>> On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
>>>> On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
>>>>>> On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
>>>>>> I'll continue the investigation early next week.
>>>>>
>>>>> The root cause is identified.
>>>>> When INDIRECT_DESC feature is negotiated, Windows guest uses indirect
>>>>> for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
>>>>> virtio-net kernel driver) use indirect only for Tx.
>>>>> I'll implement indirect support for the Rx path in vhost lib, but the
>>>>> change will be too big for -rc release.
>>>>> I propose in the mean time to disable INDIRECT_DESC feature in vhost
>>>>> lib, we can still enable it locally for testing.
>>>>>
>>>>> Yuanhan, is it ok for you?
>>>>
>>>> That's okay.
>>> I'll send a patch to disable it then.
>>>
>>>>
>>>>>
>>>>>> Has anyone already tested Windows guest with vhost-net, which also
>>> has
>>>>>> indirect descs support?
>>>>>
>>>>> I tested and confirm it works with vhost-net.
>>>>
>>>> I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
>>>> for Rx path, right?
>>>
>>> No, it does support it actually.
>>> I thought it didn't support too, I misread the Kernel implementation of
>>> vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
>>> in Rx path when mergeable buffers is disabled.
>>>
>>> The confusion certainly comes from me, sorry about that.
>>>
>>> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-27  9:55                 ` Maxime Coquelin
@ 2016-10-27 10:19                   ` Wang, Zhihong
  2016-10-28  7:32                     ` Pierre Pfister (ppfister)
  0 siblings, 1 reply; 52+ messages in thread
From: Wang, Zhihong @ 2016-10-27 10:19 UTC (permalink / raw)
  To: Maxime Coquelin, Yuanhan Liu, stephen, Pierre Pfister (ppfister)
  Cc: dev, vkaplans, mst



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Thursday, October 27, 2016 5:55 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> <yuanhan.liu@linux.intel.com>; stephen@networkplumber.org; Pierre
> Pfister (ppfister) <ppfister@cisco.com>
> Cc: Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
> vkaplans@redhat.com; mst@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
> to the TX path
> 
> 
> 
> On 10/27/2016 11:10 AM, Maxime Coquelin wrote:
> > Hi Zhihong,
> >
> > On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
> >> Hi Maxime,
> >>
> >> Seems indirect desc feature is causing serious performance
> >> degradation on Haswell platform, about 20% drop for both
> >> mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
> >> both iofwd and macfwd.
> > I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy Bridge
> > platform, and didn't faced such a drop.
> > Have you tried to pass indirect_desc=off to qemu cmdline to see if you
> > recover the performance?
> >
> > Yuanhan, which platform did you use when you tested it with zero copy?
> >
> >>
> >> I'm using RC2, and the CPU is Xeon E5-2699 v3 @ 2.30GHz.
> >>
> >> Could you please verify if this is true in your test?
> > I'll try -rc1/-rc2 on my platform, and let you know.
> As a first test, I tried again Txonly from the guest to the host (Rxonly),
> where Tx indirect descriptors are used, on my E5-2665 @2.40GHz:
> v16.11-rc1: 10.81Mpps
> v16.11-rc2: 10.91Mpps
> 
> -rc2 is even slightly better in my case.
> Could you please run the same test on your platform?

I mean to use rc2 as both host and guest, and compare the
perf between indirect=0 and indirect=1.

I use PVP traffic, tried both testpmd and OvS as the forwarding
engine in host, and testpmd in guest.

Thanks
Zhihong

> 
> And could you provide me more info on your fwd bench?
> Do you use dpdk-pktgen on host, or you do fwd on howt with a real NIC
> also?
> 
> Thanks,
> Maxime
> > Thanks,
> > Maxime
> >
> >>
> >>
> >> Thanks
> >> Zhihong
> >>
> >>> -----Original Message-----
> >>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >>> Sent: Monday, October 17, 2016 10:15 PM
> >>> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> >>> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Xie, Huawei
> >>> <huawei.xie@intel.com>; dev@dpdk.org; vkaplans@redhat.com;
> >>> mst@redhat.com; stephen@networkplumber.org
> >>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
> >>> support
> >>> to the TX path
> >>>
> >>>
> >>>
> >>> On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
> >>>> On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
> >>>>>> On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
> >>>>>> I'll continue the investigation early next week.
> >>>>>
> >>>>> The root cause is identified.
> >>>>> When INDIRECT_DESC feature is negotiated, Windows guest uses
> indirect
> >>>>> for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
> >>>>> virtio-net kernel driver) use indirect only for Tx.
> >>>>> I'll implement indirect support for the Rx path in vhost lib, but the
> >>>>> change will be too big for -rc release.
> >>>>> I propose in the mean time to disable INDIRECT_DESC feature in vhost
> >>>>> lib, we can still enable it locally for testing.
> >>>>>
> >>>>> Yuanhan, is it ok for you?
> >>>>
> >>>> That's okay.
> >>> I'll send a patch to disable it then.
> >>>
> >>>>
> >>>>>
> >>>>>> Has anyone already tested Windows guest with vhost-net, which
> also
> >>> has
> >>>>>> indirect descs support?
> >>>>>
> >>>>> I tested and confirm it works with vhost-net.
> >>>>
> >>>> I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
> >>>> for Rx path, right?
> >>>
> >>> No, it does support it actually.
> >>> I thought it didn't support too, I misread the Kernel implementation of
> >>> vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
> >>> in Rx path when mergeable buffers is disabled.
> >>>
> >>> The confusion certainly comes from me, sorry about that.
> >>>
> >>> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-27  9:10               ` Maxime Coquelin
  2016-10-27  9:55                 ` Maxime Coquelin
@ 2016-10-27 10:33                 ` Yuanhan Liu
  2016-10-27 10:35                   ` Maxime Coquelin
  2016-10-27 10:53                   ` Maxime Coquelin
  1 sibling, 2 replies; 52+ messages in thread
From: Yuanhan Liu @ 2016-10-27 10:33 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: mst, dev, vkaplans

On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin wrote:
> Hi Zhihong,
> 
> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
> >Hi Maxime,
> >
> >Seems indirect desc feature is causing serious performance
> >degradation on Haswell platform, about 20% drop for both
> >mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
> >both iofwd and macfwd.
> I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy Bridge
> platform, and didn't faced such a drop.

I was actually wondering that may be the cause. I tested it with
my IvyBridge server as well, I saw no drop.

Maybe you should find a similar platform (Haswell) and have a try?

	--yliu

> Have you tried to pass indirect_desc=off to qemu cmdline to see if you
> recover the performance?
> 
> Yuanhan, which platform did you use when you tested it with zero copy?
> 
> >
> >I'm using RC2, and the CPU is Xeon E5-2699 v3 @ 2.30GHz.
> >
> >Could you please verify if this is true in your test?
> I'll try -rc1/-rc2 on my platform, and let you know.
> 
> Thanks,
> Maxime
> 
> >
> >
> >Thanks
> >Zhihong
> >
> >>-----Original Message-----
> >>From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >>Sent: Monday, October 17, 2016 10:15 PM
> >>To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> >>Cc: Wang, Zhihong <zhihong.wang@intel.com>; Xie, Huawei
> >><huawei.xie@intel.com>; dev@dpdk.org; vkaplans@redhat.com;
> >>mst@redhat.com; stephen@networkplumber.org
> >>Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
> >>to the TX path
> >>
> >>
> >>
> >>On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
> >>>On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
> >>>>>On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
> >>>>>I'll continue the investigation early next week.
> >>>>
> >>>>The root cause is identified.
> >>>>When INDIRECT_DESC feature is negotiated, Windows guest uses indirect
> >>>>for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
> >>>>virtio-net kernel driver) use indirect only for Tx.
> >>>>I'll implement indirect support for the Rx path in vhost lib, but the
> >>>>change will be too big for -rc release.
> >>>>I propose in the mean time to disable INDIRECT_DESC feature in vhost
> >>>>lib, we can still enable it locally for testing.
> >>>>
> >>>>Yuanhan, is it ok for you?
> >>>
> >>>That's okay.
> >>I'll send a patch to disable it then.
> >>
> >>>
> >>>>
> >>>>>Has anyone already tested Windows guest with vhost-net, which also
> >>has
> >>>>>indirect descs support?
> >>>>
> >>>>I tested and confirm it works with vhost-net.
> >>>
> >>>I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
> >>>for Rx path, right?
> >>
> >>No, it does support it actually.
> >>I thought it didn't support too, I misread the Kernel implementation of
> >>vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
> >>in Rx path when mergeable buffers is disabled.
> >>
> >>The confusion certainly comes from me, sorry about that.
> >>
> >>Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-27 10:33                 ` Yuanhan Liu
@ 2016-10-27 10:35                   ` Maxime Coquelin
  2016-10-27 10:46                     ` Yuanhan Liu
  2016-10-27 10:53                   ` Maxime Coquelin
  1 sibling, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-10-27 10:35 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: mst, dev, vkaplans



On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
> On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin wrote:
>> Hi Zhihong,
>>
>> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
>>> Hi Maxime,
>>>
>>> Seems indirect desc feature is causing serious performance
>>> degradation on Haswell platform, about 20% drop for both
>>> mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
>>> both iofwd and macfwd.
>> I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy Bridge
>> platform, and didn't faced such a drop.
>
> I was actually wondering that may be the cause. I tested it with
> my IvyBridge server as well, I saw no drop.
>
> Maybe you should find a similar platform (Haswell) and have a try?
Yes, that's why I asked Zhihong whether he could test Txonly in guest to
see if issue is reproducible like this.
I will be easier for me to find an Haswell machine if it has not to be
connected back to back to and HW/SW packet generator.

Thanks,
Maxime

>
> 	--yliu
>
>> Have you tried to pass indirect_desc=off to qemu cmdline to see if you
>> recover the performance?
>>
>> Yuanhan, which platform did you use when you tested it with zero copy?
>>
>>>
>>> I'm using RC2, and the CPU is Xeon E5-2699 v3 @ 2.30GHz.
>>>
>>> Could you please verify if this is true in your test?
>> I'll try -rc1/-rc2 on my platform, and let you know.
>>
>> Thanks,
>> Maxime
>>
>>>
>>>
>>> Thanks
>>> Zhihong
>>>
>>>> -----Original Message-----
>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>>> Sent: Monday, October 17, 2016 10:15 PM
>>>> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
>>>> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Xie, Huawei
>>>> <huawei.xie@intel.com>; dev@dpdk.org; vkaplans@redhat.com;
>>>> mst@redhat.com; stephen@networkplumber.org
>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
>>>> to the TX path
>>>>
>>>>
>>>>
>>>> On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
>>>>> On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
>>>>>>> On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
>>>>>>> I'll continue the investigation early next week.
>>>>>>
>>>>>> The root cause is identified.
>>>>>> When INDIRECT_DESC feature is negotiated, Windows guest uses indirect
>>>>>> for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
>>>>>> virtio-net kernel driver) use indirect only for Tx.
>>>>>> I'll implement indirect support for the Rx path in vhost lib, but the
>>>>>> change will be too big for -rc release.
>>>>>> I propose in the mean time to disable INDIRECT_DESC feature in vhost
>>>>>> lib, we can still enable it locally for testing.
>>>>>>
>>>>>> Yuanhan, is it ok for you?
>>>>>
>>>>> That's okay.
>>>> I'll send a patch to disable it then.
>>>>
>>>>>
>>>>>>
>>>>>>> Has anyone already tested Windows guest with vhost-net, which also
>>>> has
>>>>>>> indirect descs support?
>>>>>>
>>>>>> I tested and confirm it works with vhost-net.
>>>>>
>>>>> I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
>>>>> for Rx path, right?
>>>>
>>>> No, it does support it actually.
>>>> I thought it didn't support too, I misread the Kernel implementation of
>>>> vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
>>>> in Rx path when mergeable buffers is disabled.
>>>>
>>>> The confusion certainly comes from me, sorry about that.
>>>>
>>>> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-27 10:35                   ` Maxime Coquelin
@ 2016-10-27 10:46                     ` Yuanhan Liu
  2016-10-28  0:49                       ` Wang, Zhihong
  0 siblings, 1 reply; 52+ messages in thread
From: Yuanhan Liu @ 2016-10-27 10:46 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: mst, dev, vkaplans

On Thu, Oct 27, 2016 at 12:35:11PM +0200, Maxime Coquelin wrote:
> 
> 
> On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
> >On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin wrote:
> >>Hi Zhihong,
> >>
> >>On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
> >>>Hi Maxime,
> >>>
> >>>Seems indirect desc feature is causing serious performance
> >>>degradation on Haswell platform, about 20% drop for both
> >>>mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
> >>>both iofwd and macfwd.
> >>I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy Bridge
> >>platform, and didn't faced such a drop.
> >
> >I was actually wondering that may be the cause. I tested it with
> >my IvyBridge server as well, I saw no drop.
> >
> >Maybe you should find a similar platform (Haswell) and have a try?
> Yes, that's why I asked Zhihong whether he could test Txonly in guest to
> see if issue is reproducible like this.

I have no Haswell box, otherwise I could do a quick test for you. IIRC,
he tried to disable the indirect_desc feature, then the performance
recovered. So, it's likely the indirect_desc is the culprit here.

> I will be easier for me to find an Haswell machine if it has not to be
> connected back to back to and HW/SW packet generator.

Makes sense.

	--yliu
> 
> Thanks,
> Maxime
> 
> >
> >	--yliu
> >
> >>Have you tried to pass indirect_desc=off to qemu cmdline to see if you
> >>recover the performance?
> >>
> >>Yuanhan, which platform did you use when you tested it with zero copy?
> >>
> >>>
> >>>I'm using RC2, and the CPU is Xeon E5-2699 v3 @ 2.30GHz.
> >>>
> >>>Could you please verify if this is true in your test?
> >>I'll try -rc1/-rc2 on my platform, and let you know.
> >>
> >>Thanks,
> >>Maxime
> >>
> >>>
> >>>
> >>>Thanks
> >>>Zhihong
> >>>
> >>>>-----Original Message-----
> >>>>From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >>>>Sent: Monday, October 17, 2016 10:15 PM
> >>>>To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> >>>>Cc: Wang, Zhihong <zhihong.wang@intel.com>; Xie, Huawei
> >>>><huawei.xie@intel.com>; dev@dpdk.org; vkaplans@redhat.com;
> >>>>mst@redhat.com; stephen@networkplumber.org
> >>>>Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
> >>>>to the TX path
> >>>>
> >>>>
> >>>>
> >>>>On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
> >>>>>On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
> >>>>>>>On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
> >>>>>>>I'll continue the investigation early next week.
> >>>>>>
> >>>>>>The root cause is identified.
> >>>>>>When INDIRECT_DESC feature is negotiated, Windows guest uses indirect
> >>>>>>for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
> >>>>>>virtio-net kernel driver) use indirect only for Tx.
> >>>>>>I'll implement indirect support for the Rx path in vhost lib, but the
> >>>>>>change will be too big for -rc release.
> >>>>>>I propose in the mean time to disable INDIRECT_DESC feature in vhost
> >>>>>>lib, we can still enable it locally for testing.
> >>>>>>
> >>>>>>Yuanhan, is it ok for you?
> >>>>>
> >>>>>That's okay.
> >>>>I'll send a patch to disable it then.
> >>>>
> >>>>>
> >>>>>>
> >>>>>>>Has anyone already tested Windows guest with vhost-net, which also
> >>>>has
> >>>>>>>indirect descs support?
> >>>>>>
> >>>>>>I tested and confirm it works with vhost-net.
> >>>>>
> >>>>>I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
> >>>>>for Rx path, right?
> >>>>
> >>>>No, it does support it actually.
> >>>>I thought it didn't support too, I misread the Kernel implementation of
> >>>>vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
> >>>>in Rx path when mergeable buffers is disabled.
> >>>>
> >>>>The confusion certainly comes from me, sorry about that.
> >>>>
> >>>>Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-27 10:33                 ` Yuanhan Liu
  2016-10-27 10:35                   ` Maxime Coquelin
@ 2016-10-27 10:53                   ` Maxime Coquelin
  2016-10-28  6:05                     ` Xu, Qian Q
  1 sibling, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-10-27 10:53 UTC (permalink / raw)
  To: Yuanhan Liu; +Cc: mst, dev, vkaplans



On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
> On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin wrote:
>> Hi Zhihong,
>>
>> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
>>> Hi Maxime,
>>>
>>> Seems indirect desc feature is causing serious performance
>>> degradation on Haswell platform, about 20% drop for both
>>> mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
>>> both iofwd and macfwd.
>> I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy Bridge
>> platform, and didn't faced such a drop.
>
> I was actually wondering that may be the cause. I tested it with
> my IvyBridge server as well, I saw no drop.
Sorry, mine is a SandyBridge, not IvyBridge.

>
> Maybe you should find a similar platform (Haswell) and have a try?
>
> 	--yliu
>
>> Have you tried to pass indirect_desc=off to qemu cmdline to see if you
>> recover the performance?
>>
>> Yuanhan, which platform did you use when you tested it with zero copy?
>>
>>>
>>> I'm using RC2, and the CPU is Xeon E5-2699 v3 @ 2.30GHz.
>>>
>>> Could you please verify if this is true in your test?
>> I'll try -rc1/-rc2 on my platform, and let you know.
>>
>> Thanks,
>> Maxime
>>
>>>
>>>
>>> Thanks
>>> Zhihong
>>>
>>>> -----Original Message-----
>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>>> Sent: Monday, October 17, 2016 10:15 PM
>>>> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
>>>> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Xie, Huawei
>>>> <huawei.xie@intel.com>; dev@dpdk.org; vkaplans@redhat.com;
>>>> mst@redhat.com; stephen@networkplumber.org
>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
>>>> to the TX path
>>>>
>>>>
>>>>
>>>> On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
>>>>> On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
>>>>>>> On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
>>>>>>> I'll continue the investigation early next week.
>>>>>>
>>>>>> The root cause is identified.
>>>>>> When INDIRECT_DESC feature is negotiated, Windows guest uses indirect
>>>>>> for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
>>>>>> virtio-net kernel driver) use indirect only for Tx.
>>>>>> I'll implement indirect support for the Rx path in vhost lib, but the
>>>>>> change will be too big for -rc release.
>>>>>> I propose in the mean time to disable INDIRECT_DESC feature in vhost
>>>>>> lib, we can still enable it locally for testing.
>>>>>>
>>>>>> Yuanhan, is it ok for you?
>>>>>
>>>>> That's okay.
>>>> I'll send a patch to disable it then.
>>>>
>>>>>
>>>>>>
>>>>>>> Has anyone already tested Windows guest with vhost-net, which also
>>>> has
>>>>>>> indirect descs support?
>>>>>>
>>>>>> I tested and confirm it works with vhost-net.
>>>>>
>>>>> I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
>>>>> for Rx path, right?
>>>>
>>>> No, it does support it actually.
>>>> I thought it didn't support too, I misread the Kernel implementation of
>>>> vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
>>>> in Rx path when mergeable buffers is disabled.
>>>>
>>>> The confusion certainly comes from me, sorry about that.
>>>>
>>>> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-27 10:46                     ` Yuanhan Liu
@ 2016-10-28  0:49                       ` Wang, Zhihong
  2016-10-28  7:42                         ` Maxime Coquelin
  0 siblings, 1 reply; 52+ messages in thread
From: Wang, Zhihong @ 2016-10-28  0:49 UTC (permalink / raw)
  To: Yuanhan Liu, Maxime Coquelin; +Cc: mst, dev, vkaplans



> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Thursday, October 27, 2016 6:46 PM
> To: Maxime Coquelin <maxime.coquelin@redhat.com>
> Cc: Wang, Zhihong <zhihong.wang@intel.com>;
> stephen@networkplumber.org; Pierre Pfister (ppfister)
> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
> vkaplans@redhat.com; mst@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
> to the TX path
> 
> On Thu, Oct 27, 2016 at 12:35:11PM +0200, Maxime Coquelin wrote:
> >
> >
> > On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
> > >On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin wrote:
> > >>Hi Zhihong,
> > >>
> > >>On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
> > >>>Hi Maxime,
> > >>>
> > >>>Seems indirect desc feature is causing serious performance
> > >>>degradation on Haswell platform, about 20% drop for both
> > >>>mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
> > >>>both iofwd and macfwd.
> > >>I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy
> Bridge
> > >>platform, and didn't faced such a drop.
> > >
> > >I was actually wondering that may be the cause. I tested it with
> > >my IvyBridge server as well, I saw no drop.
> > >
> > >Maybe you should find a similar platform (Haswell) and have a try?
> > Yes, that's why I asked Zhihong whether he could test Txonly in guest to
> > see if issue is reproducible like this.
> 
> I have no Haswell box, otherwise I could do a quick test for you. IIRC,
> he tried to disable the indirect_desc feature, then the performance
> recovered. So, it's likely the indirect_desc is the culprit here.
> 
> > I will be easier for me to find an Haswell machine if it has not to be
> > connected back to back to and HW/SW packet generator.

In fact simple loopback test will also do, without pktgen.

Start testpmd in both host and guest, and do "start" in one
and "start tx_first 32" in another.

Perf drop is about 24% in my test.

> 
> Makes sense.
> 
> 	--yliu
> >
> > Thanks,
> > Maxime
> >
> > >
> > >	--yliu
> > >
> > >>Have you tried to pass indirect_desc=off to qemu cmdline to see if you
> > >>recover the performance?
> > >>
> > >>Yuanhan, which platform did you use when you tested it with zero copy?
> > >>
> > >>>
> > >>>I'm using RC2, and the CPU is Xeon E5-2699 v3 @ 2.30GHz.
> > >>>
> > >>>Could you please verify if this is true in your test?
> > >>I'll try -rc1/-rc2 on my platform, and let you know.
> > >>
> > >>Thanks,
> > >>Maxime
> > >>
> > >>>
> > >>>
> > >>>Thanks
> > >>>Zhihong
> > >>>
> > >>>>-----Original Message-----
> > >>>>From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> > >>>>Sent: Monday, October 17, 2016 10:15 PM
> > >>>>To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> > >>>>Cc: Wang, Zhihong <zhihong.wang@intel.com>; Xie, Huawei
> > >>>><huawei.xie@intel.com>; dev@dpdk.org; vkaplans@redhat.com;
> > >>>>mst@redhat.com; stephen@networkplumber.org
> > >>>>Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
> support
> > >>>>to the TX path
> > >>>>
> > >>>>
> > >>>>
> > >>>>On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
> > >>>>>On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
> > >>>>>>>On my side, I just setup 2 Windows 2016 VMs, and confirm the
> issue.
> > >>>>>>>I'll continue the investigation early next week.
> > >>>>>>
> > >>>>>>The root cause is identified.
> > >>>>>>When INDIRECT_DESC feature is negotiated, Windows guest uses
> indirect
> > >>>>>>for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
> > >>>>>>virtio-net kernel driver) use indirect only for Tx.
> > >>>>>>I'll implement indirect support for the Rx path in vhost lib, but the
> > >>>>>>change will be too big for -rc release.
> > >>>>>>I propose in the mean time to disable INDIRECT_DESC feature in
> vhost
> > >>>>>>lib, we can still enable it locally for testing.
> > >>>>>>
> > >>>>>>Yuanhan, is it ok for you?
> > >>>>>
> > >>>>>That's okay.
> > >>>>I'll send a patch to disable it then.
> > >>>>
> > >>>>>
> > >>>>>>
> > >>>>>>>Has anyone already tested Windows guest with vhost-net, which
> also
> > >>>>has
> > >>>>>>>indirect descs support?
> > >>>>>>
> > >>>>>>I tested and confirm it works with vhost-net.
> > >>>>>
> > >>>>>I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
> > >>>>>for Rx path, right?
> > >>>>
> > >>>>No, it does support it actually.
> > >>>>I thought it didn't support too, I misread the Kernel implementation of
> > >>>>vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
> > >>>>in Rx path when mergeable buffers is disabled.
> > >>>>
> > >>>>The confusion certainly comes from me, sorry about that.
> > >>>>
> > >>>>Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-27 10:53                   ` Maxime Coquelin
@ 2016-10-28  6:05                     ` Xu, Qian Q
  0 siblings, 0 replies; 52+ messages in thread
From: Xu, Qian Q @ 2016-10-28  6:05 UTC (permalink / raw)
  To: Maxime Coquelin, Yuanhan Liu; +Cc: dev, vkaplans, mst

In my BDW-EP platform(similar to HSW), I can also see the performance drop. So what's the next step now? 
Intel CPU GEN: 
SNB-->IVB--->HSW-->BDW-EP

-----Original Message-----
From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime Coquelin
Sent: Thursday, October 27, 2016 6:53 PM
To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Cc: mst@redhat.com; dev@dpdk.org; vkaplans@redhat.com
Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support to the TX path



On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
> On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin wrote:
>> Hi Zhihong,
>>
>> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
>>> Hi Maxime,
>>>
>>> Seems indirect desc feature is causing serious performance 
>>> degradation on Haswell platform, about 20% drop for both mrg=on and 
>>> mrg=off (--txqflags=0xf00, non-vector version), both iofwd and 
>>> macfwd.
>> I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy 
>> Bridge platform, and didn't faced such a drop.
>
> I was actually wondering that may be the cause. I tested it with my 
> IvyBridge server as well, I saw no drop.
Sorry, mine is a SandyBridge, not IvyBridge.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-27 10:19                   ` Wang, Zhihong
@ 2016-10-28  7:32                     ` Pierre Pfister (ppfister)
  2016-10-28  7:58                       ` Maxime Coquelin
  0 siblings, 1 reply; 52+ messages in thread
From: Pierre Pfister (ppfister) @ 2016-10-28  7:32 UTC (permalink / raw)
  To: Wang, Zhihong; +Cc: mst, dev, vkaplans


> Le 27 oct. 2016 à 12:19, Wang, Zhihong <zhihong.wang@intel.com> a écrit :
> 
> 
> 
>> -----Original Message-----
>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>> Sent: Thursday, October 27, 2016 5:55 PM
>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
>> <yuanhan.liu@linux.intel.com>; stephen@networkplumber.org; Pierre
>> Pfister (ppfister) <ppfister@cisco.com>
>> Cc: Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
>> vkaplans@redhat.com; mst@redhat.com
>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
>> to the TX path
>> 
>> 
>> 
>> On 10/27/2016 11:10 AM, Maxime Coquelin wrote:
>>> Hi Zhihong,
>>> 
>>> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
>>>> Hi Maxime,
>>>> 
>>>> Seems indirect desc feature is causing serious performance
>>>> degradation on Haswell platform, about 20% drop for both
>>>> mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
>>>> both iofwd and macfwd.
>>> I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy Bridge
>>> platform, and didn't faced such a drop.
>>> Have you tried to pass indirect_desc=off to qemu cmdline to see if you
>>> recover the performance?
>>> 
>>> Yuanhan, which platform did you use when you tested it with zero copy?
>>> 
>>>> 
>>>> I'm using RC2, and the CPU is Xeon E5-2699 v3 @ 2.30GHz.
>>>> 
>>>> Could you please verify if this is true in your test?
>>> I'll try -rc1/-rc2 on my platform, and let you know.
>> As a first test, I tried again Txonly from the guest to the host (Rxonly),
>> where Tx indirect descriptors are used, on my E5-2665 @2.40GHz:
>> v16.11-rc1: 10.81Mpps
>> v16.11-rc2: 10.91Mpps
>> 
>> -rc2 is even slightly better in my case.
>> Could you please run the same test on your platform?
> 
> I mean to use rc2 as both host and guest, and compare the
> perf between indirect=0 and indirect=1.
> 
> I use PVP traffic, tried both testpmd and OvS as the forwarding
> engine in host, and testpmd in guest.
> 
> Thanks
> Zhihong

From my experience, and as Michael pointed out, the best mode for small packets is obviously
ANY_LAYOUT so it uses a single descriptor per packet.

So, disabling indirect descriptors may give you better pps for 64 bytes packets, but that doesn't mean you should not implement, or enable, it in your driver. That just means that the guest is not taking the right decision, and uses indirect while it should actually use any_layout.

Given virtio/vhost design (most decision comes from the guest), the host should be liberal in what it accepts, and not try to influence guest implementation by carefully picking the features it supports. Otherwise guests will never get a chance to make the right decisions either.

- Pierre

> 
>> 
>> And could you provide me more info on your fwd bench?
>> Do you use dpdk-pktgen on host, or you do fwd on howt with a real NIC
>> also?
>> 
>> Thanks,
>> Maxime
>>> Thanks,
>>> Maxime
>>> 
>>>> 
>>>> 
>>>> Thanks
>>>> Zhihong
>>>> 
>>>>> -----Original Message-----
>>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>>>> Sent: Monday, October 17, 2016 10:15 PM
>>>>> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
>>>>> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Xie, Huawei
>>>>> <huawei.xie@intel.com>; dev@dpdk.org; vkaplans@redhat.com;
>>>>> mst@redhat.com; stephen@networkplumber.org
>>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>>>>> support
>>>>> to the TX path
>>>>> 
>>>>> 
>>>>> 
>>>>> On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
>>>>>> On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
>>>>>>>> On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
>>>>>>>> I'll continue the investigation early next week.
>>>>>>> 
>>>>>>> The root cause is identified.
>>>>>>> When INDIRECT_DESC feature is negotiated, Windows guest uses
>> indirect
>>>>>>> for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
>>>>>>> virtio-net kernel driver) use indirect only for Tx.
>>>>>>> I'll implement indirect support for the Rx path in vhost lib, but the
>>>>>>> change will be too big for -rc release.
>>>>>>> I propose in the mean time to disable INDIRECT_DESC feature in vhost
>>>>>>> lib, we can still enable it locally for testing.
>>>>>>> 
>>>>>>> Yuanhan, is it ok for you?
>>>>>> 
>>>>>> That's okay.
>>>>> I'll send a patch to disable it then.
>>>>> 
>>>>>> 
>>>>>>> 
>>>>>>>> Has anyone already tested Windows guest with vhost-net, which
>> also
>>>>> has
>>>>>>>> indirect descs support?
>>>>>>> 
>>>>>>> I tested and confirm it works with vhost-net.
>>>>>> 
>>>>>> I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
>>>>>> for Rx path, right?
>>>>> 
>>>>> No, it does support it actually.
>>>>> I thought it didn't support too, I misread the Kernel implementation of
>>>>> vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
>>>>> in Rx path when mergeable buffers is disabled.
>>>>> 
>>>>> The confusion certainly comes from me, sorry about that.
>>>>> 
>>>>> Maxime


^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-28  0:49                       ` Wang, Zhihong
@ 2016-10-28  7:42                         ` Maxime Coquelin
  2016-10-31 10:01                           ` Wang, Zhihong
  0 siblings, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-10-28  7:42 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu; +Cc: mst, dev, vkaplans



On 10/28/2016 02:49 AM, Wang, Zhihong wrote:
>
>> > -----Original Message-----
>> > From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
>> > Sent: Thursday, October 27, 2016 6:46 PM
>> > To: Maxime Coquelin <maxime.coquelin@redhat.com>
>> > Cc: Wang, Zhihong <zhihong.wang@intel.com>;
>> > stephen@networkplumber.org; Pierre Pfister (ppfister)
>> > <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
>> > vkaplans@redhat.com; mst@redhat.com
>> > Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
>> > to the TX path
>> >
>> > On Thu, Oct 27, 2016 at 12:35:11PM +0200, Maxime Coquelin wrote:
>>> > >
>>> > >
>>> > > On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
>>>> > > >On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin wrote:
>>>>> > > >>Hi Zhihong,
>>>>> > > >>
>>>>> > > >>On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
>>>>>> > > >>>Hi Maxime,
>>>>>> > > >>>
>>>>>> > > >>>Seems indirect desc feature is causing serious performance
>>>>>> > > >>>degradation on Haswell platform, about 20% drop for both
>>>>>> > > >>>mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
>>>>>> > > >>>both iofwd and macfwd.
>>>>> > > >>I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy
>> > Bridge
>>>>> > > >>platform, and didn't faced such a drop.
>>>> > > >
>>>> > > >I was actually wondering that may be the cause. I tested it with
>>>> > > >my IvyBridge server as well, I saw no drop.
>>>> > > >
>>>> > > >Maybe you should find a similar platform (Haswell) and have a try?
>>> > > Yes, that's why I asked Zhihong whether he could test Txonly in guest to
>>> > > see if issue is reproducible like this.
>> >
>> > I have no Haswell box, otherwise I could do a quick test for you. IIRC,
>> > he tried to disable the indirect_desc feature, then the performance
>> > recovered. So, it's likely the indirect_desc is the culprit here.
>> >
>>> > > I will be easier for me to find an Haswell machine if it has not to be
>>> > > connected back to back to and HW/SW packet generator.
> In fact simple loopback test will also do, without pktgen.
>
> Start testpmd in both host and guest, and do "start" in one
> and "start tx_first 32" in another.
>
> Perf drop is about 24% in my test.
>

Thanks, I never tried this test.
I managed to find an Haswell platform (Intel(R) Xeon(R) CPU E5-2699 v3
@ 2.30GHz), and can reproduce the problem with the loop test you
mention. I see a performance drop about 10% (8.94Mpps/8.08Mpps).
Out of curiosity, what are the numbers you get with your setup?

As I never tried this test, I run it again on my Sandy Bridge setup, and
I also see a performance regression, this time of 4%.

If I understand correctly the test, only 32 packets are allocated,
corresponding to a single burst, which is less than the queue size.
So it makes sense that the performance is lower with this test case.

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-28  7:32                     ` Pierre Pfister (ppfister)
@ 2016-10-28  7:58                       ` Maxime Coquelin
  2016-11-01  8:15                         ` Yuanhan Liu
  0 siblings, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-10-28  7:58 UTC (permalink / raw)
  To: Pierre Pfister (ppfister), Wang, Zhihong; +Cc: mst, dev, vkaplans



On 10/28/2016 09:32 AM, Pierre Pfister (ppfister) wrote:
>
>> Le 27 oct. 2016 à 12:19, Wang, Zhihong <zhihong.wang@intel.com> a écrit :
>>
>>
>>
>>> -----Original Message-----
>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>> Sent: Thursday, October 27, 2016 5:55 PM
>>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
>>> <yuanhan.liu@linux.intel.com>; stephen@networkplumber.org; Pierre
>>> Pfister (ppfister) <ppfister@cisco.com>
>>> Cc: Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
>>> vkaplans@redhat.com; mst@redhat.com
>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
>>> to the TX path
>>>
>>>
>>>
>>> On 10/27/2016 11:10 AM, Maxime Coquelin wrote:
>>>> Hi Zhihong,
>>>>
>>>> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
>>>>> Hi Maxime,
>>>>>
>>>>> Seems indirect desc feature is causing serious performance
>>>>> degradation on Haswell platform, about 20% drop for both
>>>>> mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
>>>>> both iofwd and macfwd.
>>>> I tested PVP (with macswap on guest) and Txonly/Rxonly on an Ivy Bridge
>>>> platform, and didn't faced such a drop.
>>>> Have you tried to pass indirect_desc=off to qemu cmdline to see if you
>>>> recover the performance?
>>>>
>>>> Yuanhan, which platform did you use when you tested it with zero copy?
>>>>
>>>>>
>>>>> I'm using RC2, and the CPU is Xeon E5-2699 v3 @ 2.30GHz.
>>>>>
>>>>> Could you please verify if this is true in your test?
>>>> I'll try -rc1/-rc2 on my platform, and let you know.
>>> As a first test, I tried again Txonly from the guest to the host (Rxonly),
>>> where Tx indirect descriptors are used, on my E5-2665 @2.40GHz:
>>> v16.11-rc1: 10.81Mpps
>>> v16.11-rc2: 10.91Mpps
>>>
>>> -rc2 is even slightly better in my case.
>>> Could you please run the same test on your platform?
>>
>> I mean to use rc2 as both host and guest, and compare the
>> perf between indirect=0 and indirect=1.
>>
>> I use PVP traffic, tried both testpmd and OvS as the forwarding
>> engine in host, and testpmd in guest.
>>
>> Thanks
>> Zhihong
>
> From my experience, and as Michael pointed out, the best mode for small packets is obviously
> ANY_LAYOUT so it uses a single descriptor per packet.
Of course, having a single descriptor is in theory the best way.
But, in current Virtio PMD implementation, with no offload supported, we 
never access the virtio header at transmit time, it is allocated and
zeroed at startup.

For ANY_LAYOUT case, the virtio header is prepended to the packet, and
need to be zeroed at packet transmit time. The performance impact is
quite important, as show the measurements I made one month ago (txonly):
  - 2 descs per packet: 11.6Mpps
  - 1 desc per packet: 9.6Mpps

As Michael suggested, I tried to replace the memset by direct
fields assignation, but it only recovers a small part of the drop.

What I suggested is to introduce a new feature, so that we can skip the
virtio header when no offload is negotiated.

Maybe you have other ideas?

> So, disabling indirect descriptors may give you better pps for 64 bytes packets, but that doesn't mean you should not implement, or enable, it in your driver. That just means that the guest is not taking the right decision, and uses indirect while it should actually use any_layout.
+1, it really depends on the use-case.
>
> Given virtio/vhost design (most decision comes from the guest), the host should be liberal in what it accepts, and not try to influence guest implementation by carefully picking the features it supports. Otherwise guests will never get a chance to make the right decisions either.
Agree, what we need is to be able to disable Virtio PMD features
without having to rebuild the PMD.
It will certainly require an new API change to add this option.

Thanks,
Maxime

>
> - Pierre
>
>>
>>>
>>> And could you provide me more info on your fwd bench?
>>> Do you use dpdk-pktgen on host, or you do fwd on howt with a real NIC
>>> also?
>>>
>>> Thanks,
>>> Maxime
>>>> Thanks,
>>>> Maxime
>>>>
>>>>>
>>>>>
>>>>> Thanks
>>>>> Zhihong
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>>>>> Sent: Monday, October 17, 2016 10:15 PM
>>>>>> To: Yuanhan Liu <yuanhan.liu@linux.intel.com>
>>>>>> Cc: Wang, Zhihong <zhihong.wang@intel.com>; Xie, Huawei
>>>>>> <huawei.xie@intel.com>; dev@dpdk.org; vkaplans@redhat.com;
>>>>>> mst@redhat.com; stephen@networkplumber.org
>>>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>>>>>> support
>>>>>> to the TX path
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 10/17/2016 03:21 PM, Yuanhan Liu wrote:
>>>>>>> On Mon, Oct 17, 2016 at 01:23:23PM +0200, Maxime Coquelin wrote:
>>>>>>>>> On my side, I just setup 2 Windows 2016 VMs, and confirm the issue.
>>>>>>>>> I'll continue the investigation early next week.
>>>>>>>>
>>>>>>>> The root cause is identified.
>>>>>>>> When INDIRECT_DESC feature is negotiated, Windows guest uses
>>> indirect
>>>>>>>> for both Tx and Rx descriptors, whereas Linux guests (Virtio PMD &
>>>>>>>> virtio-net kernel driver) use indirect only for Tx.
>>>>>>>> I'll implement indirect support for the Rx path in vhost lib, but the
>>>>>>>> change will be too big for -rc release.
>>>>>>>> I propose in the mean time to disable INDIRECT_DESC feature in vhost
>>>>>>>> lib, we can still enable it locally for testing.
>>>>>>>>
>>>>>>>> Yuanhan, is it ok for you?
>>>>>>>
>>>>>>> That's okay.
>>>>>> I'll send a patch to disable it then.
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>> Has anyone already tested Windows guest with vhost-net, which
>>> also
>>>>>> has
>>>>>>>>> indirect descs support?
>>>>>>>>
>>>>>>>> I tested and confirm it works with vhost-net.
>>>>>>>
>>>>>>> I'm a bit confused then. IIRC, vhost-net also doesn't support indirect
>>>>>>> for Rx path, right?
>>>>>>
>>>>>> No, it does support it actually.
>>>>>> I thought it didn't support too, I misread the Kernel implementation of
>>>>>> vhost-net and virtio-net. Acutally, virtio-net makes use of indirect
>>>>>> in Rx path when mergeable buffers is disabled.
>>>>>>
>>>>>> The confusion certainly comes from me, sorry about that.
>>>>>>
>>>>>> Maxime
>

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-28  7:42                         ` Maxime Coquelin
@ 2016-10-31 10:01                           ` Wang, Zhihong
  2016-11-02 10:51                             ` Maxime Coquelin
  0 siblings, 1 reply; 52+ messages in thread
From: Wang, Zhihong @ 2016-10-31 10:01 UTC (permalink / raw)
  To: Maxime Coquelin, Yuanhan Liu; +Cc: mst, dev, vkaplans



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Friday, October 28, 2016 3:42 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> <yuanhan.liu@linux.intel.com>
> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
> vkaplans@redhat.com; mst@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
> to the TX path
> 
> 
> 
> On 10/28/2016 02:49 AM, Wang, Zhihong wrote:
> >
> >> > -----Original Message-----
> >> > From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> >> > Sent: Thursday, October 27, 2016 6:46 PM
> >> > To: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> > Cc: Wang, Zhihong <zhihong.wang@intel.com>;
> >> > stephen@networkplumber.org; Pierre Pfister (ppfister)
> >> > <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
> dev@dpdk.org;
> >> > vkaplans@redhat.com; mst@redhat.com
> >> > Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
> support
> >> > to the TX path
> >> >
> >> > On Thu, Oct 27, 2016 at 12:35:11PM +0200, Maxime Coquelin wrote:
> >>> > >
> >>> > >
> >>> > > On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
> >>>> > > >On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin
> wrote:
> >>>>> > > >>Hi Zhihong,
> >>>>> > > >>
> >>>>> > > >>On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
> >>>>>> > > >>>Hi Maxime,
> >>>>>> > > >>>
> >>>>>> > > >>>Seems indirect desc feature is causing serious performance
> >>>>>> > > >>>degradation on Haswell platform, about 20% drop for both
> >>>>>> > > >>>mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
> >>>>>> > > >>>both iofwd and macfwd.
> >>>>> > > >>I tested PVP (with macswap on guest) and Txonly/Rxonly on an
> Ivy
> >> > Bridge
> >>>>> > > >>platform, and didn't faced such a drop.
> >>>> > > >
> >>>> > > >I was actually wondering that may be the cause. I tested it with
> >>>> > > >my IvyBridge server as well, I saw no drop.
> >>>> > > >
> >>>> > > >Maybe you should find a similar platform (Haswell) and have a try?
> >>> > > Yes, that's why I asked Zhihong whether he could test Txonly in guest
> to
> >>> > > see if issue is reproducible like this.
> >> >
> >> > I have no Haswell box, otherwise I could do a quick test for you. IIRC,
> >> > he tried to disable the indirect_desc feature, then the performance
> >> > recovered. So, it's likely the indirect_desc is the culprit here.
> >> >
> >>> > > I will be easier for me to find an Haswell machine if it has not to be
> >>> > > connected back to back to and HW/SW packet generator.
> > In fact simple loopback test will also do, without pktgen.
> >
> > Start testpmd in both host and guest, and do "start" in one
> > and "start tx_first 32" in another.
> >
> > Perf drop is about 24% in my test.
> >
> 
> Thanks, I never tried this test.
> I managed to find an Haswell platform (Intel(R) Xeon(R) CPU E5-2699 v3
> @ 2.30GHz), and can reproduce the problem with the loop test you
> mention. I see a performance drop about 10% (8.94Mpps/8.08Mpps).
> Out of curiosity, what are the numbers you get with your setup?

Hi Maxime,

Let's align our test case to RC2, mrg=on, loopback, on Haswell.
My results below:
 1. indirect=1: 5.26 Mpps
 2. indirect=0: 6.54 Mpps

It's about 24% drop.

> 
> As I never tried this test, I run it again on my Sandy Bridge setup, and
> I also see a performance regression, this time of 4%.
> 
> If I understand correctly the test, only 32 packets are allocated,
> corresponding to a single burst, which is less than the queue size.
> So it makes sense that the performance is lower with this test case.

Actually it's 32 burst, so 1024 packets in total, enough to
fill the queue.

Thanks
Zhihong

> 
> Thanks,
> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-28  7:58                       ` Maxime Coquelin
@ 2016-11-01  8:15                         ` Yuanhan Liu
  2016-11-01  9:39                           ` Thomas Monjalon
  0 siblings, 1 reply; 52+ messages in thread
From: Yuanhan Liu @ 2016-11-01  8:15 UTC (permalink / raw)
  To: Maxime Coquelin; +Cc: mst, dev, vkaplans

On Fri, Oct 28, 2016 at 09:58:51AM +0200, Maxime Coquelin wrote:
> >From my experience, and as Michael pointed out, the best mode for small packets is obviously
> >ANY_LAYOUT so it uses a single descriptor per packet.
> Of course, having a single descriptor is in theory the best way.
> But, in current Virtio PMD implementation, with no offload supported, we
> never access the virtio header at transmit time, it is allocated and
> zeroed at startup.
> 
> For ANY_LAYOUT case, the virtio header is prepended to the packet, and
> need to be zeroed at packet transmit time. The performance impact is
> quite important, as show the measurements I made one month ago (txonly):
>  - 2 descs per packet: 11.6Mpps
>  - 1 desc per packet: 9.6Mpps
> 
> As Michael suggested, I tried to replace the memset by direct
> fields assignation, but it only recovers a small part of the drop.
> 
> What I suggested is to introduce a new feature, so that we can skip the
> virtio header when no offload is negotiated.
> 
> Maybe you have other ideas?
> 
> >So, disabling indirect descriptors may give you better pps for 64 bytes packets, but that doesn't mean you should not implement, or enable, it in your driver. That just means that the guest is not taking the right decision, and uses indirect while it should actually use any_layout.
> +1, it really depends on the use-case.
> >
> >Given virtio/vhost design (most decision comes from the guest), the host should be liberal in what it accepts, and not try to influence guest implementation by carefully picking the features it supports. Otherwise guests will never get a chance to make the right decisions either.
> Agree, what we need is to be able to disable Virtio PMD features
> without having to rebuild the PMD.

I want this feature (or more precisely, ability) long times ago.
For example, I'd wish there is an option like "force_legacy" when
both legacy and modern exist.

> It will certainly require an new API change to add this option.

I don't think it will work.

Firstly, generally, as discussed before, it's not a good idea to
introduce some PMD specific APIs.

Secondly, even if it's okay to do so, you need to let the application
(say testpmd) invoke it. Once you have done that, you need introduce
some CLI options to make sure that part is invoked.

And as stated before, it's also not a good idea to add virtio PMD
specific CLI options to testpmd. 

For this case, however, I think it makes more sense if test provides
some commands to enable/disable csum and tso stuff. With that, we
could enable it dynamically.

It has "tso set/show" commands though: it just doesn't look like the
right interface to me to enable/disable tso.

	--yliu

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-01  8:15                         ` Yuanhan Liu
@ 2016-11-01  9:39                           ` Thomas Monjalon
  2016-11-02  2:44                             ` Yuanhan Liu
  0 siblings, 1 reply; 52+ messages in thread
From: Thomas Monjalon @ 2016-11-01  9:39 UTC (permalink / raw)
  To: Yuanhan Liu, Maxime Coquelin; +Cc: dev, vkaplans, mst

2016-11-01 16:15, Yuanhan Liu:
> On Fri, Oct 28, 2016 at 09:58:51AM +0200, Maxime Coquelin wrote:
> > Agree, what we need is to be able to disable Virtio PMD features
> > without having to rebuild the PMD.
> 
> I want this feature (or more precisely, ability) long times ago.
> For example, I'd wish there is an option like "force_legacy" when
> both legacy and modern exist.

You can change the behaviour of the driver with a run-time parameter
as a struct rte_devargs.

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-01  9:39                           ` Thomas Monjalon
@ 2016-11-02  2:44                             ` Yuanhan Liu
  0 siblings, 0 replies; 52+ messages in thread
From: Yuanhan Liu @ 2016-11-02  2:44 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, vkaplans, mst

On Tue, Nov 01, 2016 at 10:39:35AM +0100, Thomas Monjalon wrote:
> 2016-11-01 16:15, Yuanhan Liu:
> > On Fri, Oct 28, 2016 at 09:58:51AM +0200, Maxime Coquelin wrote:
> > > Agree, what we need is to be able to disable Virtio PMD features
> > > without having to rebuild the PMD.
> > 
> > I want this feature (or more precisely, ability) long times ago.
> > For example, I'd wish there is an option like "force_legacy" when
> > both legacy and modern exist.
> 
> You can change the behaviour of the driver with a run-time parameter
> as a struct rte_devargs.

Thanks for the tip! Yeah, it's a workable solution, not an ideal one though.

	--yliu

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-10-31 10:01                           ` Wang, Zhihong
@ 2016-11-02 10:51                             ` Maxime Coquelin
  2016-11-03  8:11                               ` Maxime Coquelin
  0 siblings, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-11-02 10:51 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu; +Cc: mst, dev, vkaplans



On 10/31/2016 11:01 AM, Wang, Zhihong wrote:
>
>
>> -----Original Message-----
>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>> Sent: Friday, October 28, 2016 3:42 PM
>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
>> <yuanhan.liu@linux.intel.com>
>> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
>> vkaplans@redhat.com; mst@redhat.com
>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
>> to the TX path
>>
>>
>>
>> On 10/28/2016 02:49 AM, Wang, Zhihong wrote:
>>>
>>>>> -----Original Message-----
>>>>> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
>>>>> Sent: Thursday, October 27, 2016 6:46 PM
>>>>> To: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>>> Cc: Wang, Zhihong <zhihong.wang@intel.com>;
>>>>> stephen@networkplumber.org; Pierre Pfister (ppfister)
>>>>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
>> dev@dpdk.org;
>>>>> vkaplans@redhat.com; mst@redhat.com
>>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>> support
>>>>> to the TX path
>>>>>
>>>>> On Thu, Oct 27, 2016 at 12:35:11PM +0200, Maxime Coquelin wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
>>>>>>>>> On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin
>> wrote:
>>>>>>>>>>> Hi Zhihong,
>>>>>>>>>>>
>>>>>>>>>>> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
>>>>>>>>>>>>> Hi Maxime,
>>>>>>>>>>>>>
>>>>>>>>>>>>> Seems indirect desc feature is causing serious performance
>>>>>>>>>>>>> degradation on Haswell platform, about 20% drop for both
>>>>>>>>>>>>> mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
>>>>>>>>>>>>> both iofwd and macfwd.
>>>>>>>>>>> I tested PVP (with macswap on guest) and Txonly/Rxonly on an
>> Ivy
>>>>> Bridge
>>>>>>>>>>> platform, and didn't faced such a drop.
>>>>>>>>>
>>>>>>>>> I was actually wondering that may be the cause. I tested it with
>>>>>>>>> my IvyBridge server as well, I saw no drop.
>>>>>>>>>
>>>>>>>>> Maybe you should find a similar platform (Haswell) and have a try?
>>>>>>> Yes, that's why I asked Zhihong whether he could test Txonly in guest
>> to
>>>>>>> see if issue is reproducible like this.
>>>>>
>>>>> I have no Haswell box, otherwise I could do a quick test for you. IIRC,
>>>>> he tried to disable the indirect_desc feature, then the performance
>>>>> recovered. So, it's likely the indirect_desc is the culprit here.
>>>>>
>>>>>>> I will be easier for me to find an Haswell machine if it has not to be
>>>>>>> connected back to back to and HW/SW packet generator.
>>> In fact simple loopback test will also do, without pktgen.
>>>
>>> Start testpmd in both host and guest, and do "start" in one
>>> and "start tx_first 32" in another.
>>>
>>> Perf drop is about 24% in my test.
>>>
>>
>> Thanks, I never tried this test.
>> I managed to find an Haswell platform (Intel(R) Xeon(R) CPU E5-2699 v3
>> @ 2.30GHz), and can reproduce the problem with the loop test you
>> mention. I see a performance drop about 10% (8.94Mpps/8.08Mpps).
>> Out of curiosity, what are the numbers you get with your setup?
>
> Hi Maxime,
>
> Let's align our test case to RC2, mrg=on, loopback, on Haswell.
> My results below:
>  1. indirect=1: 5.26 Mpps
>  2. indirect=0: 6.54 Mpps
>
> It's about 24% drop.
OK, so on my side, same setup on Haswell:
1. indirect=1: 7.44 Mpps
2. indirect=0: 8.18 Mpps

Still 10% drop in my case with mrg=on.

The strange thing with both of our figures is that this is below from
what I obtain with my SandyBridge machine. The SB cpu freq is 4% higher,
but that doesn't explain the gap between the measurements.

I'm continuing the investigations on my side.
Maybe we should fix a deadline, and decide do disable indirect in
Virtio PMD if root cause not identified/fixed at some point?

Yuanhan, what do you think?

Regards,
Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-02 10:51                             ` Maxime Coquelin
@ 2016-11-03  8:11                               ` Maxime Coquelin
  2016-11-04  6:18                                 ` Xu, Qian Q
  2016-11-04  7:20                                 ` Wang, Zhihong
  0 siblings, 2 replies; 52+ messages in thread
From: Maxime Coquelin @ 2016-11-03  8:11 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu; +Cc: mst, dev, vkaplans



On 11/02/2016 11:51 AM, Maxime Coquelin wrote:
>
>
> On 10/31/2016 11:01 AM, Wang, Zhihong wrote:
>>
>>
>>> -----Original Message-----
>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>> Sent: Friday, October 28, 2016 3:42 PM
>>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
>>> <yuanhan.liu@linux.intel.com>
>>> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
>>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
>>> vkaplans@redhat.com; mst@redhat.com
>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>>> support
>>> to the TX path
>>>
>>>
>>>
>>> On 10/28/2016 02:49 AM, Wang, Zhihong wrote:
>>>>
>>>>>> -----Original Message-----
>>>>>> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
>>>>>> Sent: Thursday, October 27, 2016 6:46 PM
>>>>>> To: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>>>> Cc: Wang, Zhihong <zhihong.wang@intel.com>;
>>>>>> stephen@networkplumber.org; Pierre Pfister (ppfister)
>>>>>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
>>> dev@dpdk.org;
>>>>>> vkaplans@redhat.com; mst@redhat.com
>>>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>>> support
>>>>>> to the TX path
>>>>>>
>>>>>> On Thu, Oct 27, 2016 at 12:35:11PM +0200, Maxime Coquelin wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
>>>>>>>>>> On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin
>>> wrote:
>>>>>>>>>>>> Hi Zhihong,
>>>>>>>>>>>>
>>>>>>>>>>>> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
>>>>>>>>>>>>>> Hi Maxime,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Seems indirect desc feature is causing serious performance
>>>>>>>>>>>>>> degradation on Haswell platform, about 20% drop for both
>>>>>>>>>>>>>> mrg=on and mrg=off (--txqflags=0xf00, non-vector version),
>>>>>>>>>>>>>> both iofwd and macfwd.
>>>>>>>>>>>> I tested PVP (with macswap on guest) and Txonly/Rxonly on an
>>> Ivy
>>>>>> Bridge
>>>>>>>>>>>> platform, and didn't faced such a drop.
>>>>>>>>>>
>>>>>>>>>> I was actually wondering that may be the cause. I tested it with
>>>>>>>>>> my IvyBridge server as well, I saw no drop.
>>>>>>>>>>
>>>>>>>>>> Maybe you should find a similar platform (Haswell) and have a
>>>>>>>>>> try?
>>>>>>>> Yes, that's why I asked Zhihong whether he could test Txonly in
>>>>>>>> guest
>>> to
>>>>>>>> see if issue is reproducible like this.
>>>>>>
>>>>>> I have no Haswell box, otherwise I could do a quick test for you.
>>>>>> IIRC,
>>>>>> he tried to disable the indirect_desc feature, then the performance
>>>>>> recovered. So, it's likely the indirect_desc is the culprit here.
>>>>>>
>>>>>>>> I will be easier for me to find an Haswell machine if it has not
>>>>>>>> to be
>>>>>>>> connected back to back to and HW/SW packet generator.
>>>> In fact simple loopback test will also do, without pktgen.
>>>>
>>>> Start testpmd in both host and guest, and do "start" in one
>>>> and "start tx_first 32" in another.
>>>>
>>>> Perf drop is about 24% in my test.
>>>>
>>>
>>> Thanks, I never tried this test.
>>> I managed to find an Haswell platform (Intel(R) Xeon(R) CPU E5-2699 v3
>>> @ 2.30GHz), and can reproduce the problem with the loop test you
>>> mention. I see a performance drop about 10% (8.94Mpps/8.08Mpps).
>>> Out of curiosity, what are the numbers you get with your setup?
>>
>> Hi Maxime,
>>
>> Let's align our test case to RC2, mrg=on, loopback, on Haswell.
>> My results below:
>>  1. indirect=1: 5.26 Mpps
>>  2. indirect=0: 6.54 Mpps
>>
>> It's about 24% drop.
> OK, so on my side, same setup on Haswell:
> 1. indirect=1: 7.44 Mpps
> 2. indirect=0: 8.18 Mpps
>
> Still 10% drop in my case with mrg=on.
>
> The strange thing with both of our figures is that this is below from
> what I obtain with my SandyBridge machine. The SB cpu freq is 4% higher,
> but that doesn't explain the gap between the measurements.
>
> I'm continuing the investigations on my side.
> Maybe we should fix a deadline, and decide do disable indirect in
> Virtio PMD if root cause not identified/fixed at some point?
>
> Yuanhan, what do you think?

I have done some measurements using perf, and know understand better
what happens.

With indirect descriptors, I can see a cache miss when fetching the
descriptors in the indirect table. Actually, this is expected, so
we prefetch the first desc as soon as possible, but still not soon
enough to make it transparent.
In direct descriptors case, the desc in the virtqueue seems to be
remain in the cache from its previous use, so we have a hit.

That said, in realistic use-case, I think we should not have a hit,
even with direct descriptors.
Indeed, the test case use testpmd on guest side with the forwarding set
in IO mode. It means the packet content is never accessed by the guest.

In my experiments, I am used to set the "macswap" forwarding mode, which
swaps src and dest MAC addresses in the packet. I find it more
realistic, because I don't see the point in sending packets to the guest
if it is not accessed (not even its header).

I tried again the test case, this time with setting the forwarding mode
to macswap in the guest. This time, I get same performance with both
direct and indirect (indirect even a little better with a small
optimization, consisting in prefetching the 2 first descs
systematically as we know there are contiguous).

Do you agree we should assume that the packet (header or/and buf) will
always be accessed by the guest application?
If so, do you agree we should keep indirect descs enabled, and maybe
update the test cases?

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-03  8:11                               ` Maxime Coquelin
@ 2016-11-04  6:18                                 ` Xu, Qian Q
  2016-11-04  7:41                                   ` Maxime Coquelin
  2016-11-04  7:20                                 ` Wang, Zhihong
  1 sibling, 1 reply; 52+ messages in thread
From: Xu, Qian Q @ 2016-11-04  6:18 UTC (permalink / raw)
  To: Maxime Coquelin, Wang, Zhihong, Yuanhan Liu; +Cc: mst, dev, vkaplans



-----Original Message-----
From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime Coquelin
Sent: Thursday, November 3, 2016 4:11 PM
To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu <yuanhan.liu@linux.intel.com>
Cc: mst@redhat.com; dev@dpdk.org; vkaplans@redhat.com
Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support to the TX path


>
> The strange thing with both of our figures is that this is below from 
> what I obtain with my SandyBridge machine. The SB cpu freq is 4% 
> higher, but that doesn't explain the gap between the measurements.
>
> I'm continuing the investigations on my side.
> Maybe we should fix a deadline, and decide do disable indirect in 
> Virtio PMD if root cause not identified/fixed at some point?
>
> Yuanhan, what do you think?

I have done some measurements using perf, and know understand better what happens.

With indirect descriptors, I can see a cache miss when fetching the descriptors in the indirect table. Actually, this is expected, so we prefetch the first desc as soon as possible, but still not soon enough to make it transparent.
In direct descriptors case, the desc in the virtqueue seems to be remain in the cache from its previous use, so we have a hit.

That said, in realistic use-case, I think we should not have a hit, even with direct descriptors.
Indeed, the test case use testpmd on guest side with the forwarding set in IO mode. It means the packet content is never accessed by the guest.

In my experiments, I am used to set the "macswap" forwarding mode, which swaps src and dest MAC addresses in the packet. I find it more realistic, because I don't see the point in sending packets to the guest if it is not accessed (not even its header).

I tried again the test case, this time with setting the forwarding mode to macswap in the guest. This time, I get same performance with both direct and indirect (indirect even a little better with a small optimization, consisting in prefetching the 2 first descs systematically as we know there are contiguous).

Do you agree we should assume that the packet (header or/and buf) will always be accessed by the guest application?
----Maybe it's true in many real use case. But we also need ensure the performance for "io fwd" has no performance drop. As I know, OVS-DPDK team will do the performance benchmark based on "IO fwd" for virtio part, so they will also see some performance drop. And we just thought if it's possible to make the feature default off then if someone wanted to use it can turn it on. People can choose if they want to use the feature, just like vhost dequeue zero copy. 

If so, do you agree we should keep indirect descs enabled, and maybe update the test cases?

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-03  8:11                               ` Maxime Coquelin
  2016-11-04  6:18                                 ` Xu, Qian Q
@ 2016-11-04  7:20                                 ` Wang, Zhihong
  2016-11-04  7:57                                   ` Maxime Coquelin
  1 sibling, 1 reply; 52+ messages in thread
From: Wang, Zhihong @ 2016-11-04  7:20 UTC (permalink / raw)
  To: Maxime Coquelin, Yuanhan Liu
  Cc: stephen, Pierre Pfister (ppfister), Xie, Huawei, dev, vkaplans, mst



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Thursday, November 3, 2016 4:11 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> <yuanhan.liu@linux.intel.com>
> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
> vkaplans@redhat.com; mst@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
> to the TX path
> 
> 
> 
> On 11/02/2016 11:51 AM, Maxime Coquelin wrote:
> >
> >
> > On 10/31/2016 11:01 AM, Wang, Zhihong wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >>> Sent: Friday, October 28, 2016 3:42 PM
> >>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> >>> <yuanhan.liu@linux.intel.com>
> >>> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
> >>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
> dev@dpdk.org;
> >>> vkaplans@redhat.com; mst@redhat.com
> >>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
> >>> support
> >>> to the TX path
> >>>
> >>>
> >>>
> >>> On 10/28/2016 02:49 AM, Wang, Zhihong wrote:
> >>>>
> >>>>>> -----Original Message-----
> >>>>>> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> >>>>>> Sent: Thursday, October 27, 2016 6:46 PM
> >>>>>> To: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>>>>> Cc: Wang, Zhihong <zhihong.wang@intel.com>;
> >>>>>> stephen@networkplumber.org; Pierre Pfister (ppfister)
> >>>>>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
> >>> dev@dpdk.org;
> >>>>>> vkaplans@redhat.com; mst@redhat.com
> >>>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
> >>> support
> >>>>>> to the TX path
> >>>>>>
> >>>>>> On Thu, Oct 27, 2016 at 12:35:11PM +0200, Maxime Coquelin wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
> >>>>>>>>>> On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin
> >>> wrote:
> >>>>>>>>>>>> Hi Zhihong,
> >>>>>>>>>>>>
> >>>>>>>>>>>> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
> >>>>>>>>>>>>>> Hi Maxime,
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Seems indirect desc feature is causing serious
> performance
> >>>>>>>>>>>>>> degradation on Haswell platform, about 20% drop for both
> >>>>>>>>>>>>>> mrg=on and mrg=off (--txqflags=0xf00, non-vector
> version),
> >>>>>>>>>>>>>> both iofwd and macfwd.
> >>>>>>>>>>>> I tested PVP (with macswap on guest) and Txonly/Rxonly on
> an
> >>> Ivy
> >>>>>> Bridge
> >>>>>>>>>>>> platform, and didn't faced such a drop.
> >>>>>>>>>>
> >>>>>>>>>> I was actually wondering that may be the cause. I tested it with
> >>>>>>>>>> my IvyBridge server as well, I saw no drop.
> >>>>>>>>>>
> >>>>>>>>>> Maybe you should find a similar platform (Haswell) and have a
> >>>>>>>>>> try?
> >>>>>>>> Yes, that's why I asked Zhihong whether he could test Txonly in
> >>>>>>>> guest
> >>> to
> >>>>>>>> see if issue is reproducible like this.
> >>>>>>
> >>>>>> I have no Haswell box, otherwise I could do a quick test for you.
> >>>>>> IIRC,
> >>>>>> he tried to disable the indirect_desc feature, then the performance
> >>>>>> recovered. So, it's likely the indirect_desc is the culprit here.
> >>>>>>
> >>>>>>>> I will be easier for me to find an Haswell machine if it has not
> >>>>>>>> to be
> >>>>>>>> connected back to back to and HW/SW packet generator.
> >>>> In fact simple loopback test will also do, without pktgen.
> >>>>
> >>>> Start testpmd in both host and guest, and do "start" in one
> >>>> and "start tx_first 32" in another.
> >>>>
> >>>> Perf drop is about 24% in my test.
> >>>>
> >>>
> >>> Thanks, I never tried this test.
> >>> I managed to find an Haswell platform (Intel(R) Xeon(R) CPU E5-2699 v3
> >>> @ 2.30GHz), and can reproduce the problem with the loop test you
> >>> mention. I see a performance drop about 10% (8.94Mpps/8.08Mpps).
> >>> Out of curiosity, what are the numbers you get with your setup?
> >>
> >> Hi Maxime,
> >>
> >> Let's align our test case to RC2, mrg=on, loopback, on Haswell.
> >> My results below:
> >>  1. indirect=1: 5.26 Mpps
> >>  2. indirect=0: 6.54 Mpps
> >>
> >> It's about 24% drop.
> > OK, so on my side, same setup on Haswell:
> > 1. indirect=1: 7.44 Mpps
> > 2. indirect=0: 8.18 Mpps
> >
> > Still 10% drop in my case with mrg=on.
> >
> > The strange thing with both of our figures is that this is below from
> > what I obtain with my SandyBridge machine. The SB cpu freq is 4% higher,
> > but that doesn't explain the gap between the measurements.
> >
> > I'm continuing the investigations on my side.
> > Maybe we should fix a deadline, and decide do disable indirect in
> > Virtio PMD if root cause not identified/fixed at some point?
> >
> > Yuanhan, what do you think?
> 
> I have done some measurements using perf, and know understand better
> what happens.
> 
> With indirect descriptors, I can see a cache miss when fetching the
> descriptors in the indirect table. Actually, this is expected, so
> we prefetch the first desc as soon as possible, but still not soon
> enough to make it transparent.
> In direct descriptors case, the desc in the virtqueue seems to be
> remain in the cache from its previous use, so we have a hit.
> 
> That said, in realistic use-case, I think we should not have a hit,
> even with direct descriptors.
> Indeed, the test case use testpmd on guest side with the forwarding set
> in IO mode. It means the packet content is never accessed by the guest.
> 
> In my experiments, I am used to set the "macswap" forwarding mode, which
> swaps src and dest MAC addresses in the packet. I find it more
> realistic, because I don't see the point in sending packets to the guest
> if it is not accessed (not even its header).
> 
> I tried again the test case, this time with setting the forwarding mode
> to macswap in the guest. This time, I get same performance with both
> direct and indirect (indirect even a little better with a small
> optimization, consisting in prefetching the 2 first descs
> systematically as we know there are contiguous).


Hi Maxime,

I did a little more macswap test and found out more stuff here:

 1. I did loopback test on another HSW machine with the same H/W,
    and indirect_desc on and off seems have close perf

 2. So I checked the gcc version:

     *  Previous: gcc version 6.2.1 20160916 (Fedora 24)

     *  New: gcc version 5.4.0 20160609 (Ubuntu 16.04.1 LTS)

    On previous one indirect_desc has 20% drop

 3. Then I compiled binary on Ubuntu and scp to Fedora, and as
    expected I got the same perf as on Ubuntu, and the perf gap
    disappeared, so gcc is definitely one factor here

 4. Then I use the Ubuntu binary on Fedora for PVP test, then the
    perf gap comes back again and the same with the Fedora binary
    results, indirect_desc causes about 20% drop

So in all, could you try PVP traffic on HSW to see how it works?


> 
> Do you agree we should assume that the packet (header or/and buf) will
> always be accessed by the guest application?
> If so, do you agree we should keep indirect descs enabled, and maybe
> update the test cases?


I agree with you that mac/macswap test is more realistic and makes
more sense for real applications.


Thanks
Zhihong



> 
> Thanks,
> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-04  6:18                                 ` Xu, Qian Q
@ 2016-11-04  7:41                                   ` Maxime Coquelin
  0 siblings, 0 replies; 52+ messages in thread
From: Maxime Coquelin @ 2016-11-04  7:41 UTC (permalink / raw)
  To: Xu, Qian Q, Wang, Zhihong, Yuanhan Liu; +Cc: mst, dev, vkaplans

Hi,

On 11/04/2016 07:18 AM, Xu, Qian Q wrote:
>
>
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Maxime Coquelin
> Sent: Thursday, November 3, 2016 4:11 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu <yuanhan.liu@linux.intel.com>
> Cc: mst@redhat.com; dev@dpdk.org; vkaplans@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support to the TX path
>
>
>>
>> The strange thing with both of our figures is that this is below from
>> what I obtain with my SandyBridge machine. The SB cpu freq is 4%
>> higher, but that doesn't explain the gap between the measurements.
>>
>> I'm continuing the investigations on my side.
>> Maybe we should fix a deadline, and decide do disable indirect in
>> Virtio PMD if root cause not identified/fixed at some point?
>>
>> Yuanhan, what do you think?
>
> I have done some measurements using perf, and know understand better what happens.
>
> With indirect descriptors, I can see a cache miss when fetching the descriptors in the indirect table. Actually, this is expected, so we prefetch the first desc as soon as possible, but still not soon enough to make it transparent.
> In direct descriptors case, the desc in the virtqueue seems to be remain in the cache from its previous use, so we have a hit.
>
> That said, in realistic use-case, I think we should not have a hit, even with direct descriptors.
> Indeed, the test case use testpmd on guest side with the forwarding set in IO mode. It means the packet content is never accessed by the guest.
>
> In my experiments, I am used to set the "macswap" forwarding mode, which swaps src and dest MAC addresses in the packet. I find it more realistic, because I don't see the point in sending packets to the guest if it is not accessed (not even its header).
>
> I tried again the test case, this time with setting the forwarding mode to macswap in the guest. This time, I get same performance with both direct and indirect (indirect even a little better with a small optimization, consisting in prefetching the 2 first descs systematically as we know there are contiguous).
>
> Do you agree we should assume that the packet (header or/and buf) will always be accessed by the guest application?
> ----Maybe it's true in many real use case. But we also need ensure the performance for "io fwd" has no performance drop. As I know, OVS-DPDK team will do the performance benchmark based on "IO fwd" for virtio part, so they will also see some performance drop. And we just thought if it's possible to make the feature default off then if someone wanted to use it can turn it on. People can choose if they want to use the feature, just like vhost dequeue zero copy.

OVS adds an overhead compared to testpmd on host, and its cache
utilization might have the same effect as doing macswap.
Do you know who could test with OVS? I would be interested in the
results.

And the difference today with zero-copy is that it can be enabled at
runtime, whereas we can only do it at build time on guest side for
indirect.

It can be disabled in QEMU command line though.

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-04  7:20                                 ` Wang, Zhihong
@ 2016-11-04  7:57                                   ` Maxime Coquelin
  2016-11-04  7:59                                     ` Maxime Coquelin
  0 siblings, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-11-04  7:57 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu
  Cc: stephen, Pierre Pfister (ppfister), Xie, Huawei, dev, vkaplans, mst

Hi Zhihong,

On 11/04/2016 08:20 AM, Wang, Zhihong wrote:
>
>
>> -----Original Message-----
>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>> Sent: Thursday, November 3, 2016 4:11 PM
>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
>> <yuanhan.liu@linux.intel.com>
>> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
>> vkaplans@redhat.com; mst@redhat.com
>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
>> to the TX path
>>
>>
>>
>> On 11/02/2016 11:51 AM, Maxime Coquelin wrote:
>>>
>>>
>>> On 10/31/2016 11:01 AM, Wang, Zhihong wrote:
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>>>> Sent: Friday, October 28, 2016 3:42 PM
>>>>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
>>>>> <yuanhan.liu@linux.intel.com>
>>>>> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
>>>>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
>> dev@dpdk.org;
>>>>> vkaplans@redhat.com; mst@redhat.com
>>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>>>>> support
>>>>> to the TX path
>>>>>
>>>>>
>>>>>
>>>>> On 10/28/2016 02:49 AM, Wang, Zhihong wrote:
>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
>>>>>>>> Sent: Thursday, October 27, 2016 6:46 PM
>>>>>>>> To: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>>>>>> Cc: Wang, Zhihong <zhihong.wang@intel.com>;
>>>>>>>> stephen@networkplumber.org; Pierre Pfister (ppfister)
>>>>>>>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
>>>>> dev@dpdk.org;
>>>>>>>> vkaplans@redhat.com; mst@redhat.com
>>>>>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>>>>> support
>>>>>>>> to the TX path
>>>>>>>>
>>>>>>>> On Thu, Oct 27, 2016 at 12:35:11PM +0200, Maxime Coquelin wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
>>>>>>>>>>>> On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin
>>>>> wrote:
>>>>>>>>>>>>>> Hi Zhihong,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
>>>>>>>>>>>>>>>> Hi Maxime,
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Seems indirect desc feature is causing serious
>> performance
>>>>>>>>>>>>>>>> degradation on Haswell platform, about 20% drop for both
>>>>>>>>>>>>>>>> mrg=on and mrg=off (--txqflags=0xf00, non-vector
>> version),
>>>>>>>>>>>>>>>> both iofwd and macfwd.
>>>>>>>>>>>>>> I tested PVP (with macswap on guest) and Txonly/Rxonly on
>> an
>>>>> Ivy
>>>>>>>> Bridge
>>>>>>>>>>>>>> platform, and didn't faced such a drop.
>>>>>>>>>>>>
>>>>>>>>>>>> I was actually wondering that may be the cause. I tested it with
>>>>>>>>>>>> my IvyBridge server as well, I saw no drop.
>>>>>>>>>>>>
>>>>>>>>>>>> Maybe you should find a similar platform (Haswell) and have a
>>>>>>>>>>>> try?
>>>>>>>>>> Yes, that's why I asked Zhihong whether he could test Txonly in
>>>>>>>>>> guest
>>>>> to
>>>>>>>>>> see if issue is reproducible like this.
>>>>>>>>
>>>>>>>> I have no Haswell box, otherwise I could do a quick test for you.
>>>>>>>> IIRC,
>>>>>>>> he tried to disable the indirect_desc feature, then the performance
>>>>>>>> recovered. So, it's likely the indirect_desc is the culprit here.
>>>>>>>>
>>>>>>>>>> I will be easier for me to find an Haswell machine if it has not
>>>>>>>>>> to be
>>>>>>>>>> connected back to back to and HW/SW packet generator.
>>>>>> In fact simple loopback test will also do, without pktgen.
>>>>>>
>>>>>> Start testpmd in both host and guest, and do "start" in one
>>>>>> and "start tx_first 32" in another.
>>>>>>
>>>>>> Perf drop is about 24% in my test.
>>>>>>
>>>>>
>>>>> Thanks, I never tried this test.
>>>>> I managed to find an Haswell platform (Intel(R) Xeon(R) CPU E5-2699 v3
>>>>> @ 2.30GHz), and can reproduce the problem with the loop test you
>>>>> mention. I see a performance drop about 10% (8.94Mpps/8.08Mpps).
>>>>> Out of curiosity, what are the numbers you get with your setup?
>>>>
>>>> Hi Maxime,
>>>>
>>>> Let's align our test case to RC2, mrg=on, loopback, on Haswell.
>>>> My results below:
>>>>  1. indirect=1: 5.26 Mpps
>>>>  2. indirect=0: 6.54 Mpps
>>>>
>>>> It's about 24% drop.
>>> OK, so on my side, same setup on Haswell:
>>> 1. indirect=1: 7.44 Mpps
>>> 2. indirect=0: 8.18 Mpps
>>>
>>> Still 10% drop in my case with mrg=on.
>>>
>>> The strange thing with both of our figures is that this is below from
>>> what I obtain with my SandyBridge machine. The SB cpu freq is 4% higher,
>>> but that doesn't explain the gap between the measurements.
>>>
>>> I'm continuing the investigations on my side.
>>> Maybe we should fix a deadline, and decide do disable indirect in
>>> Virtio PMD if root cause not identified/fixed at some point?
>>>
>>> Yuanhan, what do you think?
>>
>> I have done some measurements using perf, and know understand better
>> what happens.
>>
>> With indirect descriptors, I can see a cache miss when fetching the
>> descriptors in the indirect table. Actually, this is expected, so
>> we prefetch the first desc as soon as possible, but still not soon
>> enough to make it transparent.
>> In direct descriptors case, the desc in the virtqueue seems to be
>> remain in the cache from its previous use, so we have a hit.
>>
>> That said, in realistic use-case, I think we should not have a hit,
>> even with direct descriptors.
>> Indeed, the test case use testpmd on guest side with the forwarding set
>> in IO mode. It means the packet content is never accessed by the guest.
>>
>> In my experiments, I am used to set the "macswap" forwarding mode, which
>> swaps src and dest MAC addresses in the packet. I find it more
>> realistic, because I don't see the point in sending packets to the guest
>> if it is not accessed (not even its header).
>>
>> I tried again the test case, this time with setting the forwarding mode
>> to macswap in the guest. This time, I get same performance with both
>> direct and indirect (indirect even a little better with a small
>> optimization, consisting in prefetching the 2 first descs
>> systematically as we know there are contiguous).
>
>
> Hi Maxime,
>
> I did a little more macswap test and found out more stuff here:
Thanks for doing more tests.

>
>  1. I did loopback test on another HSW machine with the same H/W,
>     and indirect_desc on and off seems have close perf
>
>  2. So I checked the gcc version:
>
>      *  Previous: gcc version 6.2.1 20160916 (Fedora 24)
>
>      *  New: gcc version 5.4.0 20160609 (Ubuntu 16.04.1 LTS)

On my side, I tested with RHEL7.3:
  - gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-11)

It certainly contains some backports from newer GCC versions.

>
>     On previous one indirect_desc has 20% drop
>
>  3. Then I compiled binary on Ubuntu and scp to Fedora, and as
>     expected I got the same perf as on Ubuntu, and the perf gap
>     disappeared, so gcc is definitely one factor here
>
>  4. Then I use the Ubuntu binary on Fedora for PVP test, then the
>     perf gap comes back again and the same with the Fedora binary
>     results, indirect_desc causes about 20% drop

Let me know if I understand correctly:
Loopback test with macswap:
  - gcc version 6.2.1 : 20% perf drop
  - gcc version 5.4.0 : No drop

PVP test with macswap:
  - gcc version 6.2.1 : 20% perf drop
  - gcc version 5.4.0 : 20% perf drop

>
> So in all, could you try PVP traffic on HSW to see how it works?
Sadly, the HSW machine I borrowed does not have other device connected 
back to back on its 10G port. I can only test PVP with SNB machines
currently.

>
>
>>
>> Do you agree we should assume that the packet (header or/and buf) will
>> always be accessed by the guest application?
>> If so, do you agree we should keep indirect descs enabled, and maybe
>> update the test cases?
>
>
> I agree with you that mac/macswap test is more realistic and makes
> more sense for real applications.

Thanks,
Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-04  7:57                                   ` Maxime Coquelin
@ 2016-11-04  7:59                                     ` Maxime Coquelin
  2016-11-04 10:43                                       ` Wang, Zhihong
  0 siblings, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-11-04  7:59 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu
  Cc: stephen, Pierre Pfister (ppfister), Xie, Huawei, dev, vkaplans, mst



On 11/04/2016 08:57 AM, Maxime Coquelin wrote:
> Hi Zhihong,
>
> On 11/04/2016 08:20 AM, Wang, Zhihong wrote:
>>
>>
>>> -----Original Message-----
>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>> Sent: Thursday, November 3, 2016 4:11 PM
>>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
>>> <yuanhan.liu@linux.intel.com>
>>> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
>>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
>>> vkaplans@redhat.com; mst@redhat.com
>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>>> support
>>> to the TX path
>>>
>>>
>>>
>>> On 11/02/2016 11:51 AM, Maxime Coquelin wrote:
>>>>
>>>>
>>>> On 10/31/2016 11:01 AM, Wang, Zhihong wrote:
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>>>>>> Sent: Friday, October 28, 2016 3:42 PM
>>>>>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
>>>>>> <yuanhan.liu@linux.intel.com>
>>>>>> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
>>>>>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
>>> dev@dpdk.org;
>>>>>> vkaplans@redhat.com; mst@redhat.com
>>>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>>>>>> support
>>>>>> to the TX path
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 10/28/2016 02:49 AM, Wang, Zhihong wrote:
>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
>>>>>>>>> Sent: Thursday, October 27, 2016 6:46 PM
>>>>>>>>> To: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>>>>>>> Cc: Wang, Zhihong <zhihong.wang@intel.com>;
>>>>>>>>> stephen@networkplumber.org; Pierre Pfister (ppfister)
>>>>>>>>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
>>>>>> dev@dpdk.org;
>>>>>>>>> vkaplans@redhat.com; mst@redhat.com
>>>>>>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
>>>>>> support
>>>>>>>>> to the TX path
>>>>>>>>>
>>>>>>>>> On Thu, Oct 27, 2016 at 12:35:11PM +0200, Maxime Coquelin wrote:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
>>>>>>>>>>>>> On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime Coquelin
>>>>>> wrote:
>>>>>>>>>>>>>>> Hi Zhihong,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
>>>>>>>>>>>>>>>>> Hi Maxime,
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Seems indirect desc feature is causing serious
>>> performance
>>>>>>>>>>>>>>>>> degradation on Haswell platform, about 20% drop for both
>>>>>>>>>>>>>>>>> mrg=on and mrg=off (--txqflags=0xf00, non-vector
>>> version),
>>>>>>>>>>>>>>>>> both iofwd and macfwd.
>>>>>>>>>>>>>>> I tested PVP (with macswap on guest) and Txonly/Rxonly on
>>> an
>>>>>> Ivy
>>>>>>>>> Bridge
>>>>>>>>>>>>>>> platform, and didn't faced such a drop.
>>>>>>>>>>>>>
>>>>>>>>>>>>> I was actually wondering that may be the cause. I tested it
>>>>>>>>>>>>> with
>>>>>>>>>>>>> my IvyBridge server as well, I saw no drop.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Maybe you should find a similar platform (Haswell) and have a
>>>>>>>>>>>>> try?
>>>>>>>>>>> Yes, that's why I asked Zhihong whether he could test Txonly in
>>>>>>>>>>> guest
>>>>>> to
>>>>>>>>>>> see if issue is reproducible like this.
>>>>>>>>>
>>>>>>>>> I have no Haswell box, otherwise I could do a quick test for you.
>>>>>>>>> IIRC,
>>>>>>>>> he tried to disable the indirect_desc feature, then the
>>>>>>>>> performance
>>>>>>>>> recovered. So, it's likely the indirect_desc is the culprit here.
>>>>>>>>>
>>>>>>>>>>> I will be easier for me to find an Haswell machine if it has not
>>>>>>>>>>> to be
>>>>>>>>>>> connected back to back to and HW/SW packet generator.
>>>>>>> In fact simple loopback test will also do, without pktgen.
>>>>>>>
>>>>>>> Start testpmd in both host and guest, and do "start" in one
>>>>>>> and "start tx_first 32" in another.
>>>>>>>
>>>>>>> Perf drop is about 24% in my test.
>>>>>>>
>>>>>>
>>>>>> Thanks, I never tried this test.
>>>>>> I managed to find an Haswell platform (Intel(R) Xeon(R) CPU
>>>>>> E5-2699 v3
>>>>>> @ 2.30GHz), and can reproduce the problem with the loop test you
>>>>>> mention. I see a performance drop about 10% (8.94Mpps/8.08Mpps).
>>>>>> Out of curiosity, what are the numbers you get with your setup?
>>>>>
>>>>> Hi Maxime,
>>>>>
>>>>> Let's align our test case to RC2, mrg=on, loopback, on Haswell.
>>>>> My results below:
>>>>>  1. indirect=1: 5.26 Mpps
>>>>>  2. indirect=0: 6.54 Mpps
>>>>>
>>>>> It's about 24% drop.
>>>> OK, so on my side, same setup on Haswell:
>>>> 1. indirect=1: 7.44 Mpps
>>>> 2. indirect=0: 8.18 Mpps
>>>>
>>>> Still 10% drop in my case with mrg=on.
>>>>
>>>> The strange thing with both of our figures is that this is below from
>>>> what I obtain with my SandyBridge machine. The SB cpu freq is 4%
>>>> higher,
>>>> but that doesn't explain the gap between the measurements.
>>>>
>>>> I'm continuing the investigations on my side.
>>>> Maybe we should fix a deadline, and decide do disable indirect in
>>>> Virtio PMD if root cause not identified/fixed at some point?
>>>>
>>>> Yuanhan, what do you think?
>>>
>>> I have done some measurements using perf, and know understand better
>>> what happens.
>>>
>>> With indirect descriptors, I can see a cache miss when fetching the
>>> descriptors in the indirect table. Actually, this is expected, so
>>> we prefetch the first desc as soon as possible, but still not soon
>>> enough to make it transparent.
>>> In direct descriptors case, the desc in the virtqueue seems to be
>>> remain in the cache from its previous use, so we have a hit.
>>>
>>> That said, in realistic use-case, I think we should not have a hit,
>>> even with direct descriptors.
>>> Indeed, the test case use testpmd on guest side with the forwarding set
>>> in IO mode. It means the packet content is never accessed by the guest.
>>>
>>> In my experiments, I am used to set the "macswap" forwarding mode, which
>>> swaps src and dest MAC addresses in the packet. I find it more
>>> realistic, because I don't see the point in sending packets to the guest
>>> if it is not accessed (not even its header).
>>>
>>> I tried again the test case, this time with setting the forwarding mode
>>> to macswap in the guest. This time, I get same performance with both
>>> direct and indirect (indirect even a little better with a small
>>> optimization, consisting in prefetching the 2 first descs
>>> systematically as we know there are contiguous).
>>
>>
>> Hi Maxime,
>>
>> I did a little more macswap test and found out more stuff here:
> Thanks for doing more tests.
>
>>
>>  1. I did loopback test on another HSW machine with the same H/W,
>>     and indirect_desc on and off seems have close perf
>>
>>  2. So I checked the gcc version:
>>
>>      *  Previous: gcc version 6.2.1 20160916 (Fedora 24)
>>
>>      *  New: gcc version 5.4.0 20160609 (Ubuntu 16.04.1 LTS)
>
> On my side, I tested with RHEL7.3:
>  - gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-11)
>
> It certainly contains some backports from newer GCC versions.
>
>>
>>     On previous one indirect_desc has 20% drop
>>
>>  3. Then I compiled binary on Ubuntu and scp to Fedora, and as
>>     expected I got the same perf as on Ubuntu, and the perf gap
>>     disappeared, so gcc is definitely one factor here
>>
>>  4. Then I use the Ubuntu binary on Fedora for PVP test, then the
>>     perf gap comes back again and the same with the Fedora binary
>>     results, indirect_desc causes about 20% drop
>
> Let me know if I understand correctly:
> Loopback test with macswap:
>  - gcc version 6.2.1 : 20% perf drop
>  - gcc version 5.4.0 : No drop
>
> PVP test with macswap:
>  - gcc version 6.2.1 : 20% perf drop
>  - gcc version 5.4.0 : 20% perf drop

I forgot to ask, did you recompile only host, or both host and guest
testmpd's in your test?

>
>>
>> So in all, could you try PVP traffic on HSW to see how it works?
> Sadly, the HSW machine I borrowed does not have other device connected
> back to back on its 10G port. I can only test PVP with SNB machines
> currently.
>
>>
>>
>>>
>>> Do you agree we should assume that the packet (header or/and buf) will
>>> always be accessed by the guest application?
>>> If so, do you agree we should keep indirect descs enabled, and maybe
>>> update the test cases?
>>
>>
>> I agree with you that mac/macswap test is more realistic and makes
>> more sense for real applications.
>
> Thanks,
> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-04  7:59                                     ` Maxime Coquelin
@ 2016-11-04 10:43                                       ` Wang, Zhihong
  2016-11-04 11:22                                         ` Maxime Coquelin
  0 siblings, 1 reply; 52+ messages in thread
From: Wang, Zhihong @ 2016-11-04 10:43 UTC (permalink / raw)
  To: Maxime Coquelin, Yuanhan Liu
  Cc: stephen, Pierre Pfister (ppfister), Xie, Huawei, dev, vkaplans, mst



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Friday, November 4, 2016 4:00 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> <yuanhan.liu@linux.intel.com>
> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
> vkaplans@redhat.com; mst@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
> to the TX path
> 
> 
> 
> On 11/04/2016 08:57 AM, Maxime Coquelin wrote:
> > Hi Zhihong,
> >
> > On 11/04/2016 08:20 AM, Wang, Zhihong wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >>> Sent: Thursday, November 3, 2016 4:11 PM
> >>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> >>> <yuanhan.liu@linux.intel.com>
> >>> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
> >>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
> dev@dpdk.org;
> >>> vkaplans@redhat.com; mst@redhat.com
> >>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
> >>> support
> >>> to the TX path
> >>>
> >>>
> >>>
> >>> On 11/02/2016 11:51 AM, Maxime Coquelin wrote:
> >>>>
> >>>>
> >>>> On 10/31/2016 11:01 AM, Wang, Zhihong wrote:
> >>>>>
> >>>>>
> >>>>>> -----Original Message-----
> >>>>>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >>>>>> Sent: Friday, October 28, 2016 3:42 PM
> >>>>>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> >>>>>> <yuanhan.liu@linux.intel.com>
> >>>>>> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
> >>>>>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
> >>> dev@dpdk.org;
> >>>>>> vkaplans@redhat.com; mst@redhat.com
> >>>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
> >>>>>> support
> >>>>>> to the TX path
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>> On 10/28/2016 02:49 AM, Wang, Zhihong wrote:
> >>>>>>>
> >>>>>>>>> -----Original Message-----
> >>>>>>>>> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> >>>>>>>>> Sent: Thursday, October 27, 2016 6:46 PM
> >>>>>>>>> To: Maxime Coquelin <maxime.coquelin@redhat.com>
> >>>>>>>>> Cc: Wang, Zhihong <zhihong.wang@intel.com>;
> >>>>>>>>> stephen@networkplumber.org; Pierre Pfister (ppfister)
> >>>>>>>>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
> >>>>>> dev@dpdk.org;
> >>>>>>>>> vkaplans@redhat.com; mst@redhat.com
> >>>>>>>>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect
> descriptors
> >>>>>> support
> >>>>>>>>> to the TX path
> >>>>>>>>>
> >>>>>>>>> On Thu, Oct 27, 2016 at 12:35:11PM +0200, Maxime Coquelin
> wrote:
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> On 10/27/2016 12:33 PM, Yuanhan Liu wrote:
> >>>>>>>>>>>>> On Thu, Oct 27, 2016 at 11:10:34AM +0200, Maxime
> Coquelin
> >>>>>> wrote:
> >>>>>>>>>>>>>>> Hi Zhihong,
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> On 10/27/2016 11:00 AM, Wang, Zhihong wrote:
> >>>>>>>>>>>>>>>>> Hi Maxime,
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> Seems indirect desc feature is causing serious
> >>> performance
> >>>>>>>>>>>>>>>>> degradation on Haswell platform, about 20% drop for
> both
> >>>>>>>>>>>>>>>>> mrg=on and mrg=off (--txqflags=0xf00, non-vector
> >>> version),
> >>>>>>>>>>>>>>>>> both iofwd and macfwd.
> >>>>>>>>>>>>>>> I tested PVP (with macswap on guest) and Txonly/Rxonly
> on
> >>> an
> >>>>>> Ivy
> >>>>>>>>> Bridge
> >>>>>>>>>>>>>>> platform, and didn't faced such a drop.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> I was actually wondering that may be the cause. I tested it
> >>>>>>>>>>>>> with
> >>>>>>>>>>>>> my IvyBridge server as well, I saw no drop.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Maybe you should find a similar platform (Haswell) and
> have a
> >>>>>>>>>>>>> try?
> >>>>>>>>>>> Yes, that's why I asked Zhihong whether he could test Txonly
> in
> >>>>>>>>>>> guest
> >>>>>> to
> >>>>>>>>>>> see if issue is reproducible like this.
> >>>>>>>>>
> >>>>>>>>> I have no Haswell box, otherwise I could do a quick test for you.
> >>>>>>>>> IIRC,
> >>>>>>>>> he tried to disable the indirect_desc feature, then the
> >>>>>>>>> performance
> >>>>>>>>> recovered. So, it's likely the indirect_desc is the culprit here.
> >>>>>>>>>
> >>>>>>>>>>> I will be easier for me to find an Haswell machine if it has not
> >>>>>>>>>>> to be
> >>>>>>>>>>> connected back to back to and HW/SW packet generator.
> >>>>>>> In fact simple loopback test will also do, without pktgen.
> >>>>>>>
> >>>>>>> Start testpmd in both host and guest, and do "start" in one
> >>>>>>> and "start tx_first 32" in another.
> >>>>>>>
> >>>>>>> Perf drop is about 24% in my test.
> >>>>>>>
> >>>>>>
> >>>>>> Thanks, I never tried this test.
> >>>>>> I managed to find an Haswell platform (Intel(R) Xeon(R) CPU
> >>>>>> E5-2699 v3
> >>>>>> @ 2.30GHz), and can reproduce the problem with the loop test you
> >>>>>> mention. I see a performance drop about 10% (8.94Mpps/8.08Mpps).
> >>>>>> Out of curiosity, what are the numbers you get with your setup?
> >>>>>
> >>>>> Hi Maxime,
> >>>>>
> >>>>> Let's align our test case to RC2, mrg=on, loopback, on Haswell.
> >>>>> My results below:
> >>>>>  1. indirect=1: 5.26 Mpps
> >>>>>  2. indirect=0: 6.54 Mpps
> >>>>>
> >>>>> It's about 24% drop.
> >>>> OK, so on my side, same setup on Haswell:
> >>>> 1. indirect=1: 7.44 Mpps
> >>>> 2. indirect=0: 8.18 Mpps
> >>>>
> >>>> Still 10% drop in my case with mrg=on.
> >>>>
> >>>> The strange thing with both of our figures is that this is below from
> >>>> what I obtain with my SandyBridge machine. The SB cpu freq is 4%
> >>>> higher,
> >>>> but that doesn't explain the gap between the measurements.
> >>>>
> >>>> I'm continuing the investigations on my side.
> >>>> Maybe we should fix a deadline, and decide do disable indirect in
> >>>> Virtio PMD if root cause not identified/fixed at some point?
> >>>>
> >>>> Yuanhan, what do you think?
> >>>
> >>> I have done some measurements using perf, and know understand
> better
> >>> what happens.
> >>>
> >>> With indirect descriptors, I can see a cache miss when fetching the
> >>> descriptors in the indirect table. Actually, this is expected, so
> >>> we prefetch the first desc as soon as possible, but still not soon
> >>> enough to make it transparent.
> >>> In direct descriptors case, the desc in the virtqueue seems to be
> >>> remain in the cache from its previous use, so we have a hit.
> >>>
> >>> That said, in realistic use-case, I think we should not have a hit,
> >>> even with direct descriptors.
> >>> Indeed, the test case use testpmd on guest side with the forwarding set
> >>> in IO mode. It means the packet content is never accessed by the guest.
> >>>
> >>> In my experiments, I am used to set the "macswap" forwarding mode,
> which
> >>> swaps src and dest MAC addresses in the packet. I find it more
> >>> realistic, because I don't see the point in sending packets to the guest
> >>> if it is not accessed (not even its header).
> >>>
> >>> I tried again the test case, this time with setting the forwarding mode
> >>> to macswap in the guest. This time, I get same performance with both
> >>> direct and indirect (indirect even a little better with a small
> >>> optimization, consisting in prefetching the 2 first descs
> >>> systematically as we know there are contiguous).
> >>
> >>
> >> Hi Maxime,
> >>
> >> I did a little more macswap test and found out more stuff here:
> > Thanks for doing more tests.
> >
> >>
> >>  1. I did loopback test on another HSW machine with the same H/W,
> >>     and indirect_desc on and off seems have close perf
> >>
> >>  2. So I checked the gcc version:
> >>
> >>      *  Previous: gcc version 6.2.1 20160916 (Fedora 24)
> >>
> >>      *  New: gcc version 5.4.0 20160609 (Ubuntu 16.04.1 LTS)
> >
> > On my side, I tested with RHEL7.3:
> >  - gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-11)
> >
> > It certainly contains some backports from newer GCC versions.
> >
> >>
> >>     On previous one indirect_desc has 20% drop
> >>
> >>  3. Then I compiled binary on Ubuntu and scp to Fedora, and as
> >>     expected I got the same perf as on Ubuntu, and the perf gap
> >>     disappeared, so gcc is definitely one factor here
> >>
> >>  4. Then I use the Ubuntu binary on Fedora for PVP test, then the
> >>     perf gap comes back again and the same with the Fedora binary
> >>     results, indirect_desc causes about 20% drop
> >
> > Let me know if I understand correctly:

Yes, and it's hard to breakdown further at this time.

Also we may need to check whether it's caused by certain NIC
model. Unfortunately I don't have the right setup right now.

> > Loopback test with macswap:
> >  - gcc version 6.2.1 : 20% perf drop
> >  - gcc version 5.4.0 : No drop
> >
> > PVP test with macswap:
> >  - gcc version 6.2.1 : 20% perf drop
> >  - gcc version 5.4.0 : 20% perf drop
> 
> I forgot to ask, did you recompile only host, or both host and guest
> testmpd's in your test?

Both.

> 
> >
> >>
> >> So in all, could you try PVP traffic on HSW to see how it works?
> > Sadly, the HSW machine I borrowed does not have other device connected
> > back to back on its 10G port. I can only test PVP with SNB machines
> > currently.
> >
> >>
> >>
> >>>
> >>> Do you agree we should assume that the packet (header or/and buf) will
> >>> always be accessed by the guest application?
> >>> If so, do you agree we should keep indirect descs enabled, and maybe
> >>> update the test cases?
> >>
> >>
> >> I agree with you that mac/macswap test is more realistic and makes
> >> more sense for real applications.
> >
> > Thanks,
> > Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-04 10:43                                       ` Wang, Zhihong
@ 2016-11-04 11:22                                         ` Maxime Coquelin
  2016-11-04 11:36                                           ` Yuanhan Liu
  2016-11-04 12:30                                           ` Wang, Zhihong
  0 siblings, 2 replies; 52+ messages in thread
From: Maxime Coquelin @ 2016-11-04 11:22 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu
  Cc: stephen, Pierre Pfister (ppfister), Xie, Huawei, dev, vkaplans, mst



>>>> Hi Maxime,
>>>>
>>>> I did a little more macswap test and found out more stuff here:
>>> Thanks for doing more tests.
>>>
>>>>
>>>>  1. I did loopback test on another HSW machine with the same H/W,
>>>>     and indirect_desc on and off seems have close perf
>>>>
>>>>  2. So I checked the gcc version:
>>>>
>>>>      *  Previous: gcc version 6.2.1 20160916 (Fedora 24)
>>>>
>>>>      *  New: gcc version 5.4.0 20160609 (Ubuntu 16.04.1 LTS)
>>>
>>> On my side, I tested with RHEL7.3:
>>>  - gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-11)
>>>
>>> It certainly contains some backports from newer GCC versions.
>>>
>>>>
>>>>     On previous one indirect_desc has 20% drop
>>>>
>>>>  3. Then I compiled binary on Ubuntu and scp to Fedora, and as
>>>>     expected I got the same perf as on Ubuntu, and the perf gap
>>>>     disappeared, so gcc is definitely one factor here
>>>>
>>>>  4. Then I use the Ubuntu binary on Fedora for PVP test, then the
>>>>     perf gap comes back again and the same with the Fedora binary
>>>>     results, indirect_desc causes about 20% drop
>>>
>>> Let me know if I understand correctly:
>
> Yes, and it's hard to breakdown further at this time.
>
> Also we may need to check whether it's caused by certain NIC
> model. Unfortunately I don't have the right setup right now.
>
>>> Loopback test with macswap:
>>>  - gcc version 6.2.1 : 20% perf drop
>>>  - gcc version 5.4.0 : No drop
>>>
>>> PVP test with macswap:
>>>  - gcc version 6.2.1 : 20% perf drop
>>>  - gcc version 5.4.0 : 20% perf drop
>>
>> I forgot to ask, did you recompile only host, or both host and guest
>> testmpd's in your test?

> Both.

I recompiled testpmd on a Fedora 24 machine using GCC6:
gcc (GCC) 6.1.1 20160621 (Red Hat 6.1.1-3)
Testing loopback with macswap on my Haswell RHEL7.3 machine gives me the 
following results:
  - indirect on: 7.75Mpps
  - indirect off: 7.35Mpps

Surprisingly, I get better results with indirect on my setup (I
reproduced the tests multiple times).

Do you have a document explaining the tuning/config you apply to both 
the host and the guest (isolation, HT, hugepage size, ...) in your
setup?

Regards,
Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-04 11:22                                         ` Maxime Coquelin
@ 2016-11-04 11:36                                           ` Yuanhan Liu
  2016-11-04 11:39                                             ` Maxime Coquelin
  2016-11-04 12:30                                           ` Wang, Zhihong
  1 sibling, 1 reply; 52+ messages in thread
From: Yuanhan Liu @ 2016-11-04 11:36 UTC (permalink / raw)
  To: Maxime Coquelin
  Cc: Wang, Zhihong, stephen, Pierre Pfister (ppfister),
	Xie, Huawei, dev, vkaplans, mst

On Fri, Nov 04, 2016 at 12:22:47PM +0100, Maxime Coquelin wrote:
> 
> 
> >>>>Hi Maxime,
> >>>>
> >>>>I did a little more macswap test and found out more stuff here:
> >>>Thanks for doing more tests.
> >>>
> >>>>
> >>>> 1. I did loopback test on another HSW machine with the same H/W,
> >>>>    and indirect_desc on and off seems have close perf
> >>>>
> >>>> 2. So I checked the gcc version:
> >>>>
> >>>>     *  Previous: gcc version 6.2.1 20160916 (Fedora 24)
> >>>>
> >>>>     *  New: gcc version 5.4.0 20160609 (Ubuntu 16.04.1 LTS)
> >>>
> >>>On my side, I tested with RHEL7.3:
> >>> - gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-11)
> >>>
> >>>It certainly contains some backports from newer GCC versions.
> >>>
> >>>>
> >>>>    On previous one indirect_desc has 20% drop
> >>>>
> >>>> 3. Then I compiled binary on Ubuntu and scp to Fedora, and as
> >>>>    expected I got the same perf as on Ubuntu, and the perf gap
> >>>>    disappeared, so gcc is definitely one factor here
> >>>>
> >>>> 4. Then I use the Ubuntu binary on Fedora for PVP test, then the
> >>>>    perf gap comes back again and the same with the Fedora binary
> >>>>    results, indirect_desc causes about 20% drop
> >>>
> >>>Let me know if I understand correctly:
> >
> >Yes, and it's hard to breakdown further at this time.
> >
> >Also we may need to check whether it's caused by certain NIC
> >model. Unfortunately I don't have the right setup right now.
> >
> >>>Loopback test with macswap:
> >>> - gcc version 6.2.1 : 20% perf drop
> >>> - gcc version 5.4.0 : No drop
> >>>
> >>>PVP test with macswap:
> >>> - gcc version 6.2.1 : 20% perf drop
> >>> - gcc version 5.4.0 : 20% perf drop
> >>
> >>I forgot to ask, did you recompile only host, or both host and guest
> >>testmpd's in your test?
> 
> >Both.
> 
> I recompiled testpmd on a Fedora 24 machine using GCC6:

Have you built host DPDK with gcc6 as well?

	--yliu

> gcc (GCC) 6.1.1 20160621 (Red Hat 6.1.1-3)
> Testing loopback with macswap on my Haswell RHEL7.3 machine gives me the
> following results:
>  - indirect on: 7.75Mpps
>  - indirect off: 7.35Mpps
> 
> Surprisingly, I get better results with indirect on my setup (I
> reproduced the tests multiple times).
> 
> Do you have a document explaining the tuning/config you apply to both the
> host and the guest (isolation, HT, hugepage size, ...) in your
> setup?
> 
> Regards,
> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-04 11:36                                           ` Yuanhan Liu
@ 2016-11-04 11:39                                             ` Maxime Coquelin
  0 siblings, 0 replies; 52+ messages in thread
From: Maxime Coquelin @ 2016-11-04 11:39 UTC (permalink / raw)
  To: Yuanhan Liu
  Cc: Wang, Zhihong, stephen, Pierre Pfister (ppfister),
	Xie, Huawei, dev, vkaplans, mst



On 11/04/2016 12:36 PM, Yuanhan Liu wrote:
> On Fri, Nov 04, 2016 at 12:22:47PM +0100, Maxime Coquelin wrote:
>>
>>
>>>>>> Hi Maxime,
>>>>>>
>>>>>> I did a little more macswap test and found out more stuff here:
>>>>> Thanks for doing more tests.
>>>>>
>>>>>>
>>>>>> 1. I did loopback test on another HSW machine with the same H/W,
>>>>>>    and indirect_desc on and off seems have close perf
>>>>>>
>>>>>> 2. So I checked the gcc version:
>>>>>>
>>>>>>     *  Previous: gcc version 6.2.1 20160916 (Fedora 24)
>>>>>>
>>>>>>     *  New: gcc version 5.4.0 20160609 (Ubuntu 16.04.1 LTS)
>>>>>
>>>>> On my side, I tested with RHEL7.3:
>>>>> - gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-11)
>>>>>
>>>>> It certainly contains some backports from newer GCC versions.
>>>>>
>>>>>>
>>>>>>    On previous one indirect_desc has 20% drop
>>>>>>
>>>>>> 3. Then I compiled binary on Ubuntu and scp to Fedora, and as
>>>>>>    expected I got the same perf as on Ubuntu, and the perf gap
>>>>>>    disappeared, so gcc is definitely one factor here
>>>>>>
>>>>>> 4. Then I use the Ubuntu binary on Fedora for PVP test, then the
>>>>>>    perf gap comes back again and the same with the Fedora binary
>>>>>>    results, indirect_desc causes about 20% drop
>>>>>
>>>>> Let me know if I understand correctly:
>>>
>>> Yes, and it's hard to breakdown further at this time.
>>>
>>> Also we may need to check whether it's caused by certain NIC
>>> model. Unfortunately I don't have the right setup right now.
>>>
>>>>> Loopback test with macswap:
>>>>> - gcc version 6.2.1 : 20% perf drop
>>>>> - gcc version 5.4.0 : No drop
>>>>>
>>>>> PVP test with macswap:
>>>>> - gcc version 6.2.1 : 20% perf drop
>>>>> - gcc version 5.4.0 : 20% perf drop
>>>>
>>>> I forgot to ask, did you recompile only host, or both host and guest
>>>> testmpd's in your test?
>>
>>> Both.
>>
>> I recompiled testpmd on a Fedora 24 machine using GCC6:
>
> Have you built host DPDK with gcc6 as well?

Yes, I use the same build based on GCC6 on both sides.

>
> 	--yliu
>
>> gcc (GCC) 6.1.1 20160621 (Red Hat 6.1.1-3)
>> Testing loopback with macswap on my Haswell RHEL7.3 machine gives me the
>> following results:
>>  - indirect on: 7.75Mpps
>>  - indirect off: 7.35Mpps
>>
>> Surprisingly, I get better results with indirect on my setup (I
>> reproduced the tests multiple times).
>>
>> Do you have a document explaining the tuning/config you apply to both the
>> host and the guest (isolation, HT, hugepage size, ...) in your
>> setup?
>>
>> Regards,
>> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-04 11:22                                         ` Maxime Coquelin
  2016-11-04 11:36                                           ` Yuanhan Liu
@ 2016-11-04 12:30                                           ` Wang, Zhihong
  2016-11-04 12:54                                             ` Maxime Coquelin
  1 sibling, 1 reply; 52+ messages in thread
From: Wang, Zhihong @ 2016-11-04 12:30 UTC (permalink / raw)
  To: Maxime Coquelin, Yuanhan Liu
  Cc: stephen, Pierre Pfister (ppfister), Xie, Huawei, dev, vkaplans, mst



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Friday, November 4, 2016 7:23 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> <yuanhan.liu@linux.intel.com>
> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
> vkaplans@redhat.com; mst@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support to the
> TX path
> 
> 
> 
> >>>> Hi Maxime,
> >>>>
> >>>> I did a little more macswap test and found out more stuff here:
> >>> Thanks for doing more tests.
> >>>
> >>>>
> >>>>  1. I did loopback test on another HSW machine with the same H/W,
> >>>>     and indirect_desc on and off seems have close perf
> >>>>
> >>>>  2. So I checked the gcc version:
> >>>>
> >>>>      *  Previous: gcc version 6.2.1 20160916 (Fedora 24)
> >>>>
> >>>>      *  New: gcc version 5.4.0 20160609 (Ubuntu 16.04.1 LTS)
> >>>
> >>> On my side, I tested with RHEL7.3:
> >>>  - gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-11)
> >>>
> >>> It certainly contains some backports from newer GCC versions.
> >>>
> >>>>
> >>>>     On previous one indirect_desc has 20% drop
> >>>>
> >>>>  3. Then I compiled binary on Ubuntu and scp to Fedora, and as
> >>>>     expected I got the same perf as on Ubuntu, and the perf gap
> >>>>     disappeared, so gcc is definitely one factor here
> >>>>
> >>>>  4. Then I use the Ubuntu binary on Fedora for PVP test, then the
> >>>>     perf gap comes back again and the same with the Fedora binary
> >>>>     results, indirect_desc causes about 20% drop
> >>>
> >>> Let me know if I understand correctly:
> >
> > Yes, and it's hard to breakdown further at this time.
> >
> > Also we may need to check whether it's caused by certain NIC
> > model. Unfortunately I don't have the right setup right now.
> >
> >>> Loopback test with macswap:
> >>>  - gcc version 6.2.1 : 20% perf drop
> >>>  - gcc version 5.4.0 : No drop
> >>>
> >>> PVP test with macswap:
> >>>  - gcc version 6.2.1 : 20% perf drop
> >>>  - gcc version 5.4.0 : 20% perf drop
> >>
> >> I forgot to ask, did you recompile only host, or both host and guest
> >> testmpd's in your test?
> 
> > Both.
> 
> I recompiled testpmd on a Fedora 24 machine using GCC6:
> gcc (GCC) 6.1.1 20160621 (Red Hat 6.1.1-3)
> Testing loopback with macswap on my Haswell RHEL7.3 machine gives me the
> following results:
>   - indirect on: 7.75Mpps
>   - indirect off: 7.35Mpps
> 
> Surprisingly, I get better results with indirect on my setup (I
> reproduced the tests multiple times).
> 
> Do you have a document explaining the tuning/config you apply to both
> the host and the guest (isolation, HT, hugepage size, ...) in your
> setup?


The setup where it goes wrong:
 1. Xeon E5-2699, HT on, turbo off, 1GB hugepage for both host and guest
 2. Fortville 40G
 3. Fedora 4.7.5-200.fc24.x86_64
 4. gcc version 6.2.1
 5. 16.11 RC2 for both host and guest
 6. PVP, testpmd macswap for both host and guest

BTW, I do see indirect_desc gives slightly better performance for loopback
in tests on other platforms, but don't know how PVP performs yet.


> 
> Regards,
> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-04 12:30                                           ` Wang, Zhihong
@ 2016-11-04 12:54                                             ` Maxime Coquelin
  2016-11-04 13:09                                               ` Wang, Zhihong
  0 siblings, 1 reply; 52+ messages in thread
From: Maxime Coquelin @ 2016-11-04 12:54 UTC (permalink / raw)
  To: Wang, Zhihong, Yuanhan Liu
  Cc: stephen, Pierre Pfister (ppfister), Xie, Huawei, dev, vkaplans, mst



On 11/04/2016 01:30 PM, Wang, Zhihong wrote:
>
>
>> -----Original Message-----
>> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
>> Sent: Friday, November 4, 2016 7:23 PM
>> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
>> <yuanhan.liu@linux.intel.com>
>> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
>> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
>> vkaplans@redhat.com; mst@redhat.com
>> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support to the
>> TX path
>>
>>
>>
>>>>>> Hi Maxime,
>>>>>>
>>>>>> I did a little more macswap test and found out more stuff here:
>>>>> Thanks for doing more tests.
>>>>>
>>>>>>
>>>>>>  1. I did loopback test on another HSW machine with the same H/W,
>>>>>>     and indirect_desc on and off seems have close perf
>>>>>>
>>>>>>  2. So I checked the gcc version:
>>>>>>
>>>>>>      *  Previous: gcc version 6.2.1 20160916 (Fedora 24)
>>>>>>
>>>>>>      *  New: gcc version 5.4.0 20160609 (Ubuntu 16.04.1 LTS)
>>>>>
>>>>> On my side, I tested with RHEL7.3:
>>>>>  - gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-11)
>>>>>
>>>>> It certainly contains some backports from newer GCC versions.
>>>>>
>>>>>>
>>>>>>     On previous one indirect_desc has 20% drop
>>>>>>
>>>>>>  3. Then I compiled binary on Ubuntu and scp to Fedora, and as
>>>>>>     expected I got the same perf as on Ubuntu, and the perf gap
>>>>>>     disappeared, so gcc is definitely one factor here
>>>>>>
>>>>>>  4. Then I use the Ubuntu binary on Fedora for PVP test, then the
>>>>>>     perf gap comes back again and the same with the Fedora binary
>>>>>>     results, indirect_desc causes about 20% drop
>>>>>
>>>>> Let me know if I understand correctly:
>>>
>>> Yes, and it's hard to breakdown further at this time.
>>>
>>> Also we may need to check whether it's caused by certain NIC
>>> model. Unfortunately I don't have the right setup right now.
>>>
>>>>> Loopback test with macswap:
>>>>>  - gcc version 6.2.1 : 20% perf drop
>>>>>  - gcc version 5.4.0 : No drop
>>>>>
>>>>> PVP test with macswap:
>>>>>  - gcc version 6.2.1 : 20% perf drop
>>>>>  - gcc version 5.4.0 : 20% perf drop
>>>>
>>>> I forgot to ask, did you recompile only host, or both host and guest
>>>> testmpd's in your test?
>>
>>> Both.
>>
>> I recompiled testpmd on a Fedora 24 machine using GCC6:
>> gcc (GCC) 6.1.1 20160621 (Red Hat 6.1.1-3)
>> Testing loopback with macswap on my Haswell RHEL7.3 machine gives me the
>> following results:
>>   - indirect on: 7.75Mpps
>>   - indirect off: 7.35Mpps
>>
>> Surprisingly, I get better results with indirect on my setup (I
>> reproduced the tests multiple times).
>>
>> Do you have a document explaining the tuning/config you apply to both
>> the host and the guest (isolation, HT, hugepage size, ...) in your
>> setup?
>
>
> The setup where it goes wrong:
>  1. Xeon E5-2699, HT on, turbo off, 1GB hugepage for both host and guest
On the Haswell machine (on which I don't have BIOS access), HT is on,
but I unplug siblings at runtime.
I also have 1G pages on both sides, and I isolate the cores used by both
testpmd and vCPUS.

>  2. Fortville 40G
>  3. Fedora 4.7.5-200.fc24.x86_64
>  4. gcc version 6.2.1
>  5. 16.11 RC2 for both host and guest
>  6. PVP, testpmd macswap for both host and guest
>
> BTW, I do see indirect_desc gives slightly better performance for loopback
> in tests on other platforms, but don't know how PVP performs yet.
Interesting, other platforms are also Haswell/Broadwell?

For PVP benchmarks, are your figures with 0% pkt loss?

Thanks,
Maxime

>
>
>>
>> Regards,
>> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-04 12:54                                             ` Maxime Coquelin
@ 2016-11-04 13:09                                               ` Wang, Zhihong
  2016-11-08 10:51                                                 ` Wang, Zhihong
  0 siblings, 1 reply; 52+ messages in thread
From: Wang, Zhihong @ 2016-11-04 13:09 UTC (permalink / raw)
  To: Maxime Coquelin, Yuanhan Liu
  Cc: stephen, Pierre Pfister (ppfister), Xie, Huawei, dev, vkaplans, mst



> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Friday, November 4, 2016 8:54 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> <yuanhan.liu@linux.intel.com>
> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>; dev@dpdk.org;
> vkaplans@redhat.com; mst@redhat.com
> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors support
> to the TX path
> 
> 
> 
> On 11/04/2016 01:30 PM, Wang, Zhihong wrote:
> >
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >> Sent: Friday, November 4, 2016 7:23 PM
> >> To: Wang, Zhihong <zhihong.wang@intel.com>; Yuanhan Liu
> >> <yuanhan.liu@linux.intel.com>
> >> Cc: stephen@networkplumber.org; Pierre Pfister (ppfister)
> >> <ppfister@cisco.com>; Xie, Huawei <huawei.xie@intel.com>;
> dev@dpdk.org;
> >> vkaplans@redhat.com; mst@redhat.com
> >> Subject: Re: [dpdk-dev] [PATCH v4] vhost: Add indirect descriptors
> support to the
> >> TX path
> >>
> >>
> >>
> >>>>>> Hi Maxime,
> >>>>>>
> >>>>>> I did a little more macswap test and found out more stuff here:
> >>>>> Thanks for doing more tests.
> >>>>>
> >>>>>>
> >>>>>>  1. I did loopback test on another HSW machine with the same H/W,
> >>>>>>     and indirect_desc on and off seems have close perf
> >>>>>>
> >>>>>>  2. So I checked the gcc version:
> >>>>>>
> >>>>>>      *  Previous: gcc version 6.2.1 20160916 (Fedora 24)
> >>>>>>
> >>>>>>      *  New: gcc version 5.4.0 20160609 (Ubuntu 16.04.1 LTS)
> >>>>>
> >>>>> On my side, I tested with RHEL7.3:
> >>>>>  - gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-11)
> >>>>>
> >>>>> It certainly contains some backports from newer GCC versions.
> >>>>>
> >>>>>>
> >>>>>>     On previous one indirect_desc has 20% drop
> >>>>>>
> >>>>>>  3. Then I compiled binary on Ubuntu and scp to Fedora, and as
> >>>>>>     expected I got the same perf as on Ubuntu, and the perf gap
> >>>>>>     disappeared, so gcc is definitely one factor here
> >>>>>>
> >>>>>>  4. Then I use the Ubuntu binary on Fedora for PVP test, then the
> >>>>>>     perf gap comes back again and the same with the Fedora binary
> >>>>>>     results, indirect_desc causes about 20% drop
> >>>>>
> >>>>> Let me know if I understand correctly:
> >>>
> >>> Yes, and it's hard to breakdown further at this time.
> >>>
> >>> Also we may need to check whether it's caused by certain NIC
> >>> model. Unfortunately I don't have the right setup right now.
> >>>
> >>>>> Loopback test with macswap:
> >>>>>  - gcc version 6.2.1 : 20% perf drop
> >>>>>  - gcc version 5.4.0 : No drop
> >>>>>
> >>>>> PVP test with macswap:
> >>>>>  - gcc version 6.2.1 : 20% perf drop
> >>>>>  - gcc version 5.4.0 : 20% perf drop
> >>>>
> >>>> I forgot to ask, did you recompile only host, or both host and guest
> >>>> testmpd's in your test?
> >>
> >>> Both.
> >>
> >> I recompiled testpmd on a Fedora 24 machine using GCC6:
> >> gcc (GCC) 6.1.1 20160621 (Red Hat 6.1.1-3)
> >> Testing loopback with macswap on my Haswell RHEL7.3 machine gives me
> the
> >> following results:
> >>   - indirect on: 7.75Mpps
> >>   - indirect off: 7.35Mpps
> >>
> >> Surprisingly, I get better results with indirect on my setup (I
> >> reproduced the tests multiple times).
> >>
> >> Do you have a document explaining the tuning/config you apply to both
> >> the host and the guest (isolation, HT, hugepage size, ...) in your
> >> setup?
> >
> >
> > The setup where it goes wrong:
> >  1. Xeon E5-2699, HT on, turbo off, 1GB hugepage for both host and guest
> On the Haswell machine (on which I don't have BIOS access), HT is on,
> but I unplug siblings at runtime.
> I also have 1G pages on both sides, and I isolate the cores used by both
> testpmd and vCPUS.
> 
> >  2. Fortville 40G
> >  3. Fedora 4.7.5-200.fc24.x86_64
> >  4. gcc version 6.2.1
> >  5. 16.11 RC2 for both host and guest
> >  6. PVP, testpmd macswap for both host and guest
> >
> > BTW, I do see indirect_desc gives slightly better performance for loopback
> > in tests on other platforms, but don't know how PVP performs yet.
> Interesting, other platforms are also Haswell/Broadwell?

Yes, but with different OS.

If you don't have the setup I can do more detailed profiling for the
root cause next week, since my platform is the only one right now that
reporting the drop.


> 
> For PVP benchmarks, are your figures with 0% pkt loss?

No, for testpmd perf analysis it's not necessary in my opinion.

I do tried low rate though, the result is the same.

> 
> Thanks,
> Maxime
> 
> >
> >
> >>
> >> Regards,
> >> Maxime

^ permalink raw reply	[flat|nested] 52+ messages in thread

* Re: [PATCH v4] vhost: Add indirect descriptors support to the TX path
  2016-11-04 13:09                                               ` Wang, Zhihong
@ 2016-11-08 10:51                                                 ` Wang, Zhihong
  0 siblings, 0 replies; 52+ messages in thread
From: Wang, Zhihong @ 2016-11-08 10:51 UTC (permalink / raw)
  To: Wang, Zhihong, Maxime Coquelin, Yuanhan Liu
  Cc: stephen, Pierre Pfister (ppfister), Xie, Huawei, dev, vkaplans, mst

> > > The setup where it goes wrong:
> > >  1. Xeon E5-2699, HT on, turbo off, 1GB hugepage for both host and guest
> > On the Haswell machine (on which I don't have BIOS access), HT is on,
> > but I unplug siblings at runtime.
> > I also have 1G pages on both sides, and I isolate the cores used by both
> > testpmd and vCPUS.
> >
> > >  2. Fortville 40G
> > >  3. Fedora 4.7.5-200.fc24.x86_64
> > >  4. gcc version 6.2.1
> > >  5. 16.11 RC2 for both host and guest
> > >  6. PVP, testpmd macswap for both host and guest
> > >
> > > BTW, I do see indirect_desc gives slightly better performance for
> loopback
> > > in tests on other platforms, but don't know how PVP performs yet.
> > Interesting, other platforms are also Haswell/Broadwell?
> 
> Yes, but with different OS.
> 
> If you don't have the setup I can do more detailed profiling for the
> root cause next week, since my platform is the only one right now that
> reporting the drop.
> 
> 

Hi Maxime,

I just did some profiling and see much higher L2 miss in vhost
dequeue with indirect_desc in my platform, indicates increase
of memory access contention.

I can keep digging further but might not be able to fix it in time
due to limited bandwidth.

Hope this helps.


Thanks
Zhihong





















 

^ permalink raw reply	[flat|nested] 52+ messages in thread

end of thread, other threads:[~2016-11-08 10:51 UTC | newest]

Thread overview: 52+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-23  8:28 [PATCH v3] vhost: Add indirect descriptors support to the TX path Maxime Coquelin
2016-09-23 15:49 ` Michael S. Tsirkin
2016-09-23 18:02   ` Maxime Coquelin
2016-09-23 18:06     ` Michael S. Tsirkin
2016-09-23 18:16       ` Maxime Coquelin
2016-09-23 18:22         ` Michael S. Tsirkin
2016-09-23 20:24           ` Stephen Hemminger
2016-09-26  3:03             ` Yuanhan Liu
2016-09-26 12:25               ` Michael S. Tsirkin
2016-09-26 13:04                 ` Yuanhan Liu
2016-09-27  4:15 ` Yuanhan Liu
2016-09-27  7:25   ` Maxime Coquelin
2016-09-27  8:42 ` [PATCH v4] " Maxime Coquelin
2016-09-27 12:18   ` Yuanhan Liu
2016-10-14  7:24   ` Wang, Zhihong
2016-10-14  7:34     ` Wang, Zhihong
2016-10-14 15:50     ` Maxime Coquelin
2016-10-17 11:23       ` Maxime Coquelin
2016-10-17 13:21         ` Yuanhan Liu
2016-10-17 14:14           ` Maxime Coquelin
2016-10-27  9:00             ` Wang, Zhihong
2016-10-27  9:10               ` Maxime Coquelin
2016-10-27  9:55                 ` Maxime Coquelin
2016-10-27 10:19                   ` Wang, Zhihong
2016-10-28  7:32                     ` Pierre Pfister (ppfister)
2016-10-28  7:58                       ` Maxime Coquelin
2016-11-01  8:15                         ` Yuanhan Liu
2016-11-01  9:39                           ` Thomas Monjalon
2016-11-02  2:44                             ` Yuanhan Liu
2016-10-27 10:33                 ` Yuanhan Liu
2016-10-27 10:35                   ` Maxime Coquelin
2016-10-27 10:46                     ` Yuanhan Liu
2016-10-28  0:49                       ` Wang, Zhihong
2016-10-28  7:42                         ` Maxime Coquelin
2016-10-31 10:01                           ` Wang, Zhihong
2016-11-02 10:51                             ` Maxime Coquelin
2016-11-03  8:11                               ` Maxime Coquelin
2016-11-04  6:18                                 ` Xu, Qian Q
2016-11-04  7:41                                   ` Maxime Coquelin
2016-11-04  7:20                                 ` Wang, Zhihong
2016-11-04  7:57                                   ` Maxime Coquelin
2016-11-04  7:59                                     ` Maxime Coquelin
2016-11-04 10:43                                       ` Wang, Zhihong
2016-11-04 11:22                                         ` Maxime Coquelin
2016-11-04 11:36                                           ` Yuanhan Liu
2016-11-04 11:39                                             ` Maxime Coquelin
2016-11-04 12:30                                           ` Wang, Zhihong
2016-11-04 12:54                                             ` Maxime Coquelin
2016-11-04 13:09                                               ` Wang, Zhihong
2016-11-08 10:51                                                 ` Wang, Zhihong
2016-10-27 10:53                   ` Maxime Coquelin
2016-10-28  6:05                     ` Xu, Qian Q

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.