* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
@ 2010-10-06 13:34 Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 1/2] vhost: put mm after thread stop Michael S. Tsirkin
` (3 more replies)
0 siblings, 4 replies; 21+ messages in thread
From: Michael S. Tsirkin @ 2010-10-06 13:34 UTC (permalink / raw)
To: Krishna Kumar; +Cc: rusty, davem, kvm, arnd, netdev, avi, anthony
On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> for degradation for 1 stream case:
I thought about possible RX/TX contention reasons, and I realized that
we get/put the mm counter all the time. So I write the following: I
haven't seen any performance gain from this in a single queue case, but
maybe this will help multiqueue?
Thanks,
Michael S. Tsirkin (2):
vhost: put mm after thread stop
vhost-net: batch use/unuse mm
drivers/vhost/net.c | 7 -------
drivers/vhost/vhost.c | 16 ++++++++++------
2 files changed, 10 insertions(+), 13 deletions(-)
--
1.7.3-rc1
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH 1/2] vhost: put mm after thread stop
2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
@ 2010-10-06 13:34 ` Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 2/2] vhost-net: batch use/unuse mm Michael S. Tsirkin
` (2 subsequent siblings)
3 siblings, 0 replies; 21+ messages in thread
From: Michael S. Tsirkin @ 2010-10-06 13:34 UTC (permalink / raw)
To: Krishna Kumar; +Cc: rusty, davem, kvm, arnd, netdev, avi, anthony
makes it possible to batch use/unuse mm
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/vhost.c | 9 ++++-----
1 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 677d112..8b9d474 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -207,7 +207,7 @@ static int vhost_worker(void *data)
if (work) {
__set_current_state(TASK_RUNNING);
work->fn(work);
- if (n++) {
+ if (dev->nvqs <= ++n) {
__set_current_state(TASK_RUNNING);
schedule();
n = 0;
@@ -409,15 +409,14 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
/* No one will access memory at this point */
kfree(dev->memory);
dev->memory = NULL;
- if (dev->mm)
- mmput(dev->mm);
- dev->mm = NULL;
-
WARN_ON(!list_empty(&dev->work_list));
if (dev->worker) {
kthread_stop(dev->worker);
dev->worker = NULL;
}
+ if (dev->mm)
+ mmput(dev->mm);
+ dev->mm = NULL;
}
static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
--
1.7.3-rc1
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH 2/2] vhost-net: batch use/unuse mm
2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 1/2] vhost: put mm after thread stop Michael S. Tsirkin
@ 2010-10-06 13:34 ` Michael S. Tsirkin
2010-10-06 17:02 ` [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Krishna Kumar2
2010-10-11 7:21 ` Krishna Kumar2
3 siblings, 0 replies; 21+ messages in thread
From: Michael S. Tsirkin @ 2010-10-06 13:34 UTC (permalink / raw)
To: Krishna Kumar; +Cc: rusty, davem, kvm, arnd, netdev, avi, anthony
Move use/unuse mm to vhost.c which makes it possible to batch these
operations.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
drivers/vhost/net.c | 7 -------
drivers/vhost/vhost.c | 7 ++++++-
2 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 271678e..ff02ea4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -10,7 +10,6 @@
#include <linux/eventfd.h>
#include <linux/vhost.h>
#include <linux/virtio_net.h>
-#include <linux/mmu_context.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/mutex.h>
@@ -136,7 +135,6 @@ static void handle_tx(struct vhost_net *net)
return;
}
- use_mm(net->dev.mm);
mutex_lock(&vq->mutex);
vhost_disable_notify(vq);
@@ -197,7 +195,6 @@ static void handle_tx(struct vhost_net *net)
}
mutex_unlock(&vq->mutex);
- unuse_mm(net->dev.mm);
}
static int peek_head_len(struct sock *sk)
@@ -302,7 +299,6 @@ static void handle_rx_big(struct vhost_net *net)
if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
return;
- use_mm(net->dev.mm);
mutex_lock(&vq->mutex);
vhost_disable_notify(vq);
hdr_size = vq->vhost_hlen;
@@ -381,7 +377,6 @@ static void handle_rx_big(struct vhost_net *net)
}
mutex_unlock(&vq->mutex);
- unuse_mm(net->dev.mm);
}
/* Expects to be always run from workqueue - which acts as
@@ -413,7 +408,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
return;
- use_mm(net->dev.mm);
mutex_lock(&vq->mutex);
vhost_disable_notify(vq);
vhost_hlen = vq->vhost_hlen;
@@ -490,7 +484,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
}
mutex_unlock(&vq->mutex);
- unuse_mm(net->dev.mm);
}
static void handle_rx(struct vhost_net *net)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8b9d474..c83d1c2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -15,6 +15,7 @@
#include <linux/vhost.h>
#include <linux/virtio_net.h>
#include <linux/mm.h>
+#include <linux/mmu_context.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
@@ -179,6 +180,8 @@ static int vhost_worker(void *data)
unsigned uninitialized_var(seq);
int n = 0;
+ use_mm(dev->mm);
+
for (;;) {
/* mb paired w/ kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
@@ -193,7 +196,7 @@ static int vhost_worker(void *data)
if (kthread_should_stop()) {
spin_unlock_irq(&dev->work_lock);
__set_current_state(TASK_RUNNING);
- return 0;
+ break;
}
if (!list_empty(&dev->work_list)) {
work = list_first_entry(&dev->work_list,
@@ -218,6 +221,8 @@ static int vhost_worker(void *data)
}
}
+ unuse_mm(dev->mm);
+ return 0;
}
/* Helper to allocate iovec buffers for all vqs. */
--
1.7.3-rc1
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 1/2] vhost: put mm after thread stop Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 2/2] vhost-net: batch use/unuse mm Michael S. Tsirkin
@ 2010-10-06 17:02 ` Krishna Kumar2
2010-10-11 7:21 ` Krishna Kumar2
3 siblings, 0 replies; 21+ messages in thread
From: Krishna Kumar2 @ 2010-10-06 17:02 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
"Michael S. Tsirkin" <mst@redhat.com> wrote on 10/06/2010 07:04:31 PM:
> "Michael S. Tsirkin" <mst@redhat.com>
> 10/06/2010 07:04 PM
>
> To
>
> Krishna Kumar2/India/IBM@IBMIN
>
> cc
>
> rusty@rustcorp.com.au, davem@davemloft.net, kvm@vger.kernel.org,
> arnd@arndb.de, netdev@vger.kernel.org, avi@redhat.com,
anthony@codemonkey.ws
>
> Subject
>
> Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
>
> On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> > For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> > for degradation for 1 stream case:
>
> I thought about possible RX/TX contention reasons, and I realized that
> we get/put the mm counter all the time. So I write the following: I
> haven't seen any performance gain from this in a single queue case, but
> maybe this will help multiqueue?
Great! I am on vacation tomorrow, but will test with this patch
tomorrow night.
Thanks,
- KK
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
` (2 preceding siblings ...)
2010-10-06 17:02 ` [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Krishna Kumar2
@ 2010-10-11 7:21 ` Krishna Kumar2
2010-10-12 17:09 ` Michael S. Tsirkin
3 siblings, 1 reply; 21+ messages in thread
From: Krishna Kumar2 @ 2010-10-11 7:21 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
"Michael S. Tsirkin" <mst@redhat.com> wrote on 10/06/2010 07:04:31 PM:
> On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> > For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> > for degradation for 1 stream case:
>
> I thought about possible RX/TX contention reasons, and I realized that
> we get/put the mm counter all the time. So I write the following: I
> haven't seen any performance gain from this in a single queue case, but
> maybe this will help multiqueue?
Sorry for the delay, I was sick last couple of days. The results
with your patch are (%'s over original code):
Code BW% CPU% RemoteCPU
MQ (#txq=16) 31.4% 38.42% 6.41%
MQ+MST (#txq=16) 28.3% 18.9% -10.77%
The patch helps CPU utilization but didn't help single stream
drop.
Thanks,
- KK
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-11 7:21 ` Krishna Kumar2
@ 2010-10-12 17:09 ` Michael S. Tsirkin
2010-10-14 7:58 ` Krishna Kumar2
0 siblings, 1 reply; 21+ messages in thread
From: Michael S. Tsirkin @ 2010-10-12 17:09 UTC (permalink / raw)
To: Krishna Kumar2; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
On Mon, Oct 11, 2010 at 12:51:27PM +0530, Krishna Kumar2 wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 10/06/2010 07:04:31 PM:
>
> > On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> > > For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> > > for degradation for 1 stream case:
> >
> > I thought about possible RX/TX contention reasons, and I realized that
> > we get/put the mm counter all the time. So I write the following: I
> > haven't seen any performance gain from this in a single queue case, but
> > maybe this will help multiqueue?
>
> Sorry for the delay, I was sick last couple of days. The results
> with your patch are (%'s over original code):
>
> Code BW% CPU% RemoteCPU
> MQ (#txq=16) 31.4% 38.42% 6.41%
> MQ+MST (#txq=16) 28.3% 18.9% -10.77%
>
> The patch helps CPU utilization but didn't help single stream
> drop.
>
> Thanks,
What other shared TX/RX locks are there? In your setup, is the same
macvtap socket structure used for RX and TX? If yes this will create
cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
there might also be contention on the lock in sk_sleep waitqueue.
Anything else?
--
MST
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-12 17:09 ` Michael S. Tsirkin
@ 2010-10-14 7:58 ` Krishna Kumar2
2010-10-14 8:17 ` Michael S. Tsirkin
0 siblings, 1 reply; 21+ messages in thread
From: Krishna Kumar2 @ 2010-10-14 7:58 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
"Michael S. Tsirkin" <mst@redhat.com> wrote on 10/12/2010 10:39:07 PM:
> > Sorry for the delay, I was sick last couple of days. The results
> > with your patch are (%'s over original code):
> >
> > Code BW% CPU% RemoteCPU
> > MQ (#txq=16) 31.4% 38.42% 6.41%
> > MQ+MST (#txq=16) 28.3% 18.9% -10.77%
> >
> > The patch helps CPU utilization but didn't help single stream
> > drop.
> >
> > Thanks,
>
> What other shared TX/RX locks are there? In your setup, is the same
> macvtap socket structure used for RX and TX? If yes this will create
> cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
> there might also be contention on the lock in sk_sleep waitqueue.
> Anything else?
The patch is not introducing any locking (both vhost and virtio-net).
The single stream drop is due to different vhost threads handling the
RX/TX traffic.
I added a heuristic (fuzzy) to determine if more than one flow
is being used on the device, and if not, use vhost[0] for both
tx and rx (vhost_poll_queue figures this out before waking up
the suitable vhost thread). Testing shows that single stream
performance is as good as the original code.
__________________________________________________________________________
#txqs = 2 (#vhosts = 3)
# BW1 BW2 (%) CPU1 CPU2 (%) RCPU1 RCPU2 (%)
__________________________________________________________________________
1 77344 74973 (-3.06) 172 143 (-16.86) 358 324 (-9.49)
2 20924 21107 (.87) 107 103 (-3.73) 220 217 (-1.36)
4 21629 32911 (52.16) 214 391 (82.71) 446 616 (38.11)
8 21678 34359 (58.49) 428 845 (97.42) 892 1286 (44.17)
16 22046 34401 (56.04) 841 1677 (99.40) 1785 2585 (44.81)
24 22396 35117 (56.80) 1272 2447 (92.37) 2667 3863 (44.84)
32 22750 35158 (54.54) 1719 3233 (88.07) 3569 5143 (44.10)
40 23041 35345 (53.40) 2219 3970 (78.90) 4478 6410 (43.14)
48 23209 35219 (51.74) 2707 4685 (73.06) 5386 7684 (42.66)
64 23215 35209 (51.66) 3639 6195 (70.23) 7206 10218 (41.79)
80 23443 35179 (50.06) 4633 7625 (64.58) 9051 12745 (40.81)
96 24006 36108 (50.41) 5635 9096 (61.41) 10864 15283 (40.67)
128 23601 35744 (51.45) 7475 12104 (61.92) 14495 20405 (40.77)
__________________________________________________________________________
SUM: BW: (37.6) CPU: (69.0) RCPU: (41.2)
__________________________________________________________________________
#txqs = 8 (#vhosts = 5)
# BW1 BW2 (%) CPU1 CPU2 (%) RCPU1 RCPU2 (%)
__________________________________________________________________________
1 77344 75341 (-2.58) 172 171 (-.58) 358 356 (-.55)
2 20924 26872 (28.42) 107 135 (26.16) 220 262 (19.09)
4 21629 33594 (55.31) 214 394 (84.11) 446 615 (37.89)
8 21678 39714 (83.19) 428 949 (121.72) 892 1358 (52.24)
16 22046 39879 (80.88) 841 1791 (112.96) 1785 2737 (53.33)
24 22396 38436 (71.61) 1272 2111 (65.95) 2667 3453 (29.47)
32 22750 38776 (70.44) 1719 3594 (109.07) 3569 5421 (51.89)
40 23041 38023 (65.02) 2219 4358 (96.39) 4478 6507 (45.31)
48 23209 33811 (45.68) 2707 4047 (49.50) 5386 6222 (15.52)
64 23215 30212 (30.13) 3639 3858 (6.01) 7206 5819 (-19.24)
80 23443 34497 (47.15) 4633 7214 (55.70) 9051 10776 (19.05)
96 24006 30990 (29.09) 5635 5731 (1.70) 10864 8799 (-19.00)
128 23601 29413 (24.62) 7475 7804 (4.40) 14495 11638 (-19.71)
__________________________________________________________________________
SUM: BW: (40.1) CPU: (35.7) RCPU: (4.1)
_______________________________________________________________________________
The SD numbers are also good (same table as before, but SD
instead of CPU:
__________________________________________________________________________
#txqs = 2 (#vhosts = 3)
# BW% SD1 SD2 (%) RSD1 RSD2 (%)
__________________________________________________________________________
1 -3.06) 5 4 (-20.00) 21 19 (-9.52)
2 .87 6 6 (0) 27 27 (0)
4 52.16 26 32 (23.07) 108 103 (-4.62)
8 58.49 103 146 (41.74) 431 445 (3.24)
16 56.04 407 514 (26.28) 1729 1586 (-8.27)
24 56.80 934 1161 (24.30) 3916 3665 (-6.40)
32 54.54 1668 2160 (29.49) 6925 6872 (-.76)
40 53.40 2655 3317 (24.93) 10712 10707 (-.04)
48 51.74 3920 4486 (14.43) 15598 14715 (-5.66)
64 51.66 7096 8250 (16.26) 28099 27211 (-3.16)
80 50.06 11240 12586 (11.97) 43913 42070 (-4.19)
96 50.41 16342 16976 (3.87) 63017 57048 (-9.47)
128 51.45 29254 32069 (9.62) 113451 108113 (-4.70)
__________________________________________________________________________
SUM: BW: (37.6) SD: (10.9) RSD: (-5.3)
__________________________________________________________________________
#txqs = 8 (#vhosts = 5)
# BW% SD1 SD2 (%) RSD1 RSD2 (%)
__________________________________________________________________________
1 -2.58 5 5 (0) 21 21 (0)
2 28.42 6 6 (0) 27 25 (-7.40)
4 55.31 26 32 (23.07) 108 102 (-5.55)
8 83.19 103 128 (24.27) 431 368 (-14.61)
16 80.88 407 593 (45.70) 1729 1814 (4.91)
24 71.61 934 965 (3.31) 3916 3156 (-19.40)
32 70.44 1668 3232 (93.76) 6925 9752 (40.82)
40 65.02 2655 5134 (93.37) 10712 15340 (43.20)
48 45.68 3920 4592 (17.14) 15598 14122 (-9.46)
64 30.13 7096 3928 (-44.64) 28099 11880 (-57.72)
80 47.15 11240 18389 (63.60) 43913 55154 (25.59)
96 29.09 16342 21695 (32.75) 63017 66892 (6.14)
128 24.62 29254 36371 (24.32) 113451 109219 (-3.73)
__________________________________________________________________________
SUM: BW: (40.1) SD: (29.0) RSD: (0)
This approach works nicely for both single and multiple stream.
Does this look good?
Thanks,
- KK
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-14 7:58 ` Krishna Kumar2
@ 2010-10-14 8:17 ` Michael S. Tsirkin
2010-10-14 9:04 ` Krishna Kumar2
[not found] ` <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>
0 siblings, 2 replies; 21+ messages in thread
From: Michael S. Tsirkin @ 2010-10-14 8:17 UTC (permalink / raw)
To: Krishna Kumar2; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
On Thu, Oct 14, 2010 at 01:28:58PM +0530, Krishna Kumar2 wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 10/12/2010 10:39:07 PM:
>
> > > Sorry for the delay, I was sick last couple of days. The results
> > > with your patch are (%'s over original code):
> > >
> > > Code BW% CPU% RemoteCPU
> > > MQ (#txq=16) 31.4% 38.42% 6.41%
> > > MQ+MST (#txq=16) 28.3% 18.9% -10.77%
> > >
> > > The patch helps CPU utilization but didn't help single stream
> > > drop.
> > >
> > > Thanks,
> >
> > What other shared TX/RX locks are there? In your setup, is the same
> > macvtap socket structure used for RX and TX? If yes this will create
> > cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
> > there might also be contention on the lock in sk_sleep waitqueue.
> > Anything else?
>
> The patch is not introducing any locking (both vhost and virtio-net).
> The single stream drop is due to different vhost threads handling the
> RX/TX traffic.
>
> I added a heuristic (fuzzy) to determine if more than one flow
> is being used on the device, and if not, use vhost[0] for both
> tx and rx (vhost_poll_queue figures this out before waking up
> the suitable vhost thread). Testing shows that single stream
> performance is as good as the original code.
...
> This approach works nicely for both single and multiple stream.
> Does this look good?
>
> Thanks,
>
> - KK
Yes, but I guess it depends on the heuristic :) What's the logic?
--
MST
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-14 8:17 ` Michael S. Tsirkin
@ 2010-10-14 9:04 ` Krishna Kumar2
[not found] ` <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>
1 sibling, 0 replies; 21+ messages in thread
From: Krishna Kumar2 @ 2010-10-14 9:04 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
> "Michael S. Tsirkin" <mst@redhat.com>
> > > What other shared TX/RX locks are there? In your setup, is the same
> > > macvtap socket structure used for RX and TX? If yes this will create
> > > cacheline bounces as sk_wmem_alloc/sk_rmem_alloc share a cache line,
> > > there might also be contention on the lock in sk_sleep waitqueue.
> > > Anything else?
> >
> > The patch is not introducing any locking (both vhost and virtio-net).
> > The single stream drop is due to different vhost threads handling the
> > RX/TX traffic.
> >
> > I added a heuristic (fuzzy) to determine if more than one flow
> > is being used on the device, and if not, use vhost[0] for both
> > tx and rx (vhost_poll_queue figures this out before waking up
> > the suitable vhost thread). Testing shows that single stream
> > performance is as good as the original code.
>
> ...
>
> > This approach works nicely for both single and multiple stream.
> > Does this look good?
> >
> > Thanks,
> >
> > - KK
>
> Yes, but I guess it depends on the heuristic :) What's the logic?
I define how recently a txq was used. If 0 or 1 txq's were used
recently, use vq[0] (which also handles rx). Otherwise, use
multiple txq (vq[1-n]). The code is:
/*
* Algorithm for selecting vq:
*
* Condition Return
* RX vq vq[0]
* If all txqs unused vq[0]
* If one txq used, and new txq is same vq[0]
* If one txq used, and new txq is different vq[vq->qnum]
* If > 1 txqs used vq[vq->qnum]
* Where "used" means the txq was used in the last 'n' jiffies.
*
* Note: locking is not required as an update race will only result in
* a different worker being woken up.
*/
static inline struct vhost_virtqueue *vhost_find_vq(struct vhost_poll
*poll)
{
if (poll->vq->qnum) {
struct vhost_dev *dev = poll->vq->dev;
struct vhost_virtqueue *vq = &dev->vqs[0];
unsigned long max_time = jiffies - 5; /* Some macro needed */
unsigned long *table = dev->jiffies;
int i, used = 0;
for (i = 0; i < dev->nvqs - 1; i++) {
if (time_after_eq(table[i], max_time) && ++used > 1) {
vq = poll->vq;
break;
}
}
table[poll->vq->qnum - 1] = jiffies;
return vq;
}
/* RX is handled by the same worker thread */
return poll->vq;
}
void vhost_poll_queue(struct vhost_poll *poll)
{
struct vhost_virtqueue *vq = vhost_find_vq(poll);
vhost_work_queue(vq, &poll->work);
}
Since poll batches packets, find_vq does not seem to add much
to the CPU utilization (or BW). I am sure that code can be
optimized much better.
The results I sent in my last mail were without your use_mm
patch, and the only tuning was to make vhost threads run on
only cpus 0-3 (though the performance is good even without
that). I will test it later today with the use_mm patch too.
Thanks,
- KK
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
[not found] ` <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>
@ 2010-10-14 12:17 ` Krishna Kumar2
[not found] ` <OF0BDA6B3A.F673A449-ON652577BC.00422911-652577BC.0043474B@LocalDomain>
1 sibling, 0 replies; 21+ messages in thread
From: Krishna Kumar2 @ 2010-10-14 12:17 UTC (permalink / raw)
To: Krishna Kumar2
Cc: anthony, arnd, avi, davem, kvm, Michael S. Tsirkin, netdev, rusty
Krishna Kumar2/India/IBM wrote on 10/14/2010 02:34:01 PM:
> void vhost_poll_queue(struct vhost_poll *poll)
> {
> struct vhost_virtqueue *vq = vhost_find_vq(poll);
>
> vhost_work_queue(vq, &poll->work);
> }
>
> Since poll batches packets, find_vq does not seem to add much
> to the CPU utilization (or BW). I am sure that code can be
> optimized much better.
>
> The results I sent in my last mail were without your use_mm
> patch, and the only tuning was to make vhost threads run on
> only cpus 0-3 (though the performance is good even without
> that). I will test it later today with the use_mm patch too.
There's a significant reduction in CPU/SD utilization with your
patch. Following is the performance of ORG vs MQ+mm patch:
_________________________________________________
Org vs MQ+mm patch txq=2
# BW% CPU/RCPU% SD/RSD%
_________________________________________________
1 2.26 -1.16 .27 -20.00 0
2 35.07 29.90 21.81 0 -11.11
4 55.03 84.57 37.66 26.92 -4.62
8 73.16 118.69 49.21 45.63 -.46
16 77.43 98.81 47.89 24.07 -7.80
24 71.59 105.18 48.44 62.84 18.18
32 70.91 102.38 47.15 49.22 8.54
40 63.26 90.58 41.00 85.27 37.33
48 45.25 45.99 11.23 14.31 -12.91
64 42.78 41.82 5.50 .43 -25.12
80 31.40 7.31 -18.69 15.78 -11.93
96 27.60 7.79 -18.54 17.39 -10.98
128 23.46 -11.89 -34.41 -.41 -25.53
_________________________________________________
BW: 40.2 CPU/RCPU: 29.9,-2.2 SD/RSD: 12.0,-15.6
Following is the performance of MQ vs MQ+mm patch:
_____________________________________________________
MQ vs MQ+mm patch
# BW% CPU% RCPU% SD% RSD%
_____________________________________________________
1 4.98 -.58 .84 -20.00 0
2 5.17 2.96 2.29 0 -4.00
4 -.18 .25 -.16 3.12 .98
8 -5.47 -1.36 -1.98 17.18 16.57
16 -1.90 -6.64 -3.54 -14.83 -12.12
24 -.01 23.63 14.65 57.61 46.64
32 .27 -3.19 -3.11 -22.98 -22.91
40 -1.06 -2.96 -2.96 -4.18 -4.10
48 -.28 -2.34 -3.71 -2.41 -3.81
64 9.71 33.77 30.65 81.44 77.09
80 -10.69 -31.07 -31.70 -29.22 -29.88
96 -1.14 5.98 .56 -11.57 -16.14
128 -.93 -15.60 -18.31 -19.89 -22.65
_____________________________________________________
BW: 0 CPU/RCPU: -4.2,-6.1 SD/RSD: -13.1,-15.6
_____________________________________________________
Each test case is for 60 secs, sum over two runs (except
when number of netperf sessions is 1, which has 7 runs
of 10 secs each), numcpus=4, numtxqs=8, etc. No tuning
other than taskset each vhost to cpus 0-3.
Thanks,
- KK
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
[not found] ` <OF0BDA6B3A.F673A449-ON652577BC.00422911-652577BC.0043474B@LocalDomain>
@ 2010-10-14 12:47 ` Krishna Kumar2
0 siblings, 0 replies; 21+ messages in thread
From: Krishna Kumar2 @ 2010-10-14 12:47 UTC (permalink / raw)
To: Krishna Kumar2
Cc: anthony, arnd, avi, davem, kvm, Michael S. Tsirkin, netdev, rusty
Krishna Kumar2/India/IBM wrote on 10/14/2010 05:47:54 PM:
Sorry, it should read "txq=8" below.
- KK
> There's a significant reduction in CPU/SD utilization with your
> patch. Following is the performance of ORG vs MQ+mm patch:
>
> _________________________________________________
> Org vs MQ+mm patch txq=2
> # BW% CPU/RCPU% SD/RSD%
> _________________________________________________
> 1 2.26 -1.16 .27 -20.00 0
> 2 35.07 29.90 21.81 0 -11.11
> 4 55.03 84.57 37.66 26.92 -4.62
> 8 73.16 118.69 49.21 45.63 -.46
> 16 77.43 98.81 47.89 24.07 -7.80
> 24 71.59 105.18 48.44 62.84 18.18
> 32 70.91 102.38 47.15 49.22 8.54
> 40 63.26 90.58 41.00 85.27 37.33
> 48 45.25 45.99 11.23 14.31 -12.91
> 64 42.78 41.82 5.50 .43 -25.12
> 80 31.40 7.31 -18.69 15.78 -11.93
> 96 27.60 7.79 -18.54 17.39 -10.98
> 128 23.46 -11.89 -34.41 -.41 -25.53
> _________________________________________________
> BW: 40.2 CPU/RCPU: 29.9,-2.2 SD/RSD: 12.0,-15.6
>
> Following is the performance of MQ vs MQ+mm patch:
> _____________________________________________________
> MQ vs MQ+mm patch
> # BW% CPU% RCPU% SD% RSD%
> _____________________________________________________
> 1 4.98 -.58 .84 -20.00 0
> 2 5.17 2.96 2.29 0 -4.00
> 4 -.18 .25 -.16 3.12 .98
> 8 -5.47 -1.36 -1.98 17.18 16.57
> 16 -1.90 -6.64 -3.54 -14.83 -12.12
> 24 -.01 23.63 14.65 57.61 46.64
> 32 .27 -3.19 -3.11 -22.98 -22.91
> 40 -1.06 -2.96 -2.96 -4.18 -4.10
> 48 -.28 -2.34 -3.71 -2.41 -3.81
> 64 9.71 33.77 30.65 81.44 77.09
> 80 -10.69 -31.07 -31.70 -29.22 -29.88
> 96 -1.14 5.98 .56 -11.57 -16.14
> 128 -.93 -15.60 -18.31 -19.89 -22.65
> _____________________________________________________
> BW: 0 CPU/RCPU: -4.2,-6.1 SD/RSD: -13.1,-15.6
> _____________________________________________________
>
> Each test case is for 60 secs, sum over two runs (except
> when number of netperf sessions is 1, which has 7 runs
> of 10 secs each), numcpus=4, numtxqs=8, etc. No tuning
> other than taskset each vhost to cpus 0-3.
>
> Thanks,
>
> - KK
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-06 17:43 ` Krishna Kumar2
@ 2010-10-06 19:03 ` Michael S. Tsirkin
0 siblings, 0 replies; 21+ messages in thread
From: Michael S. Tsirkin @ 2010-10-06 19:03 UTC (permalink / raw)
To: Krishna Kumar2; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty, herbert
On Wed, Oct 06, 2010 at 11:13:31PM +0530, Krishna Kumar2 wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 10/05/2010 11:53:23 PM:
>
> > > > Any idea where does this come from?
> > > > Do you see more TX interrupts? RX interrupts? Exits?
> > > > Do interrupts bounce more between guest CPUs?
> > > > 4. Identify reasons for single netperf BW regression.
> > >
> > > After testing various combinations of #txqs, #vhosts, #netperf
> > > sessions, I think the drop for 1 stream is due to TX and RX for
> > > a flow being processed on different cpus.
> >
> > Right. Can we fix it?
>
> I am not sure how to. My initial patch had one thread but gave
> small gains and ran into limitations once number of sessions
> became large.
Sure. We will need multiple RX queues, and have a single
thread handle a TX and RX pair. Then we need to make sure packets
from a given flow on TX land on the same thread on RX.
As flows can be hashed differently, for this to work we'll have to
expose this info in host/guest interface.
But since multiqueue implies host/guest ABI changes anyway,
this point is moot.
BTW, an interesting approach could be using bonding
and multiple virtio-net interfaces.
What are the disadvantages of such a setup? One advantage
is it can be made to work in existing guests.
> > > I did two more tests:
> > > 1. Pin vhosts to same CPU:
> > > - BW drop is much lower for 1 stream case (- 5 to -8% range)
> > > - But performance is not so high for more sessions.
> > > 2. Changed vhost to be single threaded:
> > > - No degradation for 1 session, and improvement for upto
> > > 8, sometimes 16 streams (5-12%).
> > > - BW degrades after that, all the way till 128 netperf
> sessions.
> > > - But overall CPU utilization improves.
> > > Summary of the entire run (for 1-128 sessions):
> > > txq=4: BW: (-2.3) CPU: (-16.5) RCPU: (-5.3)
> > > txq=16: BW: (-1.9) CPU: (-24.9) RCPU: (-9.6)
> > >
> > > I don't see any reasons mentioned above. However, for higher
> > > number of netperf sessions, I see a big increase in retransmissions:
> >
> > Hmm, ok, and do you see any errors?
>
> I haven't seen any in any statistics, messages, etc.
Herbert, could you help out debugging this increase in retransmissions
please? Older mail on netdev in this thread has some numbers that seem
to imply that we start hitting retransmissions much more as # of flows
goes up.
> Also no
> retranmissions for txq=1.
While it's nice that we have this parameter, the need to choose between
single stream and multi stream performance when you start the vm makes
this patch much less interesting IMHO.
--
MST
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-06 17:14 ` Krishna Kumar2
@ 2010-10-06 17:50 ` Arnd Bergmann
0 siblings, 0 replies; 21+ messages in thread
From: Arnd Bergmann @ 2010-10-06 17:50 UTC (permalink / raw)
To: Krishna Kumar2
Cc: anthony, avi, davem, Ben Greear, kvm, Michael S. Tsirkin, netdev, rusty
On Wednesday 06 October 2010 19:14:42 Krishna Kumar2 wrote:
> Arnd Bergmann <arnd@arndb.de> wrote on 10/06/2010 05:49:00 PM:
>
> > > I don't see any reasons mentioned above. However, for higher
> > > number of netperf sessions, I see a big increase in retransmissions:
> > > _______________________________________
> > > #netperf ORG NEW
> > > BW (#retr) BW (#retr)
> > > _______________________________________
> > > 1 70244 (0) 64102 (0)
> > > 4 21421 (0) 36570 (416)
> > > 8 21746 (0) 38604 (148)
> > > 16 21783 (0) 40632 (464)
> > > 32 22677 (0) 37163 (1053)
> > > 64 23648 (4) 36449 (2197)
> > > 128 23251 (2) 31676 (3185)
> > > _______________________________________
> >
> >
> > This smells like it could be related to a problem that Ben Greear found
> > recently (see "macvlan: Enable qdisc backoff logic"). When the hardware
> > is busy, used to just drop the packet. With Ben's patch, we return
> -EAGAIN
> > to qemu (or vhost-net) to trigger a resend.
> >
> > I suppose what we really should do is feed that condition back to the
> > guest network stack and implement the backoff in there.
>
> Thanks for the pointer. I will take a look at this as I hadn't seen
> this patch earlier. Is there any way to figure out if this is the
> issue?
I think a good indication would be if this changes with/without the
patch, and if you see -EAGAIN in qemu with the patch applied.
Arnd
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-05 18:23 ` Michael S. Tsirkin
@ 2010-10-06 17:43 ` Krishna Kumar2
2010-10-06 19:03 ` Michael S. Tsirkin
0 siblings, 1 reply; 21+ messages in thread
From: Krishna Kumar2 @ 2010-10-06 17:43 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
"Michael S. Tsirkin" <mst@redhat.com> wrote on 10/05/2010 11:53:23 PM:
> > > Any idea where does this come from?
> > > Do you see more TX interrupts? RX interrupts? Exits?
> > > Do interrupts bounce more between guest CPUs?
> > > 4. Identify reasons for single netperf BW regression.
> >
> > After testing various combinations of #txqs, #vhosts, #netperf
> > sessions, I think the drop for 1 stream is due to TX and RX for
> > a flow being processed on different cpus.
>
> Right. Can we fix it?
I am not sure how to. My initial patch had one thread but gave
small gains and ran into limitations once number of sessions
became large.
> > I did two more tests:
> > 1. Pin vhosts to same CPU:
> > - BW drop is much lower for 1 stream case (- 5 to -8% range)
> > - But performance is not so high for more sessions.
> > 2. Changed vhost to be single threaded:
> > - No degradation for 1 session, and improvement for upto
> > 8, sometimes 16 streams (5-12%).
> > - BW degrades after that, all the way till 128 netperf
sessions.
> > - But overall CPU utilization improves.
> > Summary of the entire run (for 1-128 sessions):
> > txq=4: BW: (-2.3) CPU: (-16.5) RCPU: (-5.3)
> > txq=16: BW: (-1.9) CPU: (-24.9) RCPU: (-9.6)
> >
> > I don't see any reasons mentioned above. However, for higher
> > number of netperf sessions, I see a big increase in retransmissions:
>
> Hmm, ok, and do you see any errors?
I haven't seen any in any statistics, messages, etc. Also no
retranmissions for txq=1.
> > Single netperf case didn't have any retransmissions so that is not
> > the cause for drop. I tested ixgbe (MQ):
> > ___________________________________________________________
> > #netperf ixgbe ixgbe (pin intrs to cpu#0 on
> > both server/client)
> > BW (#retr) BW (#retr)
> > ___________________________________________________________
> > 1 3567 (117) 6000 (251)
> > 2 4406 (477) 6298 (725)
> > 4 6119 (1085) 7208 (3387)
> > 8 6595 (4276) 7381 (15296)
> > 16 6651 (11651) 6856 (30394)
>
> Interesting.
> You are saying we get much more retransmissions with physical nic as
> well?
Yes, with ixgbe. I re-ran with 16 netperfs running for 15 secs on
both ixgbe and cxgb3 just now to reconfirm:
ixgbe: BW: 6186.85 SD/Remote: 135.711, 339.376 CPU/Remote: 79.99, 200.00,
Retrans: 545
cxgb3: BW: 8051.07 SD/Remote: 144.416, 260.487 CPU/Remote: 110.88,
200.00, Retrans: 0
However 64 netperfs for 30 secs gave:
ixgbe: BW: 6691.12 SD/Remote: 8046.617, 5259.992 CPU/Remote: 1223.86,
799.97, Retrans: 1424
cxgb3: BW: 7799.16 SD/Remote: 2589.875, 4317.013 CPU/Remote: 480.39
800.64, Retrans: 649
# ethtool -i eth4
driver: ixgbe
version: 2.0.84-k2
firmware-version: 0.9-3
bus-info: 0000:1f:00.1
# ifconfig output:
RX packets:783241 errors:0 dropped:0 overruns:0 frame:0
TX packets:689533 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
# lspci output:
1f:00.0 Ethernet controller: Intel Corporation 82599EB 10-Gigabit Network
Connec
tion (rev 01)
Subsystem: Intel Corporation Ethernet Server Adapter X520-2
Flags: bus master, fast devsel, latency 0, IRQ 30
Memory at 98900000 (64-bit, prefetchable) [size=512K]
I/O ports at 2020 [size=32]
Memory at 98a00000 (64-bit, prefetchable) [size=16K]
Capabilities: [40] Power Management version 3
Capabilities: [50] MSI: Enable- Count=1/1 Maskable+ 64bit+
Capabilities: [70] MSI-X: Enable+ Count=64 Masked-
Capabilities: [a0] Express Endpoint, MSI 00
Capabilities: [100] Advanced Error Reporting
Capabilities: [140] Device Serial Number 00-1b-21-ff-ff-40-4a-b4
Capabilities: [150] Alternative Routing-ID Interpretation (ARI)
Capabilities: [160] Single Root I/O Virtualization (SR-IOV)
Kernel driver in use: ixgbe
Kernel modules: ixgbe
> > I haven't done this right now since I don't have a setup. I guess
> > it would be limited by wire speed and gains may not be there. I
> > will try to do this later when I get the setup.
>
> OK but at least need to check that it does not hurt things.
Yes, sure.
> > Summary:
> >
> > 1. Average BW increase for regular I/O is best for #txq=16 with the
> > least CPU utilization increase.
> > 2. The average BW for 512 byte I/O is best for lower #txq=2. For higher
> > #txqs, BW increased only after a particular #netperf sessions - in
> > my testing that limit was 32 netperf sessions.
> > 3. Multiple txq for guest by itself doesn't seem to have any issues.
> > Guest CPU% increase is slightly higher than BW improvement. I
> > think it is true for all mq drivers since more paths run in parallel
> > upto the device instead of sleeping and allowing one thread to send
> > all packets via qdisc_restart.
> > 4. Having high number of txqs gives better gains and reduces cpu util
> > on the guest and the host.
> > 5. MQ is intended for server loads. MQ should probably not be
explicitly
> > specified for client systems.
> > 6. No regression with numtxqs=1 (or if mq option is not used) in any
> > testing scenario.
>
> Of course txq=1 can be considered a kind of fix, but if we know the
> issue is TX/RX flows getting bounced between CPUs, can we fix this?
> Workload-specific optimizations can only get us this far.
I will test with your patch tomorrow night once I am back.
Thanks,
- KK
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-06 12:19 ` Arnd Bergmann
@ 2010-10-06 17:14 ` Krishna Kumar2
2010-10-06 17:50 ` Arnd Bergmann
0 siblings, 1 reply; 21+ messages in thread
From: Krishna Kumar2 @ 2010-10-06 17:14 UTC (permalink / raw)
To: Arnd Bergmann
Cc: anthony, avi, davem, Ben Greear, kvm, Michael S. Tsirkin, netdev, rusty
Arnd Bergmann <arnd@arndb.de> wrote on 10/06/2010 05:49:00 PM:
> > I don't see any reasons mentioned above. However, for higher
> > number of netperf sessions, I see a big increase in retransmissions:
> > _______________________________________
> > #netperf ORG NEW
> > BW (#retr) BW (#retr)
> > _______________________________________
> > 1 70244 (0) 64102 (0)
> > 4 21421 (0) 36570 (416)
> > 8 21746 (0) 38604 (148)
> > 16 21783 (0) 40632 (464)
> > 32 22677 (0) 37163 (1053)
> > 64 23648 (4) 36449 (2197)
> > 128 23251 (2) 31676 (3185)
> > _______________________________________
>
>
> This smells like it could be related to a problem that Ben Greear found
> recently (see "macvlan: Enable qdisc backoff logic"). When the hardware
> is busy, used to just drop the packet. With Ben's patch, we return
-EAGAIN
> to qemu (or vhost-net) to trigger a resend.
>
> I suppose what we really should do is feed that condition back to the
> guest network stack and implement the backoff in there.
Thanks for the pointer. I will take a look at this as I hadn't seen
this patch earlier. Is there any way to figure out if this is the
issue?
Thanks,
- KK
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-05 10:40 ` Krishna Kumar2
2010-10-05 18:23 ` Michael S. Tsirkin
@ 2010-10-06 12:19 ` Arnd Bergmann
2010-10-06 17:14 ` Krishna Kumar2
1 sibling, 1 reply; 21+ messages in thread
From: Arnd Bergmann @ 2010-10-06 12:19 UTC (permalink / raw)
To: Krishna Kumar2, Ben Greear
Cc: Michael S. Tsirkin, anthony, avi, davem, kvm, netdev, rusty
On Tuesday 05 October 2010, Krishna Kumar2 wrote:
> After testing various combinations of #txqs, #vhosts, #netperf
> sessions, I think the drop for 1 stream is due to TX and RX for
> a flow being processed on different cpus. I did two more tests:
> 1. Pin vhosts to same CPU:
> - BW drop is much lower for 1 stream case (- 5 to -8% range)
> - But performance is not so high for more sessions.
> 2. Changed vhost to be single threaded:
> - No degradation for 1 session, and improvement for upto
> 8, sometimes 16 streams (5-12%).
> - BW degrades after that, all the way till 128 netperf sessions.
> - But overall CPU utilization improves.
> Summary of the entire run (for 1-128 sessions):
> txq=4: BW: (-2.3) CPU: (-16.5) RCPU: (-5.3)
> txq=16: BW: (-1.9) CPU: (-24.9) RCPU: (-9.6)
>
> I don't see any reasons mentioned above. However, for higher
> number of netperf sessions, I see a big increase in retransmissions:
> _______________________________________
> #netperf ORG NEW
> BW (#retr) BW (#retr)
> _______________________________________
> 1 70244 (0) 64102 (0)
> 4 21421 (0) 36570 (416)
> 8 21746 (0) 38604 (148)
> 16 21783 (0) 40632 (464)
> 32 22677 (0) 37163 (1053)
> 64 23648 (4) 36449 (2197)
> 128 23251 (2) 31676 (3185)
> _______________________________________
This smells like it could be related to a problem that Ben Greear found
recently (see "macvlan: Enable qdisc backoff logic"). When the hardware
is busy, used to just drop the packet. With Ben's patch, we return -EAGAIN
to qemu (or vhost-net) to trigger a resend.
I suppose what we really should do is feed that condition back to the
guest network stack and implement the backoff in there.
Arnd
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-10-05 10:40 ` Krishna Kumar2
@ 2010-10-05 18:23 ` Michael S. Tsirkin
2010-10-06 17:43 ` Krishna Kumar2
2010-10-06 12:19 ` Arnd Bergmann
1 sibling, 1 reply; 21+ messages in thread
From: Michael S. Tsirkin @ 2010-10-05 18:23 UTC (permalink / raw)
To: Krishna Kumar2; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
On Tue, Oct 05, 2010 at 04:10:00PM +0530, Krishna Kumar2 wrote:
> "Michael S. Tsirkin" <mst@redhat.com> wrote on 09/19/2010 06:14:43 PM:
>
> > Could you document how exactly do you measure multistream bandwidth:
> > netperf flags, etc?
>
> All results were without any netperf flags or system tuning:
> for i in $list
> do
> netperf -c -C -l 60 -H 192.168.122.1 > /tmp/netperf.$$.$i &
> done
> wait
> Another script processes the result files. It also displays the
> start time/end time of each iteration to make sure skew due to
> parallel netperfs is minimal.
>
> I changed the vhost functionality once more to try to get the
> best model, the new model being:
> 1. #numtxqs=1 -> #vhosts=1, this thread handles both RX/TX.
> 2. #numtxqs>1 -> vhost[0] handles RX and vhost[1-MAX] handles
> TX[0-n], where MAX is 4. Beyond numtxqs=4, the remaining TX
> queues are handled by vhost threads in round-robin fashion.
>
> Results from here on are with these changes, and only "tuning" is
> to set each vhost's affinity to CPUs[0-3] ("taskset -p f <vhost-pids>").
>
> > Any idea where does this come from?
> > Do you see more TX interrupts? RX interrupts? Exits?
> > Do interrupts bounce more between guest CPUs?
> > 4. Identify reasons for single netperf BW regression.
>
> After testing various combinations of #txqs, #vhosts, #netperf
> sessions, I think the drop for 1 stream is due to TX and RX for
> a flow being processed on different cpus.
Right. Can we fix it?
> I did two more tests:
> 1. Pin vhosts to same CPU:
> - BW drop is much lower for 1 stream case (- 5 to -8% range)
> - But performance is not so high for more sessions.
> 2. Changed vhost to be single threaded:
> - No degradation for 1 session, and improvement for upto
> 8, sometimes 16 streams (5-12%).
> - BW degrades after that, all the way till 128 netperf sessions.
> - But overall CPU utilization improves.
> Summary of the entire run (for 1-128 sessions):
> txq=4: BW: (-2.3) CPU: (-16.5) RCPU: (-5.3)
> txq=16: BW: (-1.9) CPU: (-24.9) RCPU: (-9.6)
>
> I don't see any reasons mentioned above. However, for higher
> number of netperf sessions, I see a big increase in retransmissions:
Hmm, ok, and do you see any errors?
> _______________________________________
> #netperf ORG NEW
> BW (#retr) BW (#retr)
> _______________________________________
> 1 70244 (0) 64102 (0)
> 4 21421 (0) 36570 (416)
> 8 21746 (0) 38604 (148)
> 16 21783 (0) 40632 (464)
> 32 22677 (0) 37163 (1053)
> 64 23648 (4) 36449 (2197)
> 128 23251 (2) 31676 (3185)
> _______________________________________
>
> Single netperf case didn't have any retransmissions so that is not
> the cause for drop. I tested ixgbe (MQ):
> ___________________________________________________________
> #netperf ixgbe ixgbe (pin intrs to cpu#0 on
> both server/client)
> BW (#retr) BW (#retr)
> ___________________________________________________________
> 1 3567 (117) 6000 (251)
> 2 4406 (477) 6298 (725)
> 4 6119 (1085) 7208 (3387)
> 8 6595 (4276) 7381 (15296)
> 16 6651 (11651) 6856 (30394)
Interesting.
You are saying we get much more retransmissions with physical nic as
well?
> ___________________________________________________________
>
> > 5. Test perf in more scenarious:
> > small packets
>
> 512 byte packets - BW drop for upto 8 (sometimes 16) netperf sessions,
> but increases with #sessions:
> _______________________________________________________________________________
> # BW1 BW2 (%) CPU1 CPU2 (%) RCPU1 RCPU2 (%)
> _______________________________________________________________________________
> 1 4043 3800 (-6.0) 50 50 (0) 86 98 (13.9)
> 2 8358 7485 (-10.4) 153 178 (16.3) 230 264 (14.7)
> 4 20664 13567 (-34.3) 448 490 (9.3) 530 624 (17.7)
> 8 25198 17590 (-30.1) 967 1021 (5.5) 1085 1257 (15.8)
> 16 23791 24057 (1.1) 1904 2220 (16.5) 2156 2578 (19.5)
> 24 23055 26378 (14.4) 2807 3378 (20.3) 3225 3901 (20.9)
> 32 22873 27116 (18.5) 3748 4525 (20.7) 4307 5239 (21.6)
> 40 22876 29106 (27.2) 4705 5717 (21.5) 5388 6591 (22.3)
> 48 23099 31352 (35.7) 5642 6986 (23.8) 6475 8085 (24.8)
> 64 22645 30563 (34.9) 7527 9027 (19.9) 8619 10656 (23.6)
> 80 22497 31922 (41.8) 9375 11390 (21.4) 10736 13485 (25.6)
> 96 22509 32718 (45.3) 11271 13710 (21.6) 12927 16269 (25.8)
> 128 22255 32397 (45.5) 15036 18093 (20.3) 17144 21608 (26.0)
> _______________________________________________________________________________
> SUM: BW: (16.7) CPU: (20.6) RCPU: (24.3)
> _______________________________________________________________________________
>
> > host -> guest
> _______________________________________________________________________________
> # BW1 BW2 (%) CPU1 CPU2 (%) RCPU1 RCPU2 (%)
> _______________________________________________________________________________
> *1 70706 90398 (27.8) 300 327 (9.0) 140 175 (25.0)
> 2 20951 21937 (4.7) 188 196 (4.2) 93 103 (10.7)
> 4 19952 25281 (26.7) 397 496 (24.9) 210 304 (44.7)
> 8 18559 24992 (34.6) 802 1010 (25.9) 439 659 (50.1)
> 16 18882 25608 (35.6) 1642 2082 (26.7) 953 1454 (52.5)
> 24 19012 26955 (41.7) 2465 3153 (27.9) 1452 2254 (55.2)
> 32 19846 26894 (35.5) 3278 4238 (29.2) 1914 3081 (60.9)
> 40 19704 27034 (37.2) 4104 5303 (29.2) 2409 3866 (60.4)
> 48 19721 26832 (36.0) 4924 6418 (30.3) 2898 4701 (62.2)
> 64 19650 26849 (36.6) 6595 8611 (30.5) 3975 6433 (61.8)
> 80 19432 26823 (38.0) 8244 10817 (31.2) 4985 8165 (63.7)
> 96 20347 27886 (37.0) 9913 13017 (31.3) 5982 9860 (64.8)
> 128 19108 27715 (45.0) 13254 17546 (32.3) 8153 13589 (66.6)
> _______________________________________________________________________________
> SUM: BW: (32.4) CPU: (30.4) RCPU: (62.6)
> _______________________________________________________________________________
> *: Sum over 7 iterations, remaining test cases are sum over 2 iterations
>
> > guest <-> external
>
> I haven't done this right now since I don't have a setup. I guess
> it would be limited by wire speed and gains may not be there. I
> will try to do this later when I get the setup.
OK but at least need to check that it does not hurt things.
> > in last case:
> > find some other way to measure host CPU utilization,
> > try multiqueue and single queue devices
> > 6. Use above to figure out what is a sane default for numtxqs
>
> A. Summary for default I/O (16K):
> #txqs=2 (#vhost=3): BW: (37.6) CPU: (69.2) RCPU: (40.8)
> #txqs=4 (#vhost=5): BW: (36.9) CPU: (60.9) RCPU: (25.2)
> #txqs=8 (#vhost=5): BW: (41.8) CPU: (50.0) RCPU: (15.2)
> #txqs=16 (#vhost=5): BW: (40.4) CPU: (49.9) RCPU: (10.0)
>
> B. Summary for 512 byte I/O:
> #txqs=2 (#vhost=3): BW: (31.6) CPU: (35.7) RCPU: (28.6)
> #txqs=4 (#vhost=5): BW: (5.7) CPU: (27.2) RCPU: (22.7)
> #txqs=8 (#vhost=5): BW: (-.6) CPU: (25.1) RCPU: (22.5)
> #txqs=16 (#vhost=5): BW: (-6.6) CPU: (24.7) RCPU: (21.7)
>
> Summary:
>
> 1. Average BW increase for regular I/O is best for #txq=16 with the
> least CPU utilization increase.
> 2. The average BW for 512 byte I/O is best for lower #txq=2. For higher
> #txqs, BW increased only after a particular #netperf sessions - in
> my testing that limit was 32 netperf sessions.
> 3. Multiple txq for guest by itself doesn't seem to have any issues.
> Guest CPU% increase is slightly higher than BW improvement. I
> think it is true for all mq drivers since more paths run in parallel
> upto the device instead of sleeping and allowing one thread to send
> all packets via qdisc_restart.
> 4. Having high number of txqs gives better gains and reduces cpu util
> on the guest and the host.
> 5. MQ is intended for server loads. MQ should probably not be explicitly
> specified for client systems.
> 6. No regression with numtxqs=1 (or if mq option is not used) in any
> testing scenario.
Of course txq=1 can be considered a kind of fix, but if we know the
issue is TX/RX flows getting bounced between CPUs, can we fix this?
Workload-specific optimizations can only get us this far.
>
> I will send the v3 patch within a day after some more testing.
>
> Thanks,
>
> - KK
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-09-19 12:44 ` Michael S. Tsirkin
@ 2010-10-05 10:40 ` Krishna Kumar2
2010-10-05 18:23 ` Michael S. Tsirkin
2010-10-06 12:19 ` Arnd Bergmann
0 siblings, 2 replies; 21+ messages in thread
From: Krishna Kumar2 @ 2010-10-05 10:40 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: anthony, arnd, avi, davem, kvm, netdev, rusty
"Michael S. Tsirkin" <mst@redhat.com> wrote on 09/19/2010 06:14:43 PM:
> Could you document how exactly do you measure multistream bandwidth:
> netperf flags, etc?
All results were without any netperf flags or system tuning:
for i in $list
do
netperf -c -C -l 60 -H 192.168.122.1 > /tmp/netperf.$$.$i &
done
wait
Another script processes the result files. It also displays the
start time/end time of each iteration to make sure skew due to
parallel netperfs is minimal.
I changed the vhost functionality once more to try to get the
best model, the new model being:
1. #numtxqs=1 -> #vhosts=1, this thread handles both RX/TX.
2. #numtxqs>1 -> vhost[0] handles RX and vhost[1-MAX] handles
TX[0-n], where MAX is 4. Beyond numtxqs=4, the remaining TX
queues are handled by vhost threads in round-robin fashion.
Results from here on are with these changes, and only "tuning" is
to set each vhost's affinity to CPUs[0-3] ("taskset -p f <vhost-pids>").
> Any idea where does this come from?
> Do you see more TX interrupts? RX interrupts? Exits?
> Do interrupts bounce more between guest CPUs?
> 4. Identify reasons for single netperf BW regression.
After testing various combinations of #txqs, #vhosts, #netperf
sessions, I think the drop for 1 stream is due to TX and RX for
a flow being processed on different cpus. I did two more tests:
1. Pin vhosts to same CPU:
- BW drop is much lower for 1 stream case (- 5 to -8% range)
- But performance is not so high for more sessions.
2. Changed vhost to be single threaded:
- No degradation for 1 session, and improvement for upto
8, sometimes 16 streams (5-12%).
- BW degrades after that, all the way till 128 netperf sessions.
- But overall CPU utilization improves.
Summary of the entire run (for 1-128 sessions):
txq=4: BW: (-2.3) CPU: (-16.5) RCPU: (-5.3)
txq=16: BW: (-1.9) CPU: (-24.9) RCPU: (-9.6)
I don't see any reasons mentioned above. However, for higher
number of netperf sessions, I see a big increase in retransmissions:
_______________________________________
#netperf ORG NEW
BW (#retr) BW (#retr)
_______________________________________
1 70244 (0) 64102 (0)
4 21421 (0) 36570 (416)
8 21746 (0) 38604 (148)
16 21783 (0) 40632 (464)
32 22677 (0) 37163 (1053)
64 23648 (4) 36449 (2197)
128 23251 (2) 31676 (3185)
_______________________________________
Single netperf case didn't have any retransmissions so that is not
the cause for drop. I tested ixgbe (MQ):
___________________________________________________________
#netperf ixgbe ixgbe (pin intrs to cpu#0 on
both server/client)
BW (#retr) BW (#retr)
___________________________________________________________
1 3567 (117) 6000 (251)
2 4406 (477) 6298 (725)
4 6119 (1085) 7208 (3387)
8 6595 (4276) 7381 (15296)
16 6651 (11651) 6856 (30394)
___________________________________________________________
> 5. Test perf in more scenarious:
> small packets
512 byte packets - BW drop for upto 8 (sometimes 16) netperf sessions,
but increases with #sessions:
_______________________________________________________________________________
# BW1 BW2 (%) CPU1 CPU2 (%) RCPU1 RCPU2 (%)
_______________________________________________________________________________
1 4043 3800 (-6.0) 50 50 (0) 86 98 (13.9)
2 8358 7485 (-10.4) 153 178 (16.3) 230 264 (14.7)
4 20664 13567 (-34.3) 448 490 (9.3) 530 624 (17.7)
8 25198 17590 (-30.1) 967 1021 (5.5) 1085 1257 (15.8)
16 23791 24057 (1.1) 1904 2220 (16.5) 2156 2578 (19.5)
24 23055 26378 (14.4) 2807 3378 (20.3) 3225 3901 (20.9)
32 22873 27116 (18.5) 3748 4525 (20.7) 4307 5239 (21.6)
40 22876 29106 (27.2) 4705 5717 (21.5) 5388 6591 (22.3)
48 23099 31352 (35.7) 5642 6986 (23.8) 6475 8085 (24.8)
64 22645 30563 (34.9) 7527 9027 (19.9) 8619 10656 (23.6)
80 22497 31922 (41.8) 9375 11390 (21.4) 10736 13485 (25.6)
96 22509 32718 (45.3) 11271 13710 (21.6) 12927 16269 (25.8)
128 22255 32397 (45.5) 15036 18093 (20.3) 17144 21608 (26.0)
_______________________________________________________________________________
SUM: BW: (16.7) CPU: (20.6) RCPU: (24.3)
_______________________________________________________________________________
> host -> guest
_______________________________________________________________________________
# BW1 BW2 (%) CPU1 CPU2 (%) RCPU1 RCPU2 (%)
_______________________________________________________________________________
*1 70706 90398 (27.8) 300 327 (9.0) 140 175 (25.0)
2 20951 21937 (4.7) 188 196 (4.2) 93 103 (10.7)
4 19952 25281 (26.7) 397 496 (24.9) 210 304 (44.7)
8 18559 24992 (34.6) 802 1010 (25.9) 439 659 (50.1)
16 18882 25608 (35.6) 1642 2082 (26.7) 953 1454 (52.5)
24 19012 26955 (41.7) 2465 3153 (27.9) 1452 2254 (55.2)
32 19846 26894 (35.5) 3278 4238 (29.2) 1914 3081 (60.9)
40 19704 27034 (37.2) 4104 5303 (29.2) 2409 3866 (60.4)
48 19721 26832 (36.0) 4924 6418 (30.3) 2898 4701 (62.2)
64 19650 26849 (36.6) 6595 8611 (30.5) 3975 6433 (61.8)
80 19432 26823 (38.0) 8244 10817 (31.2) 4985 8165 (63.7)
96 20347 27886 (37.0) 9913 13017 (31.3) 5982 9860 (64.8)
128 19108 27715 (45.0) 13254 17546 (32.3) 8153 13589 (66.6)
_______________________________________________________________________________
SUM: BW: (32.4) CPU: (30.4) RCPU: (62.6)
_______________________________________________________________________________
*: Sum over 7 iterations, remaining test cases are sum over 2 iterations
> guest <-> external
I haven't done this right now since I don't have a setup. I guess
it would be limited by wire speed and gains may not be there. I
will try to do this later when I get the setup.
> in last case:
> find some other way to measure host CPU utilization,
> try multiqueue and single queue devices
> 6. Use above to figure out what is a sane default for numtxqs
A. Summary for default I/O (16K):
#txqs=2 (#vhost=3): BW: (37.6) CPU: (69.2) RCPU: (40.8)
#txqs=4 (#vhost=5): BW: (36.9) CPU: (60.9) RCPU: (25.2)
#txqs=8 (#vhost=5): BW: (41.8) CPU: (50.0) RCPU: (15.2)
#txqs=16 (#vhost=5): BW: (40.4) CPU: (49.9) RCPU: (10.0)
B. Summary for 512 byte I/O:
#txqs=2 (#vhost=3): BW: (31.6) CPU: (35.7) RCPU: (28.6)
#txqs=4 (#vhost=5): BW: (5.7) CPU: (27.2) RCPU: (22.7)
#txqs=8 (#vhost=5): BW: (-.6) CPU: (25.1) RCPU: (22.5)
#txqs=16 (#vhost=5): BW: (-6.6) CPU: (24.7) RCPU: (21.7)
Summary:
1. Average BW increase for regular I/O is best for #txq=16 with the
least CPU utilization increase.
2. The average BW for 512 byte I/O is best for lower #txq=2. For higher
#txqs, BW increased only after a particular #netperf sessions - in
my testing that limit was 32 netperf sessions.
3. Multiple txq for guest by itself doesn't seem to have any issues.
Guest CPU% increase is slightly higher than BW improvement. I
think it is true for all mq drivers since more paths run in parallel
upto the device instead of sleeping and allowing one thread to send
all packets via qdisc_restart.
4. Having high number of txqs gives better gains and reduces cpu util
on the guest and the host.
5. MQ is intended for server loads. MQ should probably not be explicitly
specified for client systems.
6. No regression with numtxqs=1 (or if mq option is not used) in any
testing scenario.
I will send the v3 patch within a day after some more testing.
Thanks,
- KK
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-09-17 10:03 Krishna Kumar
2010-09-17 15:42 ` Sridhar Samudrala
@ 2010-09-19 12:44 ` Michael S. Tsirkin
2010-10-05 10:40 ` Krishna Kumar2
1 sibling, 1 reply; 21+ messages in thread
From: Michael S. Tsirkin @ 2010-09-19 12:44 UTC (permalink / raw)
To: Krishna Kumar; +Cc: rusty, davem, kvm, arnd, netdev, avi, anthony
On Fri, Sep 17, 2010 at 03:33:07PM +0530, Krishna Kumar wrote:
> For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
> for degradation for 1 stream case:
Could you document how exactly do you measure multistream bandwidth:
netperf flags, etc?
> 1. Without any tuning, BW falls -6.5%.
Any idea where does this come from?
Do you see more TX interrupts? RX interrupts? Exits?
Do interrupts bounce more between guest CPUs?
> 2. When vhosts on server were bound to CPU0, BW was as good
> as with original code.
> 3. When new code was started with numtxqs=1 (or mq=off, which
> is the default), there was no degradation.
>
> Next steps:
> -----------
> 1. MQ RX patch is also complete - plan to submit once TX is OK (as
> well as after identifying bandwidth degradations for some test
> cases).
> 2. Cache-align data structures: I didn't see any BW/SD improvement
> after making the sq's (and similarly for vhost) cache-aligned
> statically:
> struct virtnet_info {
> ...
> struct send_queue sq[16] ____cacheline_aligned_in_smp;
> ...
> };
> 3. Migration is not tested.
4. Identify reasons for single netperf BW regression.
5. Test perf in more scenarious:
small packets
host -> guest
guest <-> external
in last case:
find some other way to measure host CPU utilization,
try multiqueue and single queue devices
6. Use above to figure out what is a sane default for numtxqs.
>
> Review/feedback appreciated.
>
> Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> ---
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
2010-09-17 10:03 Krishna Kumar
@ 2010-09-17 15:42 ` Sridhar Samudrala
2010-09-19 12:44 ` Michael S. Tsirkin
1 sibling, 0 replies; 21+ messages in thread
From: Sridhar Samudrala @ 2010-09-17 15:42 UTC (permalink / raw)
To: Krishna Kumar; +Cc: rusty, davem, mst, kvm, arnd, netdev, avi, anthony
On Fri, 2010-09-17 at 15:33 +0530, Krishna Kumar wrote:
> Following patches implement transmit MQ in virtio-net. Also
> included is the user qemu changes. MQ is disabled by default
> unless qemu specifies it.
>
> 1. This feature was first implemented with a single vhost.
> Testing showed 3-8% performance gain for upto 8 netperf
> sessions (and sometimes 16), but BW dropped with more
> sessions. However, adding more vhosts improved BW
> significantly all the way to 128 sessions. Multiple
> vhost is implemented in-kernel by passing an argument
> to SET_OWNER (retaining backward compatibility). The
> vhost patch adds 173 source lines (incl comments).
> 2. BW -> CPU/SD equation: Average TCP performance increased
> 23% compared to almost 70% for earlier patch (with
> unrestricted #vhosts). SD improved -4.2% while it had
> increased 55% for the earlier patch. Increasing #vhosts
> has it's pros and cons, but this patch lays emphasis on
> reducing CPU utilization. Another option could be a
> tunable to select number of vhosts threads.
> 3. Interoperability: Many combinations, but not all, of qemu,
> host, guest tested together. Tested with multiple i/f's
> on guest, with both mq=on/off, vhost=on/off, etc.
>
> Changes from rev1:
> ------------------
> 1. Move queue_index from virtio_pci_vq_info to virtqueue,
> and resulting changes to existing code and to the patch.
> 2. virtio-net probe uses virtio_config_val.
> 3. Remove constants: VIRTIO_MAX_TXQS, MAX_VQS, all arrays
> allocated on stack, etc.
> 4. Restrict number of vhost threads to 2 - I get much better
> cpu/sd results (without any tuning) with low number of vhost
> threads. Higher vhosts gives better average BW performance
> (from average of 45%), but SD increases significantly (90%).
> 5. Working of vhost threads changes, eg for numtxqs=4:
> vhost-0: handles RX
> vhost-1: handles TX[0]
> vhost-0: handles TX[1]
> vhost-1: handles TX[2]
> vhost-0: handles TX[3]
This doesn't look symmetrical.
TCP flows that go via TX(1,3) use the same vhost thread for RX packets,
whereas flows via TX(0,2) use a different vhost thread.
Thanks
Sridhar
^ permalink raw reply [flat|nested] 21+ messages in thread
* [v2 RFC PATCH 0/4] Implement multiqueue virtio-net
@ 2010-09-17 10:03 Krishna Kumar
2010-09-17 15:42 ` Sridhar Samudrala
2010-09-19 12:44 ` Michael S. Tsirkin
0 siblings, 2 replies; 21+ messages in thread
From: Krishna Kumar @ 2010-09-17 10:03 UTC (permalink / raw)
To: rusty, davem, mst; +Cc: kvm, arnd, netdev, avi, anthony, Krishna Kumar
Following patches implement transmit MQ in virtio-net. Also
included is the user qemu changes. MQ is disabled by default
unless qemu specifies it.
1. This feature was first implemented with a single vhost.
Testing showed 3-8% performance gain for upto 8 netperf
sessions (and sometimes 16), but BW dropped with more
sessions. However, adding more vhosts improved BW
significantly all the way to 128 sessions. Multiple
vhost is implemented in-kernel by passing an argument
to SET_OWNER (retaining backward compatibility). The
vhost patch adds 173 source lines (incl comments).
2. BW -> CPU/SD equation: Average TCP performance increased
23% compared to almost 70% for earlier patch (with
unrestricted #vhosts). SD improved -4.2% while it had
increased 55% for the earlier patch. Increasing #vhosts
has it's pros and cons, but this patch lays emphasis on
reducing CPU utilization. Another option could be a
tunable to select number of vhosts threads.
3. Interoperability: Many combinations, but not all, of qemu,
host, guest tested together. Tested with multiple i/f's
on guest, with both mq=on/off, vhost=on/off, etc.
Changes from rev1:
------------------
1. Move queue_index from virtio_pci_vq_info to virtqueue,
and resulting changes to existing code and to the patch.
2. virtio-net probe uses virtio_config_val.
3. Remove constants: VIRTIO_MAX_TXQS, MAX_VQS, all arrays
allocated on stack, etc.
4. Restrict number of vhost threads to 2 - I get much better
cpu/sd results (without any tuning) with low number of vhost
threads. Higher vhosts gives better average BW performance
(from average of 45%), but SD increases significantly (90%).
5. Working of vhost threads changes, eg for numtxqs=4:
vhost-0: handles RX
vhost-1: handles TX[0]
vhost-0: handles TX[1]
vhost-1: handles TX[2]
vhost-0: handles TX[3]
Enabling MQ on virtio:
-----------------------
When following options are passed to qemu:
- smp > 1
- vhost=on
- mq=on (new option, default:off)
then #txqueues = #cpus. The #txqueues can be changed by using
an optional 'numtxqs' option. e.g. for a smp=4 guest:
vhost=on -> #txqueues = 1
vhost=on,mq=on -> #txqueues = 4
vhost=on,mq=on,numtxqs=8 -> #txqueues = 8
vhost=on,mq=on,numtxqs=2 -> #txqueues = 2
Performance (guest -> local host):
-----------------------------------
System configuration:
Host: 8 Intel Xeon, 8 GB memory
Guest: 4 cpus, 2 GB memory, numtxqs=4
All testing without any system tuning, and default netperf
Results split across two tables to show SD and CPU usage:
________________________________________________________________________
TCP: BW vs CPU/Remote CPU utilization:
# BW1 BW2 (%) CPU1 CPU2 (%) RCPU1 RCPU2 (%)
________________________________________________________________________
1 69971 65376 (-6.56) 134 170 (26.86) 322 376 (16.77)
2 20911 24839 (18.78) 107 139 (29.90) 217 264 (21.65)
4 21431 28912 (34.90) 213 318 (49.29) 444 541 (21.84)
8 21857 34592 (58.26) 444 859 (93.46) 901 1247 (38.40)
16 22368 33083 (47.90) 899 1523 (69.41) 1813 2410 (32.92)
24 22556 32578 (44.43) 1347 2249 (66.96) 2712 3606 (32.96)
32 22727 30923 (36.06) 1806 2506 (38.75) 3622 3952 (9.11)
40 23054 29334 (27.24) 2319 2872 (23.84) 4544 4551 (.15)
48 23006 28800 (25.18) 2827 2990 (5.76) 5465 4718 (-13.66)
64 23411 27661 (18.15) 3708 3306 (-10.84) 7231 5218 (-27.83)
80 23175 27141 (17.11) 4796 4509 (-5.98) 9152 7182 (-21.52)
96 23337 26759 (14.66) 5603 4543 (-18.91) 10890 7162 (-34.23)
128 22726 28339 (24.69) 7559 6395 (-15.39) 14600 10169 (-30.34)
________________________________________________________________________
Summary: BW: 22.8% CPU: 1.9% RCPU: -17.0%
________________________________________________________________________
TCP: BW vs SD/Remote SD:
# BW1 BW2 (%) SD1 SD2 (%) RSD1 RSD2 (%)
________________________________________________________________________
1 69971 65376 (-6.56) 4 6 (50.00) 21 26 (23.80)
2 20911 24839 (18.78) 6 7 (16.66) 27 28 (3.70)
4 21431 28912 (34.90) 26 31 (19.23) 108 111 (2.77)
8 21857 34592 (58.26) 106 135 (27.35) 432 393 (-9.02)
16 22368 33083 (47.90) 431 577 (33.87) 1742 1828 (4.93)
24 22556 32578 (44.43) 972 1393 (43.31) 3915 4479 (14.40)
32 22727 30923 (36.06) 1723 2165 (25.65) 6908 6842 (-.95)
40 23054 29334 (27.24) 2774 2761 (-.46) 10874 8764 (-19.40)
48 23006 28800 (25.18) 4126 3847 (-6.76) 15953 12172 (-23.70)
64 23411 27661 (18.15) 7216 6035 (-16.36) 28146 19078 (-32.21)
80 23175 27141 (17.11) 11729 12454 (6.18) 44765 39750 (-11.20)
96 23337 26759 (14.66) 16745 15905 (-5.01) 65099 50261 (-22.79)
128 22726 28339 (24.69) 30571 27893 (-8.76) 118089 89994 (-23.79)
________________________________________________________________________
Summary: BW: 22.8% SD: -4.21% RSD: -21.06%
________________________________________________________________________
UDP: BW vs SD/CPU
# BW1 BW2 (%) CPU1 CPU2 (%) SD1 SD2 (%)
_____________________________________________________________________________
1 36521 37415 (2.44) 61 61 (0) 2 2 (0)
4 28585 46903 (64.08) 397 546 (37.53) 72 68 (-5.55)
8 26649 44694 (67.71) 851 1243 (46.06) 334 339 (1.49)
16 25905 43385 (67.47) 1740 2631 (51.20) 1409 1572 (11.56)
32 24980 40448 (61.92) 3502 5360 (53.05) 5881 6401 (8.84)
48 27439 39451 (43.77) 5410 8324 (53.86) 12475 14855 (19.07)
64 25682 39915 (55.42) 7165 10825 (51.08) 23404 25982 (11.01)
96 26205 40190 (53.36) 10855 16283 (50.00) 52124 75014 (43.91)
128 25741 40252 (56.37) 14448 22186 (53.55) 133922 96843 (-27.68)
____________________________________________________________________________
Summary: BW: 50.4 CPU: 51.8 SD: -27.68
_____________________________________________________________________________
N#: Number of netperf sessions, 60 sec runs
BW1,SD1,RSD1: Bandwidth (sum across 2 runs in mbps), SD and Remote
SD for original code
BW2,SD2,RSD2: Bandwidth (sum across 2 runs in mbps), SD and Remote
SD for new code.
CPU1,CPU2,RCPU1,RCPU2: Similar to SD.
For 1 TCP netperf, I ran 7 iterations and summed it. Explanation
for degradation for 1 stream case:
1. Without any tuning, BW falls -6.5%.
2. When vhosts on server were bound to CPU0, BW was as good
as with original code.
3. When new code was started with numtxqs=1 (or mq=off, which
is the default), there was no degradation.
Next steps:
-----------
1. MQ RX patch is also complete - plan to submit once TX is OK (as
well as after identifying bandwidth degradations for some test
cases).
2. Cache-align data structures: I didn't see any BW/SD improvement
after making the sq's (and similarly for vhost) cache-aligned
statically:
struct virtnet_info {
...
struct send_queue sq[16] ____cacheline_aligned_in_smp;
...
};
3. Migration is not tested.
Review/feedback appreciated.
Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2010-10-14 12:47 UTC | newest]
Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-06 13:34 [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 1/2] vhost: put mm after thread stop Michael S. Tsirkin
2010-10-06 13:34 ` [PATCH 2/2] vhost-net: batch use/unuse mm Michael S. Tsirkin
2010-10-06 17:02 ` [v2 RFC PATCH 0/4] Implement multiqueue virtio-net Krishna Kumar2
2010-10-11 7:21 ` Krishna Kumar2
2010-10-12 17:09 ` Michael S. Tsirkin
2010-10-14 7:58 ` Krishna Kumar2
2010-10-14 8:17 ` Michael S. Tsirkin
2010-10-14 9:04 ` Krishna Kumar2
[not found] ` <OFEC86A094.39835EBF-ON652577BC.002F9AAF-652577BC.003186B5@LocalDomain>
2010-10-14 12:17 ` Krishna Kumar2
[not found] ` <OF0BDA6B3A.F673A449-ON652577BC.00422911-652577BC.0043474B@LocalDomain>
2010-10-14 12:47 ` Krishna Kumar2
-- strict thread matches above, loose matches on Subject: below --
2010-09-17 10:03 Krishna Kumar
2010-09-17 15:42 ` Sridhar Samudrala
2010-09-19 12:44 ` Michael S. Tsirkin
2010-10-05 10:40 ` Krishna Kumar2
2010-10-05 18:23 ` Michael S. Tsirkin
2010-10-06 17:43 ` Krishna Kumar2
2010-10-06 19:03 ` Michael S. Tsirkin
2010-10-06 12:19 ` Arnd Bergmann
2010-10-06 17:14 ` Krishna Kumar2
2010-10-06 17:50 ` Arnd Bergmann
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.