* [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
@ 2012-03-22 23:48 Shirley Ma
2012-03-23 0:16 ` Shirley Ma
0 siblings, 1 reply; 8+ messages in thread
From: Shirley Ma @ 2012-03-22 23:48 UTC (permalink / raw)
To: Michael S. Tsirkin, netdev, tahm, kvm
Signed-off-by: Shirley Ma <xma@us.ibm.com>
Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Tested-by: Tom Lendacky <toml@us.ibm.com>
---
drivers/vhost/net.c | 26 ++-
drivers/vhost/vhost.c | 300 ++++++++++++++++++++++++----------
drivers/vhost/vhost.h | 16 ++-
3 files changed, 243 insertions(+), 103 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9dab1f5..4664e63 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -41,12 +41,6 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX");
#define VHOST_MAX_PEND 128
#define VHOST_GOODCOPY_LEN 256
-enum {
- VHOST_NET_VQ_RX = 0,
- VHOST_NET_VQ_TX = 1,
- VHOST_NET_VQ_MAX = 2,
-};
-
enum vhost_net_poll_state {
VHOST_NET_POLL_DISABLED = 0,
VHOST_NET_POLL_STARTED = 1,
@@ -510,8 +504,10 @@ static int vhost_net_open(struct inode *inode, struct file *f)
return r;
}
- vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
- vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT,
+ &n->vqs[VHOST_NET_VQ_TX]);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN,
+ &n->vqs[VHOST_NET_VQ_RX]);
n->tx_poll_state = VHOST_NET_POLL_DISABLED;
f->private_data = n;
@@ -863,15 +859,27 @@ static struct miscdevice vhost_net_misc = {
static int vhost_net_init(void)
{
+ int ret;
+
if (experimental_zcopytx)
vhost_enable_zcopy(VHOST_NET_VQ_TX);
- return misc_register(&vhost_net_misc);
+
+ ret = misc_register(&vhost_net_misc);
+ if (ret)
+ return ret;
+
+ ret = vhost_init();
+ if (ret)
+ misc_deregister(&vhost_net_misc);
+
+ return ret;
}
module_init(vhost_net_init);
static void vhost_net_exit(void)
{
misc_deregister(&vhost_net_misc);
+ vhost_cleanup();
}
module_exit(vhost_net_exit);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c14c42b..9fabc5a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -24,7 +24,7 @@
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/kthread.h>
-#include <linux/cgroup.h>
+#include <linux/cpu.h>
#include <linux/net.h>
#include <linux/if_packet.h>
@@ -42,6 +42,15 @@ static unsigned vhost_zcopy_mask __read_mostly;
#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
+/* per cpu vhost struct */
+struct vhost {
+ struct task_struct *worker;
+ spinlock_t lock;
+ struct list_head work_list;
+};
+
+static DEFINE_PER_CPU(struct vhost, vhosts);
+
static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
@@ -64,25 +73,28 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
return 0;
}
-static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
+static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn,
+ struct vhost_virtqueue *vq)
{
INIT_LIST_HEAD(&work->node);
work->fn = fn;
init_waitqueue_head(&work->done);
work->flushing = 0;
work->queue_seq = work->done_seq = 0;
+ work->vq = vq;
+ spin_lock_init(&work->lock);
}
/* Init poll structure */
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
- unsigned long mask, struct vhost_dev *dev)
+ unsigned long mask, struct vhost_virtqueue *vq)
{
init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
init_poll_funcptr(&poll->table, vhost_poll_func);
poll->mask = mask;
- poll->dev = dev;
+ poll->dev = vq->dev;
- vhost_work_init(&poll->work, fn);
+ vhost_work_init(&poll->work, fn, vq);
}
/* Start polling a file. We add ourselves to file's wait queue. The caller must
@@ -108,25 +120,30 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
{
int left;
- spin_lock_irq(&dev->work_lock);
+ spin_lock_irq(&work->lock);
left = seq - work->done_seq;
- spin_unlock_irq(&dev->work_lock);
+ spin_unlock_irq(&work->lock);
return left <= 0;
}
-static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
+/* only flushing this work? */
+static void vhost_work_flush(struct vhost_poll *poll)
{
unsigned seq;
int flushing;
+ struct vhost_dev *dev = poll->dev;
+ struct vhost_work *work = &poll->work;
- spin_lock_irq(&dev->work_lock);
+ if (list_empty(&work->node))
+ return;
+ spin_lock_irq(&work->lock);
seq = work->queue_seq;
work->flushing++;
- spin_unlock_irq(&dev->work_lock);
+ spin_unlock_irq(&work->lock);
wait_event(work->done, vhost_work_seq_done(dev, work, seq));
- spin_lock_irq(&dev->work_lock);
+ spin_lock_irq(&work->lock);
flushing = --work->flushing;
- spin_unlock_irq(&dev->work_lock);
+ spin_unlock_irq(&work->lock);
BUG_ON(flushing < 0);
}
@@ -134,21 +151,59 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
* locks that are also used by the callback. */
void vhost_poll_flush(struct vhost_poll *poll)
{
- vhost_work_flush(poll->dev, &poll->work);
+ vhost_work_flush(poll);
+}
+
+/* schedule the cpu on the same socket but different cpu with the given one */
+static unsigned long sched_node_cpu(unsigned long cpu)
+{
+ int node, ncpus_node;
+ unsigned long sched_cpu = cpu;
+
+ node = cpu_to_node(cpu);
+ ncpus_node = nr_cpus_node(node);
+ if (ncpus_node != 1) {
+ /* pick up a random cpu on the same node, exclude
+ * the input one
+ */
+ sched_cpu = node * ncpus_node + random32() % (ncpus_node - 1);
+ if (sched_cpu >= cpu)
+ ++sched_cpu;
+ /* todo hotplug cpu race */
+ if (!cpu_online(sched_cpu))
+ sched_cpu = cpu;
+ }
+ return sched_cpu;
}
static inline void vhost_work_queue(struct vhost_dev *dev,
struct vhost_work *work)
{
- unsigned long flags;
-
- spin_lock_irqsave(&dev->work_lock, flags);
+ unsigned long cpu = work->vq->cpu;
+ struct vhost *vhost;
+
+ /* Is it safe to disable vq notify here ? */
+ vhost_disable_notify(dev, work->vq);
+
+ /* schedule the work on the cpu socket as the work has been delivered
+ * but different with the cpu the work is delivered on
+ */
+ preempt_disable();
+ if (cpu_to_node(cpu) != cpu_to_node(smp_processor_id())) {
+ cpu = sched_node_cpu(smp_processor_id());
+ work->vq->cpu = cpu;
+ }
+ preempt_enable();
+ vhost = &per_cpu(vhosts, cpu);
+ spin_lock_irq(&vhost->lock);
+ spin_lock(&work->lock);
if (list_empty(&work->node)) {
- list_add_tail(&work->node, &dev->work_list);
+ list_add_tail(&work->node, &vhost->work_list);
work->queue_seq++;
- wake_up_process(dev->worker);
+ wake_up_process(vhost->worker);
}
- spin_unlock_irqrestore(&dev->work_lock, flags);
+ spin_unlock(&work->lock);
+ spin_unlock_irq(&vhost->lock);
}
void vhost_poll_queue(struct vhost_poll *poll)
@@ -188,17 +243,18 @@ static void vhost_vq_reset(struct vhost_dev *dev,
static int vhost_worker(void *data)
{
- struct vhost_dev *dev = data;
- struct vhost_work *work = NULL;
+ struct vhost *vhost = &__get_cpu_var(vhosts);
+ struct list_head *work_list;
+ struct mm_struct *prev_mm = NULL;
unsigned uninitialized_var(seq);
+ struct vhost_work *work = NULL;
- use_mm(dev->mm);
-
+ work_list = &vhost->work_list;
for (;;) {
/* mb paired w/ kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
- spin_lock_irq(&dev->work_lock);
+ spin_lock_irq(&vhost->lock);
if (work) {
work->done_seq = seq;
if (work->flushing)
@@ -206,18 +262,26 @@ static int vhost_worker(void *data)
}
if (kthread_should_stop()) {
- spin_unlock_irq(&dev->work_lock);
+ spin_unlock_irq(&vhost->lock);
__set_current_state(TASK_RUNNING);
break;
}
- if (!list_empty(&dev->work_list)) {
- work = list_first_entry(&dev->work_list,
+ if (!list_empty(work_list)) {
+ work = list_first_entry(work_list,
struct vhost_work, node);
+ spin_lock(&work->lock);
list_del_init(&work->node);
+ spin_unlock(&work->lock);
seq = work->queue_seq;
+ if (prev_mm != work->vq->dev->mm) {
+ if (prev_mm)
+ unuse_mm(prev_mm);
+ prev_mm = work->vq->dev->mm;
+ use_mm(prev_mm);
+ }
} else
work = NULL;
- spin_unlock_irq(&dev->work_lock);
+ spin_unlock_irq(&vhost->lock);
if (work) {
__set_current_state(TASK_RUNNING);
@@ -226,7 +290,9 @@ static int vhost_worker(void *data)
schedule();
}
- unuse_mm(dev->mm);
+
+ if (prev_mm)
+ unuse_mm(prev_mm);
return 0;
}
@@ -298,9 +364,6 @@ long vhost_dev_init(struct vhost_dev *dev,
dev->log_file = NULL;
dev->memory = NULL;
dev->mm = NULL;
- spin_lock_init(&dev->work_lock);
- INIT_LIST_HEAD(&dev->work_list);
- dev->worker = NULL;
for (i = 0; i < dev->nvqs; ++i) {
dev->vqs[i].log = NULL;
@@ -312,7 +375,8 @@ long vhost_dev_init(struct vhost_dev *dev,
vhost_vq_reset(dev, dev->vqs + i);
if (dev->vqs[i].handle_kick)
vhost_poll_init(&dev->vqs[i].poll,
- dev->vqs[i].handle_kick, POLLIN, dev);
+ dev->vqs[i].handle_kick, POLLIN,
+ &dev->vqs[i]);
}
return 0;
@@ -325,71 +389,35 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
return dev->mm == current->mm ? 0 : -EPERM;
}
-struct vhost_attach_cgroups_struct {
- struct vhost_work work;
- struct task_struct *owner;
- int ret;
-};
-
-static void vhost_attach_cgroups_work(struct vhost_work *work)
-{
- struct vhost_attach_cgroups_struct *s;
-
- s = container_of(work, struct vhost_attach_cgroups_struct, work);
- s->ret = cgroup_attach_task_all(s->owner, current);
-}
-
-static int vhost_attach_cgroups(struct vhost_dev *dev)
-{
- struct vhost_attach_cgroups_struct attach;
-
- attach.owner = current;
- vhost_work_init(&attach.work, vhost_attach_cgroups_work);
- vhost_work_queue(dev, &attach.work);
- vhost_work_flush(dev, &attach.work);
- return attach.ret;
-}
-
/* Caller should have device mutex */
static long vhost_dev_set_owner(struct vhost_dev *dev)
{
- struct task_struct *worker;
int err;
+ unsigned long txcpu, rxcpu;
/* Is there an owner already? */
if (dev->mm) {
err = -EBUSY;
- goto err_mm;
+ goto out;
}
- /* No owner, become one */
- dev->mm = get_task_mm(current);
- worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
- if (IS_ERR(worker)) {
- err = PTR_ERR(worker);
- goto err_worker;
- }
+ err = vhost_dev_alloc_iovecs(dev);
+ if (err)
+ goto out;
- dev->worker = worker;
- wake_up_process(worker); /* avoid contributing to loadavg */
+ /* initial txcpu, rxcpu on the same socket */
+ txcpu = sched_node_cpu(smp_processor_id());
+ rxcpu = sched_node_cpu(txcpu);
- err = vhost_attach_cgroups(dev);
- if (err)
- goto err_cgroup;
+ dev->vqs[VHOST_NET_VQ_TX].cpu = txcpu;
+ dev->vqs[VHOST_NET_VQ_RX].cpu = rxcpu;
- err = vhost_dev_alloc_iovecs(dev);
- if (err)
- goto err_cgroup;
+ /* No owner, become one */
+ dev->mm = get_task_mm(current);
return 0;
-err_cgroup:
- kthread_stop(worker);
- dev->worker = NULL;
-err_worker:
- if (dev->mm)
- mmput(dev->mm);
- dev->mm = NULL;
-err_mm:
+
+out:
return err;
}
@@ -474,11 +502,6 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
kfree(rcu_dereference_protected(dev->memory,
lockdep_is_held(&dev->mutex)));
RCU_INIT_POINTER(dev->memory, NULL);
- WARN_ON(!list_empty(&dev->work_list));
- if (dev->worker) {
- kthread_stop(dev->worker);
- dev->worker = NULL;
- }
if (dev->mm)
mmput(dev->mm);
dev->mm = NULL;
@@ -1605,3 +1628,104 @@ void vhost_zerocopy_callback(void *arg)
vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
}
+
+/* to do
+static int __cpuinit vhost_pool_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ struct vhost *vhost = per_cpu(vhosts, hcpu);
+
+ action &= ~CPU_TASKS_FROZEN;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (!create_vhost_task(vhosts, hcpu))
+ return notifier_from_errno(-ENOMEM);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ kthread_bind(vhost->worker, cpumask_any(cpu_online_mask));
+ destory_vhost_task(vhost, hcpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ kthread_bind(vhost->worker, hcpu);
+ wake_up_process(vhost->worker);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ destrory_vhost_task(vhosts, hcpu);
+ take_over_work(vhosts, hcpu);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block vhost_pool_callback_nb __cpuinitdata = {
+ .notifier_call = vhost_pool_callcack,
+ .priority = 0,
+}
+*/
+
+static void free_workers(void)
+{
+ unsigned long cpu;
+ struct vhost *vhost;
+
+ /* to do
+ * unregister_cpu_notifier(&vhost_pool_callback_nb);
+ */
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ vhost = &per_cpu(vhosts, cpu);
+ if (!IS_ERR(vhost->worker)) {
+ kthread_stop(vhost->worker);
+ BUG_ON(!list_empty(&vhost->work_list));
+ }
+ }
+ put_online_cpus();
+}
+
+int vhost_init(void)
+{
+ int ret = -ENOMEM;
+ unsigned long cpu;
+ struct vhost *vhost;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ vhost = &per_cpu(vhosts, cpu);
+
+ INIT_LIST_HEAD(&vhost->work_list);
+ spin_lock_init(&vhost->lock);
+ vhost->worker = kthread_create_on_node(vhost_worker, NULL,
+ cpu_to_node(cpu),
+ "vhost-%lu", cpu);
+ if (IS_ERR(vhost->worker))
+ goto err;
+
+ kthread_bind(vhost->worker, cpu);
+ wake_up_process(vhost->worker);
+ }
+ put_online_cpus();
+
+ /* to do
+ * register_cpu_notifier(&vhost_pool_callback_nb);
+ */
+ return 0;
+err:
+ free_workers();
+ return ret;
+}
+
+void vhost_cleanup(void)
+{
+ free_workers();
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a801e28..c6ecfb0 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -18,6 +18,12 @@
#define VHOST_DMA_DONE_LEN 1
#define VHOST_DMA_CLEAR_LEN 0
+enum {
+ VHOST_NET_VQ_RX = 0,
+ VHOST_NET_VQ_TX = 1,
+ VHOST_NET_VQ_MAX = 2,
+};
+
struct vhost_device;
struct vhost_work;
@@ -30,6 +36,8 @@ struct vhost_work {
int flushing;
unsigned queue_seq;
unsigned done_seq;
+ struct vhost_virtqueue *vq;
+ spinlock_t lock;
};
/* Poll a file (eventfd or socket) */
@@ -44,7 +52,7 @@ struct vhost_poll {
};
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
- unsigned long mask, struct vhost_dev *dev);
+ unsigned long mask, struct vhost_virtqueue *vq);
void vhost_poll_start(struct vhost_poll *poll, struct file *file);
void vhost_poll_stop(struct vhost_poll *poll);
void vhost_poll_flush(struct vhost_poll *poll);
@@ -141,6 +149,7 @@ struct vhost_virtqueue {
/* Reference counting for outstanding ubufs.
* Protected by vq mutex. Writers must also take device mutex. */
struct vhost_ubuf_ref *ubufs;
+ unsigned long cpu;
};
struct vhost_dev {
@@ -155,9 +164,6 @@ struct vhost_dev {
int nvqs;
struct file *log_file;
struct eventfd_ctx *log_ctx;
- spinlock_t work_lock;
- struct list_head work_list;
- struct task_struct *worker;
};
long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
@@ -190,6 +196,8 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len);
void vhost_zerocopy_callback(void *arg);
int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
+int vhost_init(void);
+void vhost_cleanup(void);
#define vq_err(vq, fmt, ...) do { \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
2012-03-22 23:48 [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread Shirley Ma
@ 2012-03-23 0:16 ` Shirley Ma
2012-03-23 18:32 ` Thomas Lendacky
0 siblings, 1 reply; 8+ messages in thread
From: Shirley Ma @ 2012-03-23 0:16 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: netdev, tahm, kvm
Resubmit it with the right format.
Signed-off-by: Shirley Ma <xma@us.ibm.com>
Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Tested-by: Tom Lendacky <toml@us.ibm.com>
---
drivers/vhost/net.c | 26 ++-
drivers/vhost/vhost.c | 300 ++++++++++++++++++++++++----------
drivers/vhost/vhost.h | 16 ++-
3 files changed, 243 insertions(+), 103 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9dab1f5..4664e63 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -41,12 +41,6 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX");
#define VHOST_MAX_PEND 128
#define VHOST_GOODCOPY_LEN 256
-enum {
- VHOST_NET_VQ_RX = 0,
- VHOST_NET_VQ_TX = 1,
- VHOST_NET_VQ_MAX = 2,
-};
-
enum vhost_net_poll_state {
VHOST_NET_POLL_DISABLED = 0,
VHOST_NET_POLL_STARTED = 1,
@@ -510,8 +504,10 @@ static int vhost_net_open(struct inode *inode, struct file *f)
return r;
}
- vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
- vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT,
+ &n->vqs[VHOST_NET_VQ_TX]);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN,
+ &n->vqs[VHOST_NET_VQ_RX]);
n->tx_poll_state = VHOST_NET_POLL_DISABLED;
f->private_data = n;
@@ -863,15 +859,27 @@ static struct miscdevice vhost_net_misc = {
static int vhost_net_init(void)
{
+ int ret;
+
if (experimental_zcopytx)
vhost_enable_zcopy(VHOST_NET_VQ_TX);
- return misc_register(&vhost_net_misc);
+
+ ret = misc_register(&vhost_net_misc);
+ if (ret)
+ return ret;
+
+ ret = vhost_init();
+ if (ret)
+ misc_deregister(&vhost_net_misc);
+
+ return ret;
}
module_init(vhost_net_init);
static void vhost_net_exit(void)
{
misc_deregister(&vhost_net_misc);
+ vhost_cleanup();
}
module_exit(vhost_net_exit);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c14c42b..9fabc5a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -24,7 +24,7 @@
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/kthread.h>
-#include <linux/cgroup.h>
+#include <linux/cpu.h>
#include <linux/net.h>
#include <linux/if_packet.h>
@@ -42,6 +42,15 @@ static unsigned vhost_zcopy_mask __read_mostly;
#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
+/* per cpu vhost struct */
+struct vhost {
+ struct task_struct *worker;
+ spinlock_t lock;
+ struct list_head work_list;
+};
+
+static DEFINE_PER_CPU(struct vhost, vhosts);
+
static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
poll_table *pt)
{
@@ -64,25 +73,28 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
return 0;
}
-static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
+static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn,
+ struct vhost_virtqueue *vq)
{
INIT_LIST_HEAD(&work->node);
work->fn = fn;
init_waitqueue_head(&work->done);
work->flushing = 0;
work->queue_seq = work->done_seq = 0;
+ work->vq = vq;
+ spin_lock_init(&work->lock);
}
/* Init poll structure */
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
- unsigned long mask, struct vhost_dev *dev)
+ unsigned long mask, struct vhost_virtqueue *vq)
{
init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
init_poll_funcptr(&poll->table, vhost_poll_func);
poll->mask = mask;
- poll->dev = dev;
+ poll->dev = vq->dev;
- vhost_work_init(&poll->work, fn);
+ vhost_work_init(&poll->work, fn, vq);
}
/* Start polling a file. We add ourselves to file's wait queue. The caller must
@@ -108,25 +120,30 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
{
int left;
- spin_lock_irq(&dev->work_lock);
+ spin_lock_irq(&work->lock);
left = seq - work->done_seq;
- spin_unlock_irq(&dev->work_lock);
+ spin_unlock_irq(&work->lock);
return left <= 0;
}
-static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
+/* only flushing this work? */
+static void vhost_work_flush(struct vhost_poll *poll)
{
unsigned seq;
int flushing;
+ struct vhost_dev *dev = poll->dev;
+ struct vhost_work *work = &poll->work;
- spin_lock_irq(&dev->work_lock);
+ if (list_empty(&work->node))
+ return;
+ spin_lock_irq(&work->lock);
seq = work->queue_seq;
work->flushing++;
- spin_unlock_irq(&dev->work_lock);
+ spin_unlock_irq(&work->lock);
wait_event(work->done, vhost_work_seq_done(dev, work, seq));
- spin_lock_irq(&dev->work_lock);
+ spin_lock_irq(&work->lock);
flushing = --work->flushing;
- spin_unlock_irq(&dev->work_lock);
+ spin_unlock_irq(&work->lock);
BUG_ON(flushing < 0);
}
@@ -134,21 +151,59 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
* locks that are also used by the callback. */
void vhost_poll_flush(struct vhost_poll *poll)
{
- vhost_work_flush(poll->dev, &poll->work);
+ vhost_work_flush(poll);
+}
+
+/* schedule the cpu on the same socket but different cpu with the given one */
+static unsigned long sched_node_cpu(unsigned long cpu)
+{
+ int node, ncpus_node;
+ unsigned long sched_cpu = cpu;
+
+ node = cpu_to_node(cpu);
+ ncpus_node = nr_cpus_node(node);
+ if (ncpus_node != 1) {
+ /* pick up a random cpu on the same node, exclude
+ * the input one
+ */
+ sched_cpu = node * ncpus_node + random32() % (ncpus_node - 1);
+ if (sched_cpu >= cpu)
+ ++sched_cpu;
+ /* todo hotplug cpu race */
+ if (!cpu_online(sched_cpu))
+ sched_cpu = cpu;
+ }
+ return sched_cpu;
}
static inline void vhost_work_queue(struct vhost_dev *dev,
struct vhost_work *work)
{
- unsigned long flags;
-
- spin_lock_irqsave(&dev->work_lock, flags);
+ unsigned long cpu = work->vq->cpu;
+ struct vhost *vhost;
+
+ /* Is it safe to disable vq notify here ? */
+ vhost_disable_notify(dev, work->vq);
+
+ /* schedule the work on the cpu socket as the work has been delivered
+ * but different with the cpu the work is delivered on
+ */
+ preempt_disable();
+ if (cpu_to_node(cpu) != cpu_to_node(smp_processor_id())) {
+ cpu = sched_node_cpu(smp_processor_id());
+ work->vq->cpu = cpu;
+ }
+ preempt_enable();
+ vhost = &per_cpu(vhosts, cpu);
+ spin_lock_irq(&vhost->lock);
+ spin_lock(&work->lock);
if (list_empty(&work->node)) {
- list_add_tail(&work->node, &dev->work_list);
+ list_add_tail(&work->node, &vhost->work_list);
work->queue_seq++;
- wake_up_process(dev->worker);
+ wake_up_process(vhost->worker);
}
- spin_unlock_irqrestore(&dev->work_lock, flags);
+ spin_unlock(&work->lock);
+ spin_unlock_irq(&vhost->lock);
}
void vhost_poll_queue(struct vhost_poll *poll)
@@ -188,17 +243,18 @@ static void vhost_vq_reset(struct vhost_dev *dev,
static int vhost_worker(void *data)
{
- struct vhost_dev *dev = data;
- struct vhost_work *work = NULL;
+ struct vhost *vhost = &__get_cpu_var(vhosts);
+ struct list_head *work_list;
+ struct mm_struct *prev_mm = NULL;
unsigned uninitialized_var(seq);
+ struct vhost_work *work = NULL;
- use_mm(dev->mm);
-
+ work_list = &vhost->work_list;
for (;;) {
/* mb paired w/ kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
- spin_lock_irq(&dev->work_lock);
+ spin_lock_irq(&vhost->lock);
if (work) {
work->done_seq = seq;
if (work->flushing)
@@ -206,18 +262,26 @@ static int vhost_worker(void *data)
}
if (kthread_should_stop()) {
- spin_unlock_irq(&dev->work_lock);
+ spin_unlock_irq(&vhost->lock);
__set_current_state(TASK_RUNNING);
break;
}
- if (!list_empty(&dev->work_list)) {
- work = list_first_entry(&dev->work_list,
+ if (!list_empty(work_list)) {
+ work = list_first_entry(work_list,
struct vhost_work, node);
+ spin_lock(&work->lock);
list_del_init(&work->node);
+ spin_unlock(&work->lock);
seq = work->queue_seq;
+ if (prev_mm != work->vq->dev->mm) {
+ if (prev_mm)
+ unuse_mm(prev_mm);
+ prev_mm = work->vq->dev->mm;
+ use_mm(prev_mm);
+ }
} else
work = NULL;
- spin_unlock_irq(&dev->work_lock);
+ spin_unlock_irq(&vhost->lock);
if (work) {
__set_current_state(TASK_RUNNING);
@@ -226,7 +290,9 @@ static int vhost_worker(void *data)
schedule();
}
- unuse_mm(dev->mm);
+
+ if (prev_mm)
+ unuse_mm(prev_mm);
return 0;
}
@@ -298,9 +364,6 @@ long vhost_dev_init(struct vhost_dev *dev,
dev->log_file = NULL;
dev->memory = NULL;
dev->mm = NULL;
- spin_lock_init(&dev->work_lock);
- INIT_LIST_HEAD(&dev->work_list);
- dev->worker = NULL;
for (i = 0; i < dev->nvqs; ++i) {
dev->vqs[i].log = NULL;
@@ -312,7 +375,8 @@ long vhost_dev_init(struct vhost_dev *dev,
vhost_vq_reset(dev, dev->vqs + i);
if (dev->vqs[i].handle_kick)
vhost_poll_init(&dev->vqs[i].poll,
- dev->vqs[i].handle_kick, POLLIN, dev);
+ dev->vqs[i].handle_kick, POLLIN,
+ &dev->vqs[i]);
}
return 0;
@@ -325,71 +389,35 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
return dev->mm == current->mm ? 0 : -EPERM;
}
-struct vhost_attach_cgroups_struct {
- struct vhost_work work;
- struct task_struct *owner;
- int ret;
-};
-
-static void vhost_attach_cgroups_work(struct vhost_work *work)
-{
- struct vhost_attach_cgroups_struct *s;
-
- s = container_of(work, struct vhost_attach_cgroups_struct, work);
- s->ret = cgroup_attach_task_all(s->owner, current);
-}
-
-static int vhost_attach_cgroups(struct vhost_dev *dev)
-{
- struct vhost_attach_cgroups_struct attach;
-
- attach.owner = current;
- vhost_work_init(&attach.work, vhost_attach_cgroups_work);
- vhost_work_queue(dev, &attach.work);
- vhost_work_flush(dev, &attach.work);
- return attach.ret;
-}
-
/* Caller should have device mutex */
static long vhost_dev_set_owner(struct vhost_dev *dev)
{
- struct task_struct *worker;
int err;
+ unsigned long txcpu, rxcpu;
/* Is there an owner already? */
if (dev->mm) {
err = -EBUSY;
- goto err_mm;
+ goto out;
}
- /* No owner, become one */
- dev->mm = get_task_mm(current);
- worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
- if (IS_ERR(worker)) {
- err = PTR_ERR(worker);
- goto err_worker;
- }
+ err = vhost_dev_alloc_iovecs(dev);
+ if (err)
+ goto out;
- dev->worker = worker;
- wake_up_process(worker); /* avoid contributing to loadavg */
+ /* initial txcpu, rxcpu on the same socket */
+ txcpu = sched_node_cpu(smp_processor_id());
+ rxcpu = sched_node_cpu(txcpu);
- err = vhost_attach_cgroups(dev);
- if (err)
- goto err_cgroup;
+ dev->vqs[VHOST_NET_VQ_TX].cpu = txcpu;
+ dev->vqs[VHOST_NET_VQ_RX].cpu = rxcpu;
- err = vhost_dev_alloc_iovecs(dev);
- if (err)
- goto err_cgroup;
+ /* No owner, become one */
+ dev->mm = get_task_mm(current);
return 0;
-err_cgroup:
- kthread_stop(worker);
- dev->worker = NULL;
-err_worker:
- if (dev->mm)
- mmput(dev->mm);
- dev->mm = NULL;
-err_mm:
+
+out:
return err;
}
@@ -474,11 +502,6 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
kfree(rcu_dereference_protected(dev->memory,
lockdep_is_held(&dev->mutex)));
RCU_INIT_POINTER(dev->memory, NULL);
- WARN_ON(!list_empty(&dev->work_list));
- if (dev->worker) {
- kthread_stop(dev->worker);
- dev->worker = NULL;
- }
if (dev->mm)
mmput(dev->mm);
dev->mm = NULL;
@@ -1605,3 +1628,104 @@ void vhost_zerocopy_callback(void *arg)
vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
}
+
+/* to do
+static int __cpuinit vhost_pool_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ struct vhost *vhost = per_cpu(vhosts, hcpu);
+
+ action &= ~CPU_TASKS_FROZEN;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (!create_vhost_task(vhosts, hcpu))
+ return notifier_from_errno(-ENOMEM);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ kthread_bind(vhost->worker, cpumask_any(cpu_online_mask));
+ destory_vhost_task(vhost, hcpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ kthread_bind(vhost->worker, hcpu);
+ wake_up_process(vhost->worker);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ break;
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ destrory_vhost_task(vhosts, hcpu);
+ take_over_work(vhosts, hcpu);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block vhost_pool_callback_nb __cpuinitdata = {
+ .notifier_call = vhost_pool_callcack,
+ .priority = 0,
+}
+*/
+
+static void free_workers(void)
+{
+ unsigned long cpu;
+ struct vhost *vhost;
+
+ /* to do
+ * unregister_cpu_notifier(&vhost_pool_callback_nb);
+ */
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ vhost = &per_cpu(vhosts, cpu);
+ if (!IS_ERR(vhost->worker)) {
+ kthread_stop(vhost->worker);
+ BUG_ON(!list_empty(&vhost->work_list));
+ }
+ }
+ put_online_cpus();
+}
+
+int vhost_init(void)
+{
+ int ret = -ENOMEM;
+ unsigned long cpu;
+ struct vhost *vhost;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ vhost = &per_cpu(vhosts, cpu);
+
+ INIT_LIST_HEAD(&vhost->work_list);
+ spin_lock_init(&vhost->lock);
+ vhost->worker = kthread_create_on_node(vhost_worker, NULL,
+ cpu_to_node(cpu),
+ "vhost-%lu", cpu);
+ if (IS_ERR(vhost->worker))
+ goto err;
+
+ kthread_bind(vhost->worker, cpu);
+ wake_up_process(vhost->worker);
+ }
+ put_online_cpus();
+
+ /* to do
+ * register_cpu_notifier(&vhost_pool_callback_nb);
+ */
+ return 0;
+err:
+ free_workers();
+ return ret;
+}
+
+void vhost_cleanup(void)
+{
+ free_workers();
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a801e28..c6ecfb0 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -18,6 +18,12 @@
#define VHOST_DMA_DONE_LEN 1
#define VHOST_DMA_CLEAR_LEN 0
+enum {
+ VHOST_NET_VQ_RX = 0,
+ VHOST_NET_VQ_TX = 1,
+ VHOST_NET_VQ_MAX = 2,
+};
+
struct vhost_device;
struct vhost_work;
@@ -30,6 +36,8 @@ struct vhost_work {
int flushing;
unsigned queue_seq;
unsigned done_seq;
+ struct vhost_virtqueue *vq;
+ spinlock_t lock;
};
/* Poll a file (eventfd or socket) */
@@ -44,7 +52,7 @@ struct vhost_poll {
};
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
- unsigned long mask, struct vhost_dev *dev);
+ unsigned long mask, struct vhost_virtqueue *vq);
void vhost_poll_start(struct vhost_poll *poll, struct file *file);
void vhost_poll_stop(struct vhost_poll *poll);
void vhost_poll_flush(struct vhost_poll *poll);
@@ -141,6 +149,7 @@ struct vhost_virtqueue {
/* Reference counting for outstanding ubufs.
* Protected by vq mutex. Writers must also take device mutex. */
struct vhost_ubuf_ref *ubufs;
+ unsigned long cpu;
};
struct vhost_dev {
@@ -155,9 +164,6 @@ struct vhost_dev {
int nvqs;
struct file *log_file;
struct eventfd_ctx *log_ctx;
- spinlock_t work_lock;
- struct list_head work_list;
- struct task_struct *worker;
};
long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
@@ -190,6 +196,8 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len);
void vhost_zerocopy_callback(void *arg);
int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
+int vhost_init(void);
+void vhost_cleanup(void);
#define vq_err(vq, fmt, ...) do { \
pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
2012-03-23 0:16 ` Shirley Ma
@ 2012-03-23 18:32 ` Thomas Lendacky
2012-03-23 19:00 ` Rick Jones
2012-03-23 23:45 ` David Ahern
0 siblings, 2 replies; 8+ messages in thread
From: Thomas Lendacky @ 2012-03-23 18:32 UTC (permalink / raw)
To: Shirley Ma; +Cc: Michael S. Tsirkin, netdev, kvm
I ran a series of TCP_RR, UDP_RR, TCP_STREAM and TCP_MAERTS tests
against the recent vhost patches. For simplicity, the patches
submitted by Anthony that increase the number of threads per vhost
instance I will call multi-worker and the patches submitted by Shirley
that provide a vhost thread per cpu I will call per-cpu.
Quick description of the tests:
TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30
and 60 instances
TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes
and 1 and 4 instances
Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests
running between an external host and each VM.
Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests
running between VM pairs on the same host (no TCP_MAERTS done in
this situation).
For TCP_RR and UDP_RR tests I report the transaction rate as the
score and the transaction rate / KVMhost CPU% as the efficiency.
For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps
as the score and the throughput / KVMhost CPU% as the efficiency.
The KVM host machine is a nehalem-based 2-socket, 4-cores/socket
system (E5530 @ 2.40GHz) with hyperthreading disabled and an Intel
10GbE single port network adapter.
There's a lot of data and I hope this is the clearest way to report
it. The remote host to VM results are first followed by the local
VM to VM results.
Remote Host to VM:
Host to 1 VM
- Base - -Multi-Worker- - Per-CPU -
Test Inst Score Eff Score Eff Score Eff
TCP_RR 1 9,587 984 9,725 1,145 9,252 1,041
10 63,919 3,095 51,841 2,415 55,226 2,884
30 85,646 3,288 127,277 3,242 145,644 4,092
60 117,448 3,929 148,330 3,616 137,996 3,898
UDP_RR 1 10,815 1,174 10,125 1,255 7,913 1,150
10 53,989 3,082 59,590 2,875 52,353 3,328
30 91,484 4,115 95,312 3,042 110,715 3,659
60 107,466 4,689 173,443 4,351 158,141 4,235
TCP_STREAM
256 1 2,724 140 2,450 131 2,681 150
4 5,027 137 4,147 146 3,998 117
1024 1 5,602 235 4,623 169 5,425 238
4 5,987 212 5,991 133 6,827 175
4096 1 6,202 256 6,753 211 7,247 279
4 4,996 192 5,771 159 7,124 202
16384 1 6,258 259 7,211 214 8,453 308
4 4,591 179 5,788 181 6,925 217
TCP_MAERTS
256 1 1,951 85 1,871 89 1,899 97
4 4,757 129 4,102 140 4,279 116
1024 1 7,479 381 6,970 371 7,374 427
4 8,931 385 6,612 258 8,731 417
4096 1 9,276 464 9,296 456 9,131 510
4 9,381 452 9,032 367 9,338 446
16384 1 9,153 496 8,817 589 9,238 516
4 9,358 478 9,006 367 9,350 462
Host to 1 VM (VM pinned to a socket)
- Base - -Multi-Worker- - Per-CPU -
Test Inst Score Eff Score Eff Score Eff
TCP_RR 1 9,992 1,019 9,899 917 8,963 899
10 60,731 3,236 60,015 2,444 55,860 3,059
30 127,375 4,042 146,571 3,922 163,806 4,389
60 173,021 4,972 149,549 4,662 161,397 4,330
UDP_RR 1 10,854 1,253 7,983 1,120 7,647 1,206
10 68,128 3,804 64,335 4,067 53,343 3,233
30 92,456 3,994 112,101 4,219 111,610 3,598
60 135,741 4,590 184,441 4,422 184,527 4,546
TCP_STREAM
256 1 2,564 146 2,530 147 2,497 150
4 4,757 139 4,300 127 4,245 124
1024 1 4,700 209 6,062 323 5,627 247
4 6,828 214 7,125 153 6,561 172
4096 1 6,676 281 7,672 286 7,760 290
4 6,258 236 6,410 171 7,354 225
16384 1 6,712 289 8,217 297 8,457 322
4 5,764 235 6,285 200 7,554 245
TCP_MAERTS
256 1 1,673 82 1,444 71 1,756 88
4 6,385 175 5,671 155 5,685 153
1024 1 7,500 427 6,884 414 7,640 429
4 9,310 444 8,659 496 8,200 350
4096 1 8,427 477 9,201 515 8,825 422
4 9,372 478 9,184 394 9,391 446
16384 1 8,840 500 9,205 555 9,239 482
4 9,379 495 9,079 385 9,389 472
Host to 4 VMs
- Base - -Multi-Worker- - Per-CPU -
Test Inst Score Eff Score Eff Score Eff
TCP_RR 1 38,635 949 34,063 843 35,432 897
10 193,703 2,604 157,699 1,841 180,323 2,858
30 279,736 3,301 170,343 1,739 269,827 2,875
60 308,838 3,555 170,486 1,738 285,073 2,988
UDP_RR 1 42,209 1,136 36,035 904 36,974 975
10 177,286 2,616 166,999 2,043 178,470 2,466
30 296,415 3,731 221,738 2,488 260,630 2,966
60 353,784 4,179 209,489 2,152 306,792 3,440
TCP_STREAM
256 1 8,409 113 7,517 101 7,178 115
4 8,963 93 7,825 80 8,606 91
1024 1 9,382 119 10,223 192 9,314 128
4 9,233 101 9,085 110 8,585 105
4096 1 9,391 124 9,393 125 9,300 140
4 9,303 103 9,151 102 8,601 106
16384 1 9,395 121 8,715 128 9,378 135
4 9,322 105 9,135 101 8,691 121
TCP_MAERTS
256 1 8,629 125 7,045 112 7,559 109
4 9,389 145 7,091 80 9,335 156
1024 1 9,385 201 9,349 148 9,320 248
4 9,392 154 9,340 148 9,390 226
4096 1 9,387 239 9,339 151 9,379 291
4 9,392 167 9,389 124 9,390 259
16384 1 9,374 236 9,366 150 9,391 317
4 9,365 167 9,394 123 9,390 284
Host to 12 VMs
- Base - -Multi-Worker- - Per-CPU -
Test Inst Score Eff Score Eff Score Eff
TCP_RR 1 79,628 928 85,717 944 72,760 885
10 106,348 1,067 94,032 944 164,548 2,017
30 131,313 1,318 116,431 1,168 206,560 2,367
60 156,868 1,574 152,205 1,527 223,701 2,250
UDP_RR 1 90,762 1,059 93,904 1,037 75,512 919
10 149,381 1,499 113,254 1,136 194,153 1,951
30 177,803 1,783 132,818 1,333 235,682 2,370
60 201,833 2,025 154,871 1,554 258,133 2,595
TCP_STREAM
256 1 8,549 86 7,173 72 8,407 85
4 8,910 89 8,693 87 8,768 88
1024 1 9,397 95 9,371 94 9,376 95
4 9,289 93 9,268 100 8,898 92
4096 1 9,399 95 9,415 95 9,401 97
4 9,336 94 9,319 94 8,938 94
16384 1 9,405 95 9,402 96 9,397 102
4 9,366 94 9,345 94 8,890 94
TCP_MAERTS
256 1 4,646 49 2,273 23 9,232 135
4 9,393 107 8,019 81 9,414 134
1024 1 9,393 115 9,403 104 9,399 178
4 9,406 110 9,383 98 9,392 157
4096 1 9,393 114 9,409 104 9,388 202
4 9,388 110 9,387 98 9,382 181
16384 1 9,396 114 9,391 104 9,394 221
4 9,411 110 9,384 98 9,391 192
Host to 24 VMs
- Base - -Multi-Worker- - Per-CPU -
Test Inst Score Eff Score Eff Score Eff
TCP_RR 1 110,139 1,118 101,765 1,033 79,189 805
10 94,757 948 90,872 915 156,821 1,581
30 119,904 1,199 120,728 1,207 214,151 2,211
60 144,684 1,457 146,788 1,468 240,963 2,513
UDP_RR 1 129,655 1,316 120,071 1,201 91,208 914
10 119,204 1,201 104,645 1,046 208,432 2,340
30 158,887 1,601 136,629 1,366 249,329 2,517
60 179,365 1,794 159,883 1,610 259,018 2,651
TCP_STREAM
256 1 5,899 59 4,258 44 8,071 82
4 8,739 89 8,195 83 7,934 82
1024 1 8,477 86 7,498 76 9,268 93
4 9,205 93 9,171 94 8,159 84
4096 1 9,334 96 8,992 92 9,324 97
4 9,255 95 9,221 92 8,237 85
16384 1 9,373 96 9,356 95 9,311 96
4 9,283 94 9,275 93 8,317 86
TCP_MAERTS
256 1 739 7 770 8 9,186 129
4 7,804 79 7,573 76 9,253 122
1024 1 1,763 18 1,759 18 9,287 146
4 9,204 99 9,166 93 9,389 155
4096 1 3,430 35 3,403 35 9,348 161
4 9,372 100 9,315 95 9,385 151
16384 1 9,309 102 9,306 97 9,353 175
4 9,378 100 9,392 96 9,377 159
Local VM to VM:
1 VM to 1 VM
- Base - -Multi-Worker- - Per-CPU -
Test Inst Score Eff Score Eff Score Eff
TCP_RR 1 7,422 506 7,698 462 6,281 450
10 49,662 1,362 47,553 1,205 43,258 1,270
30 91,657 1,538 99,319 1,471 89,478 1,499
60 106,168 1,658 106,430 1,503 99,205 1,576
UDP_RR 1 8,414 552 8,532 528 6,976 499
10 58,359 1,645 55,283 1,398 48,094 1,457
30 91,046 1,736 109,403 1,721 92,109 1,715
60 128,835 2,021 130,382 1,807 118,563 1,853
TCP_STREAM
256 1 2,029 60 1,923 54 1,998 64
4 3,861 66 3,445 53 2,914 54
1024 1 7,374 205 6,465 174 5,704 165
4 8,474 196 7,541 161 6,274 156
4096 1 12,825 295 11,921 275 10,262 262
4 12,639 253 13,395 260 11,451 264
16384 1 14,576 331 14,141 291 11,925 305
4 16,016 327 14,210 274 13,656 308
1 VM to 1 VM (each VM pinned to a socket)
- Base - -Multi-Worker- - Per-CPU -
Test Inst Score Eff Score Eff Score Eff
TCP_RR 1 7,145 489 7,840 477 5,965 467
10 51,016 1,406 47,881 1,223 45,232 1,288
30 92,785 1,580 103,453 1,512 91,437 1,523
60 120,160 1,817 115,058 1,595 102,734 1,611
UDP_RR 1 7,908 547 8,704 541 6,552 528
10 59,807 1,653 56,598 1,435 50,524 1,488
30 90,302 1,738 113,861 1,765 94,640 1,720
60 141,684 2,196 141,866 1,919 125,334 1,917
TCP_STREAM
256 1 2,210 64 1,291 32 2,069 64
4 3,993 64 3,441 52 2,780 50
1024 1 8,106 217 7,571 198 5,709 165
4 8,471 206 8,756 174 6,531 157
4096 1 15,360 350 13,825 303 10,717 271
4 14,671 330 12,604 263 11,266 258
16384 1 18,284 395 16,305 337 13,185 317
4 15,451 331 12,438 247 14,699 316
2 VMs to 2 VMs (4 VMs total)
- Base - -Multi-Worker- - Per-CPU -
Test Inst Score Eff Score Eff Score Eff
TCP_RR 1 15,498 491 16,518 460 13,008 441
10 71,425 983 79,711 1,063 85,087 1,037
30 102,132 1,436 82,191 1,145 100,504 1,076
60 127,670 1,608 96,815 1,262 104,694 1,119
UDP_RR 1 17,091 548 18,214 538 14,780 492
10 77,682 1,129 87,523 1,235 86,755 1,165
30 131,830 1,826 92,844 1,327 111,839 1,232
60 145,688 1,952 111,315 1,520 116,358 1,296
TCP_STREAM
256 1 5,085 72 3,900 50 2,430 38
4 6,622 70 4,337 48 5,032 58
1024 1 15,262 206 15,022 195 7,000 115
4 14,205 174 15,288 174 11,030 148
4096 1 15,020 197 21,694 261 13,583 198
4 16,818 205 16,076 195 17,175 238
16384 1 19,671 261 23,699 290 22,396 306
4 18,648 229 17,901 218 17,122 251
6 VMs to 6 VMs (12 VMs total)
- Base - -Multi-Worker- - Per-CPU -
Test Inst Score Eff Score Eff Score Eff
TCP_RR 1 30,242 400 32,281 390 27,737 401
10 73,461 783 61,856 644 93,259 1,000
30 98,638 1,034 81,799 844 107,022 1,121
60 114,238 1,200 91,772 944 110,839 1,152
UDP_RR 1 33,017 438 35,540 429 30,022 438
10 84,676 910 67,838 711 112,339 1,220
30 110,799 1,156 90,555 932 128,928 1,357
60 129,679 1,354 100,715 1,033 136,503 1,429
TCP_STREAM
256 1 6,947 72 5,380 56 6,138 72
4 8,400 85 7,660 77 8,893 89
1024 1 13,698 146 10,307 108 13,023 158
4 15,391 157 13,242 135 17,264 182
4096 1 18,928 202 14,580 154 16,970 189
4 18,826 191 17,262 175 19,558 212
16384 1 22,176 234 17,716 187 21,245 243
4 21,306 215 20,332 206 18,353 227
12 VMs to 12 VMs (24 VMs total)
- Base - -Multi-Worker- - Per-CPU -
Test Inst Score Eff Score Eff Score Eff
TCP_RR 1 72,926 731 67,338 675 32,662 387
10 62,441 625 59,277 594 87,286 891
30 72,761 728 67,760 679 102,549 1,041
60 78,087 782 74,654 748 100,687 1,016
UDP_RR 1 82,662 829 80,875 810 34,915 421
10 71,424 716 67,754 679 111,753 1,147
30 79,495 796 75,512 756 134,576 1,372
60 83,339 835 77,523 778 137,058 1,390
TCP_STREAM
256 1 2,870 29 2,631 26 7,907 80
4 8,424 84 8,026 80 8,929 90
1024 1 3,674 37 3,121 31 15,644 164
4 14,256 143 13,342 134 16,116 168
4096 1 5,068 51 4,366 44 16,179 168
4 17,015 171 16,321 164 17,940 186
16384 1 9,768 98 9,025 90 19,233 203
4 18,981 190 18,202 183 18,964 203
On Thursday, March 22, 2012 05:16:30 PM Shirley Ma wrote:
> Resubmit it with the right format.
>
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> Tested-by: Tom Lendacky <toml@us.ibm.com>
> ---
>
> drivers/vhost/net.c | 26 ++-
> drivers/vhost/vhost.c | 300
> ++++++++++++++++++++++++---------- drivers/vhost/vhost.h |
> 16 ++-
> 3 files changed, 243 insertions(+), 103 deletions(-)
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
2012-03-23 18:32 ` Thomas Lendacky
@ 2012-03-23 19:00 ` Rick Jones
2012-03-23 21:10 ` Thomas Lendacky
2012-03-23 23:45 ` David Ahern
1 sibling, 1 reply; 8+ messages in thread
From: Rick Jones @ 2012-03-23 19:00 UTC (permalink / raw)
To: Thomas Lendacky; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm
On 03/23/2012 11:32 AM, Thomas Lendacky wrote:
> I ran a series of TCP_RR, UDP_RR, TCP_STREAM and TCP_MAERTS tests
> against the recent vhost patches. For simplicity, the patches
> submitted by Anthony that increase the number of threads per vhost
> instance I will call multi-worker and the patches submitted by Shirley
> that provide a vhost thread per cpu I will call per-cpu.
Lots of nice data there - kudos.
> Quick description of the tests:
> TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30
> and 60 instances
There is a point, not quite sure where, when aggregate, synchronous
single-transaction netperf tests become as much a context switching test
as a networking test. That is why netperf RR has support for the "burst
mode" to have more than one transaction in flight at one time:
http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002denable_002dburst
When looking to measure packet/transaction per second scaling I've taken
to finding the peak for a single stream by running up the burst size,
(TCP_NODELAY set) and then running 1, 2, 4 etc of those streams. With
the occasional ethtool -S audit to make sure that each TCP_RR
transaction is indeed a discrete pair of TCP segments...
In addition to avoiding concerns about becoming a context switching
exercise, the reduction in netperf instances means less chance for skew
error on startup and shutdown. To address that I've somewhat recently
taken to using demo mode in netperf and then post-processing the results
through rrdtool:
http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002denable_002ddemo
I have a "one to many" script for that under:
http://www.netperf.org/svn/netperf2/trunk/doc/examples/runemomniaggdemo.sh
which is then post-processed via some stone knives and bearskins:
http://www.netperf.org/svn/netperf2/trunk/doc/examples/post_proc.sh
http://www.netperf.org/svn/netperf2/trunk/doc/examples/vrules.awk
http://www.netperf.org/svn/netperf2/trunk/doc/examples/mins_maxes.awk
I've also used that basic idea in some many to many tests involving 512
concurrent netperf instances but that script isn't up on netperf.org.
> TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes
> and 1 and 4 instances
Netperf's own documentation and output is probably not good on this
point (feel free to loose petards, though some instances may be cast in
stone) but those aren't really message sizes. They are simply the
quantity of data netperf is presenting to the transport in any one send
call. They are send sizes.
> Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests
> running between an external host and each VM.
I suppose it is implicit, and I'm just being pedantic/paranoid but you
are confident of the limits of the external host?
> Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests
> running between VM pairs on the same host (no TCP_MAERTS done in
> this situation).
>
> For TCP_RR and UDP_RR tests I report the transaction rate as the
> score and the transaction rate / KVMhost CPU% as the efficiency.
>
> For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps
> as the score and the throughput / KVMhost CPU% as the efficiency.
>
> The KVM host machine is a nehalem-based 2-socket, 4-cores/socket
> system (E5530 @ 2.40GHz) with hyperthreading disabled and an Intel
> 10GbE single port network adapter.
>
> There's a lot of data and I hope this is the clearest way to report
> it. The remote host to VM results are first followed by the local
> VM to VM results.
Looks reasonable as far as presentation goes. Might have included a
summary table of the various peaks:
TCP_RR Remote Host to VM:
Inst - Base - -Multi-Worker- - Per-CPU -
VMs /VM Score Eff Score Eff Score Eff
1 60 117,448 3,929 148,330 3,616 137,996 3,898
4 60 308,838 3,555 170,486 1,738 285,073 2,988
12 60 156,868 1,574 152,205 1,527 223,701 2,250
24 60 144,684 1,457 146,788 1,468 240,963 2,513
Given the KVM host machine is 8 cores with hyperthreading disabled, I
might have included a data point at 8 VMs even if they were 2 vCPU VMs,
but that is just my gut talking. Certainly looking at the summary table
I'm wondering where between 4 and 12 VMs the curve starts its downward
trend. Does 12 and 24, 2vCPU VMs force moving around more than say 16
or 32 would?
happy benchmarking,
rick jones
>
>
> Remote Host to VM:
> Host to 1 VM
> - Base - -Multi-Worker- - Per-CPU -
> Test Inst Score Eff Score Eff Score Eff
> TCP_RR 1 9,587 984 9,725 1,145 9,252 1,041
> 10 63,919 3,095 51,841 2,415 55,226 2,884
> 30 85,646 3,288 127,277 3,242 145,644 4,092
> 60 117,448 3,929 148,330 3,616 137,996 3,898
>
> UDP_RR 1 10,815 1,174 10,125 1,255 7,913 1,150
> 10 53,989 3,082 59,590 2,875 52,353 3,328
> 30 91,484 4,115 95,312 3,042 110,715 3,659
> 60 107,466 4,689 173,443 4,351 158,141 4,235
>
> TCP_STREAM
> 256 1 2,724 140 2,450 131 2,681 150
> 4 5,027 137 4,147 146 3,998 117
>
> 1024 1 5,602 235 4,623 169 5,425 238
> 4 5,987 212 5,991 133 6,827 175
>
> 4096 1 6,202 256 6,753 211 7,247 279
> 4 4,996 192 5,771 159 7,124 202
>
> 16384 1 6,258 259 7,211 214 8,453 308
> 4 4,591 179 5,788 181 6,925 217
>
> TCP_MAERTS
> 256 1 1,951 85 1,871 89 1,899 97
> 4 4,757 129 4,102 140 4,279 116
>
> 1024 1 7,479 381 6,970 371 7,374 427
> 4 8,931 385 6,612 258 8,731 417
>
> 4096 1 9,276 464 9,296 456 9,131 510
> 4 9,381 452 9,032 367 9,338 446
>
> 16384 1 9,153 496 8,817 589 9,238 516
> 4 9,358 478 9,006 367 9,350 462
>
> Host to 1 VM (VM pinned to a socket)
> - Base - -Multi-Worker- - Per-CPU -
> Test Inst Score Eff Score Eff Score Eff
> TCP_RR 1 9,992 1,019 9,899 917 8,963 899
> 10 60,731 3,236 60,015 2,444 55,860 3,059
> 30 127,375 4,042 146,571 3,922 163,806 4,389
> 60 173,021 4,972 149,549 4,662 161,397 4,330
>
> UDP_RR 1 10,854 1,253 7,983 1,120 7,647 1,206
> 10 68,128 3,804 64,335 4,067 53,343 3,233
> 30 92,456 3,994 112,101 4,219 111,610 3,598
> 60 135,741 4,590 184,441 4,422 184,527 4,546
>
> TCP_STREAM
> 256 1 2,564 146 2,530 147 2,497 150
> 4 4,757 139 4,300 127 4,245 124
>
> 1024 1 4,700 209 6,062 323 5,627 247
> 4 6,828 214 7,125 153 6,561 172
>
> 4096 1 6,676 281 7,672 286 7,760 290
> 4 6,258 236 6,410 171 7,354 225
>
> 16384 1 6,712 289 8,217 297 8,457 322
> 4 5,764 235 6,285 200 7,554 245
>
> TCP_MAERTS
> 256 1 1,673 82 1,444 71 1,756 88
> 4 6,385 175 5,671 155 5,685 153
>
> 1024 1 7,500 427 6,884 414 7,640 429
> 4 9,310 444 8,659 496 8,200 350
>
> 4096 1 8,427 477 9,201 515 8,825 422
> 4 9,372 478 9,184 394 9,391 446
>
> 16384 1 8,840 500 9,205 555 9,239 482
> 4 9,379 495 9,079 385 9,389 472
>
> Host to 4 VMs
> - Base - -Multi-Worker- - Per-CPU -
> Test Inst Score Eff Score Eff Score Eff
> TCP_RR 1 38,635 949 34,063 843 35,432 897
> 10 193,703 2,604 157,699 1,841 180,323 2,858
> 30 279,736 3,301 170,343 1,739 269,827 2,875
> 60 308,838 3,555 170,486 1,738 285,073 2,988
>
> UDP_RR 1 42,209 1,136 36,035 904 36,974 975
> 10 177,286 2,616 166,999 2,043 178,470 2,466
> 30 296,415 3,731 221,738 2,488 260,630 2,966
> 60 353,784 4,179 209,489 2,152 306,792 3,440
>
> TCP_STREAM
> 256 1 8,409 113 7,517 101 7,178 115
> 4 8,963 93 7,825 80 8,606 91
>
> 1024 1 9,382 119 10,223 192 9,314 128
> 4 9,233 101 9,085 110 8,585 105
>
> 4096 1 9,391 124 9,393 125 9,300 140
> 4 9,303 103 9,151 102 8,601 106
>
> 16384 1 9,395 121 8,715 128 9,378 135
> 4 9,322 105 9,135 101 8,691 121
>
> TCP_MAERTS
> 256 1 8,629 125 7,045 112 7,559 109
> 4 9,389 145 7,091 80 9,335 156
>
> 1024 1 9,385 201 9,349 148 9,320 248
> 4 9,392 154 9,340 148 9,390 226
>
> 4096 1 9,387 239 9,339 151 9,379 291
> 4 9,392 167 9,389 124 9,390 259
>
> 16384 1 9,374 236 9,366 150 9,391 317
> 4 9,365 167 9,394 123 9,390 284
>
> Host to 12 VMs
> - Base - -Multi-Worker- - Per-CPU -
> Test Inst Score Eff Score Eff Score Eff
> TCP_RR 1 79,628 928 85,717 944 72,760 885
> 10 106,348 1,067 94,032 944 164,548 2,017
> 30 131,313 1,318 116,431 1,168 206,560 2,367
> 60 156,868 1,574 152,205 1,527 223,701 2,250
>
> UDP_RR 1 90,762 1,059 93,904 1,037 75,512 919
> 10 149,381 1,499 113,254 1,136 194,153 1,951
> 30 177,803 1,783 132,818 1,333 235,682 2,370
> 60 201,833 2,025 154,871 1,554 258,133 2,595
>
> TCP_STREAM
> 256 1 8,549 86 7,173 72 8,407 85
> 4 8,910 89 8,693 87 8,768 88
>
> 1024 1 9,397 95 9,371 94 9,376 95
> 4 9,289 93 9,268 100 8,898 92
>
> 4096 1 9,399 95 9,415 95 9,401 97
> 4 9,336 94 9,319 94 8,938 94
>
> 16384 1 9,405 95 9,402 96 9,397 102
> 4 9,366 94 9,345 94 8,890 94
>
> TCP_MAERTS
> 256 1 4,646 49 2,273 23 9,232 135
> 4 9,393 107 8,019 81 9,414 134
>
> 1024 1 9,393 115 9,403 104 9,399 178
> 4 9,406 110 9,383 98 9,392 157
>
> 4096 1 9,393 114 9,409 104 9,388 202
> 4 9,388 110 9,387 98 9,382 181
>
> 16384 1 9,396 114 9,391 104 9,394 221
> 4 9,411 110 9,384 98 9,391 192
>
> Host to 24 VMs
> - Base - -Multi-Worker- - Per-CPU -
> Test Inst Score Eff Score Eff Score Eff
> TCP_RR 1 110,139 1,118 101,765 1,033 79,189 805
> 10 94,757 948 90,872 915 156,821 1,581
> 30 119,904 1,199 120,728 1,207 214,151 2,211
> 60 144,684 1,457 146,788 1,468 240,963 2,513
>
> UDP_RR 1 129,655 1,316 120,071 1,201 91,208 914
> 10 119,204 1,201 104,645 1,046 208,432 2,340
> 30 158,887 1,601 136,629 1,366 249,329 2,517
> 60 179,365 1,794 159,883 1,610 259,018 2,651
>
> TCP_STREAM
> 256 1 5,899 59 4,258 44 8,071 82
> 4 8,739 89 8,195 83 7,934 82
>
> 1024 1 8,477 86 7,498 76 9,268 93
> 4 9,205 93 9,171 94 8,159 84
>
> 4096 1 9,334 96 8,992 92 9,324 97
> 4 9,255 95 9,221 92 8,237 85
>
> 16384 1 9,373 96 9,356 95 9,311 96
> 4 9,283 94 9,275 93 8,317 86
>
> TCP_MAERTS
> 256 1 739 7 770 8 9,186 129
> 4 7,804 79 7,573 76 9,253 122
>
> 1024 1 1,763 18 1,759 18 9,287 146
> 4 9,204 99 9,166 93 9,389 155
>
> 4096 1 3,430 35 3,403 35 9,348 161
> 4 9,372 100 9,315 95 9,385 151
>
> 16384 1 9,309 102 9,306 97 9,353 175
> 4 9,378 100 9,392 96 9,377 159
>
>
>
> Local VM to VM:
>
> 1 VM to 1 VM
> - Base - -Multi-Worker- - Per-CPU -
> Test Inst Score Eff Score Eff Score Eff
> TCP_RR 1 7,422 506 7,698 462 6,281 450
> 10 49,662 1,362 47,553 1,205 43,258 1,270
> 30 91,657 1,538 99,319 1,471 89,478 1,499
> 60 106,168 1,658 106,430 1,503 99,205 1,576
>
> UDP_RR 1 8,414 552 8,532 528 6,976 499
> 10 58,359 1,645 55,283 1,398 48,094 1,457
> 30 91,046 1,736 109,403 1,721 92,109 1,715
> 60 128,835 2,021 130,382 1,807 118,563 1,853
>
> TCP_STREAM
> 256 1 2,029 60 1,923 54 1,998 64
> 4 3,861 66 3,445 53 2,914 54
>
> 1024 1 7,374 205 6,465 174 5,704 165
> 4 8,474 196 7,541 161 6,274 156
>
> 4096 1 12,825 295 11,921 275 10,262 262
> 4 12,639 253 13,395 260 11,451 264
>
> 16384 1 14,576 331 14,141 291 11,925 305
> 4 16,016 327 14,210 274 13,656 308
>
>
> 1 VM to 1 VM (each VM pinned to a socket)
> - Base - -Multi-Worker- - Per-CPU -
> Test Inst Score Eff Score Eff Score Eff
> TCP_RR 1 7,145 489 7,840 477 5,965 467
> 10 51,016 1,406 47,881 1,223 45,232 1,288
> 30 92,785 1,580 103,453 1,512 91,437 1,523
> 60 120,160 1,817 115,058 1,595 102,734 1,611
>
> UDP_RR 1 7,908 547 8,704 541 6,552 528
> 10 59,807 1,653 56,598 1,435 50,524 1,488
> 30 90,302 1,738 113,861 1,765 94,640 1,720
> 60 141,684 2,196 141,866 1,919 125,334 1,917
>
> TCP_STREAM
> 256 1 2,210 64 1,291 32 2,069 64
> 4 3,993 64 3,441 52 2,780 50
>
> 1024 1 8,106 217 7,571 198 5,709 165
> 4 8,471 206 8,756 174 6,531 157
>
> 4096 1 15,360 350 13,825 303 10,717 271
> 4 14,671 330 12,604 263 11,266 258
>
> 16384 1 18,284 395 16,305 337 13,185 317
> 4 15,451 331 12,438 247 14,699 316
>
>
> 2 VMs to 2 VMs (4 VMs total)
> - Base - -Multi-Worker- - Per-CPU -
> Test Inst Score Eff Score Eff Score Eff
> TCP_RR 1 15,498 491 16,518 460 13,008 441
> 10 71,425 983 79,711 1,063 85,087 1,037
> 30 102,132 1,436 82,191 1,145 100,504 1,076
> 60 127,670 1,608 96,815 1,262 104,694 1,119
>
> UDP_RR 1 17,091 548 18,214 538 14,780 492
> 10 77,682 1,129 87,523 1,235 86,755 1,165
> 30 131,830 1,826 92,844 1,327 111,839 1,232
> 60 145,688 1,952 111,315 1,520 116,358 1,296
>
> TCP_STREAM
> 256 1 5,085 72 3,900 50 2,430 38
> 4 6,622 70 4,337 48 5,032 58
>
> 1024 1 15,262 206 15,022 195 7,000 115
> 4 14,205 174 15,288 174 11,030 148
>
> 4096 1 15,020 197 21,694 261 13,583 198
> 4 16,818 205 16,076 195 17,175 238
>
> 16384 1 19,671 261 23,699 290 22,396 306
> 4 18,648 229 17,901 218 17,122 251
>
> 6 VMs to 6 VMs (12 VMs total)
> - Base - -Multi-Worker- - Per-CPU -
> Test Inst Score Eff Score Eff Score Eff
> TCP_RR 1 30,242 400 32,281 390 27,737 401
> 10 73,461 783 61,856 644 93,259 1,000
> 30 98,638 1,034 81,799 844 107,022 1,121
> 60 114,238 1,200 91,772 944 110,839 1,152
>
> UDP_RR 1 33,017 438 35,540 429 30,022 438
> 10 84,676 910 67,838 711 112,339 1,220
> 30 110,799 1,156 90,555 932 128,928 1,357
> 60 129,679 1,354 100,715 1,033 136,503 1,429
>
> TCP_STREAM
> 256 1 6,947 72 5,380 56 6,138 72
> 4 8,400 85 7,660 77 8,893 89
>
> 1024 1 13,698 146 10,307 108 13,023 158
> 4 15,391 157 13,242 135 17,264 182
>
> 4096 1 18,928 202 14,580 154 16,970 189
> 4 18,826 191 17,262 175 19,558 212
>
> 16384 1 22,176 234 17,716 187 21,245 243
> 4 21,306 215 20,332 206 18,353 227
>
> 12 VMs to 12 VMs (24 VMs total)
> - Base - -Multi-Worker- - Per-CPU -
> Test Inst Score Eff Score Eff Score Eff
> TCP_RR 1 72,926 731 67,338 675 32,662 387
> 10 62,441 625 59,277 594 87,286 891
> 30 72,761 728 67,760 679 102,549 1,041
> 60 78,087 782 74,654 748 100,687 1,016
>
> UDP_RR 1 82,662 829 80,875 810 34,915 421
> 10 71,424 716 67,754 679 111,753 1,147
> 30 79,495 796 75,512 756 134,576 1,372
> 60 83,339 835 77,523 778 137,058 1,390
>
> TCP_STREAM
> 256 1 2,870 29 2,631 26 7,907 80
> 4 8,424 84 8,026 80 8,929 90
>
> 1024 1 3,674 37 3,121 31 15,644 164
> 4 14,256 143 13,342 134 16,116 168
>
> 4096 1 5,068 51 4,366 44 16,179 168
> 4 17,015 171 16,321 164 17,940 186
>
> 16384 1 9,768 98 9,025 90 19,233 203
> 4 18,981 190 18,202 183 18,964 203
>
>
> On Thursday, March 22, 2012 05:16:30 PM Shirley Ma wrote:
>> Resubmit it with the right format.
>>
>> Signed-off-by: Shirley Ma<xma@us.ibm.com>
>> Signed-off-by: Krishna Kumar<krkumar2@in.ibm.com>
>> Tested-by: Tom Lendacky<toml@us.ibm.com>
>> ---
>>
>> drivers/vhost/net.c | 26 ++-
>> drivers/vhost/vhost.c | 300
>> ++++++++++++++++++++++++---------- drivers/vhost/vhost.h |
>> 16 ++-
>> 3 files changed, 243 insertions(+), 103 deletions(-)
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
2012-03-23 19:00 ` Rick Jones
@ 2012-03-23 21:10 ` Thomas Lendacky
2012-03-23 21:21 ` Rick Jones
0 siblings, 1 reply; 8+ messages in thread
From: Thomas Lendacky @ 2012-03-23 21:10 UTC (permalink / raw)
To: Rick Jones; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm
On Friday, March 23, 2012 12:00:54 PM Rick Jones wrote:
> On 03/23/2012 11:32 AM, Thomas Lendacky wrote:
> > I ran a series of TCP_RR, UDP_RR, TCP_STREAM and TCP_MAERTS tests
> > against the recent vhost patches. For simplicity, the patches
> > submitted by Anthony that increase the number of threads per vhost
> > instance I will call multi-worker and the patches submitted by Shirley
> > that provide a vhost thread per cpu I will call per-cpu.
>
> Lots of nice data there - kudos.
>
> > Quick description of the tests:
> > TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30
> > and 60 instances
>
> There is a point, not quite sure where, when aggregate, synchronous
> single-transaction netperf tests become as much a context switching test
> as a networking test. That is why netperf RR has support for the "burst
> mode" to have more than one transaction in flight at one time:
>
> http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002de
> nable_002dburst
>
> When looking to measure packet/transaction per second scaling I've taken
> to finding the peak for a single stream by running up the burst size,
> (TCP_NODELAY set) and then running 1, 2, 4 etc of those streams. With
> the occasional ethtool -S audit to make sure that each TCP_RR
> transaction is indeed a discrete pair of TCP segments...
>
> In addition to avoiding concerns about becoming a context switching
> exercise, the reduction in netperf instances means less chance for skew
> error on startup and shutdown. To address that I've somewhat recently
> taken to using demo mode in netperf and then post-processing the results
> through rrdtool:
>
> http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002de
> nable_002ddemo
>
> I have a "one to many" script for that under:
>
> http://www.netperf.org/svn/netperf2/trunk/doc/examples/runemomniaggdemo.sh
>
> which is then post-processed via some stone knives and bearskins:
> http://www.netperf.org/svn/netperf2/trunk/doc/examples/post_proc.sh
> http://www.netperf.org/svn/netperf2/trunk/doc/examples/vrules.awk
> http://www.netperf.org/svn/netperf2/trunk/doc/examples/mins_maxes.awk
>
> I've also used that basic idea in some many to many tests involving 512
> concurrent netperf instances but that script isn't up on netperf.org.
>
> > TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes
> > and 1 and 4 instances
>
> Netperf's own documentation and output is probably not good on this
> point (feel free to loose petards, though some instances may be cast in
> stone) but those aren't really message sizes. They are simply the
> quantity of data netperf is presenting to the transport in any one send
> call. They are send sizes.
>
> > Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests
> > running between an external host and each VM.
>
> I suppose it is implicit, and I'm just being pedantic/paranoid but you
> are confident of the limits of the external host?
Yes I am. It's pretty much an identical system to the KVM host and has
demonstrated much greater performance when running bare-metal scenarios.
Plenty of CPU left on all cores, etc.
>
> > Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests
> > running between VM pairs on the same host (no TCP_MAERTS done in
> > this situation).
> >
> > For TCP_RR and UDP_RR tests I report the transaction rate as the
> > score and the transaction rate / KVMhost CPU% as the efficiency.
> >
> > For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps
> > as the score and the throughput / KVMhost CPU% as the efficiency.
> >
> > The KVM host machine is a nehalem-based 2-socket, 4-cores/socket
> > system (E5530 @ 2.40GHz) with hyperthreading disabled and an Intel
> > 10GbE single port network adapter.
> >
> > There's a lot of data and I hope this is the clearest way to report
> > it. The remote host to VM results are first followed by the local
> > VM to VM results.
>
> Looks reasonable as far as presentation goes. Might have included a
> summary table of the various peaks:
>
> TCP_RR Remote Host to VM:
> Inst - Base - -Multi-Worker- - Per-CPU -
> VMs /VM Score Eff Score Eff Score Eff
> 1 60 117,448 3,929 148,330 3,616 137,996 3,898
> 4 60 308,838 3,555 170,486 1,738 285,073 2,988
> 12 60 156,868 1,574 152,205 1,527 223,701 2,250
> 24 60 144,684 1,457 146,788 1,468 240,963 2,513
>
That's a good suggestion.
I also have geometric mean comparisons to the baseline (with greater
than 100% indicating an improvement and less than 100% indicating
regression).
Remote:
-Multi-Worker- - Per-CPU -
VMs Score Eff Score Eff
1 105% 91% 109% 103%
1 (pinned) 102% 94% 103% 95%
4 84% 76% 95% 103%
12 91% 88% 113% 129%
24 95% 94% 135% 149%
Overall 95% 88% 110% 114%
Local:
-Multi-Worker- - Per-CPU -
VMs Score Eff Score Eff
1 98% 90% 86% 93%
1 (pinned) 94% 85% 82% 87%
4 94% 91% 86% 86%
12 85% 84% 103% 109%
24 93% 93% 141% 148%
Overall 93% 89% 97% 102%
Combined: 94% 88% 104% 108%
> Given the KVM host machine is 8 cores with hyperthreading disabled, I
> might have included a data point at 8 VMs even if they were 2 vCPU VMs,
> but that is just my gut talking. Certainly looking at the summary table
> I'm wondering where between 4 and 12 VMs the curve starts its downward
> trend. Does 12 and 24, 2vCPU VMs force moving around more than say 16
> or 32 would?
Yeah, it becomes a question of time. I run each test 3 times and
average the results, so to run the full suite takes a long time.
Thanks,
Tom
>
> happy benchmarking,
>
> rick jones
>
> > Remote Host to VM:
> > Host to 1 VM
> >
> > - Base - -Multi-Worker- - Per-CPU -
> >
> > Test Inst Score Eff Score Eff Score Eff
> > TCP_RR 1 9,587 984 9,725 1,145 9,252 1,041
> >
> > 10 63,919 3,095 51,841 2,415 55,226 2,884
> > 30 85,646 3,288 127,277 3,242 145,644 4,092
> > 60 117,448 3,929 148,330 3,616 137,996 3,898
> >
> > UDP_RR 1 10,815 1,174 10,125 1,255 7,913 1,150
> >
> > 10 53,989 3,082 59,590 2,875 52,353 3,328
> > 30 91,484 4,115 95,312 3,042 110,715 3,659
> > 60 107,466 4,689 173,443 4,351 158,141 4,235
> >
> > TCP_STREAM
> >
> > 256 1 2,724 140 2,450 131 2,681 150
> >
> > 4 5,027 137 4,147 146 3,998 117
> >
> > 1024 1 5,602 235 4,623 169 5,425 238
> >
> > 4 5,987 212 5,991 133 6,827 175
> >
> > 4096 1 6,202 256 6,753 211 7,247 279
> >
> > 4 4,996 192 5,771 159 7,124 202
> >
> > 16384 1 6,258 259 7,211 214 8,453 308
> >
> > 4 4,591 179 5,788 181 6,925 217
> >
> > TCP_MAERTS
> >
> > 256 1 1,951 85 1,871 89 1,899 97
> >
> > 4 4,757 129 4,102 140 4,279 116
> >
> > 1024 1 7,479 381 6,970 371 7,374 427
> >
> > 4 8,931 385 6,612 258 8,731 417
> >
> > 4096 1 9,276 464 9,296 456 9,131 510
> >
> > 4 9,381 452 9,032 367 9,338 446
> >
> > 16384 1 9,153 496 8,817 589 9,238 516
> >
> > 4 9,358 478 9,006 367 9,350 462
> >
> > Host to 1 VM (VM pinned to a socket)
> >
> > - Base - -Multi-Worker- - Per-CPU -
> >
> > Test Inst Score Eff Score Eff Score Eff
> > TCP_RR 1 9,992 1,019 9,899 917 8,963 899
> >
> > 10 60,731 3,236 60,015 2,444 55,860 3,059
> > 30 127,375 4,042 146,571 3,922 163,806 4,389
> > 60 173,021 4,972 149,549 4,662 161,397 4,330
> >
> > UDP_RR 1 10,854 1,253 7,983 1,120 7,647 1,206
> >
> > 10 68,128 3,804 64,335 4,067 53,343 3,233
> > 30 92,456 3,994 112,101 4,219 111,610 3,598
> > 60 135,741 4,590 184,441 4,422 184,527 4,546
> >
> > TCP_STREAM
> >
> > 256 1 2,564 146 2,530 147 2,497 150
> >
> > 4 4,757 139 4,300 127 4,245 124
> >
> > 1024 1 4,700 209 6,062 323 5,627 247
> >
> > 4 6,828 214 7,125 153 6,561 172
> >
> > 4096 1 6,676 281 7,672 286 7,760 290
> >
> > 4 6,258 236 6,410 171 7,354 225
> >
> > 16384 1 6,712 289 8,217 297 8,457 322
> >
> > 4 5,764 235 6,285 200 7,554 245
> >
> > TCP_MAERTS
> >
> > 256 1 1,673 82 1,444 71 1,756 88
> >
> > 4 6,385 175 5,671 155 5,685 153
> >
> > 1024 1 7,500 427 6,884 414 7,640 429
> >
> > 4 9,310 444 8,659 496 8,200 350
> >
> > 4096 1 8,427 477 9,201 515 8,825 422
> >
> > 4 9,372 478 9,184 394 9,391 446
> >
> > 16384 1 8,840 500 9,205 555 9,239 482
> >
> > 4 9,379 495 9,079 385 9,389 472
> >
> > Host to 4 VMs
> >
> > - Base - -Multi-Worker- - Per-CPU -
> >
> > Test Inst Score Eff Score Eff Score Eff
> > TCP_RR 1 38,635 949 34,063 843 35,432 897
> >
> > 10 193,703 2,604 157,699 1,841 180,323 2,858
> > 30 279,736 3,301 170,343 1,739 269,827 2,875
> > 60 308,838 3,555 170,486 1,738 285,073 2,988
> >
> > UDP_RR 1 42,209 1,136 36,035 904 36,974 975
> >
> > 10 177,286 2,616 166,999 2,043 178,470 2,466
> > 30 296,415 3,731 221,738 2,488 260,630 2,966
> > 60 353,784 4,179 209,489 2,152 306,792 3,440
> >
> > TCP_STREAM
> >
> > 256 1 8,409 113 7,517 101 7,178 115
> >
> > 4 8,963 93 7,825 80 8,606 91
> >
> > 1024 1 9,382 119 10,223 192 9,314 128
> >
> > 4 9,233 101 9,085 110 8,585 105
> >
> > 4096 1 9,391 124 9,393 125 9,300 140
> >
> > 4 9,303 103 9,151 102 8,601 106
> >
> > 16384 1 9,395 121 8,715 128 9,378 135
> >
> > 4 9,322 105 9,135 101 8,691 121
> >
> > TCP_MAERTS
> >
> > 256 1 8,629 125 7,045 112 7,559 109
> >
> > 4 9,389 145 7,091 80 9,335 156
> >
> > 1024 1 9,385 201 9,349 148 9,320 248
> >
> > 4 9,392 154 9,340 148 9,390 226
> >
> > 4096 1 9,387 239 9,339 151 9,379 291
> >
> > 4 9,392 167 9,389 124 9,390 259
> >
> > 16384 1 9,374 236 9,366 150 9,391 317
> >
> > 4 9,365 167 9,394 123 9,390 284
> >
> > Host to 12 VMs
> >
> > - Base - -Multi-Worker- - Per-CPU -
> >
> > Test Inst Score Eff Score Eff Score Eff
> > TCP_RR 1 79,628 928 85,717 944 72,760 885
> >
> > 10 106,348 1,067 94,032 944 164,548 2,017
> > 30 131,313 1,318 116,431 1,168 206,560 2,367
> > 60 156,868 1,574 152,205 1,527 223,701 2,250
> >
> > UDP_RR 1 90,762 1,059 93,904 1,037 75,512 919
> >
> > 10 149,381 1,499 113,254 1,136 194,153 1,951
> > 30 177,803 1,783 132,818 1,333 235,682 2,370
> > 60 201,833 2,025 154,871 1,554 258,133 2,595
> >
> > TCP_STREAM
> >
> > 256 1 8,549 86 7,173 72 8,407 85
> >
> > 4 8,910 89 8,693 87 8,768 88
> >
> > 1024 1 9,397 95 9,371 94 9,376 95
> >
> > 4 9,289 93 9,268 100 8,898 92
> >
> > 4096 1 9,399 95 9,415 95 9,401 97
> >
> > 4 9,336 94 9,319 94 8,938 94
> >
> > 16384 1 9,405 95 9,402 96 9,397 102
> >
> > 4 9,366 94 9,345 94 8,890 94
> >
> > TCP_MAERTS
> >
> > 256 1 4,646 49 2,273 23 9,232 135
> >
> > 4 9,393 107 8,019 81 9,414 134
> >
> > 1024 1 9,393 115 9,403 104 9,399 178
> >
> > 4 9,406 110 9,383 98 9,392 157
> >
> > 4096 1 9,393 114 9,409 104 9,388 202
> >
> > 4 9,388 110 9,387 98 9,382 181
> >
> > 16384 1 9,396 114 9,391 104 9,394 221
> >
> > 4 9,411 110 9,384 98 9,391 192
> >
> > Host to 24 VMs
> >
> > - Base - -Multi-Worker- - Per-CPU -
> >
> > Test Inst Score Eff Score Eff Score Eff
> > TCP_RR 1 110,139 1,118 101,765 1,033 79,189 805
> >
> > 10 94,757 948 90,872 915 156,821 1,581
> > 30 119,904 1,199 120,728 1,207 214,151 2,211
> > 60 144,684 1,457 146,788 1,468 240,963 2,513
> >
> > UDP_RR 1 129,655 1,316 120,071 1,201 91,208 914
> >
> > 10 119,204 1,201 104,645 1,046 208,432 2,340
> > 30 158,887 1,601 136,629 1,366 249,329 2,517
> > 60 179,365 1,794 159,883 1,610 259,018 2,651
> >
> > TCP_STREAM
> >
> > 256 1 5,899 59 4,258 44 8,071 82
> >
> > 4 8,739 89 8,195 83 7,934 82
> >
> > 1024 1 8,477 86 7,498 76 9,268 93
> >
> > 4 9,205 93 9,171 94 8,159 84
> >
> > 4096 1 9,334 96 8,992 92 9,324 97
> >
> > 4 9,255 95 9,221 92 8,237 85
> >
> > 16384 1 9,373 96 9,356 95 9,311 96
> >
> > 4 9,283 94 9,275 93 8,317 86
> >
> > TCP_MAERTS
> >
> > 256 1 739 7 770 8 9,186 129
> >
> > 4 7,804 79 7,573 76 9,253 122
> >
> > 1024 1 1,763 18 1,759 18 9,287 146
> >
> > 4 9,204 99 9,166 93 9,389 155
> >
> > 4096 1 3,430 35 3,403 35 9,348 161
> >
> > 4 9,372 100 9,315 95 9,385 151
> >
> > 16384 1 9,309 102 9,306 97 9,353 175
> >
> > 4 9,378 100 9,392 96 9,377 159
> >
> > Local VM to VM:
> > 1 VM to 1 VM
> >
> > - Base - -Multi-Worker- - Per-CPU -
> >
> > Test Inst Score Eff Score Eff Score Eff
> > TCP_RR 1 7,422 506 7,698 462 6,281 450
> >
> > 10 49,662 1,362 47,553 1,205 43,258 1,270
> > 30 91,657 1,538 99,319 1,471 89,478 1,499
> > 60 106,168 1,658 106,430 1,503 99,205 1,576
> >
> > UDP_RR 1 8,414 552 8,532 528 6,976 499
> >
> > 10 58,359 1,645 55,283 1,398 48,094 1,457
> > 30 91,046 1,736 109,403 1,721 92,109 1,715
> > 60 128,835 2,021 130,382 1,807 118,563 1,853
> >
> > TCP_STREAM
> >
> > 256 1 2,029 60 1,923 54 1,998 64
> >
> > 4 3,861 66 3,445 53 2,914 54
> >
> > 1024 1 7,374 205 6,465 174 5,704 165
> >
> > 4 8,474 196 7,541 161 6,274 156
> >
> > 4096 1 12,825 295 11,921 275 10,262 262
> >
> > 4 12,639 253 13,395 260 11,451 264
> >
> > 16384 1 14,576 331 14,141 291 11,925 305
> >
> > 4 16,016 327 14,210 274 13,656 308
> >
> > 1 VM to 1 VM (each VM pinned to a socket)
> >
> > - Base - -Multi-Worker- - Per-CPU -
> >
> > Test Inst Score Eff Score Eff Score Eff
> > TCP_RR 1 7,145 489 7,840 477 5,965 467
> >
> > 10 51,016 1,406 47,881 1,223 45,232 1,288
> > 30 92,785 1,580 103,453 1,512 91,437 1,523
> > 60 120,160 1,817 115,058 1,595 102,734 1,611
> >
> > UDP_RR 1 7,908 547 8,704 541 6,552 528
> >
> > 10 59,807 1,653 56,598 1,435 50,524 1,488
> > 30 90,302 1,738 113,861 1,765 94,640 1,720
> > 60 141,684 2,196 141,866 1,919 125,334 1,917
> >
> > TCP_STREAM
> >
> > 256 1 2,210 64 1,291 32 2,069 64
> >
> > 4 3,993 64 3,441 52 2,780 50
> >
> > 1024 1 8,106 217 7,571 198 5,709 165
> >
> > 4 8,471 206 8,756 174 6,531 157
> >
> > 4096 1 15,360 350 13,825 303 10,717 271
> >
> > 4 14,671 330 12,604 263 11,266 258
> >
> > 16384 1 18,284 395 16,305 337 13,185 317
> >
> > 4 15,451 331 12,438 247 14,699 316
> >
> > 2 VMs to 2 VMs (4 VMs total)
> >
> > - Base - -Multi-Worker- - Per-CPU -
> >
> > Test Inst Score Eff Score Eff Score Eff
> > TCP_RR 1 15,498 491 16,518 460 13,008 441
> >
> > 10 71,425 983 79,711 1,063 85,087 1,037
> > 30 102,132 1,436 82,191 1,145 100,504 1,076
> > 60 127,670 1,608 96,815 1,262 104,694 1,119
> >
> > UDP_RR 1 17,091 548 18,214 538 14,780 492
> >
> > 10 77,682 1,129 87,523 1,235 86,755 1,165
> > 30 131,830 1,826 92,844 1,327 111,839 1,232
> > 60 145,688 1,952 111,315 1,520 116,358 1,296
> >
> > TCP_STREAM
> >
> > 256 1 5,085 72 3,900 50 2,430 38
> >
> > 4 6,622 70 4,337 48 5,032 58
> >
> > 1024 1 15,262 206 15,022 195 7,000 115
> >
> > 4 14,205 174 15,288 174 11,030 148
> >
> > 4096 1 15,020 197 21,694 261 13,583 198
> >
> > 4 16,818 205 16,076 195 17,175 238
> >
> > 16384 1 19,671 261 23,699 290 22,396 306
> >
> > 4 18,648 229 17,901 218 17,122 251
> >
> > 6 VMs to 6 VMs (12 VMs total)
> >
> > - Base - -Multi-Worker- - Per-CPU -
> >
> > Test Inst Score Eff Score Eff Score Eff
> > TCP_RR 1 30,242 400 32,281 390 27,737 401
> >
> > 10 73,461 783 61,856 644 93,259 1,000
> > 30 98,638 1,034 81,799 844 107,022 1,121
> > 60 114,238 1,200 91,772 944 110,839 1,152
> >
> > UDP_RR 1 33,017 438 35,540 429 30,022 438
> >
> > 10 84,676 910 67,838 711 112,339 1,220
> > 30 110,799 1,156 90,555 932 128,928 1,357
> > 60 129,679 1,354 100,715 1,033 136,503 1,429
> >
> > TCP_STREAM
> >
> > 256 1 6,947 72 5,380 56 6,138 72
> >
> > 4 8,400 85 7,660 77 8,893 89
> >
> > 1024 1 13,698 146 10,307 108 13,023 158
> >
> > 4 15,391 157 13,242 135 17,264 182
> >
> > 4096 1 18,928 202 14,580 154 16,970 189
> >
> > 4 18,826 191 17,262 175 19,558 212
> >
> > 16384 1 22,176 234 17,716 187 21,245 243
> >
> > 4 21,306 215 20,332 206 18,353 227
> >
> > 12 VMs to 12 VMs (24 VMs total)
> >
> > - Base - -Multi-Worker- - Per-CPU -
> >
> > Test Inst Score Eff Score Eff Score Eff
> > TCP_RR 1 72,926 731 67,338 675 32,662 387
> >
> > 10 62,441 625 59,277 594 87,286 891
> > 30 72,761 728 67,760 679 102,549 1,041
> > 60 78,087 782 74,654 748 100,687 1,016
> >
> > UDP_RR 1 82,662 829 80,875 810 34,915 421
> >
> > 10 71,424 716 67,754 679 111,753 1,147
> > 30 79,495 796 75,512 756 134,576 1,372
> > 60 83,339 835 77,523 778 137,058 1,390
> >
> > TCP_STREAM
> >
> > 256 1 2,870 29 2,631 26 7,907 80
> >
> > 4 8,424 84 8,026 80 8,929 90
> >
> > 1024 1 3,674 37 3,121 31 15,644 164
> >
> > 4 14,256 143 13,342 134 16,116 168
> >
> > 4096 1 5,068 51 4,366 44 16,179 168
> >
> > 4 17,015 171 16,321 164 17,940 186
> >
> > 16384 1 9,768 98 9,025 90 19,233 203
> >
> > 4 18,981 190 18,202 183 18,964 203
> >
> > On Thursday, March 22, 2012 05:16:30 PM Shirley Ma wrote:
> >> Resubmit it with the right format.
> >>
> >> Signed-off-by: Shirley Ma<xma@us.ibm.com>
> >> Signed-off-by: Krishna Kumar<krkumar2@in.ibm.com>
> >> Tested-by: Tom Lendacky<toml@us.ibm.com>
> >> ---
> >>
> >> drivers/vhost/net.c | 26 ++-
> >> drivers/vhost/vhost.c | 300
> >>
> >> ++++++++++++++++++++++++---------- drivers/vhost/vhost.h |
> >> 16 ++-
> >>
> >> 3 files changed, 243 insertions(+), 103 deletions(-)
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
Tom
Thomas Lendacky
Linux Technology Center - Performance
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
2012-03-23 21:10 ` Thomas Lendacky
@ 2012-03-23 21:21 ` Rick Jones
0 siblings, 0 replies; 8+ messages in thread
From: Rick Jones @ 2012-03-23 21:21 UTC (permalink / raw)
To: Thomas Lendacky; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm
>
> Yeah, it becomes a question of time. I run each test 3 times and
> average the results, so to run the full suite takes a long time.
I've found the "walk up the instance count with the interim results
emitted" allows me quicker overall run time than launching all the
netperfs at once with a long run time to kludge around skew. Well
modulo the time it takes to get them all launched. But for the smallish
stuff it is rather faster than the 15 minutes a data point I'd get with
the (ab)use of the confidence intervals mechanism in runemomniagg2.sh .
It also avoids the "run one wait for it to finish, run two, wait for
them to finish, run four, wait for them to finish" bit. Walking-up the
instance count leaving the previous instances going does mean that the
"end of test" information is full of skew, but a great deal of that
end-of-test information is invariant anyway.
happy benchmarking,
rick jones
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
2012-03-23 18:32 ` Thomas Lendacky
2012-03-23 19:00 ` Rick Jones
@ 2012-03-23 23:45 ` David Ahern
2012-03-27 14:34 ` Thomas Lendacky
1 sibling, 1 reply; 8+ messages in thread
From: David Ahern @ 2012-03-23 23:45 UTC (permalink / raw)
To: Thomas Lendacky; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm
On 3/23/12 12:32 PM, Thomas Lendacky wrote:
> Quick description of the tests:
> TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30
> and 60 instances
> TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes
> and 1 and 4 instances
>
> Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests
> running between an external host and each VM.
>
> Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests
> running between VM pairs on the same host (no TCP_MAERTS done in
> this situation).
>
> For TCP_RR and UDP_RR tests I report the transaction rate as the
> score and the transaction rate / KVMhost CPU% as the efficiency.
>
> For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps
> as the score and the throughput / KVMhost CPU% as the efficiency.
Would you mind sharing the netperf commands you are running and an
example of the math done to arrive at the summaries presented?
David
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
2012-03-23 23:45 ` David Ahern
@ 2012-03-27 14:34 ` Thomas Lendacky
0 siblings, 0 replies; 8+ messages in thread
From: Thomas Lendacky @ 2012-03-27 14:34 UTC (permalink / raw)
To: David Ahern; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm
On Friday, March 23, 2012 05:45:40 PM David Ahern wrote:
> On 3/23/12 12:32 PM, Thomas Lendacky wrote:
> > Quick description of the tests:
> > TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30
> > and 60 instances
> > TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes
> > and 1 and 4 instances
> >
> > Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests
> > running between an external host and each VM.
> >
> > Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests
> > running between VM pairs on the same host (no TCP_MAERTS done in
> > this situation).
> >
> > For TCP_RR and UDP_RR tests I report the transaction rate as the
> > score and the transaction rate / KVMhost CPU% as the efficiency.
> >
> > For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps
> > as the score and the throughput / KVMhost CPU% as the efficiency.
>
> Would you mind sharing the netperf commands you are running and an
> example of the math done to arrive at the summaries presented?
I'm actually using uperf not netperf. Uperf allows me to launch
multiple instances of a test with one executable. I've provided the
XML profiles for the tests below.
The math is simply taking the score (for TCP_RR it is the tranaction
rate and for TCP_STREAM/TCP_MAERTS it is the throughput) and dividing
by the CPU utilization of the KVM host (obtained from running sar
during the test).
Here are the uperf profiles that were used. The destination,
instances and message sizes are set using environment variables.
TCP_RR
<?xml version="1.0"?>
<!--
Note: uperf reports operations/second. A transaction is made up of
two operations, so to get transactions/second (like netperf)
you must divide the operations/second by 2.
-->
<profile name="TCP_RR">
<group nprocs="$uperf_instances">
<transaction iterations="1">
<flowop type="connect" options="remotehost=$uperf_dest
protocol=tcp"/>
</transaction>
<transaction duration="$uperf_duration">
<flowop type="write" options="size=$uperf_tx_msgsize"/>
<flowop type="read" options="size=$uperf_rx_msgsize"/>
</transaction>
<transaction iterations="1">
<flowop type="disconnect" />
</transaction>
</group>
</profile>
UDP_RR:
<?xml version="1.0"?>
<!--
Note: uperf reports operations/second. A transaction is made up of
two operations, so to get transactions/second (like netperf)
you must divide the operations/second by 2.
-->
<profile name="UDP_RR">
<group nprocs="$uperf_instances">
<transaction iterations="1">
<flowop type="connect" options="remotehost=$uperf_dest
protocol=udp"/>
</transaction>
<transaction duration="$uperf_duration">
<flowop type="write" options="size=$uperf_tx_msgsize"/>
<flowop type="read" options="size=$uperf_rx_msgsize"/>
</transaction>
<transaction iterations="1">
<flowop type="disconnect" />
</transaction>
</group>
</profile>
TCP_STREAM:
<?xml version="1.0"?>
<profile name="TCP_STREAM">
<group nprocs="$uperf_instances">
<transaction iterations="1">
<flowop type="connect" options="remotehost=$uperf_dest
protocol=tcp"/>
</transaction>
<transaction duration="$uperf_duration">
<flowop type="write" options="count=16 size=$uperf_tx_msgsize"/>
</transaction>
<transaction iterations="1">
<flowop type="disconnect" />
</transaction>
</group>
</profile>
TCP_MAERTS:
<?xml version="1.0"?>
<profile name="TCP_MAERTS">
<group nprocs="$uperf_instances">
<transaction iterations="1">
<flowop type="accept" options="remotehost=$uperf_dest
protocol=tcp"/>
</transaction>
<transaction duration="$uperf_duration">
<flowop type="read" options="count=16 size=$uperf_rx_msgsize"/>
</transaction>
<transaction iterations="1">
<flowop type="disconnect" />
</transaction>
</group>
</profile>
Tom
>
> David
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2012-03-27 14:34 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-03-22 23:48 [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread Shirley Ma
2012-03-23 0:16 ` Shirley Ma
2012-03-23 18:32 ` Thomas Lendacky
2012-03-23 19:00 ` Rick Jones
2012-03-23 21:10 ` Thomas Lendacky
2012-03-23 21:21 ` Rick Jones
2012-03-23 23:45 ` David Ahern
2012-03-27 14:34 ` Thomas Lendacky
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).