* [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread @ 2012-03-22 23:48 Shirley Ma 2012-03-23 0:16 ` Shirley Ma 0 siblings, 1 reply; 8+ messages in thread From: Shirley Ma @ 2012-03-22 23:48 UTC (permalink / raw) To: Michael S. Tsirkin, netdev, tahm, kvm Signed-off-by: Shirley Ma <xma@us.ibm.com> Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com> Tested-by: Tom Lendacky <toml@us.ibm.com> --- drivers/vhost/net.c | 26 ++- drivers/vhost/vhost.c | 300 ++++++++++++++++++++++++---------- drivers/vhost/vhost.h | 16 ++- 3 files changed, 243 insertions(+), 103 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 9dab1f5..4664e63 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -41,12 +41,6 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX"); #define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256 -enum { - VHOST_NET_VQ_RX = 0, - VHOST_NET_VQ_TX = 1, - VHOST_NET_VQ_MAX = 2, -}; - enum vhost_net_poll_state { VHOST_NET_POLL_DISABLED = 0, VHOST_NET_POLL_STARTED = 1, @@ -510,8 +504,10 @@ static int vhost_net_open(struct inode *inode, struct file *f) return r; } - vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); - vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, + &n->vqs[VHOST_NET_VQ_TX]); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, + &n->vqs[VHOST_NET_VQ_RX]); n->tx_poll_state = VHOST_NET_POLL_DISABLED; f->private_data = n; @@ -863,15 +859,27 @@ static struct miscdevice vhost_net_misc = { static int vhost_net_init(void) { + int ret; + if (experimental_zcopytx) vhost_enable_zcopy(VHOST_NET_VQ_TX); - return misc_register(&vhost_net_misc); + + ret = misc_register(&vhost_net_misc); + if (ret) + return ret; + + ret = vhost_init(); + if (ret) + misc_deregister(&vhost_net_misc); + + return ret; } module_init(vhost_net_init); static void vhost_net_exit(void) { misc_deregister(&vhost_net_misc); + vhost_cleanup(); } module_exit(vhost_net_exit); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index c14c42b..9fabc5a 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -24,7 +24,7 @@ #include <linux/highmem.h> #include <linux/slab.h> #include <linux/kthread.h> -#include <linux/cgroup.h> +#include <linux/cpu.h> #include <linux/net.h> #include <linux/if_packet.h> @@ -42,6 +42,15 @@ static unsigned vhost_zcopy_mask __read_mostly; #define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num]) #define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num]) +/* per cpu vhost struct */ +struct vhost { + struct task_struct *worker; + spinlock_t lock; + struct list_head work_list; +}; + +static DEFINE_PER_CPU(struct vhost, vhosts); + static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { @@ -64,25 +73,28 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, return 0; } -static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) +static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn, + struct vhost_virtqueue *vq) { INIT_LIST_HEAD(&work->node); work->fn = fn; init_waitqueue_head(&work->done); work->flushing = 0; work->queue_seq = work->done_seq = 0; + work->vq = vq; + spin_lock_init(&work->lock); } /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, - unsigned long mask, struct vhost_dev *dev) + unsigned long mask, struct vhost_virtqueue *vq) { init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; - poll->dev = dev; + poll->dev = vq->dev; - vhost_work_init(&poll->work, fn); + vhost_work_init(&poll->work, fn, vq); } /* Start polling a file. We add ourselves to file's wait queue. The caller must @@ -108,25 +120,30 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, { int left; - spin_lock_irq(&dev->work_lock); + spin_lock_irq(&work->lock); left = seq - work->done_seq; - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&work->lock); return left <= 0; } -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +/* only flushing this work? */ +static void vhost_work_flush(struct vhost_poll *poll) { unsigned seq; int flushing; + struct vhost_dev *dev = poll->dev; + struct vhost_work *work = &poll->work; - spin_lock_irq(&dev->work_lock); + if (list_empty(&work->node)) + return; + spin_lock_irq(&work->lock); seq = work->queue_seq; work->flushing++; - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&work->lock); wait_event(work->done, vhost_work_seq_done(dev, work, seq)); - spin_lock_irq(&dev->work_lock); + spin_lock_irq(&work->lock); flushing = --work->flushing; - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&work->lock); BUG_ON(flushing < 0); } @@ -134,21 +151,59 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) * locks that are also used by the callback. */ void vhost_poll_flush(struct vhost_poll *poll) { - vhost_work_flush(poll->dev, &poll->work); + vhost_work_flush(poll); +} + +/* schedule the cpu on the same socket but different cpu with the given one */ +static unsigned long sched_node_cpu(unsigned long cpu) +{ + int node, ncpus_node; + unsigned long sched_cpu = cpu; + + node = cpu_to_node(cpu); + ncpus_node = nr_cpus_node(node); + if (ncpus_node != 1) { + /* pick up a random cpu on the same node, exclude + * the input one + */ + sched_cpu = node * ncpus_node + random32() % (ncpus_node - 1); + if (sched_cpu >= cpu) + ++sched_cpu; + /* todo hotplug cpu race */ + if (!cpu_online(sched_cpu)) + sched_cpu = cpu; + } + return sched_cpu; } static inline void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { - unsigned long flags; - - spin_lock_irqsave(&dev->work_lock, flags); + unsigned long cpu = work->vq->cpu; + struct vhost *vhost; + + /* Is it safe to disable vq notify here ? */ + vhost_disable_notify(dev, work->vq); + + /* schedule the work on the cpu socket as the work has been delivered + * but different with the cpu the work is delivered on + */ + preempt_disable(); + if (cpu_to_node(cpu) != cpu_to_node(smp_processor_id())) { + cpu = sched_node_cpu(smp_processor_id()); + work->vq->cpu = cpu; + } + preempt_enable(); + vhost = &per_cpu(vhosts, cpu); + spin_lock_irq(&vhost->lock); + spin_lock(&work->lock); if (list_empty(&work->node)) { - list_add_tail(&work->node, &dev->work_list); + list_add_tail(&work->node, &vhost->work_list); work->queue_seq++; - wake_up_process(dev->worker); + wake_up_process(vhost->worker); } - spin_unlock_irqrestore(&dev->work_lock, flags); + spin_unlock(&work->lock); + spin_unlock_irq(&vhost->lock); } void vhost_poll_queue(struct vhost_poll *poll) @@ -188,17 +243,18 @@ static void vhost_vq_reset(struct vhost_dev *dev, static int vhost_worker(void *data) { - struct vhost_dev *dev = data; - struct vhost_work *work = NULL; + struct vhost *vhost = &__get_cpu_var(vhosts); + struct list_head *work_list; + struct mm_struct *prev_mm = NULL; unsigned uninitialized_var(seq); + struct vhost_work *work = NULL; - use_mm(dev->mm); - + work_list = &vhost->work_list; for (;;) { /* mb paired w/ kthread_stop */ set_current_state(TASK_INTERRUPTIBLE); - spin_lock_irq(&dev->work_lock); + spin_lock_irq(&vhost->lock); if (work) { work->done_seq = seq; if (work->flushing) @@ -206,18 +262,26 @@ static int vhost_worker(void *data) } if (kthread_should_stop()) { - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&vhost->lock); __set_current_state(TASK_RUNNING); break; } - if (!list_empty(&dev->work_list)) { - work = list_first_entry(&dev->work_list, + if (!list_empty(work_list)) { + work = list_first_entry(work_list, struct vhost_work, node); + spin_lock(&work->lock); list_del_init(&work->node); + spin_unlock(&work->lock); seq = work->queue_seq; + if (prev_mm != work->vq->dev->mm) { + if (prev_mm) + unuse_mm(prev_mm); + prev_mm = work->vq->dev->mm; + use_mm(prev_mm); + } } else work = NULL; - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&vhost->lock); if (work) { __set_current_state(TASK_RUNNING); @@ -226,7 +290,9 @@ static int vhost_worker(void *data) schedule(); } - unuse_mm(dev->mm); + + if (prev_mm) + unuse_mm(prev_mm); return 0; } @@ -298,9 +364,6 @@ long vhost_dev_init(struct vhost_dev *dev, dev->log_file = NULL; dev->memory = NULL; dev->mm = NULL; - spin_lock_init(&dev->work_lock); - INIT_LIST_HEAD(&dev->work_list); - dev->worker = NULL; for (i = 0; i < dev->nvqs; ++i) { dev->vqs[i].log = NULL; @@ -312,7 +375,8 @@ long vhost_dev_init(struct vhost_dev *dev, vhost_vq_reset(dev, dev->vqs + i); if (dev->vqs[i].handle_kick) vhost_poll_init(&dev->vqs[i].poll, - dev->vqs[i].handle_kick, POLLIN, dev); + dev->vqs[i].handle_kick, POLLIN, + &dev->vqs[i]); } return 0; @@ -325,71 +389,35 @@ long vhost_dev_check_owner(struct vhost_dev *dev) return dev->mm == current->mm ? 0 : -EPERM; } -struct vhost_attach_cgroups_struct { - struct vhost_work work; - struct task_struct *owner; - int ret; -}; - -static void vhost_attach_cgroups_work(struct vhost_work *work) -{ - struct vhost_attach_cgroups_struct *s; - - s = container_of(work, struct vhost_attach_cgroups_struct, work); - s->ret = cgroup_attach_task_all(s->owner, current); -} - -static int vhost_attach_cgroups(struct vhost_dev *dev) -{ - struct vhost_attach_cgroups_struct attach; - - attach.owner = current; - vhost_work_init(&attach.work, vhost_attach_cgroups_work); - vhost_work_queue(dev, &attach.work); - vhost_work_flush(dev, &attach.work); - return attach.ret; -} - /* Caller should have device mutex */ static long vhost_dev_set_owner(struct vhost_dev *dev) { - struct task_struct *worker; int err; + unsigned long txcpu, rxcpu; /* Is there an owner already? */ if (dev->mm) { err = -EBUSY; - goto err_mm; + goto out; } - /* No owner, become one */ - dev->mm = get_task_mm(current); - worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); - if (IS_ERR(worker)) { - err = PTR_ERR(worker); - goto err_worker; - } + err = vhost_dev_alloc_iovecs(dev); + if (err) + goto out; - dev->worker = worker; - wake_up_process(worker); /* avoid contributing to loadavg */ + /* initial txcpu, rxcpu on the same socket */ + txcpu = sched_node_cpu(smp_processor_id()); + rxcpu = sched_node_cpu(txcpu); - err = vhost_attach_cgroups(dev); - if (err) - goto err_cgroup; + dev->vqs[VHOST_NET_VQ_TX].cpu = txcpu; + dev->vqs[VHOST_NET_VQ_RX].cpu = rxcpu; - err = vhost_dev_alloc_iovecs(dev); - if (err) - goto err_cgroup; + /* No owner, become one */ + dev->mm = get_task_mm(current); return 0; -err_cgroup: - kthread_stop(worker); - dev->worker = NULL; -err_worker: - if (dev->mm) - mmput(dev->mm); - dev->mm = NULL; -err_mm: + +out: return err; } @@ -474,11 +502,6 @@ void vhost_dev_cleanup(struct vhost_dev *dev) kfree(rcu_dereference_protected(dev->memory, lockdep_is_held(&dev->mutex))); RCU_INIT_POINTER(dev->memory, NULL); - WARN_ON(!list_empty(&dev->work_list)); - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; - } if (dev->mm) mmput(dev->mm); dev->mm = NULL; @@ -1605,3 +1628,104 @@ void vhost_zerocopy_callback(void *arg) vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; kref_put(&ubufs->kref, vhost_zerocopy_done_signal); } + +/* to do +static int __cpuinit vhost_pool_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + struct vhost *vhost = per_cpu(vhosts, hcpu); + + action &= ~CPU_TASKS_FROZEN; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (!create_vhost_task(vhosts, hcpu)) + return notifier_from_errno(-ENOMEM); + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + kthread_bind(vhost->worker, cpumask_any(cpu_online_mask)); + destory_vhost_task(vhost, hcpu); + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + kthread_bind(vhost->worker, hcpu); + wake_up_process(vhost->worker); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + destrory_vhost_task(vhosts, hcpu); + take_over_work(vhosts, hcpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block vhost_pool_callback_nb __cpuinitdata = { + .notifier_call = vhost_pool_callcack, + .priority = 0, +} +*/ + +static void free_workers(void) +{ + unsigned long cpu; + struct vhost *vhost; + + /* to do + * unregister_cpu_notifier(&vhost_pool_callback_nb); + */ + get_online_cpus(); + for_each_online_cpu(cpu) { + vhost = &per_cpu(vhosts, cpu); + if (!IS_ERR(vhost->worker)) { + kthread_stop(vhost->worker); + BUG_ON(!list_empty(&vhost->work_list)); + } + } + put_online_cpus(); +} + +int vhost_init(void) +{ + int ret = -ENOMEM; + unsigned long cpu; + struct vhost *vhost; + + get_online_cpus(); + for_each_online_cpu(cpu) { + vhost = &per_cpu(vhosts, cpu); + + INIT_LIST_HEAD(&vhost->work_list); + spin_lock_init(&vhost->lock); + vhost->worker = kthread_create_on_node(vhost_worker, NULL, + cpu_to_node(cpu), + "vhost-%lu", cpu); + if (IS_ERR(vhost->worker)) + goto err; + + kthread_bind(vhost->worker, cpu); + wake_up_process(vhost->worker); + } + put_online_cpus(); + + /* to do + * register_cpu_notifier(&vhost_pool_callback_nb); + */ + return 0; +err: + free_workers(); + return ret; +} + +void vhost_cleanup(void) +{ + free_workers(); +} diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index a801e28..c6ecfb0 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -18,6 +18,12 @@ #define VHOST_DMA_DONE_LEN 1 #define VHOST_DMA_CLEAR_LEN 0 +enum { + VHOST_NET_VQ_RX = 0, + VHOST_NET_VQ_TX = 1, + VHOST_NET_VQ_MAX = 2, +}; + struct vhost_device; struct vhost_work; @@ -30,6 +36,8 @@ struct vhost_work { int flushing; unsigned queue_seq; unsigned done_seq; + struct vhost_virtqueue *vq; + spinlock_t lock; }; /* Poll a file (eventfd or socket) */ @@ -44,7 +52,7 @@ struct vhost_poll { }; void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, - unsigned long mask, struct vhost_dev *dev); + unsigned long mask, struct vhost_virtqueue *vq); void vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); void vhost_poll_flush(struct vhost_poll *poll); @@ -141,6 +149,7 @@ struct vhost_virtqueue { /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ struct vhost_ubuf_ref *ubufs; + unsigned long cpu; }; struct vhost_dev { @@ -155,9 +164,6 @@ struct vhost_dev { int nvqs; struct file *log_file; struct eventfd_ctx *log_ctx; - spinlock_t work_lock; - struct list_head work_list; - struct task_struct *worker; }; long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); @@ -190,6 +196,8 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int log_num, u64 len); void vhost_zerocopy_callback(void *arg); int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq); +int vhost_init(void); +void vhost_cleanup(void); #define vq_err(vq, fmt, ...) do { \ pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ ^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread 2012-03-22 23:48 [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread Shirley Ma @ 2012-03-23 0:16 ` Shirley Ma 2012-03-23 18:32 ` Thomas Lendacky 0 siblings, 1 reply; 8+ messages in thread From: Shirley Ma @ 2012-03-23 0:16 UTC (permalink / raw) To: Michael S. Tsirkin; +Cc: netdev, tahm, kvm Resubmit it with the right format. Signed-off-by: Shirley Ma <xma@us.ibm.com> Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com> Tested-by: Tom Lendacky <toml@us.ibm.com> --- drivers/vhost/net.c | 26 ++- drivers/vhost/vhost.c | 300 ++++++++++++++++++++++++---------- drivers/vhost/vhost.h | 16 ++- 3 files changed, 243 insertions(+), 103 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 9dab1f5..4664e63 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -41,12 +41,6 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX"); #define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256 -enum { - VHOST_NET_VQ_RX = 0, - VHOST_NET_VQ_TX = 1, - VHOST_NET_VQ_MAX = 2, -}; - enum vhost_net_poll_state { VHOST_NET_POLL_DISABLED = 0, VHOST_NET_POLL_STARTED = 1, @@ -510,8 +504,10 @@ static int vhost_net_open(struct inode *inode, struct file *f) return r; } - vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); - vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, + &n->vqs[VHOST_NET_VQ_TX]); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, + &n->vqs[VHOST_NET_VQ_RX]); n->tx_poll_state = VHOST_NET_POLL_DISABLED; f->private_data = n; @@ -863,15 +859,27 @@ static struct miscdevice vhost_net_misc = { static int vhost_net_init(void) { + int ret; + if (experimental_zcopytx) vhost_enable_zcopy(VHOST_NET_VQ_TX); - return misc_register(&vhost_net_misc); + + ret = misc_register(&vhost_net_misc); + if (ret) + return ret; + + ret = vhost_init(); + if (ret) + misc_deregister(&vhost_net_misc); + + return ret; } module_init(vhost_net_init); static void vhost_net_exit(void) { misc_deregister(&vhost_net_misc); + vhost_cleanup(); } module_exit(vhost_net_exit); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index c14c42b..9fabc5a 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -24,7 +24,7 @@ #include <linux/highmem.h> #include <linux/slab.h> #include <linux/kthread.h> -#include <linux/cgroup.h> +#include <linux/cpu.h> #include <linux/net.h> #include <linux/if_packet.h> @@ -42,6 +42,15 @@ static unsigned vhost_zcopy_mask __read_mostly; #define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num]) #define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num]) +/* per cpu vhost struct */ +struct vhost { + struct task_struct *worker; + spinlock_t lock; + struct list_head work_list; +}; + +static DEFINE_PER_CPU(struct vhost, vhosts); + static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { @@ -64,25 +73,28 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, return 0; } -static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn) +static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn, + struct vhost_virtqueue *vq) { INIT_LIST_HEAD(&work->node); work->fn = fn; init_waitqueue_head(&work->done); work->flushing = 0; work->queue_seq = work->done_seq = 0; + work->vq = vq; + spin_lock_init(&work->lock); } /* Init poll structure */ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, - unsigned long mask, struct vhost_dev *dev) + unsigned long mask, struct vhost_virtqueue *vq) { init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; - poll->dev = dev; + poll->dev = vq->dev; - vhost_work_init(&poll->work, fn); + vhost_work_init(&poll->work, fn, vq); } /* Start polling a file. We add ourselves to file's wait queue. The caller must @@ -108,25 +120,30 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, { int left; - spin_lock_irq(&dev->work_lock); + spin_lock_irq(&work->lock); left = seq - work->done_seq; - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&work->lock); return left <= 0; } -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) +/* only flushing this work? */ +static void vhost_work_flush(struct vhost_poll *poll) { unsigned seq; int flushing; + struct vhost_dev *dev = poll->dev; + struct vhost_work *work = &poll->work; - spin_lock_irq(&dev->work_lock); + if (list_empty(&work->node)) + return; + spin_lock_irq(&work->lock); seq = work->queue_seq; work->flushing++; - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&work->lock); wait_event(work->done, vhost_work_seq_done(dev, work, seq)); - spin_lock_irq(&dev->work_lock); + spin_lock_irq(&work->lock); flushing = --work->flushing; - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&work->lock); BUG_ON(flushing < 0); } @@ -134,21 +151,59 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work) * locks that are also used by the callback. */ void vhost_poll_flush(struct vhost_poll *poll) { - vhost_work_flush(poll->dev, &poll->work); + vhost_work_flush(poll); +} + +/* schedule the cpu on the same socket but different cpu with the given one */ +static unsigned long sched_node_cpu(unsigned long cpu) +{ + int node, ncpus_node; + unsigned long sched_cpu = cpu; + + node = cpu_to_node(cpu); + ncpus_node = nr_cpus_node(node); + if (ncpus_node != 1) { + /* pick up a random cpu on the same node, exclude + * the input one + */ + sched_cpu = node * ncpus_node + random32() % (ncpus_node - 1); + if (sched_cpu >= cpu) + ++sched_cpu; + /* todo hotplug cpu race */ + if (!cpu_online(sched_cpu)) + sched_cpu = cpu; + } + return sched_cpu; } static inline void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) { - unsigned long flags; - - spin_lock_irqsave(&dev->work_lock, flags); + unsigned long cpu = work->vq->cpu; + struct vhost *vhost; + + /* Is it safe to disable vq notify here ? */ + vhost_disable_notify(dev, work->vq); + + /* schedule the work on the cpu socket as the work has been delivered + * but different with the cpu the work is delivered on + */ + preempt_disable(); + if (cpu_to_node(cpu) != cpu_to_node(smp_processor_id())) { + cpu = sched_node_cpu(smp_processor_id()); + work->vq->cpu = cpu; + } + preempt_enable(); + vhost = &per_cpu(vhosts, cpu); + spin_lock_irq(&vhost->lock); + spin_lock(&work->lock); if (list_empty(&work->node)) { - list_add_tail(&work->node, &dev->work_list); + list_add_tail(&work->node, &vhost->work_list); work->queue_seq++; - wake_up_process(dev->worker); + wake_up_process(vhost->worker); } - spin_unlock_irqrestore(&dev->work_lock, flags); + spin_unlock(&work->lock); + spin_unlock_irq(&vhost->lock); } void vhost_poll_queue(struct vhost_poll *poll) @@ -188,17 +243,18 @@ static void vhost_vq_reset(struct vhost_dev *dev, static int vhost_worker(void *data) { - struct vhost_dev *dev = data; - struct vhost_work *work = NULL; + struct vhost *vhost = &__get_cpu_var(vhosts); + struct list_head *work_list; + struct mm_struct *prev_mm = NULL; unsigned uninitialized_var(seq); + struct vhost_work *work = NULL; - use_mm(dev->mm); - + work_list = &vhost->work_list; for (;;) { /* mb paired w/ kthread_stop */ set_current_state(TASK_INTERRUPTIBLE); - spin_lock_irq(&dev->work_lock); + spin_lock_irq(&vhost->lock); if (work) { work->done_seq = seq; if (work->flushing) @@ -206,18 +262,26 @@ static int vhost_worker(void *data) } if (kthread_should_stop()) { - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&vhost->lock); __set_current_state(TASK_RUNNING); break; } - if (!list_empty(&dev->work_list)) { - work = list_first_entry(&dev->work_list, + if (!list_empty(work_list)) { + work = list_first_entry(work_list, struct vhost_work, node); + spin_lock(&work->lock); list_del_init(&work->node); + spin_unlock(&work->lock); seq = work->queue_seq; + if (prev_mm != work->vq->dev->mm) { + if (prev_mm) + unuse_mm(prev_mm); + prev_mm = work->vq->dev->mm; + use_mm(prev_mm); + } } else work = NULL; - spin_unlock_irq(&dev->work_lock); + spin_unlock_irq(&vhost->lock); if (work) { __set_current_state(TASK_RUNNING); @@ -226,7 +290,9 @@ static int vhost_worker(void *data) schedule(); } - unuse_mm(dev->mm); + + if (prev_mm) + unuse_mm(prev_mm); return 0; } @@ -298,9 +364,6 @@ long vhost_dev_init(struct vhost_dev *dev, dev->log_file = NULL; dev->memory = NULL; dev->mm = NULL; - spin_lock_init(&dev->work_lock); - INIT_LIST_HEAD(&dev->work_list); - dev->worker = NULL; for (i = 0; i < dev->nvqs; ++i) { dev->vqs[i].log = NULL; @@ -312,7 +375,8 @@ long vhost_dev_init(struct vhost_dev *dev, vhost_vq_reset(dev, dev->vqs + i); if (dev->vqs[i].handle_kick) vhost_poll_init(&dev->vqs[i].poll, - dev->vqs[i].handle_kick, POLLIN, dev); + dev->vqs[i].handle_kick, POLLIN, + &dev->vqs[i]); } return 0; @@ -325,71 +389,35 @@ long vhost_dev_check_owner(struct vhost_dev *dev) return dev->mm == current->mm ? 0 : -EPERM; } -struct vhost_attach_cgroups_struct { - struct vhost_work work; - struct task_struct *owner; - int ret; -}; - -static void vhost_attach_cgroups_work(struct vhost_work *work) -{ - struct vhost_attach_cgroups_struct *s; - - s = container_of(work, struct vhost_attach_cgroups_struct, work); - s->ret = cgroup_attach_task_all(s->owner, current); -} - -static int vhost_attach_cgroups(struct vhost_dev *dev) -{ - struct vhost_attach_cgroups_struct attach; - - attach.owner = current; - vhost_work_init(&attach.work, vhost_attach_cgroups_work); - vhost_work_queue(dev, &attach.work); - vhost_work_flush(dev, &attach.work); - return attach.ret; -} - /* Caller should have device mutex */ static long vhost_dev_set_owner(struct vhost_dev *dev) { - struct task_struct *worker; int err; + unsigned long txcpu, rxcpu; /* Is there an owner already? */ if (dev->mm) { err = -EBUSY; - goto err_mm; + goto out; } - /* No owner, become one */ - dev->mm = get_task_mm(current); - worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); - if (IS_ERR(worker)) { - err = PTR_ERR(worker); - goto err_worker; - } + err = vhost_dev_alloc_iovecs(dev); + if (err) + goto out; - dev->worker = worker; - wake_up_process(worker); /* avoid contributing to loadavg */ + /* initial txcpu, rxcpu on the same socket */ + txcpu = sched_node_cpu(smp_processor_id()); + rxcpu = sched_node_cpu(txcpu); - err = vhost_attach_cgroups(dev); - if (err) - goto err_cgroup; + dev->vqs[VHOST_NET_VQ_TX].cpu = txcpu; + dev->vqs[VHOST_NET_VQ_RX].cpu = rxcpu; - err = vhost_dev_alloc_iovecs(dev); - if (err) - goto err_cgroup; + /* No owner, become one */ + dev->mm = get_task_mm(current); return 0; -err_cgroup: - kthread_stop(worker); - dev->worker = NULL; -err_worker: - if (dev->mm) - mmput(dev->mm); - dev->mm = NULL; -err_mm: + +out: return err; } @@ -474,11 +502,6 @@ void vhost_dev_cleanup(struct vhost_dev *dev) kfree(rcu_dereference_protected(dev->memory, lockdep_is_held(&dev->mutex))); RCU_INIT_POINTER(dev->memory, NULL); - WARN_ON(!list_empty(&dev->work_list)); - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; - } if (dev->mm) mmput(dev->mm); dev->mm = NULL; @@ -1605,3 +1628,104 @@ void vhost_zerocopy_callback(void *arg) vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN; kref_put(&ubufs->kref, vhost_zerocopy_done_signal); } + +/* to do +static int __cpuinit vhost_pool_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + struct vhost *vhost = per_cpu(vhosts, hcpu); + + action &= ~CPU_TASKS_FROZEN; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (!create_vhost_task(vhosts, hcpu)) + return notifier_from_errno(-ENOMEM); + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + kthread_bind(vhost->worker, cpumask_any(cpu_online_mask)); + destory_vhost_task(vhost, hcpu); + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + kthread_bind(vhost->worker, hcpu); + wake_up_process(vhost->worker); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + destrory_vhost_task(vhosts, hcpu); + take_over_work(vhosts, hcpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block vhost_pool_callback_nb __cpuinitdata = { + .notifier_call = vhost_pool_callcack, + .priority = 0, +} +*/ + +static void free_workers(void) +{ + unsigned long cpu; + struct vhost *vhost; + + /* to do + * unregister_cpu_notifier(&vhost_pool_callback_nb); + */ + get_online_cpus(); + for_each_online_cpu(cpu) { + vhost = &per_cpu(vhosts, cpu); + if (!IS_ERR(vhost->worker)) { + kthread_stop(vhost->worker); + BUG_ON(!list_empty(&vhost->work_list)); + } + } + put_online_cpus(); +} + +int vhost_init(void) +{ + int ret = -ENOMEM; + unsigned long cpu; + struct vhost *vhost; + + get_online_cpus(); + for_each_online_cpu(cpu) { + vhost = &per_cpu(vhosts, cpu); + + INIT_LIST_HEAD(&vhost->work_list); + spin_lock_init(&vhost->lock); + vhost->worker = kthread_create_on_node(vhost_worker, NULL, + cpu_to_node(cpu), + "vhost-%lu", cpu); + if (IS_ERR(vhost->worker)) + goto err; + + kthread_bind(vhost->worker, cpu); + wake_up_process(vhost->worker); + } + put_online_cpus(); + + /* to do + * register_cpu_notifier(&vhost_pool_callback_nb); + */ + return 0; +err: + free_workers(); + return ret; +} + +void vhost_cleanup(void) +{ + free_workers(); +} diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index a801e28..c6ecfb0 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -18,6 +18,12 @@ #define VHOST_DMA_DONE_LEN 1 #define VHOST_DMA_CLEAR_LEN 0 +enum { + VHOST_NET_VQ_RX = 0, + VHOST_NET_VQ_TX = 1, + VHOST_NET_VQ_MAX = 2, +}; + struct vhost_device; struct vhost_work; @@ -30,6 +36,8 @@ struct vhost_work { int flushing; unsigned queue_seq; unsigned done_seq; + struct vhost_virtqueue *vq; + spinlock_t lock; }; /* Poll a file (eventfd or socket) */ @@ -44,7 +52,7 @@ struct vhost_poll { }; void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, - unsigned long mask, struct vhost_dev *dev); + unsigned long mask, struct vhost_virtqueue *vq); void vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); void vhost_poll_flush(struct vhost_poll *poll); @@ -141,6 +149,7 @@ struct vhost_virtqueue { /* Reference counting for outstanding ubufs. * Protected by vq mutex. Writers must also take device mutex. */ struct vhost_ubuf_ref *ubufs; + unsigned long cpu; }; struct vhost_dev { @@ -155,9 +164,6 @@ struct vhost_dev { int nvqs; struct file *log_file; struct eventfd_ctx *log_ctx; - spinlock_t work_lock; - struct list_head work_list; - struct task_struct *worker; }; long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); @@ -190,6 +196,8 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int log_num, u64 len); void vhost_zerocopy_callback(void *arg); int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq); +int vhost_init(void); +void vhost_cleanup(void); #define vq_err(vq, fmt, ...) do { \ pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ ^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread 2012-03-23 0:16 ` Shirley Ma @ 2012-03-23 18:32 ` Thomas Lendacky 2012-03-23 19:00 ` Rick Jones 2012-03-23 23:45 ` David Ahern 0 siblings, 2 replies; 8+ messages in thread From: Thomas Lendacky @ 2012-03-23 18:32 UTC (permalink / raw) To: Shirley Ma; +Cc: Michael S. Tsirkin, netdev, kvm I ran a series of TCP_RR, UDP_RR, TCP_STREAM and TCP_MAERTS tests against the recent vhost patches. For simplicity, the patches submitted by Anthony that increase the number of threads per vhost instance I will call multi-worker and the patches submitted by Shirley that provide a vhost thread per cpu I will call per-cpu. Quick description of the tests: TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30 and 60 instances TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes and 1 and 4 instances Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests running between an external host and each VM. Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests running between VM pairs on the same host (no TCP_MAERTS done in this situation). For TCP_RR and UDP_RR tests I report the transaction rate as the score and the transaction rate / KVMhost CPU% as the efficiency. For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps as the score and the throughput / KVMhost CPU% as the efficiency. The KVM host machine is a nehalem-based 2-socket, 4-cores/socket system (E5530 @ 2.40GHz) with hyperthreading disabled and an Intel 10GbE single port network adapter. There's a lot of data and I hope this is the clearest way to report it. The remote host to VM results are first followed by the local VM to VM results. Remote Host to VM: Host to 1 VM - Base - -Multi-Worker- - Per-CPU - Test Inst Score Eff Score Eff Score Eff TCP_RR 1 9,587 984 9,725 1,145 9,252 1,041 10 63,919 3,095 51,841 2,415 55,226 2,884 30 85,646 3,288 127,277 3,242 145,644 4,092 60 117,448 3,929 148,330 3,616 137,996 3,898 UDP_RR 1 10,815 1,174 10,125 1,255 7,913 1,150 10 53,989 3,082 59,590 2,875 52,353 3,328 30 91,484 4,115 95,312 3,042 110,715 3,659 60 107,466 4,689 173,443 4,351 158,141 4,235 TCP_STREAM 256 1 2,724 140 2,450 131 2,681 150 4 5,027 137 4,147 146 3,998 117 1024 1 5,602 235 4,623 169 5,425 238 4 5,987 212 5,991 133 6,827 175 4096 1 6,202 256 6,753 211 7,247 279 4 4,996 192 5,771 159 7,124 202 16384 1 6,258 259 7,211 214 8,453 308 4 4,591 179 5,788 181 6,925 217 TCP_MAERTS 256 1 1,951 85 1,871 89 1,899 97 4 4,757 129 4,102 140 4,279 116 1024 1 7,479 381 6,970 371 7,374 427 4 8,931 385 6,612 258 8,731 417 4096 1 9,276 464 9,296 456 9,131 510 4 9,381 452 9,032 367 9,338 446 16384 1 9,153 496 8,817 589 9,238 516 4 9,358 478 9,006 367 9,350 462 Host to 1 VM (VM pinned to a socket) - Base - -Multi-Worker- - Per-CPU - Test Inst Score Eff Score Eff Score Eff TCP_RR 1 9,992 1,019 9,899 917 8,963 899 10 60,731 3,236 60,015 2,444 55,860 3,059 30 127,375 4,042 146,571 3,922 163,806 4,389 60 173,021 4,972 149,549 4,662 161,397 4,330 UDP_RR 1 10,854 1,253 7,983 1,120 7,647 1,206 10 68,128 3,804 64,335 4,067 53,343 3,233 30 92,456 3,994 112,101 4,219 111,610 3,598 60 135,741 4,590 184,441 4,422 184,527 4,546 TCP_STREAM 256 1 2,564 146 2,530 147 2,497 150 4 4,757 139 4,300 127 4,245 124 1024 1 4,700 209 6,062 323 5,627 247 4 6,828 214 7,125 153 6,561 172 4096 1 6,676 281 7,672 286 7,760 290 4 6,258 236 6,410 171 7,354 225 16384 1 6,712 289 8,217 297 8,457 322 4 5,764 235 6,285 200 7,554 245 TCP_MAERTS 256 1 1,673 82 1,444 71 1,756 88 4 6,385 175 5,671 155 5,685 153 1024 1 7,500 427 6,884 414 7,640 429 4 9,310 444 8,659 496 8,200 350 4096 1 8,427 477 9,201 515 8,825 422 4 9,372 478 9,184 394 9,391 446 16384 1 8,840 500 9,205 555 9,239 482 4 9,379 495 9,079 385 9,389 472 Host to 4 VMs - Base - -Multi-Worker- - Per-CPU - Test Inst Score Eff Score Eff Score Eff TCP_RR 1 38,635 949 34,063 843 35,432 897 10 193,703 2,604 157,699 1,841 180,323 2,858 30 279,736 3,301 170,343 1,739 269,827 2,875 60 308,838 3,555 170,486 1,738 285,073 2,988 UDP_RR 1 42,209 1,136 36,035 904 36,974 975 10 177,286 2,616 166,999 2,043 178,470 2,466 30 296,415 3,731 221,738 2,488 260,630 2,966 60 353,784 4,179 209,489 2,152 306,792 3,440 TCP_STREAM 256 1 8,409 113 7,517 101 7,178 115 4 8,963 93 7,825 80 8,606 91 1024 1 9,382 119 10,223 192 9,314 128 4 9,233 101 9,085 110 8,585 105 4096 1 9,391 124 9,393 125 9,300 140 4 9,303 103 9,151 102 8,601 106 16384 1 9,395 121 8,715 128 9,378 135 4 9,322 105 9,135 101 8,691 121 TCP_MAERTS 256 1 8,629 125 7,045 112 7,559 109 4 9,389 145 7,091 80 9,335 156 1024 1 9,385 201 9,349 148 9,320 248 4 9,392 154 9,340 148 9,390 226 4096 1 9,387 239 9,339 151 9,379 291 4 9,392 167 9,389 124 9,390 259 16384 1 9,374 236 9,366 150 9,391 317 4 9,365 167 9,394 123 9,390 284 Host to 12 VMs - Base - -Multi-Worker- - Per-CPU - Test Inst Score Eff Score Eff Score Eff TCP_RR 1 79,628 928 85,717 944 72,760 885 10 106,348 1,067 94,032 944 164,548 2,017 30 131,313 1,318 116,431 1,168 206,560 2,367 60 156,868 1,574 152,205 1,527 223,701 2,250 UDP_RR 1 90,762 1,059 93,904 1,037 75,512 919 10 149,381 1,499 113,254 1,136 194,153 1,951 30 177,803 1,783 132,818 1,333 235,682 2,370 60 201,833 2,025 154,871 1,554 258,133 2,595 TCP_STREAM 256 1 8,549 86 7,173 72 8,407 85 4 8,910 89 8,693 87 8,768 88 1024 1 9,397 95 9,371 94 9,376 95 4 9,289 93 9,268 100 8,898 92 4096 1 9,399 95 9,415 95 9,401 97 4 9,336 94 9,319 94 8,938 94 16384 1 9,405 95 9,402 96 9,397 102 4 9,366 94 9,345 94 8,890 94 TCP_MAERTS 256 1 4,646 49 2,273 23 9,232 135 4 9,393 107 8,019 81 9,414 134 1024 1 9,393 115 9,403 104 9,399 178 4 9,406 110 9,383 98 9,392 157 4096 1 9,393 114 9,409 104 9,388 202 4 9,388 110 9,387 98 9,382 181 16384 1 9,396 114 9,391 104 9,394 221 4 9,411 110 9,384 98 9,391 192 Host to 24 VMs - Base - -Multi-Worker- - Per-CPU - Test Inst Score Eff Score Eff Score Eff TCP_RR 1 110,139 1,118 101,765 1,033 79,189 805 10 94,757 948 90,872 915 156,821 1,581 30 119,904 1,199 120,728 1,207 214,151 2,211 60 144,684 1,457 146,788 1,468 240,963 2,513 UDP_RR 1 129,655 1,316 120,071 1,201 91,208 914 10 119,204 1,201 104,645 1,046 208,432 2,340 30 158,887 1,601 136,629 1,366 249,329 2,517 60 179,365 1,794 159,883 1,610 259,018 2,651 TCP_STREAM 256 1 5,899 59 4,258 44 8,071 82 4 8,739 89 8,195 83 7,934 82 1024 1 8,477 86 7,498 76 9,268 93 4 9,205 93 9,171 94 8,159 84 4096 1 9,334 96 8,992 92 9,324 97 4 9,255 95 9,221 92 8,237 85 16384 1 9,373 96 9,356 95 9,311 96 4 9,283 94 9,275 93 8,317 86 TCP_MAERTS 256 1 739 7 770 8 9,186 129 4 7,804 79 7,573 76 9,253 122 1024 1 1,763 18 1,759 18 9,287 146 4 9,204 99 9,166 93 9,389 155 4096 1 3,430 35 3,403 35 9,348 161 4 9,372 100 9,315 95 9,385 151 16384 1 9,309 102 9,306 97 9,353 175 4 9,378 100 9,392 96 9,377 159 Local VM to VM: 1 VM to 1 VM - Base - -Multi-Worker- - Per-CPU - Test Inst Score Eff Score Eff Score Eff TCP_RR 1 7,422 506 7,698 462 6,281 450 10 49,662 1,362 47,553 1,205 43,258 1,270 30 91,657 1,538 99,319 1,471 89,478 1,499 60 106,168 1,658 106,430 1,503 99,205 1,576 UDP_RR 1 8,414 552 8,532 528 6,976 499 10 58,359 1,645 55,283 1,398 48,094 1,457 30 91,046 1,736 109,403 1,721 92,109 1,715 60 128,835 2,021 130,382 1,807 118,563 1,853 TCP_STREAM 256 1 2,029 60 1,923 54 1,998 64 4 3,861 66 3,445 53 2,914 54 1024 1 7,374 205 6,465 174 5,704 165 4 8,474 196 7,541 161 6,274 156 4096 1 12,825 295 11,921 275 10,262 262 4 12,639 253 13,395 260 11,451 264 16384 1 14,576 331 14,141 291 11,925 305 4 16,016 327 14,210 274 13,656 308 1 VM to 1 VM (each VM pinned to a socket) - Base - -Multi-Worker- - Per-CPU - Test Inst Score Eff Score Eff Score Eff TCP_RR 1 7,145 489 7,840 477 5,965 467 10 51,016 1,406 47,881 1,223 45,232 1,288 30 92,785 1,580 103,453 1,512 91,437 1,523 60 120,160 1,817 115,058 1,595 102,734 1,611 UDP_RR 1 7,908 547 8,704 541 6,552 528 10 59,807 1,653 56,598 1,435 50,524 1,488 30 90,302 1,738 113,861 1,765 94,640 1,720 60 141,684 2,196 141,866 1,919 125,334 1,917 TCP_STREAM 256 1 2,210 64 1,291 32 2,069 64 4 3,993 64 3,441 52 2,780 50 1024 1 8,106 217 7,571 198 5,709 165 4 8,471 206 8,756 174 6,531 157 4096 1 15,360 350 13,825 303 10,717 271 4 14,671 330 12,604 263 11,266 258 16384 1 18,284 395 16,305 337 13,185 317 4 15,451 331 12,438 247 14,699 316 2 VMs to 2 VMs (4 VMs total) - Base - -Multi-Worker- - Per-CPU - Test Inst Score Eff Score Eff Score Eff TCP_RR 1 15,498 491 16,518 460 13,008 441 10 71,425 983 79,711 1,063 85,087 1,037 30 102,132 1,436 82,191 1,145 100,504 1,076 60 127,670 1,608 96,815 1,262 104,694 1,119 UDP_RR 1 17,091 548 18,214 538 14,780 492 10 77,682 1,129 87,523 1,235 86,755 1,165 30 131,830 1,826 92,844 1,327 111,839 1,232 60 145,688 1,952 111,315 1,520 116,358 1,296 TCP_STREAM 256 1 5,085 72 3,900 50 2,430 38 4 6,622 70 4,337 48 5,032 58 1024 1 15,262 206 15,022 195 7,000 115 4 14,205 174 15,288 174 11,030 148 4096 1 15,020 197 21,694 261 13,583 198 4 16,818 205 16,076 195 17,175 238 16384 1 19,671 261 23,699 290 22,396 306 4 18,648 229 17,901 218 17,122 251 6 VMs to 6 VMs (12 VMs total) - Base - -Multi-Worker- - Per-CPU - Test Inst Score Eff Score Eff Score Eff TCP_RR 1 30,242 400 32,281 390 27,737 401 10 73,461 783 61,856 644 93,259 1,000 30 98,638 1,034 81,799 844 107,022 1,121 60 114,238 1,200 91,772 944 110,839 1,152 UDP_RR 1 33,017 438 35,540 429 30,022 438 10 84,676 910 67,838 711 112,339 1,220 30 110,799 1,156 90,555 932 128,928 1,357 60 129,679 1,354 100,715 1,033 136,503 1,429 TCP_STREAM 256 1 6,947 72 5,380 56 6,138 72 4 8,400 85 7,660 77 8,893 89 1024 1 13,698 146 10,307 108 13,023 158 4 15,391 157 13,242 135 17,264 182 4096 1 18,928 202 14,580 154 16,970 189 4 18,826 191 17,262 175 19,558 212 16384 1 22,176 234 17,716 187 21,245 243 4 21,306 215 20,332 206 18,353 227 12 VMs to 12 VMs (24 VMs total) - Base - -Multi-Worker- - Per-CPU - Test Inst Score Eff Score Eff Score Eff TCP_RR 1 72,926 731 67,338 675 32,662 387 10 62,441 625 59,277 594 87,286 891 30 72,761 728 67,760 679 102,549 1,041 60 78,087 782 74,654 748 100,687 1,016 UDP_RR 1 82,662 829 80,875 810 34,915 421 10 71,424 716 67,754 679 111,753 1,147 30 79,495 796 75,512 756 134,576 1,372 60 83,339 835 77,523 778 137,058 1,390 TCP_STREAM 256 1 2,870 29 2,631 26 7,907 80 4 8,424 84 8,026 80 8,929 90 1024 1 3,674 37 3,121 31 15,644 164 4 14,256 143 13,342 134 16,116 168 4096 1 5,068 51 4,366 44 16,179 168 4 17,015 171 16,321 164 17,940 186 16384 1 9,768 98 9,025 90 19,233 203 4 18,981 190 18,202 183 18,964 203 On Thursday, March 22, 2012 05:16:30 PM Shirley Ma wrote: > Resubmit it with the right format. > > Signed-off-by: Shirley Ma <xma@us.ibm.com> > Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com> > Tested-by: Tom Lendacky <toml@us.ibm.com> > --- > > drivers/vhost/net.c | 26 ++- > drivers/vhost/vhost.c | 300 > ++++++++++++++++++++++++---------- drivers/vhost/vhost.h | > 16 ++- > 3 files changed, 243 insertions(+), 103 deletions(-) > ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread 2012-03-23 18:32 ` Thomas Lendacky @ 2012-03-23 19:00 ` Rick Jones 2012-03-23 21:10 ` Thomas Lendacky 2012-03-23 23:45 ` David Ahern 1 sibling, 1 reply; 8+ messages in thread From: Rick Jones @ 2012-03-23 19:00 UTC (permalink / raw) To: Thomas Lendacky; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm On 03/23/2012 11:32 AM, Thomas Lendacky wrote: > I ran a series of TCP_RR, UDP_RR, TCP_STREAM and TCP_MAERTS tests > against the recent vhost patches. For simplicity, the patches > submitted by Anthony that increase the number of threads per vhost > instance I will call multi-worker and the patches submitted by Shirley > that provide a vhost thread per cpu I will call per-cpu. Lots of nice data there - kudos. > Quick description of the tests: > TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30 > and 60 instances There is a point, not quite sure where, when aggregate, synchronous single-transaction netperf tests become as much a context switching test as a networking test. That is why netperf RR has support for the "burst mode" to have more than one transaction in flight at one time: http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002denable_002dburst When looking to measure packet/transaction per second scaling I've taken to finding the peak for a single stream by running up the burst size, (TCP_NODELAY set) and then running 1, 2, 4 etc of those streams. With the occasional ethtool -S audit to make sure that each TCP_RR transaction is indeed a discrete pair of TCP segments... In addition to avoiding concerns about becoming a context switching exercise, the reduction in netperf instances means less chance for skew error on startup and shutdown. To address that I've somewhat recently taken to using demo mode in netperf and then post-processing the results through rrdtool: http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002denable_002ddemo I have a "one to many" script for that under: http://www.netperf.org/svn/netperf2/trunk/doc/examples/runemomniaggdemo.sh which is then post-processed via some stone knives and bearskins: http://www.netperf.org/svn/netperf2/trunk/doc/examples/post_proc.sh http://www.netperf.org/svn/netperf2/trunk/doc/examples/vrules.awk http://www.netperf.org/svn/netperf2/trunk/doc/examples/mins_maxes.awk I've also used that basic idea in some many to many tests involving 512 concurrent netperf instances but that script isn't up on netperf.org. > TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes > and 1 and 4 instances Netperf's own documentation and output is probably not good on this point (feel free to loose petards, though some instances may be cast in stone) but those aren't really message sizes. They are simply the quantity of data netperf is presenting to the transport in any one send call. They are send sizes. > Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests > running between an external host and each VM. I suppose it is implicit, and I'm just being pedantic/paranoid but you are confident of the limits of the external host? > Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests > running between VM pairs on the same host (no TCP_MAERTS done in > this situation). > > For TCP_RR and UDP_RR tests I report the transaction rate as the > score and the transaction rate / KVMhost CPU% as the efficiency. > > For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps > as the score and the throughput / KVMhost CPU% as the efficiency. > > The KVM host machine is a nehalem-based 2-socket, 4-cores/socket > system (E5530 @ 2.40GHz) with hyperthreading disabled and an Intel > 10GbE single port network adapter. > > There's a lot of data and I hope this is the clearest way to report > it. The remote host to VM results are first followed by the local > VM to VM results. Looks reasonable as far as presentation goes. Might have included a summary table of the various peaks: TCP_RR Remote Host to VM: Inst - Base - -Multi-Worker- - Per-CPU - VMs /VM Score Eff Score Eff Score Eff 1 60 117,448 3,929 148,330 3,616 137,996 3,898 4 60 308,838 3,555 170,486 1,738 285,073 2,988 12 60 156,868 1,574 152,205 1,527 223,701 2,250 24 60 144,684 1,457 146,788 1,468 240,963 2,513 Given the KVM host machine is 8 cores with hyperthreading disabled, I might have included a data point at 8 VMs even if they were 2 vCPU VMs, but that is just my gut talking. Certainly looking at the summary table I'm wondering where between 4 and 12 VMs the curve starts its downward trend. Does 12 and 24, 2vCPU VMs force moving around more than say 16 or 32 would? happy benchmarking, rick jones > > > Remote Host to VM: > Host to 1 VM > - Base - -Multi-Worker- - Per-CPU - > Test Inst Score Eff Score Eff Score Eff > TCP_RR 1 9,587 984 9,725 1,145 9,252 1,041 > 10 63,919 3,095 51,841 2,415 55,226 2,884 > 30 85,646 3,288 127,277 3,242 145,644 4,092 > 60 117,448 3,929 148,330 3,616 137,996 3,898 > > UDP_RR 1 10,815 1,174 10,125 1,255 7,913 1,150 > 10 53,989 3,082 59,590 2,875 52,353 3,328 > 30 91,484 4,115 95,312 3,042 110,715 3,659 > 60 107,466 4,689 173,443 4,351 158,141 4,235 > > TCP_STREAM > 256 1 2,724 140 2,450 131 2,681 150 > 4 5,027 137 4,147 146 3,998 117 > > 1024 1 5,602 235 4,623 169 5,425 238 > 4 5,987 212 5,991 133 6,827 175 > > 4096 1 6,202 256 6,753 211 7,247 279 > 4 4,996 192 5,771 159 7,124 202 > > 16384 1 6,258 259 7,211 214 8,453 308 > 4 4,591 179 5,788 181 6,925 217 > > TCP_MAERTS > 256 1 1,951 85 1,871 89 1,899 97 > 4 4,757 129 4,102 140 4,279 116 > > 1024 1 7,479 381 6,970 371 7,374 427 > 4 8,931 385 6,612 258 8,731 417 > > 4096 1 9,276 464 9,296 456 9,131 510 > 4 9,381 452 9,032 367 9,338 446 > > 16384 1 9,153 496 8,817 589 9,238 516 > 4 9,358 478 9,006 367 9,350 462 > > Host to 1 VM (VM pinned to a socket) > - Base - -Multi-Worker- - Per-CPU - > Test Inst Score Eff Score Eff Score Eff > TCP_RR 1 9,992 1,019 9,899 917 8,963 899 > 10 60,731 3,236 60,015 2,444 55,860 3,059 > 30 127,375 4,042 146,571 3,922 163,806 4,389 > 60 173,021 4,972 149,549 4,662 161,397 4,330 > > UDP_RR 1 10,854 1,253 7,983 1,120 7,647 1,206 > 10 68,128 3,804 64,335 4,067 53,343 3,233 > 30 92,456 3,994 112,101 4,219 111,610 3,598 > 60 135,741 4,590 184,441 4,422 184,527 4,546 > > TCP_STREAM > 256 1 2,564 146 2,530 147 2,497 150 > 4 4,757 139 4,300 127 4,245 124 > > 1024 1 4,700 209 6,062 323 5,627 247 > 4 6,828 214 7,125 153 6,561 172 > > 4096 1 6,676 281 7,672 286 7,760 290 > 4 6,258 236 6,410 171 7,354 225 > > 16384 1 6,712 289 8,217 297 8,457 322 > 4 5,764 235 6,285 200 7,554 245 > > TCP_MAERTS > 256 1 1,673 82 1,444 71 1,756 88 > 4 6,385 175 5,671 155 5,685 153 > > 1024 1 7,500 427 6,884 414 7,640 429 > 4 9,310 444 8,659 496 8,200 350 > > 4096 1 8,427 477 9,201 515 8,825 422 > 4 9,372 478 9,184 394 9,391 446 > > 16384 1 8,840 500 9,205 555 9,239 482 > 4 9,379 495 9,079 385 9,389 472 > > Host to 4 VMs > - Base - -Multi-Worker- - Per-CPU - > Test Inst Score Eff Score Eff Score Eff > TCP_RR 1 38,635 949 34,063 843 35,432 897 > 10 193,703 2,604 157,699 1,841 180,323 2,858 > 30 279,736 3,301 170,343 1,739 269,827 2,875 > 60 308,838 3,555 170,486 1,738 285,073 2,988 > > UDP_RR 1 42,209 1,136 36,035 904 36,974 975 > 10 177,286 2,616 166,999 2,043 178,470 2,466 > 30 296,415 3,731 221,738 2,488 260,630 2,966 > 60 353,784 4,179 209,489 2,152 306,792 3,440 > > TCP_STREAM > 256 1 8,409 113 7,517 101 7,178 115 > 4 8,963 93 7,825 80 8,606 91 > > 1024 1 9,382 119 10,223 192 9,314 128 > 4 9,233 101 9,085 110 8,585 105 > > 4096 1 9,391 124 9,393 125 9,300 140 > 4 9,303 103 9,151 102 8,601 106 > > 16384 1 9,395 121 8,715 128 9,378 135 > 4 9,322 105 9,135 101 8,691 121 > > TCP_MAERTS > 256 1 8,629 125 7,045 112 7,559 109 > 4 9,389 145 7,091 80 9,335 156 > > 1024 1 9,385 201 9,349 148 9,320 248 > 4 9,392 154 9,340 148 9,390 226 > > 4096 1 9,387 239 9,339 151 9,379 291 > 4 9,392 167 9,389 124 9,390 259 > > 16384 1 9,374 236 9,366 150 9,391 317 > 4 9,365 167 9,394 123 9,390 284 > > Host to 12 VMs > - Base - -Multi-Worker- - Per-CPU - > Test Inst Score Eff Score Eff Score Eff > TCP_RR 1 79,628 928 85,717 944 72,760 885 > 10 106,348 1,067 94,032 944 164,548 2,017 > 30 131,313 1,318 116,431 1,168 206,560 2,367 > 60 156,868 1,574 152,205 1,527 223,701 2,250 > > UDP_RR 1 90,762 1,059 93,904 1,037 75,512 919 > 10 149,381 1,499 113,254 1,136 194,153 1,951 > 30 177,803 1,783 132,818 1,333 235,682 2,370 > 60 201,833 2,025 154,871 1,554 258,133 2,595 > > TCP_STREAM > 256 1 8,549 86 7,173 72 8,407 85 > 4 8,910 89 8,693 87 8,768 88 > > 1024 1 9,397 95 9,371 94 9,376 95 > 4 9,289 93 9,268 100 8,898 92 > > 4096 1 9,399 95 9,415 95 9,401 97 > 4 9,336 94 9,319 94 8,938 94 > > 16384 1 9,405 95 9,402 96 9,397 102 > 4 9,366 94 9,345 94 8,890 94 > > TCP_MAERTS > 256 1 4,646 49 2,273 23 9,232 135 > 4 9,393 107 8,019 81 9,414 134 > > 1024 1 9,393 115 9,403 104 9,399 178 > 4 9,406 110 9,383 98 9,392 157 > > 4096 1 9,393 114 9,409 104 9,388 202 > 4 9,388 110 9,387 98 9,382 181 > > 16384 1 9,396 114 9,391 104 9,394 221 > 4 9,411 110 9,384 98 9,391 192 > > Host to 24 VMs > - Base - -Multi-Worker- - Per-CPU - > Test Inst Score Eff Score Eff Score Eff > TCP_RR 1 110,139 1,118 101,765 1,033 79,189 805 > 10 94,757 948 90,872 915 156,821 1,581 > 30 119,904 1,199 120,728 1,207 214,151 2,211 > 60 144,684 1,457 146,788 1,468 240,963 2,513 > > UDP_RR 1 129,655 1,316 120,071 1,201 91,208 914 > 10 119,204 1,201 104,645 1,046 208,432 2,340 > 30 158,887 1,601 136,629 1,366 249,329 2,517 > 60 179,365 1,794 159,883 1,610 259,018 2,651 > > TCP_STREAM > 256 1 5,899 59 4,258 44 8,071 82 > 4 8,739 89 8,195 83 7,934 82 > > 1024 1 8,477 86 7,498 76 9,268 93 > 4 9,205 93 9,171 94 8,159 84 > > 4096 1 9,334 96 8,992 92 9,324 97 > 4 9,255 95 9,221 92 8,237 85 > > 16384 1 9,373 96 9,356 95 9,311 96 > 4 9,283 94 9,275 93 8,317 86 > > TCP_MAERTS > 256 1 739 7 770 8 9,186 129 > 4 7,804 79 7,573 76 9,253 122 > > 1024 1 1,763 18 1,759 18 9,287 146 > 4 9,204 99 9,166 93 9,389 155 > > 4096 1 3,430 35 3,403 35 9,348 161 > 4 9,372 100 9,315 95 9,385 151 > > 16384 1 9,309 102 9,306 97 9,353 175 > 4 9,378 100 9,392 96 9,377 159 > > > > Local VM to VM: > > 1 VM to 1 VM > - Base - -Multi-Worker- - Per-CPU - > Test Inst Score Eff Score Eff Score Eff > TCP_RR 1 7,422 506 7,698 462 6,281 450 > 10 49,662 1,362 47,553 1,205 43,258 1,270 > 30 91,657 1,538 99,319 1,471 89,478 1,499 > 60 106,168 1,658 106,430 1,503 99,205 1,576 > > UDP_RR 1 8,414 552 8,532 528 6,976 499 > 10 58,359 1,645 55,283 1,398 48,094 1,457 > 30 91,046 1,736 109,403 1,721 92,109 1,715 > 60 128,835 2,021 130,382 1,807 118,563 1,853 > > TCP_STREAM > 256 1 2,029 60 1,923 54 1,998 64 > 4 3,861 66 3,445 53 2,914 54 > > 1024 1 7,374 205 6,465 174 5,704 165 > 4 8,474 196 7,541 161 6,274 156 > > 4096 1 12,825 295 11,921 275 10,262 262 > 4 12,639 253 13,395 260 11,451 264 > > 16384 1 14,576 331 14,141 291 11,925 305 > 4 16,016 327 14,210 274 13,656 308 > > > 1 VM to 1 VM (each VM pinned to a socket) > - Base - -Multi-Worker- - Per-CPU - > Test Inst Score Eff Score Eff Score Eff > TCP_RR 1 7,145 489 7,840 477 5,965 467 > 10 51,016 1,406 47,881 1,223 45,232 1,288 > 30 92,785 1,580 103,453 1,512 91,437 1,523 > 60 120,160 1,817 115,058 1,595 102,734 1,611 > > UDP_RR 1 7,908 547 8,704 541 6,552 528 > 10 59,807 1,653 56,598 1,435 50,524 1,488 > 30 90,302 1,738 113,861 1,765 94,640 1,720 > 60 141,684 2,196 141,866 1,919 125,334 1,917 > > TCP_STREAM > 256 1 2,210 64 1,291 32 2,069 64 > 4 3,993 64 3,441 52 2,780 50 > > 1024 1 8,106 217 7,571 198 5,709 165 > 4 8,471 206 8,756 174 6,531 157 > > 4096 1 15,360 350 13,825 303 10,717 271 > 4 14,671 330 12,604 263 11,266 258 > > 16384 1 18,284 395 16,305 337 13,185 317 > 4 15,451 331 12,438 247 14,699 316 > > > 2 VMs to 2 VMs (4 VMs total) > - Base - -Multi-Worker- - Per-CPU - > Test Inst Score Eff Score Eff Score Eff > TCP_RR 1 15,498 491 16,518 460 13,008 441 > 10 71,425 983 79,711 1,063 85,087 1,037 > 30 102,132 1,436 82,191 1,145 100,504 1,076 > 60 127,670 1,608 96,815 1,262 104,694 1,119 > > UDP_RR 1 17,091 548 18,214 538 14,780 492 > 10 77,682 1,129 87,523 1,235 86,755 1,165 > 30 131,830 1,826 92,844 1,327 111,839 1,232 > 60 145,688 1,952 111,315 1,520 116,358 1,296 > > TCP_STREAM > 256 1 5,085 72 3,900 50 2,430 38 > 4 6,622 70 4,337 48 5,032 58 > > 1024 1 15,262 206 15,022 195 7,000 115 > 4 14,205 174 15,288 174 11,030 148 > > 4096 1 15,020 197 21,694 261 13,583 198 > 4 16,818 205 16,076 195 17,175 238 > > 16384 1 19,671 261 23,699 290 22,396 306 > 4 18,648 229 17,901 218 17,122 251 > > 6 VMs to 6 VMs (12 VMs total) > - Base - -Multi-Worker- - Per-CPU - > Test Inst Score Eff Score Eff Score Eff > TCP_RR 1 30,242 400 32,281 390 27,737 401 > 10 73,461 783 61,856 644 93,259 1,000 > 30 98,638 1,034 81,799 844 107,022 1,121 > 60 114,238 1,200 91,772 944 110,839 1,152 > > UDP_RR 1 33,017 438 35,540 429 30,022 438 > 10 84,676 910 67,838 711 112,339 1,220 > 30 110,799 1,156 90,555 932 128,928 1,357 > 60 129,679 1,354 100,715 1,033 136,503 1,429 > > TCP_STREAM > 256 1 6,947 72 5,380 56 6,138 72 > 4 8,400 85 7,660 77 8,893 89 > > 1024 1 13,698 146 10,307 108 13,023 158 > 4 15,391 157 13,242 135 17,264 182 > > 4096 1 18,928 202 14,580 154 16,970 189 > 4 18,826 191 17,262 175 19,558 212 > > 16384 1 22,176 234 17,716 187 21,245 243 > 4 21,306 215 20,332 206 18,353 227 > > 12 VMs to 12 VMs (24 VMs total) > - Base - -Multi-Worker- - Per-CPU - > Test Inst Score Eff Score Eff Score Eff > TCP_RR 1 72,926 731 67,338 675 32,662 387 > 10 62,441 625 59,277 594 87,286 891 > 30 72,761 728 67,760 679 102,549 1,041 > 60 78,087 782 74,654 748 100,687 1,016 > > UDP_RR 1 82,662 829 80,875 810 34,915 421 > 10 71,424 716 67,754 679 111,753 1,147 > 30 79,495 796 75,512 756 134,576 1,372 > 60 83,339 835 77,523 778 137,058 1,390 > > TCP_STREAM > 256 1 2,870 29 2,631 26 7,907 80 > 4 8,424 84 8,026 80 8,929 90 > > 1024 1 3,674 37 3,121 31 15,644 164 > 4 14,256 143 13,342 134 16,116 168 > > 4096 1 5,068 51 4,366 44 16,179 168 > 4 17,015 171 16,321 164 17,940 186 > > 16384 1 9,768 98 9,025 90 19,233 203 > 4 18,981 190 18,202 183 18,964 203 > > > On Thursday, March 22, 2012 05:16:30 PM Shirley Ma wrote: >> Resubmit it with the right format. >> >> Signed-off-by: Shirley Ma<xma@us.ibm.com> >> Signed-off-by: Krishna Kumar<krkumar2@in.ibm.com> >> Tested-by: Tom Lendacky<toml@us.ibm.com> >> --- >> >> drivers/vhost/net.c | 26 ++- >> drivers/vhost/vhost.c | 300 >> ++++++++++++++++++++++++---------- drivers/vhost/vhost.h | >> 16 ++- >> 3 files changed, 243 insertions(+), 103 deletions(-) >> > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread 2012-03-23 19:00 ` Rick Jones @ 2012-03-23 21:10 ` Thomas Lendacky 2012-03-23 21:21 ` Rick Jones 0 siblings, 1 reply; 8+ messages in thread From: Thomas Lendacky @ 2012-03-23 21:10 UTC (permalink / raw) To: Rick Jones; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm On Friday, March 23, 2012 12:00:54 PM Rick Jones wrote: > On 03/23/2012 11:32 AM, Thomas Lendacky wrote: > > I ran a series of TCP_RR, UDP_RR, TCP_STREAM and TCP_MAERTS tests > > against the recent vhost patches. For simplicity, the patches > > submitted by Anthony that increase the number of threads per vhost > > instance I will call multi-worker and the patches submitted by Shirley > > that provide a vhost thread per cpu I will call per-cpu. > > Lots of nice data there - kudos. > > > Quick description of the tests: > > TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30 > > and 60 instances > > There is a point, not quite sure where, when aggregate, synchronous > single-transaction netperf tests become as much a context switching test > as a networking test. That is why netperf RR has support for the "burst > mode" to have more than one transaction in flight at one time: > > http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002de > nable_002dburst > > When looking to measure packet/transaction per second scaling I've taken > to finding the peak for a single stream by running up the burst size, > (TCP_NODELAY set) and then running 1, 2, 4 etc of those streams. With > the occasional ethtool -S audit to make sure that each TCP_RR > transaction is indeed a discrete pair of TCP segments... > > In addition to avoiding concerns about becoming a context switching > exercise, the reduction in netperf instances means less chance for skew > error on startup and shutdown. To address that I've somewhat recently > taken to using demo mode in netperf and then post-processing the results > through rrdtool: > > http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002de > nable_002ddemo > > I have a "one to many" script for that under: > > http://www.netperf.org/svn/netperf2/trunk/doc/examples/runemomniaggdemo.sh > > which is then post-processed via some stone knives and bearskins: > http://www.netperf.org/svn/netperf2/trunk/doc/examples/post_proc.sh > http://www.netperf.org/svn/netperf2/trunk/doc/examples/vrules.awk > http://www.netperf.org/svn/netperf2/trunk/doc/examples/mins_maxes.awk > > I've also used that basic idea in some many to many tests involving 512 > concurrent netperf instances but that script isn't up on netperf.org. > > > TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes > > and 1 and 4 instances > > Netperf's own documentation and output is probably not good on this > point (feel free to loose petards, though some instances may be cast in > stone) but those aren't really message sizes. They are simply the > quantity of data netperf is presenting to the transport in any one send > call. They are send sizes. > > > Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests > > running between an external host and each VM. > > I suppose it is implicit, and I'm just being pedantic/paranoid but you > are confident of the limits of the external host? Yes I am. It's pretty much an identical system to the KVM host and has demonstrated much greater performance when running bare-metal scenarios. Plenty of CPU left on all cores, etc. > > > Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests > > running between VM pairs on the same host (no TCP_MAERTS done in > > this situation). > > > > For TCP_RR and UDP_RR tests I report the transaction rate as the > > score and the transaction rate / KVMhost CPU% as the efficiency. > > > > For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps > > as the score and the throughput / KVMhost CPU% as the efficiency. > > > > The KVM host machine is a nehalem-based 2-socket, 4-cores/socket > > system (E5530 @ 2.40GHz) with hyperthreading disabled and an Intel > > 10GbE single port network adapter. > > > > There's a lot of data and I hope this is the clearest way to report > > it. The remote host to VM results are first followed by the local > > VM to VM results. > > Looks reasonable as far as presentation goes. Might have included a > summary table of the various peaks: > > TCP_RR Remote Host to VM: > Inst - Base - -Multi-Worker- - Per-CPU - > VMs /VM Score Eff Score Eff Score Eff > 1 60 117,448 3,929 148,330 3,616 137,996 3,898 > 4 60 308,838 3,555 170,486 1,738 285,073 2,988 > 12 60 156,868 1,574 152,205 1,527 223,701 2,250 > 24 60 144,684 1,457 146,788 1,468 240,963 2,513 > That's a good suggestion. I also have geometric mean comparisons to the baseline (with greater than 100% indicating an improvement and less than 100% indicating regression). Remote: -Multi-Worker- - Per-CPU - VMs Score Eff Score Eff 1 105% 91% 109% 103% 1 (pinned) 102% 94% 103% 95% 4 84% 76% 95% 103% 12 91% 88% 113% 129% 24 95% 94% 135% 149% Overall 95% 88% 110% 114% Local: -Multi-Worker- - Per-CPU - VMs Score Eff Score Eff 1 98% 90% 86% 93% 1 (pinned) 94% 85% 82% 87% 4 94% 91% 86% 86% 12 85% 84% 103% 109% 24 93% 93% 141% 148% Overall 93% 89% 97% 102% Combined: 94% 88% 104% 108% > Given the KVM host machine is 8 cores with hyperthreading disabled, I > might have included a data point at 8 VMs even if they were 2 vCPU VMs, > but that is just my gut talking. Certainly looking at the summary table > I'm wondering where between 4 and 12 VMs the curve starts its downward > trend. Does 12 and 24, 2vCPU VMs force moving around more than say 16 > or 32 would? Yeah, it becomes a question of time. I run each test 3 times and average the results, so to run the full suite takes a long time. Thanks, Tom > > happy benchmarking, > > rick jones > > > Remote Host to VM: > > Host to 1 VM > > > > - Base - -Multi-Worker- - Per-CPU - > > > > Test Inst Score Eff Score Eff Score Eff > > TCP_RR 1 9,587 984 9,725 1,145 9,252 1,041 > > > > 10 63,919 3,095 51,841 2,415 55,226 2,884 > > 30 85,646 3,288 127,277 3,242 145,644 4,092 > > 60 117,448 3,929 148,330 3,616 137,996 3,898 > > > > UDP_RR 1 10,815 1,174 10,125 1,255 7,913 1,150 > > > > 10 53,989 3,082 59,590 2,875 52,353 3,328 > > 30 91,484 4,115 95,312 3,042 110,715 3,659 > > 60 107,466 4,689 173,443 4,351 158,141 4,235 > > > > TCP_STREAM > > > > 256 1 2,724 140 2,450 131 2,681 150 > > > > 4 5,027 137 4,147 146 3,998 117 > > > > 1024 1 5,602 235 4,623 169 5,425 238 > > > > 4 5,987 212 5,991 133 6,827 175 > > > > 4096 1 6,202 256 6,753 211 7,247 279 > > > > 4 4,996 192 5,771 159 7,124 202 > > > > 16384 1 6,258 259 7,211 214 8,453 308 > > > > 4 4,591 179 5,788 181 6,925 217 > > > > TCP_MAERTS > > > > 256 1 1,951 85 1,871 89 1,899 97 > > > > 4 4,757 129 4,102 140 4,279 116 > > > > 1024 1 7,479 381 6,970 371 7,374 427 > > > > 4 8,931 385 6,612 258 8,731 417 > > > > 4096 1 9,276 464 9,296 456 9,131 510 > > > > 4 9,381 452 9,032 367 9,338 446 > > > > 16384 1 9,153 496 8,817 589 9,238 516 > > > > 4 9,358 478 9,006 367 9,350 462 > > > > Host to 1 VM (VM pinned to a socket) > > > > - Base - -Multi-Worker- - Per-CPU - > > > > Test Inst Score Eff Score Eff Score Eff > > TCP_RR 1 9,992 1,019 9,899 917 8,963 899 > > > > 10 60,731 3,236 60,015 2,444 55,860 3,059 > > 30 127,375 4,042 146,571 3,922 163,806 4,389 > > 60 173,021 4,972 149,549 4,662 161,397 4,330 > > > > UDP_RR 1 10,854 1,253 7,983 1,120 7,647 1,206 > > > > 10 68,128 3,804 64,335 4,067 53,343 3,233 > > 30 92,456 3,994 112,101 4,219 111,610 3,598 > > 60 135,741 4,590 184,441 4,422 184,527 4,546 > > > > TCP_STREAM > > > > 256 1 2,564 146 2,530 147 2,497 150 > > > > 4 4,757 139 4,300 127 4,245 124 > > > > 1024 1 4,700 209 6,062 323 5,627 247 > > > > 4 6,828 214 7,125 153 6,561 172 > > > > 4096 1 6,676 281 7,672 286 7,760 290 > > > > 4 6,258 236 6,410 171 7,354 225 > > > > 16384 1 6,712 289 8,217 297 8,457 322 > > > > 4 5,764 235 6,285 200 7,554 245 > > > > TCP_MAERTS > > > > 256 1 1,673 82 1,444 71 1,756 88 > > > > 4 6,385 175 5,671 155 5,685 153 > > > > 1024 1 7,500 427 6,884 414 7,640 429 > > > > 4 9,310 444 8,659 496 8,200 350 > > > > 4096 1 8,427 477 9,201 515 8,825 422 > > > > 4 9,372 478 9,184 394 9,391 446 > > > > 16384 1 8,840 500 9,205 555 9,239 482 > > > > 4 9,379 495 9,079 385 9,389 472 > > > > Host to 4 VMs > > > > - Base - -Multi-Worker- - Per-CPU - > > > > Test Inst Score Eff Score Eff Score Eff > > TCP_RR 1 38,635 949 34,063 843 35,432 897 > > > > 10 193,703 2,604 157,699 1,841 180,323 2,858 > > 30 279,736 3,301 170,343 1,739 269,827 2,875 > > 60 308,838 3,555 170,486 1,738 285,073 2,988 > > > > UDP_RR 1 42,209 1,136 36,035 904 36,974 975 > > > > 10 177,286 2,616 166,999 2,043 178,470 2,466 > > 30 296,415 3,731 221,738 2,488 260,630 2,966 > > 60 353,784 4,179 209,489 2,152 306,792 3,440 > > > > TCP_STREAM > > > > 256 1 8,409 113 7,517 101 7,178 115 > > > > 4 8,963 93 7,825 80 8,606 91 > > > > 1024 1 9,382 119 10,223 192 9,314 128 > > > > 4 9,233 101 9,085 110 8,585 105 > > > > 4096 1 9,391 124 9,393 125 9,300 140 > > > > 4 9,303 103 9,151 102 8,601 106 > > > > 16384 1 9,395 121 8,715 128 9,378 135 > > > > 4 9,322 105 9,135 101 8,691 121 > > > > TCP_MAERTS > > > > 256 1 8,629 125 7,045 112 7,559 109 > > > > 4 9,389 145 7,091 80 9,335 156 > > > > 1024 1 9,385 201 9,349 148 9,320 248 > > > > 4 9,392 154 9,340 148 9,390 226 > > > > 4096 1 9,387 239 9,339 151 9,379 291 > > > > 4 9,392 167 9,389 124 9,390 259 > > > > 16384 1 9,374 236 9,366 150 9,391 317 > > > > 4 9,365 167 9,394 123 9,390 284 > > > > Host to 12 VMs > > > > - Base - -Multi-Worker- - Per-CPU - > > > > Test Inst Score Eff Score Eff Score Eff > > TCP_RR 1 79,628 928 85,717 944 72,760 885 > > > > 10 106,348 1,067 94,032 944 164,548 2,017 > > 30 131,313 1,318 116,431 1,168 206,560 2,367 > > 60 156,868 1,574 152,205 1,527 223,701 2,250 > > > > UDP_RR 1 90,762 1,059 93,904 1,037 75,512 919 > > > > 10 149,381 1,499 113,254 1,136 194,153 1,951 > > 30 177,803 1,783 132,818 1,333 235,682 2,370 > > 60 201,833 2,025 154,871 1,554 258,133 2,595 > > > > TCP_STREAM > > > > 256 1 8,549 86 7,173 72 8,407 85 > > > > 4 8,910 89 8,693 87 8,768 88 > > > > 1024 1 9,397 95 9,371 94 9,376 95 > > > > 4 9,289 93 9,268 100 8,898 92 > > > > 4096 1 9,399 95 9,415 95 9,401 97 > > > > 4 9,336 94 9,319 94 8,938 94 > > > > 16384 1 9,405 95 9,402 96 9,397 102 > > > > 4 9,366 94 9,345 94 8,890 94 > > > > TCP_MAERTS > > > > 256 1 4,646 49 2,273 23 9,232 135 > > > > 4 9,393 107 8,019 81 9,414 134 > > > > 1024 1 9,393 115 9,403 104 9,399 178 > > > > 4 9,406 110 9,383 98 9,392 157 > > > > 4096 1 9,393 114 9,409 104 9,388 202 > > > > 4 9,388 110 9,387 98 9,382 181 > > > > 16384 1 9,396 114 9,391 104 9,394 221 > > > > 4 9,411 110 9,384 98 9,391 192 > > > > Host to 24 VMs > > > > - Base - -Multi-Worker- - Per-CPU - > > > > Test Inst Score Eff Score Eff Score Eff > > TCP_RR 1 110,139 1,118 101,765 1,033 79,189 805 > > > > 10 94,757 948 90,872 915 156,821 1,581 > > 30 119,904 1,199 120,728 1,207 214,151 2,211 > > 60 144,684 1,457 146,788 1,468 240,963 2,513 > > > > UDP_RR 1 129,655 1,316 120,071 1,201 91,208 914 > > > > 10 119,204 1,201 104,645 1,046 208,432 2,340 > > 30 158,887 1,601 136,629 1,366 249,329 2,517 > > 60 179,365 1,794 159,883 1,610 259,018 2,651 > > > > TCP_STREAM > > > > 256 1 5,899 59 4,258 44 8,071 82 > > > > 4 8,739 89 8,195 83 7,934 82 > > > > 1024 1 8,477 86 7,498 76 9,268 93 > > > > 4 9,205 93 9,171 94 8,159 84 > > > > 4096 1 9,334 96 8,992 92 9,324 97 > > > > 4 9,255 95 9,221 92 8,237 85 > > > > 16384 1 9,373 96 9,356 95 9,311 96 > > > > 4 9,283 94 9,275 93 8,317 86 > > > > TCP_MAERTS > > > > 256 1 739 7 770 8 9,186 129 > > > > 4 7,804 79 7,573 76 9,253 122 > > > > 1024 1 1,763 18 1,759 18 9,287 146 > > > > 4 9,204 99 9,166 93 9,389 155 > > > > 4096 1 3,430 35 3,403 35 9,348 161 > > > > 4 9,372 100 9,315 95 9,385 151 > > > > 16384 1 9,309 102 9,306 97 9,353 175 > > > > 4 9,378 100 9,392 96 9,377 159 > > > > Local VM to VM: > > 1 VM to 1 VM > > > > - Base - -Multi-Worker- - Per-CPU - > > > > Test Inst Score Eff Score Eff Score Eff > > TCP_RR 1 7,422 506 7,698 462 6,281 450 > > > > 10 49,662 1,362 47,553 1,205 43,258 1,270 > > 30 91,657 1,538 99,319 1,471 89,478 1,499 > > 60 106,168 1,658 106,430 1,503 99,205 1,576 > > > > UDP_RR 1 8,414 552 8,532 528 6,976 499 > > > > 10 58,359 1,645 55,283 1,398 48,094 1,457 > > 30 91,046 1,736 109,403 1,721 92,109 1,715 > > 60 128,835 2,021 130,382 1,807 118,563 1,853 > > > > TCP_STREAM > > > > 256 1 2,029 60 1,923 54 1,998 64 > > > > 4 3,861 66 3,445 53 2,914 54 > > > > 1024 1 7,374 205 6,465 174 5,704 165 > > > > 4 8,474 196 7,541 161 6,274 156 > > > > 4096 1 12,825 295 11,921 275 10,262 262 > > > > 4 12,639 253 13,395 260 11,451 264 > > > > 16384 1 14,576 331 14,141 291 11,925 305 > > > > 4 16,016 327 14,210 274 13,656 308 > > > > 1 VM to 1 VM (each VM pinned to a socket) > > > > - Base - -Multi-Worker- - Per-CPU - > > > > Test Inst Score Eff Score Eff Score Eff > > TCP_RR 1 7,145 489 7,840 477 5,965 467 > > > > 10 51,016 1,406 47,881 1,223 45,232 1,288 > > 30 92,785 1,580 103,453 1,512 91,437 1,523 > > 60 120,160 1,817 115,058 1,595 102,734 1,611 > > > > UDP_RR 1 7,908 547 8,704 541 6,552 528 > > > > 10 59,807 1,653 56,598 1,435 50,524 1,488 > > 30 90,302 1,738 113,861 1,765 94,640 1,720 > > 60 141,684 2,196 141,866 1,919 125,334 1,917 > > > > TCP_STREAM > > > > 256 1 2,210 64 1,291 32 2,069 64 > > > > 4 3,993 64 3,441 52 2,780 50 > > > > 1024 1 8,106 217 7,571 198 5,709 165 > > > > 4 8,471 206 8,756 174 6,531 157 > > > > 4096 1 15,360 350 13,825 303 10,717 271 > > > > 4 14,671 330 12,604 263 11,266 258 > > > > 16384 1 18,284 395 16,305 337 13,185 317 > > > > 4 15,451 331 12,438 247 14,699 316 > > > > 2 VMs to 2 VMs (4 VMs total) > > > > - Base - -Multi-Worker- - Per-CPU - > > > > Test Inst Score Eff Score Eff Score Eff > > TCP_RR 1 15,498 491 16,518 460 13,008 441 > > > > 10 71,425 983 79,711 1,063 85,087 1,037 > > 30 102,132 1,436 82,191 1,145 100,504 1,076 > > 60 127,670 1,608 96,815 1,262 104,694 1,119 > > > > UDP_RR 1 17,091 548 18,214 538 14,780 492 > > > > 10 77,682 1,129 87,523 1,235 86,755 1,165 > > 30 131,830 1,826 92,844 1,327 111,839 1,232 > > 60 145,688 1,952 111,315 1,520 116,358 1,296 > > > > TCP_STREAM > > > > 256 1 5,085 72 3,900 50 2,430 38 > > > > 4 6,622 70 4,337 48 5,032 58 > > > > 1024 1 15,262 206 15,022 195 7,000 115 > > > > 4 14,205 174 15,288 174 11,030 148 > > > > 4096 1 15,020 197 21,694 261 13,583 198 > > > > 4 16,818 205 16,076 195 17,175 238 > > > > 16384 1 19,671 261 23,699 290 22,396 306 > > > > 4 18,648 229 17,901 218 17,122 251 > > > > 6 VMs to 6 VMs (12 VMs total) > > > > - Base - -Multi-Worker- - Per-CPU - > > > > Test Inst Score Eff Score Eff Score Eff > > TCP_RR 1 30,242 400 32,281 390 27,737 401 > > > > 10 73,461 783 61,856 644 93,259 1,000 > > 30 98,638 1,034 81,799 844 107,022 1,121 > > 60 114,238 1,200 91,772 944 110,839 1,152 > > > > UDP_RR 1 33,017 438 35,540 429 30,022 438 > > > > 10 84,676 910 67,838 711 112,339 1,220 > > 30 110,799 1,156 90,555 932 128,928 1,357 > > 60 129,679 1,354 100,715 1,033 136,503 1,429 > > > > TCP_STREAM > > > > 256 1 6,947 72 5,380 56 6,138 72 > > > > 4 8,400 85 7,660 77 8,893 89 > > > > 1024 1 13,698 146 10,307 108 13,023 158 > > > > 4 15,391 157 13,242 135 17,264 182 > > > > 4096 1 18,928 202 14,580 154 16,970 189 > > > > 4 18,826 191 17,262 175 19,558 212 > > > > 16384 1 22,176 234 17,716 187 21,245 243 > > > > 4 21,306 215 20,332 206 18,353 227 > > > > 12 VMs to 12 VMs (24 VMs total) > > > > - Base - -Multi-Worker- - Per-CPU - > > > > Test Inst Score Eff Score Eff Score Eff > > TCP_RR 1 72,926 731 67,338 675 32,662 387 > > > > 10 62,441 625 59,277 594 87,286 891 > > 30 72,761 728 67,760 679 102,549 1,041 > > 60 78,087 782 74,654 748 100,687 1,016 > > > > UDP_RR 1 82,662 829 80,875 810 34,915 421 > > > > 10 71,424 716 67,754 679 111,753 1,147 > > 30 79,495 796 75,512 756 134,576 1,372 > > 60 83,339 835 77,523 778 137,058 1,390 > > > > TCP_STREAM > > > > 256 1 2,870 29 2,631 26 7,907 80 > > > > 4 8,424 84 8,026 80 8,929 90 > > > > 1024 1 3,674 37 3,121 31 15,644 164 > > > > 4 14,256 143 13,342 134 16,116 168 > > > > 4096 1 5,068 51 4,366 44 16,179 168 > > > > 4 17,015 171 16,321 164 17,940 186 > > > > 16384 1 9,768 98 9,025 90 19,233 203 > > > > 4 18,981 190 18,202 183 18,964 203 > > > > On Thursday, March 22, 2012 05:16:30 PM Shirley Ma wrote: > >> Resubmit it with the right format. > >> > >> Signed-off-by: Shirley Ma<xma@us.ibm.com> > >> Signed-off-by: Krishna Kumar<krkumar2@in.ibm.com> > >> Tested-by: Tom Lendacky<toml@us.ibm.com> > >> --- > >> > >> drivers/vhost/net.c | 26 ++- > >> drivers/vhost/vhost.c | 300 > >> > >> ++++++++++++++++++++++++---------- drivers/vhost/vhost.h | > >> 16 ++- > >> > >> 3 files changed, 243 insertions(+), 103 deletions(-) > > > > -- > > To unsubscribe from this list: send the line "unsubscribe netdev" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html Tom Thomas Lendacky Linux Technology Center - Performance ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread 2012-03-23 21:10 ` Thomas Lendacky @ 2012-03-23 21:21 ` Rick Jones 0 siblings, 0 replies; 8+ messages in thread From: Rick Jones @ 2012-03-23 21:21 UTC (permalink / raw) To: Thomas Lendacky; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm > > Yeah, it becomes a question of time. I run each test 3 times and > average the results, so to run the full suite takes a long time. I've found the "walk up the instance count with the interim results emitted" allows me quicker overall run time than launching all the netperfs at once with a long run time to kludge around skew. Well modulo the time it takes to get them all launched. But for the smallish stuff it is rather faster than the 15 minutes a data point I'd get with the (ab)use of the confidence intervals mechanism in runemomniagg2.sh . It also avoids the "run one wait for it to finish, run two, wait for them to finish, run four, wait for them to finish" bit. Walking-up the instance count leaving the previous instances going does mean that the "end of test" information is full of skew, but a great deal of that end-of-test information is invariant anyway. happy benchmarking, rick jones ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread 2012-03-23 18:32 ` Thomas Lendacky 2012-03-23 19:00 ` Rick Jones @ 2012-03-23 23:45 ` David Ahern 2012-03-27 14:34 ` Thomas Lendacky 1 sibling, 1 reply; 8+ messages in thread From: David Ahern @ 2012-03-23 23:45 UTC (permalink / raw) To: Thomas Lendacky; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm On 3/23/12 12:32 PM, Thomas Lendacky wrote: > Quick description of the tests: > TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30 > and 60 instances > TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes > and 1 and 4 instances > > Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests > running between an external host and each VM. > > Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests > running between VM pairs on the same host (no TCP_MAERTS done in > this situation). > > For TCP_RR and UDP_RR tests I report the transaction rate as the > score and the transaction rate / KVMhost CPU% as the efficiency. > > For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps > as the score and the throughput / KVMhost CPU% as the efficiency. Would you mind sharing the netperf commands you are running and an example of the math done to arrive at the summaries presented? David ^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread 2012-03-23 23:45 ` David Ahern @ 2012-03-27 14:34 ` Thomas Lendacky 0 siblings, 0 replies; 8+ messages in thread From: Thomas Lendacky @ 2012-03-27 14:34 UTC (permalink / raw) To: David Ahern; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm On Friday, March 23, 2012 05:45:40 PM David Ahern wrote: > On 3/23/12 12:32 PM, Thomas Lendacky wrote: > > Quick description of the tests: > > TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30 > > and 60 instances > > TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes > > and 1 and 4 instances > > > > Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests > > running between an external host and each VM. > > > > Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests > > running between VM pairs on the same host (no TCP_MAERTS done in > > this situation). > > > > For TCP_RR and UDP_RR tests I report the transaction rate as the > > score and the transaction rate / KVMhost CPU% as the efficiency. > > > > For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps > > as the score and the throughput / KVMhost CPU% as the efficiency. > > Would you mind sharing the netperf commands you are running and an > example of the math done to arrive at the summaries presented? I'm actually using uperf not netperf. Uperf allows me to launch multiple instances of a test with one executable. I've provided the XML profiles for the tests below. The math is simply taking the score (for TCP_RR it is the tranaction rate and for TCP_STREAM/TCP_MAERTS it is the throughput) and dividing by the CPU utilization of the KVM host (obtained from running sar during the test). Here are the uperf profiles that were used. The destination, instances and message sizes are set using environment variables. TCP_RR <?xml version="1.0"?> <!-- Note: uperf reports operations/second. A transaction is made up of two operations, so to get transactions/second (like netperf) you must divide the operations/second by 2. --> <profile name="TCP_RR"> <group nprocs="$uperf_instances"> <transaction iterations="1"> <flowop type="connect" options="remotehost=$uperf_dest protocol=tcp"/> </transaction> <transaction duration="$uperf_duration"> <flowop type="write" options="size=$uperf_tx_msgsize"/> <flowop type="read" options="size=$uperf_rx_msgsize"/> </transaction> <transaction iterations="1"> <flowop type="disconnect" /> </transaction> </group> </profile> UDP_RR: <?xml version="1.0"?> <!-- Note: uperf reports operations/second. A transaction is made up of two operations, so to get transactions/second (like netperf) you must divide the operations/second by 2. --> <profile name="UDP_RR"> <group nprocs="$uperf_instances"> <transaction iterations="1"> <flowop type="connect" options="remotehost=$uperf_dest protocol=udp"/> </transaction> <transaction duration="$uperf_duration"> <flowop type="write" options="size=$uperf_tx_msgsize"/> <flowop type="read" options="size=$uperf_rx_msgsize"/> </transaction> <transaction iterations="1"> <flowop type="disconnect" /> </transaction> </group> </profile> TCP_STREAM: <?xml version="1.0"?> <profile name="TCP_STREAM"> <group nprocs="$uperf_instances"> <transaction iterations="1"> <flowop type="connect" options="remotehost=$uperf_dest protocol=tcp"/> </transaction> <transaction duration="$uperf_duration"> <flowop type="write" options="count=16 size=$uperf_tx_msgsize"/> </transaction> <transaction iterations="1"> <flowop type="disconnect" /> </transaction> </group> </profile> TCP_MAERTS: <?xml version="1.0"?> <profile name="TCP_MAERTS"> <group nprocs="$uperf_instances"> <transaction iterations="1"> <flowop type="accept" options="remotehost=$uperf_dest protocol=tcp"/> </transaction> <transaction duration="$uperf_duration"> <flowop type="read" options="count=16 size=$uperf_rx_msgsize"/> </transaction> <transaction iterations="1"> <flowop type="disconnect" /> </transaction> </group> </profile> Tom > > David > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2012-03-27 14:34 UTC | newest] Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2012-03-22 23:48 [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread Shirley Ma 2012-03-23 0:16 ` Shirley Ma 2012-03-23 18:32 ` Thomas Lendacky 2012-03-23 19:00 ` Rick Jones 2012-03-23 21:10 ` Thomas Lendacky 2012-03-23 21:21 ` Rick Jones 2012-03-23 23:45 ` David Ahern 2012-03-27 14:34 ` Thomas Lendacky
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).