netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
@ 2012-03-22 23:48 Shirley Ma
  2012-03-23  0:16 ` Shirley Ma
  0 siblings, 1 reply; 8+ messages in thread
From: Shirley Ma @ 2012-03-22 23:48 UTC (permalink / raw)
  To: Michael S. Tsirkin, netdev, tahm, kvm

Signed-off-by: Shirley Ma <xma@us.ibm.com>
Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Tested-by: Tom Lendacky <toml@us.ibm.com>
---

 drivers/vhost/net.c                  |   26 ++-
 drivers/vhost/vhost.c                |  300 ++++++++++++++++++++++++----------
 drivers/vhost/vhost.h                |   16 ++-
 3 files changed, 243 insertions(+), 103 deletions(-)
 
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9dab1f5..4664e63 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -41,12 +41,6 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX");
 #define VHOST_MAX_PEND 128
 #define VHOST_GOODCOPY_LEN 256
 
-enum {
-	VHOST_NET_VQ_RX = 0,
-	VHOST_NET_VQ_TX = 1,
-	VHOST_NET_VQ_MAX = 2,
-};
-
 enum vhost_net_poll_state {
 	VHOST_NET_POLL_DISABLED = 0,
 	VHOST_NET_POLL_STARTED = 1,
@@ -510,8 +504,10 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 		return r;
 	}
 
-	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
-	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT,
+			&n->vqs[VHOST_NET_VQ_TX]);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN,
+			&n->vqs[VHOST_NET_VQ_RX]);
 	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
 
 	f->private_data = n;
@@ -863,15 +859,27 @@ static struct miscdevice vhost_net_misc = {
 
 static int vhost_net_init(void)
 {
+	int ret;
+
 	if (experimental_zcopytx)
 		vhost_enable_zcopy(VHOST_NET_VQ_TX);
-	return misc_register(&vhost_net_misc);
+
+	ret = misc_register(&vhost_net_misc);
+	if (ret)
+		return ret;
+
+	ret = vhost_init();
+	if (ret)
+		misc_deregister(&vhost_net_misc);
+
+	return ret;
 }
 module_init(vhost_net_init);
 
 static void vhost_net_exit(void)
 {
 	misc_deregister(&vhost_net_misc);
+	vhost_cleanup();
 }
 module_exit(vhost_net_exit);
 
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c14c42b..9fabc5a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -24,7 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
-#include <linux/cgroup.h>
+#include <linux/cpu.h>
 
 #include <linux/net.h>
 #include <linux/if_packet.h>
@@ -42,6 +42,15 @@ static unsigned vhost_zcopy_mask __read_mostly;
 #define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
 #define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
 
+/* per cpu vhost struct */
+struct vhost {
+	struct task_struct      *worker;
+	spinlock_t              lock;
+	struct list_head        work_list;
+};
+
+static DEFINE_PER_CPU(struct vhost, vhosts);
+
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 			    poll_table *pt)
 {
@@ -64,25 +73,28 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
 	return 0;
 }
 
-static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
+static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn,
+			    struct vhost_virtqueue *vq)
 {
 	INIT_LIST_HEAD(&work->node);
 	work->fn = fn;
 	init_waitqueue_head(&work->done);
 	work->flushing = 0;
 	work->queue_seq = work->done_seq = 0;
+	work->vq = vq;
+	spin_lock_init(&work->lock);
 }
 
 /* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     unsigned long mask, struct vhost_dev *dev)
+		     unsigned long mask, struct vhost_virtqueue *vq)
 {
 	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 	init_poll_funcptr(&poll->table, vhost_poll_func);
 	poll->mask = mask;
-	poll->dev = dev;
+	poll->dev = vq->dev;
 
-	vhost_work_init(&poll->work, fn);
+	vhost_work_init(&poll->work, fn, vq);
 }
 
 /* Start polling a file. We add ourselves to file's wait queue. The caller must
@@ -108,25 +120,30 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
 {
 	int left;
 
-	spin_lock_irq(&dev->work_lock);
+	spin_lock_irq(&work->lock);
 	left = seq - work->done_seq;
-	spin_unlock_irq(&dev->work_lock);
+	spin_unlock_irq(&work->lock);
 	return left <= 0;
 }
 
-static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
+/* only flushing this work? */
+static void vhost_work_flush(struct vhost_poll *poll)
 {
 	unsigned seq;
 	int flushing;
+	struct vhost_dev *dev = poll->dev;
+	struct vhost_work *work = &poll->work;
 
-	spin_lock_irq(&dev->work_lock);
+	if (list_empty(&work->node))
+		return;
+	spin_lock_irq(&work->lock);
 	seq = work->queue_seq;
 	work->flushing++;
-	spin_unlock_irq(&dev->work_lock);
+	spin_unlock_irq(&work->lock);
 	wait_event(work->done, vhost_work_seq_done(dev, work, seq));
-	spin_lock_irq(&dev->work_lock);
+	spin_lock_irq(&work->lock);
 	flushing = --work->flushing;
-	spin_unlock_irq(&dev->work_lock);
+	spin_unlock_irq(&work->lock);
 	BUG_ON(flushing < 0);
 }
 
@@ -134,21 +151,59 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
  * locks that are also used by the callback. */
 void vhost_poll_flush(struct vhost_poll *poll)
 {
-	vhost_work_flush(poll->dev, &poll->work);
+	vhost_work_flush(poll);
+}
+
+/* schedule the cpu on the same socket but different cpu with the given one */
+static unsigned long sched_node_cpu(unsigned long cpu)
+{
+	int node, ncpus_node;
+	unsigned long sched_cpu = cpu;
+
+	node = cpu_to_node(cpu);
+	ncpus_node = nr_cpus_node(node);
+	if (ncpus_node != 1) {
+		/* pick up a random cpu on the same node, exclude
+		 * the input one
+		 */
+		sched_cpu = node * ncpus_node + random32() % (ncpus_node - 1);
+		if (sched_cpu >= cpu)
+			++sched_cpu;
+		/* todo hotplug cpu race */
+		if (!cpu_online(sched_cpu))
+			sched_cpu = cpu;
+	}
+	return sched_cpu;
 }
 
 static inline void vhost_work_queue(struct vhost_dev *dev,
 				    struct vhost_work *work)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev->work_lock, flags);
+	unsigned long cpu = work->vq->cpu;
+	struct vhost *vhost;
+
+	/* Is it safe to disable vq notify here ? */
+	vhost_disable_notify(dev, work->vq);
+
+	/* schedule the work on the cpu socket as the work has been delivered
+	 * but different with the cpu the work is delivered on
+	 */
+	preempt_disable();
+	if (cpu_to_node(cpu) != cpu_to_node(smp_processor_id())) {
+		cpu = sched_node_cpu(smp_processor_id());
+		work->vq->cpu = cpu;
+	}
+	preempt_enable();
+	vhost = &per_cpu(vhosts, cpu);
+	spin_lock_irq(&vhost->lock);
+	spin_lock(&work->lock);
 	if (list_empty(&work->node)) {
-		list_add_tail(&work->node, &dev->work_list);
+		list_add_tail(&work->node, &vhost->work_list);
 		work->queue_seq++;
-		wake_up_process(dev->worker);
+		wake_up_process(vhost->worker);
 	}
-	spin_unlock_irqrestore(&dev->work_lock, flags);
+	spin_unlock(&work->lock);
+	spin_unlock_irq(&vhost->lock);
 }
 
 void vhost_poll_queue(struct vhost_poll *poll)
@@ -188,17 +243,18 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 
 static int vhost_worker(void *data)
 {
-	struct vhost_dev *dev = data;
-	struct vhost_work *work = NULL;
+	struct vhost *vhost = &__get_cpu_var(vhosts);
+	struct list_head *work_list;
+	struct mm_struct *prev_mm = NULL;
 	unsigned uninitialized_var(seq);
+	struct vhost_work *work = NULL;
 
-	use_mm(dev->mm);
-
+	work_list = &vhost->work_list;
 	for (;;) {
 		/* mb paired w/ kthread_stop */
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		spin_lock_irq(&dev->work_lock);
+		spin_lock_irq(&vhost->lock);
 		if (work) {
 			work->done_seq = seq;
 			if (work->flushing)
@@ -206,18 +262,26 @@ static int vhost_worker(void *data)
 		}
 
 		if (kthread_should_stop()) {
-			spin_unlock_irq(&dev->work_lock);
+			spin_unlock_irq(&vhost->lock);
 			__set_current_state(TASK_RUNNING);
 			break;
 		}
-		if (!list_empty(&dev->work_list)) {
-			work = list_first_entry(&dev->work_list,
+		if (!list_empty(work_list)) {
+			work = list_first_entry(work_list,
 						struct vhost_work, node);
+			spin_lock(&work->lock);
 			list_del_init(&work->node);
+			spin_unlock(&work->lock);
 			seq = work->queue_seq;
+			if (prev_mm != work->vq->dev->mm) {
+				if (prev_mm)
+					unuse_mm(prev_mm);
+				prev_mm = work->vq->dev->mm;
+				use_mm(prev_mm);
+			}
 		} else
 			work = NULL;
-		spin_unlock_irq(&dev->work_lock);
+		spin_unlock_irq(&vhost->lock);
 
 		if (work) {
 			__set_current_state(TASK_RUNNING);
@@ -226,7 +290,9 @@ static int vhost_worker(void *data)
 			schedule();
 
 	}
-	unuse_mm(dev->mm);
+
+	if (prev_mm)
+		unuse_mm(prev_mm);
 	return 0;
 }
 
@@ -298,9 +364,6 @@ long vhost_dev_init(struct vhost_dev *dev,
 	dev->log_file = NULL;
 	dev->memory = NULL;
 	dev->mm = NULL;
-	spin_lock_init(&dev->work_lock);
-	INIT_LIST_HEAD(&dev->work_list);
-	dev->worker = NULL;
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		dev->vqs[i].log = NULL;
@@ -312,7 +375,8 @@ long vhost_dev_init(struct vhost_dev *dev,
 		vhost_vq_reset(dev, dev->vqs + i);
 		if (dev->vqs[i].handle_kick)
 			vhost_poll_init(&dev->vqs[i].poll,
-					dev->vqs[i].handle_kick, POLLIN, dev);
+					dev->vqs[i].handle_kick, POLLIN,
+					&dev->vqs[i]);
 	}
 
 	return 0;
@@ -325,71 +389,35 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
 	return dev->mm == current->mm ? 0 : -EPERM;
 }
 
-struct vhost_attach_cgroups_struct {
-	struct vhost_work work;
-	struct task_struct *owner;
-	int ret;
-};
-
-static void vhost_attach_cgroups_work(struct vhost_work *work)
-{
-	struct vhost_attach_cgroups_struct *s;
-
-	s = container_of(work, struct vhost_attach_cgroups_struct, work);
-	s->ret = cgroup_attach_task_all(s->owner, current);
-}
-
-static int vhost_attach_cgroups(struct vhost_dev *dev)
-{
-	struct vhost_attach_cgroups_struct attach;
-
-	attach.owner = current;
-	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
-	vhost_work_queue(dev, &attach.work);
-	vhost_work_flush(dev, &attach.work);
-	return attach.ret;
-}
-
 /* Caller should have device mutex */
 static long vhost_dev_set_owner(struct vhost_dev *dev)
 {
-	struct task_struct *worker;
 	int err;
+	unsigned long txcpu, rxcpu;
 
 	/* Is there an owner already? */
 	if (dev->mm) {
 		err = -EBUSY;
-		goto err_mm;
+		goto out;
 	}
 
-	/* No owner, become one */
-	dev->mm = get_task_mm(current);
-	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
-	if (IS_ERR(worker)) {
-		err = PTR_ERR(worker);
-		goto err_worker;
-	}
+	err = vhost_dev_alloc_iovecs(dev);
+	if (err)
+		goto out;
 
-	dev->worker = worker;
-	wake_up_process(worker);	/* avoid contributing to loadavg */
+	/* initial txcpu, rxcpu on the same socket */
+	txcpu = sched_node_cpu(smp_processor_id());
+	rxcpu = sched_node_cpu(txcpu);
 
-	err = vhost_attach_cgroups(dev);
-	if (err)
-		goto err_cgroup;
+	dev->vqs[VHOST_NET_VQ_TX].cpu = txcpu;
+	dev->vqs[VHOST_NET_VQ_RX].cpu = rxcpu;
 
-	err = vhost_dev_alloc_iovecs(dev);
-	if (err)
-		goto err_cgroup;
+	/* No owner, become one */
+	dev->mm = get_task_mm(current);
 
 	return 0;
-err_cgroup:
-	kthread_stop(worker);
-	dev->worker = NULL;
-err_worker:
-	if (dev->mm)
-		mmput(dev->mm);
-	dev->mm = NULL;
-err_mm:
+
+out:
 	return err;
 }
 
@@ -474,11 +502,6 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 	kfree(rcu_dereference_protected(dev->memory,
 					lockdep_is_held(&dev->mutex)));
 	RCU_INIT_POINTER(dev->memory, NULL);
-	WARN_ON(!list_empty(&dev->work_list));
-	if (dev->worker) {
-		kthread_stop(dev->worker);
-		dev->worker = NULL;
-	}
 	if (dev->mm)
 		mmput(dev->mm);
 	dev->mm = NULL;
@@ -1605,3 +1628,104 @@ void vhost_zerocopy_callback(void *arg)
 	vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
 	kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
 }
+
+/* to do
+static int __cpuinit vhost_pool_callback(struct notifier_block *nfb,
+					 unsigned long action,
+					 void *hcpu)
+{
+	struct vhost *vhost = per_cpu(vhosts, hcpu);
+
+	action &= ~CPU_TASKS_FROZEN;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (!create_vhost_task(vhosts, hcpu))
+			return notifier_from_errno(-ENOMEM);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		kthread_bind(vhost->worker, cpumask_any(cpu_online_mask));
+		destory_vhost_task(vhost, hcpu);
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		kthread_bind(vhost->worker, hcpu);
+		wake_up_process(vhost->worker);
+		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		destrory_vhost_task(vhosts, hcpu);
+		take_over_work(vhosts, hcpu);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block vhost_pool_callback_nb __cpuinitdata = {
+	.notifier_call = vhost_pool_callcack,
+	.priority = 0,
+}
+*/
+
+static void free_workers(void)
+{
+	unsigned long cpu;
+	struct vhost *vhost;
+
+	/* to do
+	 * unregister_cpu_notifier(&vhost_pool_callback_nb);
+	 */
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		vhost = &per_cpu(vhosts, cpu);
+		if (!IS_ERR(vhost->worker)) {
+			kthread_stop(vhost->worker);
+			BUG_ON(!list_empty(&vhost->work_list));
+		}
+	}
+	put_online_cpus();
+}
+
+int vhost_init(void)
+{
+	int ret = -ENOMEM;
+	unsigned long cpu;
+	struct vhost *vhost;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		vhost = &per_cpu(vhosts, cpu);
+
+		INIT_LIST_HEAD(&vhost->work_list);
+		spin_lock_init(&vhost->lock);
+		vhost->worker = kthread_create_on_node(vhost_worker, NULL,
+						cpu_to_node(cpu),
+						"vhost-%lu", cpu);
+		if (IS_ERR(vhost->worker))
+			goto err;
+
+		kthread_bind(vhost->worker, cpu);
+		wake_up_process(vhost->worker);
+	}
+	put_online_cpus();
+
+	/* to do
+	 * register_cpu_notifier(&vhost_pool_callback_nb);
+	 */
+	return 0;
+err:
+	free_workers();
+	return ret;
+}
+
+void vhost_cleanup(void)
+{
+	free_workers();
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a801e28..c6ecfb0 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -18,6 +18,12 @@
 #define VHOST_DMA_DONE_LEN	1
 #define VHOST_DMA_CLEAR_LEN	0
 
+enum {
+	VHOST_NET_VQ_RX = 0,
+	VHOST_NET_VQ_TX = 1,
+	VHOST_NET_VQ_MAX = 2,
+};
+
 struct vhost_device;
 
 struct vhost_work;
@@ -30,6 +36,8 @@ struct vhost_work {
 	int			  flushing;
 	unsigned		  queue_seq;
 	unsigned		  done_seq;
+	struct vhost_virtqueue	  *vq;
+	spinlock_t                lock;
 };
 
 /* Poll a file (eventfd or socket) */
@@ -44,7 +52,7 @@ struct vhost_poll {
 };
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     unsigned long mask, struct vhost_dev *dev);
+		     unsigned long mask, struct vhost_virtqueue *vq);
 void vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
@@ -141,6 +149,7 @@ struct vhost_virtqueue {
 	/* Reference counting for outstanding ubufs.
 	 * Protected by vq mutex. Writers must also take device mutex. */
 	struct vhost_ubuf_ref *ubufs;
+	unsigned long cpu;
 };
 
 struct vhost_dev {
@@ -155,9 +164,6 @@ struct vhost_dev {
 	int nvqs;
 	struct file *log_file;
 	struct eventfd_ctx *log_ctx;
-	spinlock_t work_lock;
-	struct list_head work_list;
-	struct task_struct *worker;
 };
 
 long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
@@ -190,6 +196,8 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
 void vhost_zerocopy_callback(void *arg);
 int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
+int vhost_init(void);
+void vhost_cleanup(void);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
  2012-03-22 23:48 [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread Shirley Ma
@ 2012-03-23  0:16 ` Shirley Ma
  2012-03-23 18:32   ` Thomas Lendacky
  0 siblings, 1 reply; 8+ messages in thread
From: Shirley Ma @ 2012-03-23  0:16 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, tahm, kvm

Resubmit it with the right format.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Tested-by: Tom Lendacky <toml@us.ibm.com>
---

 drivers/vhost/net.c                  |   26 ++-
 drivers/vhost/vhost.c                |  300 ++++++++++++++++++++++++----------
 drivers/vhost/vhost.h                |   16 ++-
 3 files changed, 243 insertions(+), 103 deletions(-)
 
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9dab1f5..4664e63 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -41,12 +41,6 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Experimental Zero Copy TX");
 #define VHOST_MAX_PEND 128
 #define VHOST_GOODCOPY_LEN 256
 
-enum {
-	VHOST_NET_VQ_RX = 0,
-	VHOST_NET_VQ_TX = 1,
-	VHOST_NET_VQ_MAX = 2,
-};
-
 enum vhost_net_poll_state {
 	VHOST_NET_POLL_DISABLED = 0,
 	VHOST_NET_POLL_STARTED = 1,
@@ -510,8 +504,10 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 		return r;
 	}
 
-	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
-	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT,
+			&n->vqs[VHOST_NET_VQ_TX]);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN,
+			&n->vqs[VHOST_NET_VQ_RX]);
 	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
 
 	f->private_data = n;
@@ -863,15 +859,27 @@ static struct miscdevice vhost_net_misc = {
 
 static int vhost_net_init(void)
 {
+	int ret;
+
 	if (experimental_zcopytx)
 		vhost_enable_zcopy(VHOST_NET_VQ_TX);
-	return misc_register(&vhost_net_misc);
+
+	ret = misc_register(&vhost_net_misc);
+	if (ret)
+		return ret;
+
+	ret = vhost_init();
+	if (ret)
+		misc_deregister(&vhost_net_misc);
+
+	return ret;
 }
 module_init(vhost_net_init);
 
 static void vhost_net_exit(void)
 {
 	misc_deregister(&vhost_net_misc);
+	vhost_cleanup();
 }
 module_exit(vhost_net_exit);
 
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c14c42b..9fabc5a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -24,7 +24,7 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
-#include <linux/cgroup.h>
+#include <linux/cpu.h>
 
 #include <linux/net.h>
 #include <linux/if_packet.h>
@@ -42,6 +42,15 @@ static unsigned vhost_zcopy_mask __read_mostly;
 #define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
 #define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
 
+/* per cpu vhost struct */
+struct vhost {
+	struct task_struct      *worker;
+	spinlock_t              lock;
+	struct list_head        work_list;
+};
+
+static DEFINE_PER_CPU(struct vhost, vhosts);
+
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 			    poll_table *pt)
 {
@@ -64,25 +73,28 @@ static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
 	return 0;
 }
 
-static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
+static void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn,
+			    struct vhost_virtqueue *vq)
 {
 	INIT_LIST_HEAD(&work->node);
 	work->fn = fn;
 	init_waitqueue_head(&work->done);
 	work->flushing = 0;
 	work->queue_seq = work->done_seq = 0;
+	work->vq = vq;
+	spin_lock_init(&work->lock);
 }
 
 /* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     unsigned long mask, struct vhost_dev *dev)
+		     unsigned long mask, struct vhost_virtqueue *vq)
 {
 	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 	init_poll_funcptr(&poll->table, vhost_poll_func);
 	poll->mask = mask;
-	poll->dev = dev;
+	poll->dev = vq->dev;
 
-	vhost_work_init(&poll->work, fn);
+	vhost_work_init(&poll->work, fn, vq);
 }
 
 /* Start polling a file. We add ourselves to file's wait queue. The caller must
@@ -108,25 +120,30 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work,
 {
 	int left;
 
-	spin_lock_irq(&dev->work_lock);
+	spin_lock_irq(&work->lock);
 	left = seq - work->done_seq;
-	spin_unlock_irq(&dev->work_lock);
+	spin_unlock_irq(&work->lock);
 	return left <= 0;
 }
 
-static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
+/* only flushing this work? */
+static void vhost_work_flush(struct vhost_poll *poll)
 {
 	unsigned seq;
 	int flushing;
+	struct vhost_dev *dev = poll->dev;
+	struct vhost_work *work = &poll->work;
 
-	spin_lock_irq(&dev->work_lock);
+	if (list_empty(&work->node))
+		return;
+	spin_lock_irq(&work->lock);
 	seq = work->queue_seq;
 	work->flushing++;
-	spin_unlock_irq(&dev->work_lock);
+	spin_unlock_irq(&work->lock);
 	wait_event(work->done, vhost_work_seq_done(dev, work, seq));
-	spin_lock_irq(&dev->work_lock);
+	spin_lock_irq(&work->lock);
 	flushing = --work->flushing;
-	spin_unlock_irq(&dev->work_lock);
+	spin_unlock_irq(&work->lock);
 	BUG_ON(flushing < 0);
 }
 
@@ -134,21 +151,59 @@ static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
  * locks that are also used by the callback. */
 void vhost_poll_flush(struct vhost_poll *poll)
 {
-	vhost_work_flush(poll->dev, &poll->work);
+	vhost_work_flush(poll);
+}
+
+/* schedule the cpu on the same socket but different cpu with the given one */
+static unsigned long sched_node_cpu(unsigned long cpu)
+{
+	int node, ncpus_node;
+	unsigned long sched_cpu = cpu;
+
+	node = cpu_to_node(cpu);
+	ncpus_node = nr_cpus_node(node);
+	if (ncpus_node != 1) {
+		/* pick up a random cpu on the same node, exclude
+		 * the input one
+		 */
+		sched_cpu = node * ncpus_node + random32() % (ncpus_node - 1);
+		if (sched_cpu >= cpu)
+			++sched_cpu;
+		/* todo hotplug cpu race */
+		if (!cpu_online(sched_cpu))
+			sched_cpu = cpu;
+	}
+	return sched_cpu;
 }
 
 static inline void vhost_work_queue(struct vhost_dev *dev,
 				    struct vhost_work *work)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev->work_lock, flags);
+	unsigned long cpu = work->vq->cpu;
+	struct vhost *vhost;
+
+	/* Is it safe to disable vq notify here ? */
+	vhost_disable_notify(dev, work->vq);
+
+	/* schedule the work on the cpu socket as the work has been delivered
+	 * but different with the cpu the work is delivered on
+	 */
+	preempt_disable();
+	if (cpu_to_node(cpu) != cpu_to_node(smp_processor_id())) {
+		cpu = sched_node_cpu(smp_processor_id());
+		work->vq->cpu = cpu;
+	}
+	preempt_enable();
+	vhost = &per_cpu(vhosts, cpu);
+	spin_lock_irq(&vhost->lock);
+	spin_lock(&work->lock);
 	if (list_empty(&work->node)) {
-		list_add_tail(&work->node, &dev->work_list);
+		list_add_tail(&work->node, &vhost->work_list);
 		work->queue_seq++;
-		wake_up_process(dev->worker);
+		wake_up_process(vhost->worker);
 	}
-	spin_unlock_irqrestore(&dev->work_lock, flags);
+	spin_unlock(&work->lock);
+	spin_unlock_irq(&vhost->lock);
 }
 
 void vhost_poll_queue(struct vhost_poll *poll)
@@ -188,17 +243,18 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 
 static int vhost_worker(void *data)
 {
-	struct vhost_dev *dev = data;
-	struct vhost_work *work = NULL;
+	struct vhost *vhost = &__get_cpu_var(vhosts);
+	struct list_head *work_list;
+	struct mm_struct *prev_mm = NULL;
 	unsigned uninitialized_var(seq);
+	struct vhost_work *work = NULL;
 
-	use_mm(dev->mm);
-
+	work_list = &vhost->work_list;
 	for (;;) {
 		/* mb paired w/ kthread_stop */
 		set_current_state(TASK_INTERRUPTIBLE);
 
-		spin_lock_irq(&dev->work_lock);
+		spin_lock_irq(&vhost->lock);
 		if (work) {
 			work->done_seq = seq;
 			if (work->flushing)
@@ -206,18 +262,26 @@ static int vhost_worker(void *data)
 		}
 
 		if (kthread_should_stop()) {
-			spin_unlock_irq(&dev->work_lock);
+			spin_unlock_irq(&vhost->lock);
 			__set_current_state(TASK_RUNNING);
 			break;
 		}
-		if (!list_empty(&dev->work_list)) {
-			work = list_first_entry(&dev->work_list,
+		if (!list_empty(work_list)) {
+			work = list_first_entry(work_list,
 						struct vhost_work, node);
+			spin_lock(&work->lock);
 			list_del_init(&work->node);
+			spin_unlock(&work->lock);
 			seq = work->queue_seq;
+			if (prev_mm != work->vq->dev->mm) {
+				if (prev_mm)
+					unuse_mm(prev_mm);
+				prev_mm = work->vq->dev->mm;
+				use_mm(prev_mm);
+			}
 		} else
 			work = NULL;
-		spin_unlock_irq(&dev->work_lock);
+		spin_unlock_irq(&vhost->lock);
 
 		if (work) {
 			__set_current_state(TASK_RUNNING);
@@ -226,7 +290,9 @@ static int vhost_worker(void *data)
 			schedule();
 
 	}
-	unuse_mm(dev->mm);
+
+	if (prev_mm)
+		unuse_mm(prev_mm);
 	return 0;
 }
 
@@ -298,9 +364,6 @@ long vhost_dev_init(struct vhost_dev *dev,
 	dev->log_file = NULL;
 	dev->memory = NULL;
 	dev->mm = NULL;
-	spin_lock_init(&dev->work_lock);
-	INIT_LIST_HEAD(&dev->work_list);
-	dev->worker = NULL;
 
 	for (i = 0; i < dev->nvqs; ++i) {
 		dev->vqs[i].log = NULL;
@@ -312,7 +375,8 @@ long vhost_dev_init(struct vhost_dev *dev,
 		vhost_vq_reset(dev, dev->vqs + i);
 		if (dev->vqs[i].handle_kick)
 			vhost_poll_init(&dev->vqs[i].poll,
-					dev->vqs[i].handle_kick, POLLIN, dev);
+					dev->vqs[i].handle_kick, POLLIN,
+					&dev->vqs[i]);
 	}
 
 	return 0;
@@ -325,71 +389,35 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
 	return dev->mm == current->mm ? 0 : -EPERM;
 }
 
-struct vhost_attach_cgroups_struct {
-	struct vhost_work work;
-	struct task_struct *owner;
-	int ret;
-};
-
-static void vhost_attach_cgroups_work(struct vhost_work *work)
-{
-	struct vhost_attach_cgroups_struct *s;
-
-	s = container_of(work, struct vhost_attach_cgroups_struct, work);
-	s->ret = cgroup_attach_task_all(s->owner, current);
-}
-
-static int vhost_attach_cgroups(struct vhost_dev *dev)
-{
-	struct vhost_attach_cgroups_struct attach;
-
-	attach.owner = current;
-	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
-	vhost_work_queue(dev, &attach.work);
-	vhost_work_flush(dev, &attach.work);
-	return attach.ret;
-}
-
 /* Caller should have device mutex */
 static long vhost_dev_set_owner(struct vhost_dev *dev)
 {
-	struct task_struct *worker;
 	int err;
+	unsigned long txcpu, rxcpu;
 
 	/* Is there an owner already? */
 	if (dev->mm) {
 		err = -EBUSY;
-		goto err_mm;
+		goto out;
 	}
 
-	/* No owner, become one */
-	dev->mm = get_task_mm(current);
-	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
-	if (IS_ERR(worker)) {
-		err = PTR_ERR(worker);
-		goto err_worker;
-	}
+	err = vhost_dev_alloc_iovecs(dev);
+	if (err)
+		goto out;
 
-	dev->worker = worker;
-	wake_up_process(worker);	/* avoid contributing to loadavg */
+	/* initial txcpu, rxcpu on the same socket */
+	txcpu = sched_node_cpu(smp_processor_id());
+	rxcpu = sched_node_cpu(txcpu);
 
-	err = vhost_attach_cgroups(dev);
-	if (err)
-		goto err_cgroup;
+	dev->vqs[VHOST_NET_VQ_TX].cpu = txcpu;
+	dev->vqs[VHOST_NET_VQ_RX].cpu = rxcpu;
 
-	err = vhost_dev_alloc_iovecs(dev);
-	if (err)
-		goto err_cgroup;
+	/* No owner, become one */
+	dev->mm = get_task_mm(current);
 
 	return 0;
-err_cgroup:
-	kthread_stop(worker);
-	dev->worker = NULL;
-err_worker:
-	if (dev->mm)
-		mmput(dev->mm);
-	dev->mm = NULL;
-err_mm:
+
+out:
 	return err;
 }
 
@@ -474,11 +502,6 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 	kfree(rcu_dereference_protected(dev->memory,
 					lockdep_is_held(&dev->mutex)));
 	RCU_INIT_POINTER(dev->memory, NULL);
-	WARN_ON(!list_empty(&dev->work_list));
-	if (dev->worker) {
-		kthread_stop(dev->worker);
-		dev->worker = NULL;
-	}
 	if (dev->mm)
 		mmput(dev->mm);
 	dev->mm = NULL;
@@ -1605,3 +1628,104 @@ void vhost_zerocopy_callback(void *arg)
 	vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
 	kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
 }
+
+/* to do
+static int __cpuinit vhost_pool_callback(struct notifier_block *nfb,
+					 unsigned long action,
+					 void *hcpu)
+{
+	struct vhost *vhost = per_cpu(vhosts, hcpu);
+
+	action &= ~CPU_TASKS_FROZEN;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (!create_vhost_task(vhosts, hcpu))
+			return notifier_from_errno(-ENOMEM);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		kthread_bind(vhost->worker, cpumask_any(cpu_online_mask));
+		destory_vhost_task(vhost, hcpu);
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		kthread_bind(vhost->worker, hcpu);
+		wake_up_process(vhost->worker);
+		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		destrory_vhost_task(vhosts, hcpu);
+		take_over_work(vhosts, hcpu);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block vhost_pool_callback_nb __cpuinitdata = {
+	.notifier_call = vhost_pool_callcack,
+	.priority = 0,
+}
+*/
+
+static void free_workers(void)
+{
+	unsigned long cpu;
+	struct vhost *vhost;
+
+	/* to do
+	 * unregister_cpu_notifier(&vhost_pool_callback_nb);
+	 */
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		vhost = &per_cpu(vhosts, cpu);
+		if (!IS_ERR(vhost->worker)) {
+			kthread_stop(vhost->worker);
+			BUG_ON(!list_empty(&vhost->work_list));
+		}
+	}
+	put_online_cpus();
+}
+
+int vhost_init(void)
+{
+	int ret = -ENOMEM;
+	unsigned long cpu;
+	struct vhost *vhost;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		vhost = &per_cpu(vhosts, cpu);
+
+		INIT_LIST_HEAD(&vhost->work_list);
+		spin_lock_init(&vhost->lock);
+		vhost->worker = kthread_create_on_node(vhost_worker, NULL,
+						cpu_to_node(cpu),
+						"vhost-%lu", cpu);
+		if (IS_ERR(vhost->worker))
+			goto err;
+
+		kthread_bind(vhost->worker, cpu);
+		wake_up_process(vhost->worker);
+	}
+	put_online_cpus();
+
+	/* to do
+	 * register_cpu_notifier(&vhost_pool_callback_nb);
+	 */
+	return 0;
+err:
+	free_workers();
+	return ret;
+}
+
+void vhost_cleanup(void)
+{
+	free_workers();
+}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a801e28..c6ecfb0 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -18,6 +18,12 @@
 #define VHOST_DMA_DONE_LEN	1
 #define VHOST_DMA_CLEAR_LEN	0
 
+enum {
+	VHOST_NET_VQ_RX = 0,
+	VHOST_NET_VQ_TX = 1,
+	VHOST_NET_VQ_MAX = 2,
+};
+
 struct vhost_device;
 
 struct vhost_work;
@@ -30,6 +36,8 @@ struct vhost_work {
 	int			  flushing;
 	unsigned		  queue_seq;
 	unsigned		  done_seq;
+	struct vhost_virtqueue	  *vq;
+	spinlock_t                lock;
 };
 
 /* Poll a file (eventfd or socket) */
@@ -44,7 +52,7 @@ struct vhost_poll {
 };
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-		     unsigned long mask, struct vhost_dev *dev);
+		     unsigned long mask, struct vhost_virtqueue *vq);
 void vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
@@ -141,6 +149,7 @@ struct vhost_virtqueue {
 	/* Reference counting for outstanding ubufs.
 	 * Protected by vq mutex. Writers must also take device mutex. */
 	struct vhost_ubuf_ref *ubufs;
+	unsigned long cpu;
 };
 
 struct vhost_dev {
@@ -155,9 +164,6 @@ struct vhost_dev {
 	int nvqs;
 	struct file *log_file;
 	struct eventfd_ctx *log_ctx;
-	spinlock_t work_lock;
-	struct list_head work_list;
-	struct task_struct *worker;
 };
 
 long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
@@ -190,6 +196,8 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);
 void vhost_zerocopy_callback(void *arg);
 int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
+int vhost_init(void);
+void vhost_cleanup(void);
 
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
  2012-03-23  0:16 ` Shirley Ma
@ 2012-03-23 18:32   ` Thomas Lendacky
  2012-03-23 19:00     ` Rick Jones
  2012-03-23 23:45     ` David Ahern
  0 siblings, 2 replies; 8+ messages in thread
From: Thomas Lendacky @ 2012-03-23 18:32 UTC (permalink / raw)
  To: Shirley Ma; +Cc: Michael S. Tsirkin, netdev, kvm

I ran a series of TCP_RR, UDP_RR, TCP_STREAM and TCP_MAERTS tests
against the recent vhost patches. For simplicity, the patches
submitted by Anthony that increase the number of threads per vhost
instance I will call multi-worker and the patches submitted by Shirley
that provide a vhost thread per cpu I will call per-cpu.

Quick description of the tests:
  TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30
  and 60 instances
  TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes
  and 1 and 4 instances

  Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests
  running between an external host and each VM.

  Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests
  running between VM pairs on the same host (no TCP_MAERTS done in
  this situation).

For TCP_RR and UDP_RR tests I report the transaction rate as the
score and the transaction rate / KVMhost CPU% as the efficiency.

For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps
as the score and the throughput / KVMhost CPU% as the efficiency.

The KVM host machine is a nehalem-based 2-socket, 4-cores/socket
system (E5530 @ 2.40GHz) with hyperthreading disabled and an Intel
10GbE single port network adapter.

There's a lot of data and I hope this is the clearest way to report
it.  The remote host to VM results are first followed by the local
VM to VM results.


Remote Host to VM:
 Host to 1 VM
                -   Base    -  -Multi-Worker- -  Per-CPU  -
  Test     Inst   Score   Eff    Score   Eff    Score   Eff
  TCP_RR      1   9,587   984    9,725 1,145    9,252 1,041 
             10  63,919 3,095   51,841 2,415   55,226 2,884 
             30  85,646 3,288  127,277 3,242  145,644 4,092 
             60 117,448 3,929  148,330 3,616  137,996 3,898 

  UDP_RR      1  10,815 1,174   10,125 1,255    7,913 1,150 
             10  53,989 3,082   59,590 2,875   52,353 3,328 
             30  91,484 4,115   95,312 3,042  110,715 3,659 
             60 107,466 4,689  173,443 4,351  158,141 4,235 

  TCP_STREAM
         256  1   2,724   140    2,450   131    2,681   150 
              4   5,027   137    4,147   146    3,998   117 

        1024  1   5,602   235    4,623   169    5,425   238 
              4   5,987   212    5,991   133    6,827   175 

        4096  1   6,202   256    6,753   211    7,247   279 
              4   4,996   192    5,771   159    7,124   202 

       16384  1   6,258   259    7,211   214    8,453   308 
              4   4,591   179    5,788   181    6,925   217 

  TCP_MAERTS
         256  1   1,951    85    1,871    89    1,899    97 
              4   4,757   129    4,102   140    4,279   116 

        1024  1   7,479   381    6,970   371    7,374   427 
              4   8,931   385    6,612   258    8,731   417 

        4096  1   9,276   464    9,296   456    9,131   510 
              4   9,381   452    9,032   367    9,338   446 

       16384  1   9,153   496    8,817   589    9,238   516 
              4   9,358   478    9,006   367    9,350   462 

 Host to 1 VM (VM pinned to a socket)
                -   Base    -  -Multi-Worker- -  Per-CPU  -
  Test     Inst   Score   Eff    Score   Eff    Score   Eff
  TCP_RR      1   9,992 1,019    9,899   917    8,963   899 
             10  60,731 3,236   60,015 2,444   55,860 3,059 
             30 127,375 4,042  146,571 3,922  163,806 4,389 
             60 173,021 4,972  149,549 4,662  161,397 4,330 

  UDP_RR      1  10,854 1,253    7,983 1,120    7,647 1,206 
             10  68,128 3,804   64,335 4,067   53,343 3,233 
             30  92,456 3,994  112,101 4,219  111,610 3,598 
             60 135,741 4,590  184,441 4,422  184,527 4,546 

  TCP_STREAM
         256  1   2,564   146    2,530   147    2,497   150 
              4   4,757   139    4,300   127    4,245   124 

        1024  1   4,700   209    6,062   323    5,627   247 
              4   6,828   214    7,125   153    6,561   172 

        4096  1   6,676   281    7,672   286    7,760   290 
              4   6,258   236    6,410   171    7,354   225 

       16384  1   6,712   289    8,217   297    8,457   322 
              4   5,764   235    6,285   200    7,554   245 

  TCP_MAERTS
         256  1   1,673    82    1,444    71    1,756    88 
              4   6,385   175    5,671   155    5,685   153 

        1024  1   7,500   427    6,884   414    7,640   429 
              4   9,310   444    8,659   496    8,200   350 

        4096  1   8,427   477    9,201   515    8,825   422 
              4   9,372   478    9,184   394    9,391   446 

       16384  1   8,840   500    9,205   555    9,239   482 
              4   9,379   495    9,079   385    9,389   472 

 Host to 4 VMs
                -   Base    -  -Multi-Worker- -  Per-CPU  -
  Test     Inst   Score   Eff    Score   Eff    Score   Eff
  TCP_RR      1  38,635   949   34,063   843   35,432   897 
             10 193,703 2,604  157,699 1,841  180,323 2,858 
             30 279,736 3,301  170,343 1,739  269,827 2,875 
             60 308,838 3,555  170,486 1,738  285,073 2,988 

  UDP_RR      1  42,209 1,136   36,035   904   36,974   975 
             10 177,286 2,616  166,999 2,043  178,470 2,466 
             30 296,415 3,731  221,738 2,488  260,630 2,966 
             60 353,784 4,179  209,489 2,152  306,792 3,440 

  TCP_STREAM
         256  1   8,409   113    7,517   101    7,178   115 
              4   8,963    93    7,825    80    8,606    91 

        1024  1   9,382   119   10,223   192    9,314   128 
              4   9,233   101    9,085   110    8,585   105 

        4096  1   9,391   124    9,393   125    9,300   140 
              4   9,303   103    9,151   102    8,601   106 

       16384  1   9,395   121    8,715   128    9,378   135 
              4   9,322   105    9,135   101    8,691   121 

  TCP_MAERTS
         256  1   8,629   125    7,045   112    7,559   109 
              4   9,389   145    7,091    80    9,335   156 

        1024  1   9,385   201    9,349   148    9,320   248 
              4   9,392   154    9,340   148    9,390   226 

        4096  1   9,387   239    9,339   151    9,379   291 
              4   9,392   167    9,389   124    9,390   259 

       16384  1   9,374   236    9,366   150    9,391   317 
              4   9,365   167    9,394   123    9,390   284 

 Host to 12 VMs
                -   Base    -  -Multi-Worker- -  Per-CPU  -
  Test     Inst   Score   Eff    Score   Eff    Score   Eff
  TCP_RR      1  79,628   928   85,717   944   72,760   885
             10 106,348 1,067   94,032   944  164,548 2,017
             30 131,313 1,318  116,431 1,168  206,560 2,367
             60 156,868 1,574  152,205 1,527  223,701 2,250

  UDP_RR      1  90,762 1,059   93,904 1,037   75,512   919
             10 149,381 1,499  113,254 1,136  194,153 1,951
             30 177,803 1,783  132,818 1,333  235,682 2,370
             60 201,833 2,025  154,871 1,554  258,133 2,595

  TCP_STREAM
         256  1   8,549    86    7,173    72    8,407    85
              4   8,910    89    8,693    87    8,768    88

        1024  1   9,397    95    9,371    94    9,376    95
              4   9,289    93    9,268   100    8,898    92

        4096  1   9,399    95    9,415    95    9,401    97
              4   9,336    94    9,319    94    8,938    94

       16384  1   9,405    95    9,402    96    9,397   102
              4   9,366    94    9,345    94    8,890    94

  TCP_MAERTS
         256  1   4,646    49    2,273    23    9,232   135
              4   9,393   107    8,019    81    9,414   134

        1024  1   9,393   115    9,403   104    9,399   178
              4   9,406   110    9,383    98    9,392   157

        4096  1   9,393   114    9,409   104    9,388   202
              4   9,388   110    9,387    98    9,382   181

       16384  1   9,396   114    9,391   104    9,394   221
              4   9,411   110    9,384    98    9,391   192

 Host to 24 VMs
                -   Base    -  -Multi-Worker- -  Per-CPU  -
  Test     Inst   Score   Eff    Score   Eff    Score   Eff
  TCP_RR      1 110,139 1,118  101,765 1,033   79,189   805
             10  94,757   948   90,872   915  156,821 1,581
             30 119,904 1,199  120,728 1,207  214,151 2,211
             60 144,684 1,457  146,788 1,468  240,963 2,513

  UDP_RR      1 129,655 1,316  120,071 1,201   91,208   914
             10 119,204 1,201  104,645 1,046  208,432 2,340
             30 158,887 1,601  136,629 1,366  249,329 2,517
             60 179,365 1,794  159,883 1,610  259,018 2,651

  TCP_STREAM
         256  1   5,899    59    4,258    44    8,071    82
              4   8,739    89    8,195    83    7,934    82

        1024  1   8,477    86    7,498    76    9,268    93
              4   9,205    93    9,171    94    8,159    84

        4096  1   9,334    96    8,992    92    9,324    97
              4   9,255    95    9,221    92    8,237    85

       16384  1   9,373    96    9,356    95    9,311    96
              4   9,283    94    9,275    93    8,317    86

  TCP_MAERTS
         256  1     739     7      770     8    9,186   129
              4   7,804    79    7,573    76    9,253   122

        1024  1   1,763    18    1,759    18    9,287   146
              4   9,204    99    9,166    93    9,389   155

        4096  1   3,430    35    3,403    35    9,348   161
              4   9,372   100    9,315    95    9,385   151

       16384  1   9,309   102    9,306    97    9,353   175
              4   9,378   100    9,392    96    9,377   159



Local VM to VM:

 1 VM to 1 VM
                -   Base    -  -Multi-Worker- -  Per-CPU  -
  Test     Inst   Score   Eff    Score   Eff    Score   Eff
  TCP_RR      1   7,422   506    7,698   462    6,281   450 
             10  49,662 1,362   47,553 1,205   43,258 1,270 
             30  91,657 1,538   99,319 1,471   89,478 1,499 
             60 106,168 1,658  106,430 1,503   99,205 1,576 

  UDP_RR      1   8,414   552    8,532   528    6,976   499 
             10  58,359 1,645   55,283 1,398   48,094 1,457 
             30  91,046 1,736  109,403 1,721   92,109 1,715 
             60 128,835 2,021  130,382 1,807  118,563 1,853 

  TCP_STREAM
         256  1   2,029    60    1,923    54    1,998    64 
              4   3,861    66    3,445    53    2,914    54 

        1024  1   7,374   205    6,465   174    5,704   165 
              4   8,474   196    7,541   161    6,274   156 

        4096  1  12,825   295   11,921   275   10,262   262 
              4  12,639   253   13,395   260   11,451   264 

       16384  1  14,576   331   14,141   291   11,925   305 
              4  16,016   327   14,210   274   13,656   308 


 1 VM to 1 VM (each VM pinned to a socket)
                -   Base    -  -Multi-Worker- -  Per-CPU  -
  Test     Inst   Score   Eff    Score   Eff    Score   Eff
  TCP_RR      1   7,145   489    7,840   477    5,965   467 
             10  51,016 1,406   47,881 1,223   45,232 1,288 
             30  92,785 1,580  103,453 1,512   91,437 1,523 
             60 120,160 1,817  115,058 1,595  102,734 1,611 

  UDP_RR      1   7,908   547    8,704   541    6,552   528 
             10  59,807 1,653   56,598 1,435   50,524 1,488 
             30  90,302 1,738  113,861 1,765   94,640 1,720 
             60 141,684 2,196  141,866 1,919  125,334 1,917 

  TCP_STREAM
         256  1   2,210    64    1,291    32    2,069    64 
              4   3,993    64    3,441    52    2,780    50 

        1024  1   8,106   217    7,571   198    5,709   165 
              4   8,471   206    8,756   174    6,531   157 

        4096  1  15,360   350   13,825   303   10,717   271 
              4  14,671   330   12,604   263   11,266   258 

       16384  1  18,284   395   16,305   337   13,185   317 
              4  15,451   331   12,438   247   14,699   316 


 2 VMs to 2 VMs (4 VMs total)
                -   Base    -  -Multi-Worker- -  Per-CPU  -
  Test     Inst   Score   Eff    Score   Eff    Score   Eff
  TCP_RR      1  15,498   491   16,518   460   13,008   441 
             10  71,425   983   79,711 1,063   85,087 1,037 
             30 102,132 1,436   82,191 1,145  100,504 1,076 
             60 127,670 1,608   96,815 1,262  104,694 1,119 

  UDP_RR      1  17,091   548   18,214   538   14,780   492 
             10  77,682 1,129   87,523 1,235   86,755 1,165 
             30 131,830 1,826   92,844 1,327  111,839 1,232 
             60 145,688 1,952  111,315 1,520  116,358 1,296 

  TCP_STREAM
         256  1   5,085    72    3,900    50    2,430    38 
              4   6,622    70    4,337    48    5,032    58 

        1024  1  15,262   206   15,022   195    7,000   115 
              4  14,205   174   15,288   174   11,030   148 

        4096  1  15,020   197   21,694   261   13,583   198 
              4  16,818   205   16,076   195   17,175   238 

       16384  1  19,671   261   23,699   290   22,396   306 
              4  18,648   229   17,901   218   17,122   251 

 6 VMs to 6 VMs (12 VMs total)
                -   Base    -  -Multi-Worker- -  Per-CPU  -
  Test     Inst   Score   Eff    Score   Eff    Score   Eff
  TCP_RR      1  30,242   400   32,281   390   27,737   401 
             10  73,461   783   61,856   644   93,259 1,000 
             30  98,638 1,034   81,799   844  107,022 1,121 
             60 114,238 1,200   91,772   944  110,839 1,152 

  UDP_RR      1  33,017   438   35,540   429   30,022   438 
             10  84,676   910   67,838   711  112,339 1,220 
             30 110,799 1,156   90,555   932  128,928 1,357 
             60 129,679 1,354  100,715 1,033  136,503 1,429 

  TCP_STREAM
         256  1   6,947    72    5,380    56    6,138    72 
              4   8,400    85    7,660    77    8,893    89 

        1024  1  13,698   146   10,307   108   13,023   158 
              4  15,391   157   13,242   135   17,264   182 

        4096  1  18,928   202   14,580   154   16,970   189 
              4  18,826   191   17,262   175   19,558   212 

       16384  1  22,176   234   17,716   187   21,245   243 
              4  21,306   215   20,332   206   18,353   227 

 12 VMs to 12 VMs (24 VMs total)
                -   Base    -  -Multi-Worker- -  Per-CPU  -
  Test     Inst   Score   Eff    Score   Eff    Score   Eff
  TCP_RR      1  72,926   731   67,338   675   32,662   387 
             10  62,441   625   59,277   594   87,286   891 
             30  72,761   728   67,760   679  102,549 1,041 
             60  78,087   782   74,654   748  100,687 1,016 

  UDP_RR      1  82,662   829   80,875   810   34,915   421 
             10  71,424   716   67,754   679  111,753 1,147 
             30  79,495   796   75,512   756  134,576 1,372 
             60  83,339   835   77,523   778  137,058 1,390 

  TCP_STREAM
         256  1   2,870    29    2,631    26    7,907    80 
              4   8,424    84    8,026    80    8,929    90 

        1024  1   3,674    37    3,121    31   15,644   164 
              4  14,256   143   13,342   134   16,116   168 

        4096  1   5,068    51    4,366    44   16,179   168 
              4  17,015   171   16,321   164   17,940   186 

       16384  1   9,768    98    9,025    90   19,233   203 
              4  18,981   190   18,202   183   18,964   203 


On Thursday, March 22, 2012 05:16:30 PM Shirley Ma wrote:
> Resubmit it with the right format.
> 
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
> Tested-by: Tom Lendacky <toml@us.ibm.com>
> ---
> 
>  drivers/vhost/net.c                  |   26 ++-
>  drivers/vhost/vhost.c                |  300
> ++++++++++++++++++++++++---------- drivers/vhost/vhost.h                |  
> 16 ++-
>  3 files changed, 243 insertions(+), 103 deletions(-)
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
  2012-03-23 18:32   ` Thomas Lendacky
@ 2012-03-23 19:00     ` Rick Jones
  2012-03-23 21:10       ` Thomas Lendacky
  2012-03-23 23:45     ` David Ahern
  1 sibling, 1 reply; 8+ messages in thread
From: Rick Jones @ 2012-03-23 19:00 UTC (permalink / raw)
  To: Thomas Lendacky; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm

On 03/23/2012 11:32 AM, Thomas Lendacky wrote:
> I ran a series of TCP_RR, UDP_RR, TCP_STREAM and TCP_MAERTS tests
> against the recent vhost patches. For simplicity, the patches
> submitted by Anthony that increase the number of threads per vhost
> instance I will call multi-worker and the patches submitted by Shirley
> that provide a vhost thread per cpu I will call per-cpu.

Lots of nice data there - kudos.

> Quick description of the tests:
>    TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30
>    and 60 instances

There is a point, not quite sure where, when aggregate, synchronous 
single-transaction netperf tests become as much a context switching test 
as a networking test.  That is why netperf RR has support for the "burst 
mode" to have more than one transaction in flight at one time:

http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002denable_002dburst

When looking to measure packet/transaction per second scaling I've taken 
to finding the peak for a single stream by running up the burst size, 
(TCP_NODELAY set) and then running 1, 2, 4 etc of those streams. With 
the occasional ethtool -S audit to make sure that each TCP_RR 
transaction is indeed a discrete pair of TCP segments...

In addition to avoiding concerns about becoming a context switching 
exercise, the reduction in netperf instances means less chance for skew 
error on startup and shutdown.  To address that I've somewhat recently 
taken to using demo mode in netperf and then post-processing the results 
through rrdtool:

http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002denable_002ddemo

I have a "one to many" script for that under:

http://www.netperf.org/svn/netperf2/trunk/doc/examples/runemomniaggdemo.sh

which is then post-processed via some stone knives and bearskins:
http://www.netperf.org/svn/netperf2/trunk/doc/examples/post_proc.sh
http://www.netperf.org/svn/netperf2/trunk/doc/examples/vrules.awk
http://www.netperf.org/svn/netperf2/trunk/doc/examples/mins_maxes.awk

I've also used that basic idea in some many to many tests involving 512 
concurrent netperf instances but that script isn't up on netperf.org.

>    TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes
>    and 1 and 4 instances

Netperf's own documentation and output is probably not good on this 
point (feel free to loose petards, though some instances may be cast in 
stone) but those aren't really message sizes.  They are simply the 
quantity of data netperf is presenting to the transport in any one send 
call.  They are send sizes.

>    Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests
>    running between an external host and each VM.

I suppose it is implicit, and I'm just being pedantic/paranoid but you 
are confident of the limits of the external host?

>    Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests
>    running between VM pairs on the same host (no TCP_MAERTS done in
>    this situation).
>
> For TCP_RR and UDP_RR tests I report the transaction rate as the
> score and the transaction rate / KVMhost CPU% as the efficiency.
>
> For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps
> as the score and the throughput / KVMhost CPU% as the efficiency.
>
> The KVM host machine is a nehalem-based 2-socket, 4-cores/socket
> system (E5530 @ 2.40GHz) with hyperthreading disabled and an Intel
> 10GbE single port network adapter.
>
> There's a lot of data and I hope this is the clearest way to report
> it.  The remote host to VM results are first followed by the local
> VM to VM results.

Looks reasonable as far as presentation goes.  Might have included a 
summary table of the various peaks:

TCP_RR Remote Host to VM:
         Inst     -   Base    -  -Multi-Worker- -  Per-CPU  -
     VMs  /VM    Score   Eff    Score   Eff    Score   Eff
       1      60 117,448 3,929  148,330 3,616  137,996 3,898
       4      60 308,838 3,555  170,486 1,738  285,073 2,988
      12      60 156,868 1,574  152,205 1,527  223,701 2,250
      24      60 144,684 1,457  146,788 1,468  240,963 2,513

Given the KVM host machine is 8 cores with hyperthreading disabled, I 
might have included a data point at 8 VMs even if they were 2 vCPU VMs, 
but that is just my gut talking.  Certainly looking at the summary table 
I'm wondering where between 4 and 12 VMs the curve starts its downward 
trend.  Does 12 and 24, 2vCPU VMs force moving around more than say 16 
or 32 would?

happy benchmarking,

rick jones

>
>
> Remote Host to VM:
>   Host to 1 VM
>                  -   Base    -  -Multi-Worker- -  Per-CPU  -
>    Test     Inst   Score   Eff    Score   Eff    Score   Eff
>    TCP_RR      1   9,587   984    9,725 1,145    9,252 1,041
>               10  63,919 3,095   51,841 2,415   55,226 2,884
>               30  85,646 3,288  127,277 3,242  145,644 4,092
>               60 117,448 3,929  148,330 3,616  137,996 3,898
>
>    UDP_RR      1  10,815 1,174   10,125 1,255    7,913 1,150
>               10  53,989 3,082   59,590 2,875   52,353 3,328
>               30  91,484 4,115   95,312 3,042  110,715 3,659
>               60 107,466 4,689  173,443 4,351  158,141 4,235
>
>    TCP_STREAM
>           256  1   2,724   140    2,450   131    2,681   150
>                4   5,027   137    4,147   146    3,998   117
>
>          1024  1   5,602   235    4,623   169    5,425   238
>                4   5,987   212    5,991   133    6,827   175
>
>          4096  1   6,202   256    6,753   211    7,247   279
>                4   4,996   192    5,771   159    7,124   202
>
>         16384  1   6,258   259    7,211   214    8,453   308
>                4   4,591   179    5,788   181    6,925   217
>
>    TCP_MAERTS
>           256  1   1,951    85    1,871    89    1,899    97
>                4   4,757   129    4,102   140    4,279   116
>
>          1024  1   7,479   381    6,970   371    7,374   427
>                4   8,931   385    6,612   258    8,731   417
>
>          4096  1   9,276   464    9,296   456    9,131   510
>                4   9,381   452    9,032   367    9,338   446
>
>         16384  1   9,153   496    8,817   589    9,238   516
>                4   9,358   478    9,006   367    9,350   462
>
>   Host to 1 VM (VM pinned to a socket)
>                  -   Base    -  -Multi-Worker- -  Per-CPU  -
>    Test     Inst   Score   Eff    Score   Eff    Score   Eff
>    TCP_RR      1   9,992 1,019    9,899   917    8,963   899
>               10  60,731 3,236   60,015 2,444   55,860 3,059
>               30 127,375 4,042  146,571 3,922  163,806 4,389
>               60 173,021 4,972  149,549 4,662  161,397 4,330
>
>    UDP_RR      1  10,854 1,253    7,983 1,120    7,647 1,206
>               10  68,128 3,804   64,335 4,067   53,343 3,233
>               30  92,456 3,994  112,101 4,219  111,610 3,598
>               60 135,741 4,590  184,441 4,422  184,527 4,546
>
>    TCP_STREAM
>           256  1   2,564   146    2,530   147    2,497   150
>                4   4,757   139    4,300   127    4,245   124
>
>          1024  1   4,700   209    6,062   323    5,627   247
>                4   6,828   214    7,125   153    6,561   172
>
>          4096  1   6,676   281    7,672   286    7,760   290
>                4   6,258   236    6,410   171    7,354   225
>
>         16384  1   6,712   289    8,217   297    8,457   322
>                4   5,764   235    6,285   200    7,554   245
>
>    TCP_MAERTS
>           256  1   1,673    82    1,444    71    1,756    88
>                4   6,385   175    5,671   155    5,685   153
>
>          1024  1   7,500   427    6,884   414    7,640   429
>                4   9,310   444    8,659   496    8,200   350
>
>          4096  1   8,427   477    9,201   515    8,825   422
>                4   9,372   478    9,184   394    9,391   446
>
>         16384  1   8,840   500    9,205   555    9,239   482
>                4   9,379   495    9,079   385    9,389   472
>
>   Host to 4 VMs
>                  -   Base    -  -Multi-Worker- -  Per-CPU  -
>    Test     Inst   Score   Eff    Score   Eff    Score   Eff
>    TCP_RR      1  38,635   949   34,063   843   35,432   897
>               10 193,703 2,604  157,699 1,841  180,323 2,858
>               30 279,736 3,301  170,343 1,739  269,827 2,875
>               60 308,838 3,555  170,486 1,738  285,073 2,988
>
>    UDP_RR      1  42,209 1,136   36,035   904   36,974   975
>               10 177,286 2,616  166,999 2,043  178,470 2,466
>               30 296,415 3,731  221,738 2,488  260,630 2,966
>               60 353,784 4,179  209,489 2,152  306,792 3,440
>
>    TCP_STREAM
>           256  1   8,409   113    7,517   101    7,178   115
>                4   8,963    93    7,825    80    8,606    91
>
>          1024  1   9,382   119   10,223   192    9,314   128
>                4   9,233   101    9,085   110    8,585   105
>
>          4096  1   9,391   124    9,393   125    9,300   140
>                4   9,303   103    9,151   102    8,601   106
>
>         16384  1   9,395   121    8,715   128    9,378   135
>                4   9,322   105    9,135   101    8,691   121
>
>    TCP_MAERTS
>           256  1   8,629   125    7,045   112    7,559   109
>                4   9,389   145    7,091    80    9,335   156
>
>          1024  1   9,385   201    9,349   148    9,320   248
>                4   9,392   154    9,340   148    9,390   226
>
>          4096  1   9,387   239    9,339   151    9,379   291
>                4   9,392   167    9,389   124    9,390   259
>
>         16384  1   9,374   236    9,366   150    9,391   317
>                4   9,365   167    9,394   123    9,390   284
>
>   Host to 12 VMs
>                  -   Base    -  -Multi-Worker- -  Per-CPU  -
>    Test     Inst   Score   Eff    Score   Eff    Score   Eff
>    TCP_RR      1  79,628   928   85,717   944   72,760   885
>               10 106,348 1,067   94,032   944  164,548 2,017
>               30 131,313 1,318  116,431 1,168  206,560 2,367
>               60 156,868 1,574  152,205 1,527  223,701 2,250
>
>    UDP_RR      1  90,762 1,059   93,904 1,037   75,512   919
>               10 149,381 1,499  113,254 1,136  194,153 1,951
>               30 177,803 1,783  132,818 1,333  235,682 2,370
>               60 201,833 2,025  154,871 1,554  258,133 2,595
>
>    TCP_STREAM
>           256  1   8,549    86    7,173    72    8,407    85
>                4   8,910    89    8,693    87    8,768    88
>
>          1024  1   9,397    95    9,371    94    9,376    95
>                4   9,289    93    9,268   100    8,898    92
>
>          4096  1   9,399    95    9,415    95    9,401    97
>                4   9,336    94    9,319    94    8,938    94
>
>         16384  1   9,405    95    9,402    96    9,397   102
>                4   9,366    94    9,345    94    8,890    94
>
>    TCP_MAERTS
>           256  1   4,646    49    2,273    23    9,232   135
>                4   9,393   107    8,019    81    9,414   134
>
>          1024  1   9,393   115    9,403   104    9,399   178
>                4   9,406   110    9,383    98    9,392   157
>
>          4096  1   9,393   114    9,409   104    9,388   202
>                4   9,388   110    9,387    98    9,382   181
>
>         16384  1   9,396   114    9,391   104    9,394   221
>                4   9,411   110    9,384    98    9,391   192
>
>   Host to 24 VMs
>                  -   Base    -  -Multi-Worker- -  Per-CPU  -
>    Test     Inst   Score   Eff    Score   Eff    Score   Eff
>    TCP_RR      1 110,139 1,118  101,765 1,033   79,189   805
>               10  94,757   948   90,872   915  156,821 1,581
>               30 119,904 1,199  120,728 1,207  214,151 2,211
>               60 144,684 1,457  146,788 1,468  240,963 2,513
>
>    UDP_RR      1 129,655 1,316  120,071 1,201   91,208   914
>               10 119,204 1,201  104,645 1,046  208,432 2,340
>               30 158,887 1,601  136,629 1,366  249,329 2,517
>               60 179,365 1,794  159,883 1,610  259,018 2,651
>
>    TCP_STREAM
>           256  1   5,899    59    4,258    44    8,071    82
>                4   8,739    89    8,195    83    7,934    82
>
>          1024  1   8,477    86    7,498    76    9,268    93
>                4   9,205    93    9,171    94    8,159    84
>
>          4096  1   9,334    96    8,992    92    9,324    97
>                4   9,255    95    9,221    92    8,237    85
>
>         16384  1   9,373    96    9,356    95    9,311    96
>                4   9,283    94    9,275    93    8,317    86
>
>    TCP_MAERTS
>           256  1     739     7      770     8    9,186   129
>                4   7,804    79    7,573    76    9,253   122
>
>          1024  1   1,763    18    1,759    18    9,287   146
>                4   9,204    99    9,166    93    9,389   155
>
>          4096  1   3,430    35    3,403    35    9,348   161
>                4   9,372   100    9,315    95    9,385   151
>
>         16384  1   9,309   102    9,306    97    9,353   175
>                4   9,378   100    9,392    96    9,377   159
>
>
>
> Local VM to VM:
>
>   1 VM to 1 VM
>                  -   Base    -  -Multi-Worker- -  Per-CPU  -
>    Test     Inst   Score   Eff    Score   Eff    Score   Eff
>    TCP_RR      1   7,422   506    7,698   462    6,281   450
>               10  49,662 1,362   47,553 1,205   43,258 1,270
>               30  91,657 1,538   99,319 1,471   89,478 1,499
>               60 106,168 1,658  106,430 1,503   99,205 1,576
>
>    UDP_RR      1   8,414   552    8,532   528    6,976   499
>               10  58,359 1,645   55,283 1,398   48,094 1,457
>               30  91,046 1,736  109,403 1,721   92,109 1,715
>               60 128,835 2,021  130,382 1,807  118,563 1,853
>
>    TCP_STREAM
>           256  1   2,029    60    1,923    54    1,998    64
>                4   3,861    66    3,445    53    2,914    54
>
>          1024  1   7,374   205    6,465   174    5,704   165
>                4   8,474   196    7,541   161    6,274   156
>
>          4096  1  12,825   295   11,921   275   10,262   262
>                4  12,639   253   13,395   260   11,451   264
>
>         16384  1  14,576   331   14,141   291   11,925   305
>                4  16,016   327   14,210   274   13,656   308
>
>
>   1 VM to 1 VM (each VM pinned to a socket)
>                  -   Base    -  -Multi-Worker- -  Per-CPU  -
>    Test     Inst   Score   Eff    Score   Eff    Score   Eff
>    TCP_RR      1   7,145   489    7,840   477    5,965   467
>               10  51,016 1,406   47,881 1,223   45,232 1,288
>               30  92,785 1,580  103,453 1,512   91,437 1,523
>               60 120,160 1,817  115,058 1,595  102,734 1,611
>
>    UDP_RR      1   7,908   547    8,704   541    6,552   528
>               10  59,807 1,653   56,598 1,435   50,524 1,488
>               30  90,302 1,738  113,861 1,765   94,640 1,720
>               60 141,684 2,196  141,866 1,919  125,334 1,917
>
>    TCP_STREAM
>           256  1   2,210    64    1,291    32    2,069    64
>                4   3,993    64    3,441    52    2,780    50
>
>          1024  1   8,106   217    7,571   198    5,709   165
>                4   8,471   206    8,756   174    6,531   157
>
>          4096  1  15,360   350   13,825   303   10,717   271
>                4  14,671   330   12,604   263   11,266   258
>
>         16384  1  18,284   395   16,305   337   13,185   317
>                4  15,451   331   12,438   247   14,699   316
>
>
>   2 VMs to 2 VMs (4 VMs total)
>                  -   Base    -  -Multi-Worker- -  Per-CPU  -
>    Test     Inst   Score   Eff    Score   Eff    Score   Eff
>    TCP_RR      1  15,498   491   16,518   460   13,008   441
>               10  71,425   983   79,711 1,063   85,087 1,037
>               30 102,132 1,436   82,191 1,145  100,504 1,076
>               60 127,670 1,608   96,815 1,262  104,694 1,119
>
>    UDP_RR      1  17,091   548   18,214   538   14,780   492
>               10  77,682 1,129   87,523 1,235   86,755 1,165
>               30 131,830 1,826   92,844 1,327  111,839 1,232
>               60 145,688 1,952  111,315 1,520  116,358 1,296
>
>    TCP_STREAM
>           256  1   5,085    72    3,900    50    2,430    38
>                4   6,622    70    4,337    48    5,032    58
>
>          1024  1  15,262   206   15,022   195    7,000   115
>                4  14,205   174   15,288   174   11,030   148
>
>          4096  1  15,020   197   21,694   261   13,583   198
>                4  16,818   205   16,076   195   17,175   238
>
>         16384  1  19,671   261   23,699   290   22,396   306
>                4  18,648   229   17,901   218   17,122   251
>
>   6 VMs to 6 VMs (12 VMs total)
>                  -   Base    -  -Multi-Worker- -  Per-CPU  -
>    Test     Inst   Score   Eff    Score   Eff    Score   Eff
>    TCP_RR      1  30,242   400   32,281   390   27,737   401
>               10  73,461   783   61,856   644   93,259 1,000
>               30  98,638 1,034   81,799   844  107,022 1,121
>               60 114,238 1,200   91,772   944  110,839 1,152
>
>    UDP_RR      1  33,017   438   35,540   429   30,022   438
>               10  84,676   910   67,838   711  112,339 1,220
>               30 110,799 1,156   90,555   932  128,928 1,357
>               60 129,679 1,354  100,715 1,033  136,503 1,429
>
>    TCP_STREAM
>           256  1   6,947    72    5,380    56    6,138    72
>                4   8,400    85    7,660    77    8,893    89
>
>          1024  1  13,698   146   10,307   108   13,023   158
>                4  15,391   157   13,242   135   17,264   182
>
>          4096  1  18,928   202   14,580   154   16,970   189
>                4  18,826   191   17,262   175   19,558   212
>
>         16384  1  22,176   234   17,716   187   21,245   243
>                4  21,306   215   20,332   206   18,353   227
>
>   12 VMs to 12 VMs (24 VMs total)
>                  -   Base    -  -Multi-Worker- -  Per-CPU  -
>    Test     Inst   Score   Eff    Score   Eff    Score   Eff
>    TCP_RR      1  72,926   731   67,338   675   32,662   387
>               10  62,441   625   59,277   594   87,286   891
>               30  72,761   728   67,760   679  102,549 1,041
>               60  78,087   782   74,654   748  100,687 1,016
>
>    UDP_RR      1  82,662   829   80,875   810   34,915   421
>               10  71,424   716   67,754   679  111,753 1,147
>               30  79,495   796   75,512   756  134,576 1,372
>               60  83,339   835   77,523   778  137,058 1,390
>
>    TCP_STREAM
>           256  1   2,870    29    2,631    26    7,907    80
>                4   8,424    84    8,026    80    8,929    90
>
>          1024  1   3,674    37    3,121    31   15,644   164
>                4  14,256   143   13,342   134   16,116   168
>
>          4096  1   5,068    51    4,366    44   16,179   168
>                4  17,015   171   16,321   164   17,940   186
>
>         16384  1   9,768    98    9,025    90   19,233   203
>                4  18,981   190   18,202   183   18,964   203
>
>
> On Thursday, March 22, 2012 05:16:30 PM Shirley Ma wrote:
>> Resubmit it with the right format.
>>
>> Signed-off-by: Shirley Ma<xma@us.ibm.com>
>> Signed-off-by: Krishna Kumar<krkumar2@in.ibm.com>
>> Tested-by: Tom Lendacky<toml@us.ibm.com>
>> ---
>>
>>   drivers/vhost/net.c                  |   26 ++-
>>   drivers/vhost/vhost.c                |  300
>> ++++++++++++++++++++++++---------- drivers/vhost/vhost.h                |
>> 16 ++-
>>   3 files changed, 243 insertions(+), 103 deletions(-)
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
  2012-03-23 19:00     ` Rick Jones
@ 2012-03-23 21:10       ` Thomas Lendacky
  2012-03-23 21:21         ` Rick Jones
  0 siblings, 1 reply; 8+ messages in thread
From: Thomas Lendacky @ 2012-03-23 21:10 UTC (permalink / raw)
  To: Rick Jones; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm

On Friday, March 23, 2012 12:00:54 PM Rick Jones wrote:
> On 03/23/2012 11:32 AM, Thomas Lendacky wrote:
> > I ran a series of TCP_RR, UDP_RR, TCP_STREAM and TCP_MAERTS tests
> > against the recent vhost patches. For simplicity, the patches
> > submitted by Anthony that increase the number of threads per vhost
> > instance I will call multi-worker and the patches submitted by Shirley
> > that provide a vhost thread per cpu I will call per-cpu.
> 
> Lots of nice data there - kudos.
> 
> > Quick description of the tests:
> >    TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30
> >    and 60 instances
> 
> There is a point, not quite sure where, when aggregate, synchronous
> single-transaction netperf tests become as much a context switching test
> as a networking test.  That is why netperf RR has support for the "burst
> mode" to have more than one transaction in flight at one time:
> 
> http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002de
> nable_002dburst
> 
> When looking to measure packet/transaction per second scaling I've taken
> to finding the peak for a single stream by running up the burst size,
> (TCP_NODELAY set) and then running 1, 2, 4 etc of those streams. With
> the occasional ethtool -S audit to make sure that each TCP_RR
> transaction is indeed a discrete pair of TCP segments...
> 
> In addition to avoiding concerns about becoming a context switching
> exercise, the reduction in netperf instances means less chance for skew
> error on startup and shutdown.  To address that I've somewhat recently
> taken to using demo mode in netperf and then post-processing the results
> through rrdtool:
> 
> http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Using-_002d_002de
> nable_002ddemo
> 
> I have a "one to many" script for that under:
> 
> http://www.netperf.org/svn/netperf2/trunk/doc/examples/runemomniaggdemo.sh
> 
> which is then post-processed via some stone knives and bearskins:
> http://www.netperf.org/svn/netperf2/trunk/doc/examples/post_proc.sh
> http://www.netperf.org/svn/netperf2/trunk/doc/examples/vrules.awk
> http://www.netperf.org/svn/netperf2/trunk/doc/examples/mins_maxes.awk
> 
> I've also used that basic idea in some many to many tests involving 512
> concurrent netperf instances but that script isn't up on netperf.org.
> 
> >    TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes
> >    and 1 and 4 instances
> 
> Netperf's own documentation and output is probably not good on this
> point (feel free to loose petards, though some instances may be cast in
> stone) but those aren't really message sizes.  They are simply the
> quantity of data netperf is presenting to the transport in any one send
> call.  They are send sizes.
> 
> >    Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests
> >    running between an external host and each VM.
> 
> I suppose it is implicit, and I'm just being pedantic/paranoid but you
> are confident of the limits of the external host?

Yes I am.  It's pretty much an identical system to the KVM host and has
demonstrated much greater performance when running bare-metal scenarios.
Plenty of CPU left on all cores, etc.

> 
> >    Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests
> >    running between VM pairs on the same host (no TCP_MAERTS done in
> >    this situation).
> > 
> > For TCP_RR and UDP_RR tests I report the transaction rate as the
> > score and the transaction rate / KVMhost CPU% as the efficiency.
> > 
> > For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps
> > as the score and the throughput / KVMhost CPU% as the efficiency.
> > 
> > The KVM host machine is a nehalem-based 2-socket, 4-cores/socket
> > system (E5530 @ 2.40GHz) with hyperthreading disabled and an Intel
> > 10GbE single port network adapter.
> > 
> > There's a lot of data and I hope this is the clearest way to report
> > it.  The remote host to VM results are first followed by the local
> > VM to VM results.
> 
> Looks reasonable as far as presentation goes.  Might have included a
> summary table of the various peaks:
> 
> TCP_RR Remote Host to VM:
>          Inst     -   Base    -  -Multi-Worker- -  Per-CPU  -
>      VMs  /VM    Score   Eff    Score   Eff    Score   Eff
>        1      60 117,448 3,929  148,330 3,616  137,996 3,898
>        4      60 308,838 3,555  170,486 1,738  285,073 2,988
>       12      60 156,868 1,574  152,205 1,527  223,701 2,250
>       24      60 144,684 1,457  146,788 1,468  240,963 2,513
> 

That's a good suggestion.

I also have geometric mean comparisons to the baseline (with greater
than 100% indicating an improvement and less than 100% indicating
regression).

Remote:
                 -Multi-Worker- -  Per-CPU  -
     VMs           Score   Eff    Score   Eff
       1            105%   91%     109%  103%
       1 (pinned)   102%   94%     103%   95%
       4             84%   76%      95%  103%
      12             91%   88%     113%  129%
      24             95%   94%     135%  149%
 Overall             95%   88%     110%  114%

Local:
                 -Multi-Worker- -  Per-CPU  -
     VMs           Score   Eff    Score   Eff
       1             98%   90%      86%   93%
       1 (pinned)    94%   85%      82%   87%
       4             94%   91%      86%   86%
      12             85%   84%     103%  109%
      24             93%   93%     141%  148%
 Overall             93%   89%      97%  102%

Combined:            94%   88%     104%  108%

> Given the KVM host machine is 8 cores with hyperthreading disabled, I
> might have included a data point at 8 VMs even if they were 2 vCPU VMs,
> but that is just my gut talking.  Certainly looking at the summary table
> I'm wondering where between 4 and 12 VMs the curve starts its downward
> trend.  Does 12 and 24, 2vCPU VMs force moving around more than say 16
> or 32 would?

Yeah, it becomes a question of time.  I run each test 3 times and
average the results, so to run the full suite takes a long time.

Thanks,
Tom

> 
> happy benchmarking,
> 
> rick jones
> 
> > Remote Host to VM:
> >   Host to 1 VM
> >   
> >                  -   Base    -  -Multi-Worker- -  Per-CPU  -
> >    
> >    Test     Inst   Score   Eff    Score   Eff    Score   Eff
> >    TCP_RR      1   9,587   984    9,725 1,145    9,252 1,041
> >    
> >               10  63,919 3,095   51,841 2,415   55,226 2,884
> >               30  85,646 3,288  127,277 3,242  145,644 4,092
> >               60 117,448 3,929  148,330 3,616  137,996 3,898
> >    
> >    UDP_RR      1  10,815 1,174   10,125 1,255    7,913 1,150
> >    
> >               10  53,989 3,082   59,590 2,875   52,353 3,328
> >               30  91,484 4,115   95,312 3,042  110,715 3,659
> >               60 107,466 4,689  173,443 4,351  158,141 4,235
> >    
> >    TCP_STREAM
> >    
> >           256  1   2,724   140    2,450   131    2,681   150
> >           
> >                4   5,027   137    4,147   146    3,998   117
> >          
> >          1024  1   5,602   235    4,623   169    5,425   238
> >          
> >                4   5,987   212    5,991   133    6,827   175
> >          
> >          4096  1   6,202   256    6,753   211    7,247   279
> >          
> >                4   4,996   192    5,771   159    7,124   202
> >         
> >         16384  1   6,258   259    7,211   214    8,453   308
> >         
> >                4   4,591   179    5,788   181    6,925   217
> >    
> >    TCP_MAERTS
> >    
> >           256  1   1,951    85    1,871    89    1,899    97
> >           
> >                4   4,757   129    4,102   140    4,279   116
> >          
> >          1024  1   7,479   381    6,970   371    7,374   427
> >          
> >                4   8,931   385    6,612   258    8,731   417
> >          
> >          4096  1   9,276   464    9,296   456    9,131   510
> >          
> >                4   9,381   452    9,032   367    9,338   446
> >         
> >         16384  1   9,153   496    8,817   589    9,238   516
> >         
> >                4   9,358   478    9,006   367    9,350   462
> >   
> >   Host to 1 VM (VM pinned to a socket)
> >   
> >                  -   Base    -  -Multi-Worker- -  Per-CPU  -
> >    
> >    Test     Inst   Score   Eff    Score   Eff    Score   Eff
> >    TCP_RR      1   9,992 1,019    9,899   917    8,963   899
> >    
> >               10  60,731 3,236   60,015 2,444   55,860 3,059
> >               30 127,375 4,042  146,571 3,922  163,806 4,389
> >               60 173,021 4,972  149,549 4,662  161,397 4,330
> >    
> >    UDP_RR      1  10,854 1,253    7,983 1,120    7,647 1,206
> >    
> >               10  68,128 3,804   64,335 4,067   53,343 3,233
> >               30  92,456 3,994  112,101 4,219  111,610 3,598
> >               60 135,741 4,590  184,441 4,422  184,527 4,546
> >    
> >    TCP_STREAM
> >    
> >           256  1   2,564   146    2,530   147    2,497   150
> >           
> >                4   4,757   139    4,300   127    4,245   124
> >          
> >          1024  1   4,700   209    6,062   323    5,627   247
> >          
> >                4   6,828   214    7,125   153    6,561   172
> >          
> >          4096  1   6,676   281    7,672   286    7,760   290
> >          
> >                4   6,258   236    6,410   171    7,354   225
> >         
> >         16384  1   6,712   289    8,217   297    8,457   322
> >         
> >                4   5,764   235    6,285   200    7,554   245
> >    
> >    TCP_MAERTS
> >    
> >           256  1   1,673    82    1,444    71    1,756    88
> >           
> >                4   6,385   175    5,671   155    5,685   153
> >          
> >          1024  1   7,500   427    6,884   414    7,640   429
> >          
> >                4   9,310   444    8,659   496    8,200   350
> >          
> >          4096  1   8,427   477    9,201   515    8,825   422
> >          
> >                4   9,372   478    9,184   394    9,391   446
> >         
> >         16384  1   8,840   500    9,205   555    9,239   482
> >         
> >                4   9,379   495    9,079   385    9,389   472
> >   
> >   Host to 4 VMs
> >   
> >                  -   Base    -  -Multi-Worker- -  Per-CPU  -
> >    
> >    Test     Inst   Score   Eff    Score   Eff    Score   Eff
> >    TCP_RR      1  38,635   949   34,063   843   35,432   897
> >    
> >               10 193,703 2,604  157,699 1,841  180,323 2,858
> >               30 279,736 3,301  170,343 1,739  269,827 2,875
> >               60 308,838 3,555  170,486 1,738  285,073 2,988
> >    
> >    UDP_RR      1  42,209 1,136   36,035   904   36,974   975
> >    
> >               10 177,286 2,616  166,999 2,043  178,470 2,466
> >               30 296,415 3,731  221,738 2,488  260,630 2,966
> >               60 353,784 4,179  209,489 2,152  306,792 3,440
> >    
> >    TCP_STREAM
> >    
> >           256  1   8,409   113    7,517   101    7,178   115
> >           
> >                4   8,963    93    7,825    80    8,606    91
> >          
> >          1024  1   9,382   119   10,223   192    9,314   128
> >          
> >                4   9,233   101    9,085   110    8,585   105
> >          
> >          4096  1   9,391   124    9,393   125    9,300   140
> >          
> >                4   9,303   103    9,151   102    8,601   106
> >         
> >         16384  1   9,395   121    8,715   128    9,378   135
> >         
> >                4   9,322   105    9,135   101    8,691   121
> >    
> >    TCP_MAERTS
> >    
> >           256  1   8,629   125    7,045   112    7,559   109
> >           
> >                4   9,389   145    7,091    80    9,335   156
> >          
> >          1024  1   9,385   201    9,349   148    9,320   248
> >          
> >                4   9,392   154    9,340   148    9,390   226
> >          
> >          4096  1   9,387   239    9,339   151    9,379   291
> >          
> >                4   9,392   167    9,389   124    9,390   259
> >         
> >         16384  1   9,374   236    9,366   150    9,391   317
> >         
> >                4   9,365   167    9,394   123    9,390   284
> >   
> >   Host to 12 VMs
> >   
> >                  -   Base    -  -Multi-Worker- -  Per-CPU  -
> >    
> >    Test     Inst   Score   Eff    Score   Eff    Score   Eff
> >    TCP_RR      1  79,628   928   85,717   944   72,760   885
> >    
> >               10 106,348 1,067   94,032   944  164,548 2,017
> >               30 131,313 1,318  116,431 1,168  206,560 2,367
> >               60 156,868 1,574  152,205 1,527  223,701 2,250
> >    
> >    UDP_RR      1  90,762 1,059   93,904 1,037   75,512   919
> >    
> >               10 149,381 1,499  113,254 1,136  194,153 1,951
> >               30 177,803 1,783  132,818 1,333  235,682 2,370
> >               60 201,833 2,025  154,871 1,554  258,133 2,595
> >    
> >    TCP_STREAM
> >    
> >           256  1   8,549    86    7,173    72    8,407    85
> >           
> >                4   8,910    89    8,693    87    8,768    88
> >          
> >          1024  1   9,397    95    9,371    94    9,376    95
> >          
> >                4   9,289    93    9,268   100    8,898    92
> >          
> >          4096  1   9,399    95    9,415    95    9,401    97
> >          
> >                4   9,336    94    9,319    94    8,938    94
> >         
> >         16384  1   9,405    95    9,402    96    9,397   102
> >         
> >                4   9,366    94    9,345    94    8,890    94
> >    
> >    TCP_MAERTS
> >    
> >           256  1   4,646    49    2,273    23    9,232   135
> >           
> >                4   9,393   107    8,019    81    9,414   134
> >          
> >          1024  1   9,393   115    9,403   104    9,399   178
> >          
> >                4   9,406   110    9,383    98    9,392   157
> >          
> >          4096  1   9,393   114    9,409   104    9,388   202
> >          
> >                4   9,388   110    9,387    98    9,382   181
> >         
> >         16384  1   9,396   114    9,391   104    9,394   221
> >         
> >                4   9,411   110    9,384    98    9,391   192
> >   
> >   Host to 24 VMs
> >   
> >                  -   Base    -  -Multi-Worker- -  Per-CPU  -
> >    
> >    Test     Inst   Score   Eff    Score   Eff    Score   Eff
> >    TCP_RR      1 110,139 1,118  101,765 1,033   79,189   805
> >    
> >               10  94,757   948   90,872   915  156,821 1,581
> >               30 119,904 1,199  120,728 1,207  214,151 2,211
> >               60 144,684 1,457  146,788 1,468  240,963 2,513
> >    
> >    UDP_RR      1 129,655 1,316  120,071 1,201   91,208   914
> >    
> >               10 119,204 1,201  104,645 1,046  208,432 2,340
> >               30 158,887 1,601  136,629 1,366  249,329 2,517
> >               60 179,365 1,794  159,883 1,610  259,018 2,651
> >    
> >    TCP_STREAM
> >    
> >           256  1   5,899    59    4,258    44    8,071    82
> >           
> >                4   8,739    89    8,195    83    7,934    82
> >          
> >          1024  1   8,477    86    7,498    76    9,268    93
> >          
> >                4   9,205    93    9,171    94    8,159    84
> >          
> >          4096  1   9,334    96    8,992    92    9,324    97
> >          
> >                4   9,255    95    9,221    92    8,237    85
> >         
> >         16384  1   9,373    96    9,356    95    9,311    96
> >         
> >                4   9,283    94    9,275    93    8,317    86
> >    
> >    TCP_MAERTS
> >    
> >           256  1     739     7      770     8    9,186   129
> >           
> >                4   7,804    79    7,573    76    9,253   122
> >          
> >          1024  1   1,763    18    1,759    18    9,287   146
> >          
> >                4   9,204    99    9,166    93    9,389   155
> >          
> >          4096  1   3,430    35    3,403    35    9,348   161
> >          
> >                4   9,372   100    9,315    95    9,385   151
> >         
> >         16384  1   9,309   102    9,306    97    9,353   175
> >         
> >                4   9,378   100    9,392    96    9,377   159
> > 
> > Local VM to VM:
> >   1 VM to 1 VM
> >   
> >                  -   Base    -  -Multi-Worker- -  Per-CPU  -
> >    
> >    Test     Inst   Score   Eff    Score   Eff    Score   Eff
> >    TCP_RR      1   7,422   506    7,698   462    6,281   450
> >    
> >               10  49,662 1,362   47,553 1,205   43,258 1,270
> >               30  91,657 1,538   99,319 1,471   89,478 1,499
> >               60 106,168 1,658  106,430 1,503   99,205 1,576
> >    
> >    UDP_RR      1   8,414   552    8,532   528    6,976   499
> >    
> >               10  58,359 1,645   55,283 1,398   48,094 1,457
> >               30  91,046 1,736  109,403 1,721   92,109 1,715
> >               60 128,835 2,021  130,382 1,807  118,563 1,853
> >    
> >    TCP_STREAM
> >    
> >           256  1   2,029    60    1,923    54    1,998    64
> >           
> >                4   3,861    66    3,445    53    2,914    54
> >          
> >          1024  1   7,374   205    6,465   174    5,704   165
> >          
> >                4   8,474   196    7,541   161    6,274   156
> >          
> >          4096  1  12,825   295   11,921   275   10,262   262
> >          
> >                4  12,639   253   13,395   260   11,451   264
> >         
> >         16384  1  14,576   331   14,141   291   11,925   305
> >         
> >                4  16,016   327   14,210   274   13,656   308
> >   
> >   1 VM to 1 VM (each VM pinned to a socket)
> >   
> >                  -   Base    -  -Multi-Worker- -  Per-CPU  -
> >    
> >    Test     Inst   Score   Eff    Score   Eff    Score   Eff
> >    TCP_RR      1   7,145   489    7,840   477    5,965   467
> >    
> >               10  51,016 1,406   47,881 1,223   45,232 1,288
> >               30  92,785 1,580  103,453 1,512   91,437 1,523
> >               60 120,160 1,817  115,058 1,595  102,734 1,611
> >    
> >    UDP_RR      1   7,908   547    8,704   541    6,552   528
> >    
> >               10  59,807 1,653   56,598 1,435   50,524 1,488
> >               30  90,302 1,738  113,861 1,765   94,640 1,720
> >               60 141,684 2,196  141,866 1,919  125,334 1,917
> >    
> >    TCP_STREAM
> >    
> >           256  1   2,210    64    1,291    32    2,069    64
> >           
> >                4   3,993    64    3,441    52    2,780    50
> >          
> >          1024  1   8,106   217    7,571   198    5,709   165
> >          
> >                4   8,471   206    8,756   174    6,531   157
> >          
> >          4096  1  15,360   350   13,825   303   10,717   271
> >          
> >                4  14,671   330   12,604   263   11,266   258
> >         
> >         16384  1  18,284   395   16,305   337   13,185   317
> >         
> >                4  15,451   331   12,438   247   14,699   316
> >   
> >   2 VMs to 2 VMs (4 VMs total)
> >   
> >                  -   Base    -  -Multi-Worker- -  Per-CPU  -
> >    
> >    Test     Inst   Score   Eff    Score   Eff    Score   Eff
> >    TCP_RR      1  15,498   491   16,518   460   13,008   441
> >    
> >               10  71,425   983   79,711 1,063   85,087 1,037
> >               30 102,132 1,436   82,191 1,145  100,504 1,076
> >               60 127,670 1,608   96,815 1,262  104,694 1,119
> >    
> >    UDP_RR      1  17,091   548   18,214   538   14,780   492
> >    
> >               10  77,682 1,129   87,523 1,235   86,755 1,165
> >               30 131,830 1,826   92,844 1,327  111,839 1,232
> >               60 145,688 1,952  111,315 1,520  116,358 1,296
> >    
> >    TCP_STREAM
> >    
> >           256  1   5,085    72    3,900    50    2,430    38
> >           
> >                4   6,622    70    4,337    48    5,032    58
> >          
> >          1024  1  15,262   206   15,022   195    7,000   115
> >          
> >                4  14,205   174   15,288   174   11,030   148
> >          
> >          4096  1  15,020   197   21,694   261   13,583   198
> >          
> >                4  16,818   205   16,076   195   17,175   238
> >         
> >         16384  1  19,671   261   23,699   290   22,396   306
> >         
> >                4  18,648   229   17,901   218   17,122   251
> >   
> >   6 VMs to 6 VMs (12 VMs total)
> >   
> >                  -   Base    -  -Multi-Worker- -  Per-CPU  -
> >    
> >    Test     Inst   Score   Eff    Score   Eff    Score   Eff
> >    TCP_RR      1  30,242   400   32,281   390   27,737   401
> >    
> >               10  73,461   783   61,856   644   93,259 1,000
> >               30  98,638 1,034   81,799   844  107,022 1,121
> >               60 114,238 1,200   91,772   944  110,839 1,152
> >    
> >    UDP_RR      1  33,017   438   35,540   429   30,022   438
> >    
> >               10  84,676   910   67,838   711  112,339 1,220
> >               30 110,799 1,156   90,555   932  128,928 1,357
> >               60 129,679 1,354  100,715 1,033  136,503 1,429
> >    
> >    TCP_STREAM
> >    
> >           256  1   6,947    72    5,380    56    6,138    72
> >           
> >                4   8,400    85    7,660    77    8,893    89
> >          
> >          1024  1  13,698   146   10,307   108   13,023   158
> >          
> >                4  15,391   157   13,242   135   17,264   182
> >          
> >          4096  1  18,928   202   14,580   154   16,970   189
> >          
> >                4  18,826   191   17,262   175   19,558   212
> >         
> >         16384  1  22,176   234   17,716   187   21,245   243
> >         
> >                4  21,306   215   20,332   206   18,353   227
> >   
> >   12 VMs to 12 VMs (24 VMs total)
> >   
> >                  -   Base    -  -Multi-Worker- -  Per-CPU  -
> >    
> >    Test     Inst   Score   Eff    Score   Eff    Score   Eff
> >    TCP_RR      1  72,926   731   67,338   675   32,662   387
> >    
> >               10  62,441   625   59,277   594   87,286   891
> >               30  72,761   728   67,760   679  102,549 1,041
> >               60  78,087   782   74,654   748  100,687 1,016
> >    
> >    UDP_RR      1  82,662   829   80,875   810   34,915   421
> >    
> >               10  71,424   716   67,754   679  111,753 1,147
> >               30  79,495   796   75,512   756  134,576 1,372
> >               60  83,339   835   77,523   778  137,058 1,390
> >    
> >    TCP_STREAM
> >    
> >           256  1   2,870    29    2,631    26    7,907    80
> >           
> >                4   8,424    84    8,026    80    8,929    90
> >          
> >          1024  1   3,674    37    3,121    31   15,644   164
> >          
> >                4  14,256   143   13,342   134   16,116   168
> >          
> >          4096  1   5,068    51    4,366    44   16,179   168
> >          
> >                4  17,015   171   16,321   164   17,940   186
> >         
> >         16384  1   9,768    98    9,025    90   19,233   203
> >         
> >                4  18,981   190   18,202   183   18,964   203
> > 
> > On Thursday, March 22, 2012 05:16:30 PM Shirley Ma wrote:
> >> Resubmit it with the right format.
> >> 
> >> Signed-off-by: Shirley Ma<xma@us.ibm.com>
> >> Signed-off-by: Krishna Kumar<krkumar2@in.ibm.com>
> >> Tested-by: Tom Lendacky<toml@us.ibm.com>
> >> ---
> >> 
> >>   drivers/vhost/net.c                  |   26 ++-
> >>   drivers/vhost/vhost.c                |  300
> >> 
> >> ++++++++++++++++++++++++---------- drivers/vhost/vhost.h                |
> >> 16 ++-
> >> 
> >>   3 files changed, 243 insertions(+), 103 deletions(-)
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom

Thomas Lendacky
Linux Technology Center - Performance


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
  2012-03-23 21:10       ` Thomas Lendacky
@ 2012-03-23 21:21         ` Rick Jones
  0 siblings, 0 replies; 8+ messages in thread
From: Rick Jones @ 2012-03-23 21:21 UTC (permalink / raw)
  To: Thomas Lendacky; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm


>
> Yeah, it becomes a question of time.  I run each test 3 times and
> average the results, so to run the full suite takes a long time.

I've found the "walk up the instance count with the interim results 
emitted" allows me quicker overall run time than launching all the 
netperfs at once with a long run time to kludge around skew.  Well 
modulo the time it takes to get them all launched.  But for the smallish 
stuff it is rather faster than the 15 minutes a data point I'd get with 
the (ab)use of the confidence intervals mechanism in runemomniagg2.sh . 
  It also avoids the "run one wait for it to finish, run two, wait for 
them to finish, run four, wait for them to finish" bit.  Walking-up the 
instance count leaving the previous instances going does mean that the 
"end of test" information is full of skew, but a great deal of that 
end-of-test information is invariant anyway.

happy benchmarking,

rick jones

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
  2012-03-23 18:32   ` Thomas Lendacky
  2012-03-23 19:00     ` Rick Jones
@ 2012-03-23 23:45     ` David Ahern
  2012-03-27 14:34       ` Thomas Lendacky
  1 sibling, 1 reply; 8+ messages in thread
From: David Ahern @ 2012-03-23 23:45 UTC (permalink / raw)
  To: Thomas Lendacky; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm

On 3/23/12 12:32 PM, Thomas Lendacky wrote:
> Quick description of the tests:
>    TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30
>    and 60 instances
>    TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes
>    and 1 and 4 instances
>
>    Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests
>    running between an external host and each VM.
>
>    Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests
>    running between VM pairs on the same host (no TCP_MAERTS done in
>    this situation).
>
> For TCP_RR and UDP_RR tests I report the transaction rate as the
> score and the transaction rate / KVMhost CPU% as the efficiency.
>
> For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps
> as the score and the throughput / KVMhost CPU% as the efficiency.
Would you mind sharing the netperf commands you are running and an 
example of the math done to arrive at the summaries presented?

David

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread
  2012-03-23 23:45     ` David Ahern
@ 2012-03-27 14:34       ` Thomas Lendacky
  0 siblings, 0 replies; 8+ messages in thread
From: Thomas Lendacky @ 2012-03-27 14:34 UTC (permalink / raw)
  To: David Ahern; +Cc: Shirley Ma, Michael S. Tsirkin, netdev, kvm

On Friday, March 23, 2012 05:45:40 PM David Ahern wrote:
> On 3/23/12 12:32 PM, Thomas Lendacky wrote:
> > Quick description of the tests:
> >    TCP_RR and UDP_RR using 256 byte request/response size in 1, 10, 30
> >    and 60 instances
> >    TCP_STREAM and TCP_MAERTS using 256, 1K, 4K and 16K message sizes
> >    and 1 and 4 instances
> >    
> >    Remote host to VM using 1, 4, 12 and 24 VMs (2 vCPUs) with the tests
> >    running between an external host and each VM.
> >    
> >    Local VM to VM using 2, 4, 12 and 24 VMs (2 vCPUs) with the tests
> >    running between VM pairs on the same host (no TCP_MAERTS done in
> >    this situation).
> > 
> > For TCP_RR and UDP_RR tests I report the transaction rate as the
> > score and the transaction rate / KVMhost CPU% as the efficiency.
> > 
> > For TCP_STREAM and TCP_MAERTS tests I report the throughput in Mbps
> > as the score and the throughput / KVMhost CPU% as the efficiency.
> 
> Would you mind sharing the netperf commands you are running and an
> example of the math done to arrive at the summaries presented?

I'm actually using uperf not netperf.  Uperf allows me to launch
multiple instances of a test with one executable. I've provided the
XML profiles for the tests below.

The math is simply taking the score (for TCP_RR it is the tranaction
rate and for TCP_STREAM/TCP_MAERTS it is the throughput) and dividing
by the CPU utilization of the KVM host (obtained from running sar
during the test).

Here are the uperf profiles that were used. The destination,
instances and message sizes are set using environment variables.

TCP_RR
  <?xml version="1.0"?>
  <!--
   Note: uperf reports operations/second. A transaction is made up of
         two operations, so to get transactions/second (like netperf)
         you must divide the operations/second by 2.
  -->
  <profile name="TCP_RR">
   <group nprocs="$uperf_instances">
    <transaction iterations="1">
     <flowop type="connect" options="remotehost=$uperf_dest
       protocol=tcp"/>
     </transaction>
     <transaction duration="$uperf_duration">
      <flowop type="write" options="size=$uperf_tx_msgsize"/>
      <flowop type="read"  options="size=$uperf_rx_msgsize"/>
     </transaction>
     <transaction iterations="1">
      <flowop type="disconnect" />
     </transaction>
   </group>
  </profile>

UDP_RR:
 <?xml version="1.0"?>
 <!--
  Note: uperf reports operations/second. A transaction is made up of
        two operations, so to get transactions/second (like netperf)
        you must divide the operations/second by 2.
 -->
 <profile name="UDP_RR">
  <group nprocs="$uperf_instances">
   <transaction iterations="1">
    <flowop type="connect" options="remotehost=$uperf_dest
      protocol=udp"/>
   </transaction>
   <transaction duration="$uperf_duration">
    <flowop type="write" options="size=$uperf_tx_msgsize"/>
    <flowop type="read"  options="size=$uperf_rx_msgsize"/>
   </transaction>
   <transaction iterations="1">
    <flowop type="disconnect" />
   </transaction>
  </group>
 </profile>

TCP_STREAM:
  <?xml version="1.0"?>
  <profile name="TCP_STREAM">
   <group nprocs="$uperf_instances">
    <transaction iterations="1">
     <flowop type="connect" options="remotehost=$uperf_dest
       protocol=tcp"/>
    </transaction>
    <transaction duration="$uperf_duration">
     <flowop type="write" options="count=16 size=$uperf_tx_msgsize"/>
    </transaction>
    <transaction iterations="1">
     <flowop type="disconnect" />
    </transaction>
   </group>
  </profile>

TCP_MAERTS:
  <?xml version="1.0"?>
  <profile name="TCP_MAERTS">
   <group nprocs="$uperf_instances">
    <transaction iterations="1">
     <flowop type="accept"  options="remotehost=$uperf_dest
       protocol=tcp"/>
    </transaction>
    <transaction duration="$uperf_duration">
     <flowop type="read"  options="count=16 size=$uperf_rx_msgsize"/>
    </transaction>
    <transaction iterations="1">
     <flowop type="disconnect" />
    </transaction>
  </group>
 </profile>

Tom

> 
> David
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2012-03-27 14:34 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-03-22 23:48 [RFC PATCH 1/1] NUMA aware scheduling per cpu vhost thread Shirley Ma
2012-03-23  0:16 ` Shirley Ma
2012-03-23 18:32   ` Thomas Lendacky
2012-03-23 19:00     ` Rick Jones
2012-03-23 21:10       ` Thomas Lendacky
2012-03-23 21:21         ` Rick Jones
2012-03-23 23:45     ` David Ahern
2012-03-27 14:34       ` Thomas Lendacky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).