[PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
@ 2010-05-19  0:04 Sridhar Samudrala
  2010-05-27  9:14 ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Sridhar Samudrala @ 2010-05-19  0:04 UTC (permalink / raw)
  To: Michael S. Tsirkin, netdev, lkml, kvm

Add a new kernel API to create a singlethread workqueue and attach it's
task to current task's cgroup and cpumask.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 9466e86..6d6f301 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -211,6 +211,7 @@ __create_workqueue_key(const char *name, int singlethread,
 #define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0)
 #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0)
 
+extern struct workqueue_struct *create_singlethread_workqueue_in_current_cg(char *name); 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
 extern int queue_work(struct workqueue_struct *wq, struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5bfb213..6ba226e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -35,6 +35,7 @@
 #include <linux/lockdep.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
+#include <linux/cgroup.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -193,6 +194,45 @@ static const struct cpumask *cpu_singlethread_map __read_mostly;
  */
 static cpumask_var_t cpu_populated_map __read_mostly;
 
+static struct task_struct *get_singlethread_wq_task(struct workqueue_struct *wq)
+{
+	return (per_cpu_ptr(wq->cpu_wq, singlethread_cpu))->thread;
+}
+
+/* Create a singlethread workqueue and attach it's task to the current task's
+ * cgroup and set it's cpumask to the current task's cpumask.
+ */
+struct workqueue_struct *create_singlethread_workqueue_in_current_cg(char *name)
+{
+	struct workqueue_struct *wq;
+	struct task_struct *task;
+	cpumask_var_t mask;
+
+	wq = create_singlethread_workqueue(name);
+	if (!wq)
+		goto out;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		goto err;
+			
+	if (sched_getaffinity(current->pid, mask))
+		goto err;
+
+	task = get_singlethread_wq_task(wq);
+	if (sched_setaffinity(task->pid, mask))
+		goto err;
+
+	if (cgroup_attach_task_current_cg(task))
+		goto err;
+out:	
+	return wq;
+err:
+	destroy_workqueue(wq);
+	wq = NULL;
+	goto out;
+}
+EXPORT_SYMBOL_GPL(create_singlethread_workqueue_in_current_cg);
+
 /* If it's single threaded, it isn't in the list of workqueues. */
 static inline int is_wq_single_threaded(struct workqueue_struct *wq)
 {
	


^ permalink raw reply related	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-19  0:04 [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup Sridhar Samudrala
@ 2010-05-27  9:14 ` Michael S. Tsirkin
  2010-05-27 12:44   ` Oleg Nesterov
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-05-27  9:14 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Tejun Heo,
	Jiri Kosina, Thomas Gleixner, Oleg Nesterov, Ingo Molnar,
	Andi Kleen

On Tue, May 18, 2010 at 05:04:51PM -0700, Sridhar Samudrala wrote:
> Add a new kernel API to create a singlethread workqueue and attach it's
> task to current task's cgroup and cpumask.
> 
> Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

Could someone familiar with workqueue code please comment on whether
this patch is suitable for 2.6.35?

It is needed to fix the case where vhost user might cause a kernel
thread to consume more CPU than allowed by the cgroup.
Should I merge it through the vhost tree?
Ack for this?

Thanks!

> diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
> index 9466e86..6d6f301 100644
> --- a/include/linux/workqueue.h
> +++ b/include/linux/workqueue.h
> @@ -211,6 +211,7 @@ __create_workqueue_key(const char *name, int singlethread,
>  #define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0)
>  #define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0)
>  
> +extern struct workqueue_struct *create_singlethread_workqueue_in_current_cg(char *name); 
>  extern void destroy_workqueue(struct workqueue_struct *wq);
>  
>  extern int queue_work(struct workqueue_struct *wq, struct work_struct *work);
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 5bfb213..6ba226e 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -35,6 +35,7 @@
>  #include <linux/lockdep.h>
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/workqueue.h>
> +#include <linux/cgroup.h>
>  
>  /*
>   * The per-CPU workqueue (if single thread, we always use the first
> @@ -193,6 +194,45 @@ static const struct cpumask *cpu_singlethread_map __read_mostly;
>   */
>  static cpumask_var_t cpu_populated_map __read_mostly;
>  
> +static struct task_struct *get_singlethread_wq_task(struct workqueue_struct *wq)
> +{
> +	return (per_cpu_ptr(wq->cpu_wq, singlethread_cpu))->thread;
> +}
> +
> +/* Create a singlethread workqueue and attach it's task to the current task's
> + * cgroup and set it's cpumask to the current task's cpumask.
> + */
> +struct workqueue_struct *create_singlethread_workqueue_in_current_cg(char *name)
> +{
> +	struct workqueue_struct *wq;
> +	struct task_struct *task;
> +	cpumask_var_t mask;
> +
> +	wq = create_singlethread_workqueue(name);
> +	if (!wq)
> +		goto out;
> +
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		goto err;
> +			
> +	if (sched_getaffinity(current->pid, mask))
> +		goto err;
> +
> +	task = get_singlethread_wq_task(wq);
> +	if (sched_setaffinity(task->pid, mask))
> +		goto err;
> +
> +	if (cgroup_attach_task_current_cg(task))
> +		goto err;
> +out:	
> +	return wq;
> +err:
> +	destroy_workqueue(wq);
> +	wq = NULL;
> +	goto out;
> +}
> +EXPORT_SYMBOL_GPL(create_singlethread_workqueue_in_current_cg);
> +
>  /* If it's single threaded, it isn't in the list of workqueues. */
>  static inline int is_wq_single_threaded(struct workqueue_struct *wq)
>  {
> 	

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27  9:14 ` Michael S. Tsirkin
@ 2010-05-27 12:44   ` Oleg Nesterov
  2010-05-27 13:12     ` Michael S. Tsirkin
  2010-05-27 16:24     ` [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup Sridhar Samudrala
  0 siblings, 2 replies; 115+ messages in thread
From: Oleg Nesterov @ 2010-05-27 12:44 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, netdev, lkml, kvm, Andrew Morton,
	Dmitri Vorobiev, Tejun Heo, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On 05/27, Michael S. Tsirkin wrote:
>
> On Tue, May 18, 2010 at 05:04:51PM -0700, Sridhar Samudrala wrote:
> > Add a new kernel API to create a singlethread workqueue and attach it's
> > task to current task's cgroup and cpumask.
> >
> > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
>
> Could someone familiar with workqueue code please comment on whether
> this patch is suitable for 2.6.35?
>
> It is needed to fix the case where vhost user might cause a kernel
> thread to consume more CPU than allowed by the cgroup.
> Should I merge it through the vhost tree?
> Ack for this?

I don't understand the reasons for this patch, but this doesn't matter.

I don't really see any need to change workqueue.c,

> > +static struct task_struct *get_singlethread_wq_task(struct workqueue_struct *wq)
> > +{
> > +	return (per_cpu_ptr(wq->cpu_wq, singlethread_cpu))->thread;
> > +}

(Not sure this trivial static helper with the single caller makes sense, but
 see below)

> > +/* Create a singlethread workqueue and attach it's task to the current task's
> > + * cgroup and set it's cpumask to the current task's cpumask.
> > + */
> > +struct workqueue_struct *create_singlethread_workqueue_in_current_cg(char *name)
> > +{
> > +	struct workqueue_struct *wq;
> > +	struct task_struct *task;
> > +	cpumask_var_t mask;
> > +
> > +	wq = create_singlethread_workqueue(name);
> > +	if (!wq)
> > +		goto out;
> > +
> > +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> > +		goto err;
> > +			
> > +	if (sched_getaffinity(current->pid, mask))
> > +		goto err;
> > +
> > +	task = get_singlethread_wq_task(wq);
> > +	if (sched_setaffinity(task->pid, mask))
> > +		goto err;
> > +
> > +	if (cgroup_attach_task_current_cg(task))
> > +		goto err;
> > +out:	
> > +	return wq;
> > +err:
> > +	destroy_workqueue(wq);
> > +	wq = NULL;
> > +	goto out;
> > +}

Instead, cgroup.c (or whoever needs this) can do

	struct move_struct {
		struct work_struct work;
		int ret;
	};

	static void move_func(struct work_struct *work)
	{
		struct move_struct *move = container_of(...);

		if (cgroup_attach_task_current_cg(current))
			ret = -EANY;
	}

	static struct workqueue_struct *create_singlethread_workqueue_in_current_cg(char *name)
	{
		struct workqueue_struct *wq;
		struct move_struct move = {
			.work = __WORK_INITIALIZER(move_func);
		};

		wq = create_singlethread_workqueue(name);
		if (!wq)
			return NULL;

		queue_work(&move.work);
		flush_work(&move.work);

		if (move.ret) {
			destroy_workqueue(wq);
			wq = NULL;
		}

		return wq;
	}

Or. Just export wq_per_cpu() from workqueue.c (probably with a better name) and
use it like the patch does.

But, imho, create_singlethread_workqueue_in_current_cg() does not belong
to  workqueue.c.

Oleg.


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27 12:44   ` Oleg Nesterov
@ 2010-05-27 13:12     ` Michael S. Tsirkin
  2010-05-27 13:48       ` Oleg Nesterov
  2010-05-27 16:15       ` Tejun Heo
  2010-05-27 16:24     ` [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup Sridhar Samudrala
  1 sibling, 2 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-05-27 13:12 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Sridhar Samudrala, netdev, lkml, kvm, Andrew Morton,
	Dmitri Vorobiev, Tejun Heo, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Thu, May 27, 2010 at 02:44:48PM +0200, Oleg Nesterov wrote:
> On 05/27, Michael S. Tsirkin wrote:
> >
> > On Tue, May 18, 2010 at 05:04:51PM -0700, Sridhar Samudrala wrote:
> > > Add a new kernel API to create a singlethread workqueue and attach it's
> > > task to current task's cgroup and cpumask.
> > >
> > > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
> >
> > Could someone familiar with workqueue code please comment on whether
> > this patch is suitable for 2.6.35?
> >
> > It is needed to fix the case where vhost user might cause a kernel
> > thread to consume more CPU than allowed by the cgroup.
> > Should I merge it through the vhost tree?
> > Ack for this?
> 
> I don't understand the reasons for this patch, but this doesn't matter.

Depending on userspace application, driver can create a lot of work
for a workqueue to handle. By making the workqueue thread
belong in a cgroup, we make it possible to the CPU and other
resources thus consumed.

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27 13:12     ` Michael S. Tsirkin
@ 2010-05-27 13:48       ` Oleg Nesterov
  2010-05-27 16:15       ` Tejun Heo
  1 sibling, 0 replies; 115+ messages in thread
From: Oleg Nesterov @ 2010-05-27 13:48 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, netdev, lkml, kvm, Andrew Morton,
	Dmitri Vorobiev, Tejun Heo, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On 05/27, Michael S. Tsirkin wrote:
>
> On Thu, May 27, 2010 at 02:44:48PM +0200, Oleg Nesterov wrote:
> > On 05/27, Michael S. Tsirkin wrote:
> > >
> > > On Tue, May 18, 2010 at 05:04:51PM -0700, Sridhar Samudrala wrote:
> > > > Add a new kernel API to create a singlethread workqueue and attach it's
> > > > task to current task's cgroup and cpumask.
> > > >
> > > > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
> > >
> > > Could someone familiar with workqueue code please comment on whether
> > > this patch is suitable for 2.6.35?
> > >
> > > It is needed to fix the case where vhost user might cause a kernel
> > > thread to consume more CPU than allowed by the cgroup.
> > > Should I merge it through the vhost tree?
> > > Ack for this?
> >
> > I don't understand the reasons for this patch, but this doesn't matter.
>
> Depending on userspace application, driver can create a lot of work
> for a workqueue to handle. By making the workqueue thread
> belong in a cgroup, we make it possible to the CPU and other
> resources thus consumed.

OK, I see, thanks for your explanation.


in case I wasn't clear... I didn't mean I dislike this idea, only the
the implementation of create_singlethread_workqueue_in_current_cg(),
it doesn't belong to workqueue.c imho.

Oleg.


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27 13:12     ` Michael S. Tsirkin
  2010-05-27 13:48       ` Oleg Nesterov
@ 2010-05-27 16:15       ` Tejun Heo
  2010-05-27 16:39         ` Michael S. Tsirkin
  1 sibling, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-05-27 16:15 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 05/27/2010 03:12 PM, Michael S. Tsirkin wrote:
>> I don't understand the reasons for this patch, but this doesn't matter.
> 
> Depending on userspace application, driver can create a lot of work
> for a workqueue to handle. By making the workqueue thread
> belong in a cgroup, we make it possible to the CPU and other
> resources thus consumed.

Hmmm.... I don't really get it.  The unit of scheduling in workqueue
is a work.  Unless you're gonna convert every driver to use this
special kind of workqueue (and what happens when multiple tasks from
different cgroups share the driver?), I can't see how this is gonna be
useful.  If you really wanna impose cgroup control on workqueue items,
you'll have to do it per work item which might lead to the problem of
priority inversion.  Can you please describe what you're trying to do
in more detail?

Thank you.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27 12:44   ` Oleg Nesterov
  2010-05-27 13:12     ` Michael S. Tsirkin
@ 2010-05-27 16:24     ` Sridhar Samudrala
  2010-05-27 16:41       ` Michael S. Tsirkin
  2010-05-27 17:30       ` Oleg Nesterov
  1 sibling, 2 replies; 115+ messages in thread
From: Sridhar Samudrala @ 2010-05-27 16:24 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Michael S. Tsirkin, netdev, lkml, kvm, Andrew Morton,
	Dmitri Vorobiev, Tejun Heo, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Thu, 2010-05-27 at 14:44 +0200, Oleg Nesterov wrote:
> On 05/27, Michael S. Tsirkin wrote:
> >
> > On Tue, May 18, 2010 at 05:04:51PM -0700, Sridhar Samudrala wrote:
> > > Add a new kernel API to create a singlethread workqueue and attach it's
> > > task to current task's cgroup and cpumask.
> > >
> > > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
> >
> > Could someone familiar with workqueue code please comment on whether
> > this patch is suitable for 2.6.35?
> >
> > It is needed to fix the case where vhost user might cause a kernel
> > thread to consume more CPU than allowed by the cgroup.
> > Should I merge it through the vhost tree?
> > Ack for this?
> 
> I don't understand the reasons for this patch, but this doesn't matter.
> 
> I don't really see any need to change workqueue.c,
> 
> > > +static struct task_struct *get_singlethread_wq_task(struct workqueue_struct *wq)
> > > +{
> > > +	return (per_cpu_ptr(wq->cpu_wq, singlethread_cpu))->thread;
> > > +}
> 
> (Not sure this trivial static helper with the single caller makes sense, but
>  see below)
> 
> > > +/* Create a singlethread workqueue and attach it's task to the current task's
> > > + * cgroup and set it's cpumask to the current task's cpumask.
> > > + */
> > > +struct workqueue_struct *create_singlethread_workqueue_in_current_cg(char *name)
> > > +{
> > > +	struct workqueue_struct *wq;
> > > +	struct task_struct *task;
> > > +	cpumask_var_t mask;
> > > +
> > > +	wq = create_singlethread_workqueue(name);
> > > +	if (!wq)
> > > +		goto out;
> > > +
> > > +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> > > +		goto err;
> > > +			
> > > +	if (sched_getaffinity(current->pid, mask))
> > > +		goto err;
> > > +
> > > +	task = get_singlethread_wq_task(wq);
> > > +	if (sched_setaffinity(task->pid, mask))
> > > +		goto err;
> > > +
> > > +	if (cgroup_attach_task_current_cg(task))
> > > +		goto err;
> > > +out:	
> > > +	return wq;
> > > +err:
> > > +	destroy_workqueue(wq);
> > > +	wq = NULL;
> > > +	goto out;
> > > +}
> 
> Instead, cgroup.c (or whoever needs this) can do
> 
> 	struct move_struct {
> 		struct work_struct work;
> 		int ret;
> 	};
> 
> 	static void move_func(struct work_struct *work)
> 	{
> 		struct move_struct *move = container_of(...);
> 
> 		if (cgroup_attach_task_current_cg(current))

We are trying to attach the task associated with workqueue to the
current task's cgroup. So what we need is 
   cgroup_attach_task_current_cg(wq->task);

However there is no interface currently that exports the task associated
with a workqueue. It is hidden in cpu_workqueue_struct and is only 
accessible within workqueue.c.


> 			ret = -EANY;
> 	}
> 
> 	static struct workqueue_struct *create_singlethread_workqueue_in_current_cg(char *name)
> 	{
> 		struct workqueue_struct *wq;
> 		struct move_struct move = {
> 			.work = __WORK_INITIALIZER(move_func);
> 		};
> 
> 		wq = create_singlethread_workqueue(name);
> 		if (!wq)
> 			return NULL;
> 
> 		queue_work(&move.work);
> 		flush_work(&move.work);
> 
> 		if (move.ret) {
> 			destroy_workqueue(wq);
> 			wq = NULL;
> 		}
> 
> 		return wq;
> 	}
> 
> Or. Just export wq_per_cpu() from workqueue.c (probably with a better name) and
> use it like the patch does.
This requires that struct cpu_workqueue_struct and struct
workqueue_struct are made externally visible by moving them to
kernel/workqueue.h.

Instead what about adding the simple helper get_singlethread_wq_task()
in workqueue.c and exporting it.
I can add create_singlethread_workqueue_in_current_cg() to cgroup.c
using this helper routine.

Thanks
Sridhar



^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27 16:15       ` Tejun Heo
@ 2010-05-27 16:39         ` Michael S. Tsirkin
  2010-05-27 16:56           ` Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-05-27 16:39 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Thu, May 27, 2010 at 06:15:54PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 05/27/2010 03:12 PM, Michael S. Tsirkin wrote:
> >> I don't understand the reasons for this patch, but this doesn't matter.
> > 
> > Depending on userspace application, driver can create a lot of work
> > for a workqueue to handle. By making the workqueue thread
> > belong in a cgroup, we make it possible to the CPU and other
> > resources thus consumed.
> 
> Hmmm.... I don't really get it.  The unit of scheduling in workqueue
> is a work.

Yes. However, we use cgroups to limit when the workqueue itself is scheduled.
This affects all of work done on this workqueue, so it's a bit
of a blunt intrument. Thus we are not trying to apply this
to all drivers, we intend to start with vhost-net.

> Unless you're gonna convert every driver to use this
> special kind of workqueue (and what happens when multiple tasks from
> different cgroups share the driver?),

We'll then create a workqueue per task. Each workqueue will have the
right cgroup. But we are not trying to selve the problem for
every driver.

> I can't see how this is gonna be
> useful.  If you really wanna impose cgroup control on workqueue items,
> you'll have to do it per work item which might lead to the problem of
> priority inversion.

Exactly. cgroup is per-workqueue not per work item.
If driver wants to let administrators control priority
for different kinds of items separately, driver will have
to submit them to separate workqueues.

>  Can you please describe what you're trying to do
> in more detail?
> 
> Thank you.

vhost-net driver is under control from userspace,
it queues potentially a lot of work into the workqueue, which
might load the system beyond the cgroup limits.
And staying within cgroups limits is important for virtualization
where vhost is used.

> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27 16:24     ` [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup Sridhar Samudrala
@ 2010-05-27 16:41       ` Michael S. Tsirkin
  2010-05-27 17:30       ` Oleg Nesterov
  1 sibling, 0 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-05-27 16:41 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Oleg Nesterov, netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev,
	Tejun Heo, Jiri Kosina, Thomas Gleixner, Ingo Molnar, Andi Kleen

On Thu, May 27, 2010 at 09:24:18AM -0700, Sridhar Samudrala wrote:
> On Thu, 2010-05-27 at 14:44 +0200, Oleg Nesterov wrote:
> > On 05/27, Michael S. Tsirkin wrote:
> > >
> > > On Tue, May 18, 2010 at 05:04:51PM -0700, Sridhar Samudrala wrote:
> > > > Add a new kernel API to create a singlethread workqueue and attach it's
> > > > task to current task's cgroup and cpumask.
> > > >
> > > > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
> > >
> > > Could someone familiar with workqueue code please comment on whether
> > > this patch is suitable for 2.6.35?
> > >
> > > It is needed to fix the case where vhost user might cause a kernel
> > > thread to consume more CPU than allowed by the cgroup.
> > > Should I merge it through the vhost tree?
> > > Ack for this?
> > 
> > I don't understand the reasons for this patch, but this doesn't matter.
> > 
> > I don't really see any need to change workqueue.c,
> > 
> > > > +static struct task_struct *get_singlethread_wq_task(struct workqueue_struct *wq)
> > > > +{
> > > > +	return (per_cpu_ptr(wq->cpu_wq, singlethread_cpu))->thread;
> > > > +}
> > 
> > (Not sure this trivial static helper with the single caller makes sense, but
> >  see below)
> > 
> > > > +/* Create a singlethread workqueue and attach it's task to the current task's
> > > > + * cgroup and set it's cpumask to the current task's cpumask.
> > > > + */
> > > > +struct workqueue_struct *create_singlethread_workqueue_in_current_cg(char *name)
> > > > +{
> > > > +	struct workqueue_struct *wq;
> > > > +	struct task_struct *task;
> > > > +	cpumask_var_t mask;
> > > > +
> > > > +	wq = create_singlethread_workqueue(name);
> > > > +	if (!wq)
> > > > +		goto out;
> > > > +
> > > > +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> > > > +		goto err;
> > > > +			
> > > > +	if (sched_getaffinity(current->pid, mask))
> > > > +		goto err;
> > > > +
> > > > +	task = get_singlethread_wq_task(wq);
> > > > +	if (sched_setaffinity(task->pid, mask))
> > > > +		goto err;
> > > > +
> > > > +	if (cgroup_attach_task_current_cg(task))
> > > > +		goto err;
> > > > +out:	
> > > > +	return wq;
> > > > +err:
> > > > +	destroy_workqueue(wq);
> > > > +	wq = NULL;
> > > > +	goto out;
> > > > +}
> > 
> > Instead, cgroup.c (or whoever needs this) can do
> > 
> > 	struct move_struct {
> > 		struct work_struct work;
> > 		int ret;
> > 	};
> > 
> > 	static void move_func(struct work_struct *work)
> > 	{
> > 		struct move_struct *move = container_of(...);
> > 
> > 		if (cgroup_attach_task_current_cg(current))
> 
> We are trying to attach the task associated with workqueue to the
> current task's cgroup. So what we need is 
>    cgroup_attach_task_current_cg(wq->task);
> 
> However there is no interface currently that exports the task associated
> with a workqueue. It is hidden in cpu_workqueue_struct and is only 
> accessible within workqueue.c.
> 
> 
> > 			ret = -EANY;
> > 	}
> > 
> > 	static struct workqueue_struct *create_singlethread_workqueue_in_current_cg(char *name)
> > 	{
> > 		struct workqueue_struct *wq;
> > 		struct move_struct move = {
> > 			.work = __WORK_INITIALIZER(move_func);
> > 		};
> > 
> > 		wq = create_singlethread_workqueue(name);
> > 		if (!wq)
> > 			return NULL;
> > 
> > 		queue_work(&move.work);
> > 		flush_work(&move.work);
> > 
> > 		if (move.ret) {
> > 			destroy_workqueue(wq);
> > 			wq = NULL;
> > 		}
> > 
> > 		return wq;
> > 	}
> > 
> > Or. Just export wq_per_cpu() from workqueue.c (probably with a better name) and
> > use it like the patch does.
> This requires that struct cpu_workqueue_struct and struct
> workqueue_struct are made externally visible by moving them to
> kernel/workqueue.h.
> 
> Instead what about adding the simple helper get_singlethread_wq_task()
> in workqueue.c and exporting it.
> I can add create_singlethread_workqueue_in_current_cg() to cgroup.c
> using this helper routine.

Or to our driver, if that's more palatable.

> Thanks
> Sridhar
> 

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27 16:39         ` Michael S. Tsirkin
@ 2010-05-27 16:56           ` Tejun Heo
  2010-05-27 17:32             ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-05-27 16:56 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 05/27/2010 06:39 PM, Michael S. Tsirkin wrote:
>> Unless you're gonna convert every driver to use this
>> special kind of workqueue (and what happens when multiple tasks from
>> different cgroups share the driver?),
> 
> We'll then create a workqueue per task. Each workqueue will have the
> right cgroup. But we are not trying to selve the problem for
> every driver.

Ah... I see.  You're gonna use multiple workqueues.  Once concern that
I have is that this is abuse of workqueue interface to certain level
and depends on the implementation detail of workqueue rather than its
intended usage model.  stop_machine() was a similar case and in the
end it was better served by a different mechanism built on kthread
directly (cpu_stop).  Wouldn't it be cleaner to use kthread directly
for your case too?  You're basically trying to use workqueue as a
frontend to kthread, so...

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27 16:24     ` [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup Sridhar Samudrala
  2010-05-27 16:41       ` Michael S. Tsirkin
@ 2010-05-27 17:30       ` Oleg Nesterov
  1 sibling, 0 replies; 115+ messages in thread
From: Oleg Nesterov @ 2010-05-27 17:30 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Michael S. Tsirkin, netdev, lkml, kvm, Andrew Morton,
	Dmitri Vorobiev, Tejun Heo, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

What I am actually worried about is Tejun's rework, I am not sure
cmwq has the "this thread services that wq" property...

On 05/27, Sridhar Samudrala wrote:
>
> On Thu, 2010-05-27 at 14:44 +0200, Oleg Nesterov wrote:
> >
> > Instead, cgroup.c (or whoever needs this) can do
> >
> > 	struct move_struct {
> > 		struct work_struct work;
> > 		int ret;
> > 	};
> >
> > 	static void move_func(struct work_struct *work)
> > 	{
> > 		struct move_struct *move = container_of(...);
> >
> > 		if (cgroup_attach_task_current_cg(current))
>
> We are trying to attach the task associated with workqueue to the
> current task's cgroup. So what we need is
>    cgroup_attach_task_current_cg(wq->task);

I do not see cgroup_attach_task_current_cg() in Linus's tree and thus I do
not now what exactly it does, and of course the code above is only template.

But I think this is easy. Just add "struct cgroup *cgrp" into move_struct
and then move_func() can do cgroup_attach_task(move->cgrp, current) ?

> > Or. Just export wq_per_cpu() from workqueue.c (probably with a better name) and
> > use it like the patch does.
> This requires that struct cpu_workqueue_struct and struct
> workqueue_struct are made externally visible by moving them to
> kernel/workqueue.h.

no, no,

> Instead what about adding the simple helper get_singlethread_wq_task()
> in workqueue.c and exporting it.

Indeed, this is what I meant.

But. I disagree with get_singlethread_wq_task(). If we add this helper,
it should work with the multi-threaded wq's too, and needs the "int cpu"
parameter, ignored when is_wq_single_threaded().

So. Either rename wq_per_cpu() and export it (once again, I do not
mean we should move the body to workqueue.h!), or create the new
helper which just calls wq_per_cpu().

> I can add create_singlethread_workqueue_in_current_cg() to cgroup.c
> using this helper routine.

Imho, this is better.

But please note that it is possible to do without any changes in
workqueue.[ch] afaics, see above.

Oleg.

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27 16:56           ` Tejun Heo
@ 2010-05-27 17:32             ` Michael S. Tsirkin
  2010-05-27 21:20               ` Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-05-27 17:32 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Thu, May 27, 2010 at 06:56:20PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 05/27/2010 06:39 PM, Michael S. Tsirkin wrote:
> >> Unless you're gonna convert every driver to use this
> >> special kind of workqueue (and what happens when multiple tasks from
> >> different cgroups share the driver?),
> > 
> > We'll then create a workqueue per task. Each workqueue will have the
> > right cgroup. But we are not trying to selve the problem for
> > every driver.
> 
> Ah... I see.  You're gonna use multiple workqueues.  Once concern that
> I have is that this is abuse of workqueue interface to certain level
> and depends on the implementation detail of workqueue rather than its
> intended usage model.

Well, this is why I proposed adding a new API for creating
workqueue within workqueue.c, rather than exposing the task
and attaching it to cgroups in our driver: so that workqueue
maintainers can fix the implementation if it ever changes.

And after all, it's an internal API, we can always change
it later if we need.

> stop_machine() was a similar case and in the
> end it was better served by a different mechanism built on kthread
> directly (cpu_stop).  Wouldn't it be cleaner to use kthread directly
> for your case too?  You're basically trying to use workqueue as a
> frontend to kthread, so...
> 
> Thanks.

Well, yes but we are using APIs like flush_work etc. These are very
handy.  It seems much easier than rolling our own queue on top of kthread.

Makes sense?

> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27 17:32             ` Michael S. Tsirkin
@ 2010-05-27 21:20               ` Tejun Heo
  2010-05-28 15:08                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-05-27 21:20 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello, Michael.

On 05/27/2010 07:32 PM, Michael S. Tsirkin wrote:
> Well, this is why I proposed adding a new API for creating
> workqueue within workqueue.c, rather than exposing the task
> and attaching it to cgroups in our driver: so that workqueue
> maintainers can fix the implementation if it ever changes.
> 
> And after all, it's an internal API, we can always change
> it later if we need.
...
> Well, yes but we are using APIs like flush_work etc. These are very
> handy.  It seems much easier than rolling our own queue on top of kthread.

The thing is that this kind of one-off usage becomes problemetic when
you're trying to change the implementation detail.  All current
workqueue users don't care which thread they run on and they shouldn't
as each work owns the context only for the duration the work is
executing.  If this sort of fundamental guidelines are followed, the
implementation can be improved in pretty much transparent way but when
you start depending on specific implementation details, things become
messy pretty quickly.

If this type of usage were more common, adding proper way to account
work usage according to cgroups would make sense but that's not the
case here and I removed the only exception case recently while trying
to implement cmwq and if this is added.  So, this would be the only
one which makes such extra assumptions in the whole kernel.  One way
or the other, workqueue needs to be improved and I don't really think
adding the single exception at this point is a good idea.

The thing I realized after stop_machine conversion was that there was
no reason to use workqueue there at all.  There already are more than
enough not-too-difficult synchronization constructs and if you're
using a thread for dedicated purposes, code complexity isn't that
different either way.  Plus, it would also be clearer that dedicated
threads are required there for what reason.  So, I strongly suggest
using a kthread.  If there are issues which are noticeably difficult
to solve with kthread, we can definitely talk about that and think
about things again.

Thank you.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-27 21:20               ` Tejun Heo
@ 2010-05-28 15:08                 ` Michael S. Tsirkin
  2010-05-28 15:54                   ` Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-05-28 15:08 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Thu, May 27, 2010 at 11:20:22PM +0200, Tejun Heo wrote:
> Hello, Michael.
> 
> On 05/27/2010 07:32 PM, Michael S. Tsirkin wrote:
> > Well, this is why I proposed adding a new API for creating
> > workqueue within workqueue.c, rather than exposing the task
> > and attaching it to cgroups in our driver: so that workqueue
> > maintainers can fix the implementation if it ever changes.
> > 
> > And after all, it's an internal API, we can always change
> > it later if we need.
> ...
> > Well, yes but we are using APIs like flush_work etc. These are very
> > handy.  It seems much easier than rolling our own queue on top of kthread.
> 
> The thing is that this kind of one-off usage becomes problemetic when
> you're trying to change the implementation detail.  All current
> workqueue users don't care which thread they run on and they shouldn't
> as each work owns the context only for the duration the work is
> executing.  If this sort of fundamental guidelines are followed, the
> implementation can be improved in pretty much transparent way but when
> you start depending on specific implementation details, things become
> messy pretty quickly.
> 
> If this type of usage were more common, adding proper way to account
> work usage according to cgroups would make sense but that's not the
> case here and I removed the only exception case recently while trying
> to implement cmwq and if this is added.  So, this would be the only
> one which makes such extra assumptions in the whole kernel.  One way
> or the other, workqueue needs to be improved and I don't really think
> adding the single exception at this point is a good idea.
> 
> The thing I realized after stop_machine conversion was that there was
> no reason to use workqueue there at all.  There already are more than
> enough not-too-difficult synchronization constructs and if you're
> using a thread for dedicated purposes, code complexity isn't that
> different either way.  Plus, it would also be clearer that dedicated
> threads are required there for what reason.  So, I strongly suggest
> using a kthread.  If there are issues which are noticeably difficult
> to solve with kthread, we can definitely talk about that and think
> about things again.
> 
> Thank you.

Well, we have create_singlethread_workqueue, right?
This is not very different ... is it?

Just copying structures and code from workqueue.c,
adding vhost_ in front of it will definitely work:
there is nothing magic about the workqueue library.
But this just involves cut and paste which might be best avoided.
One final idea before we go the cut and paste way: how about
'create_workqueue_from_task' that would get a thread and have workqueue
run there?

> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-28 15:08                 ` Michael S. Tsirkin
@ 2010-05-28 15:54                   ` Tejun Heo
  2010-05-30 11:29                     ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-05-28 15:54 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 05/28/2010 05:08 PM, Michael S. Tsirkin wrote:
> Well, we have create_singlethread_workqueue, right?
> This is not very different ... is it?
> 
> Just copying structures and code from workqueue.c,
> adding vhost_ in front of it will definitely work:

Sure it will, but you'll probably be able to get away with much less.

> there is nothing magic about the workqueue library.
> But this just involves cut and paste which might be best avoided.

What I'm saying is that some magic needs to be added to workqueue and
if you add this single(!) exception, it will have to be backed out
pretty soon, so it would be better to do it properly now.

> One final idea before we go the cut and paste way: how about
> 'create_workqueue_from_task' that would get a thread and have workqueue
> run there?

You can currently depend on that implementation detail but it's not
the workqueue interface is meant to do.  The single threadedness is
there as execution ordering and concurrency specification and it
doesn't (or rather won't) necessarily mean that a specific single
thread is bound to certain workqueue.

Can you please direct me to have a look at the code.  I'll be happy to
do the conversion for you.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup
  2010-05-28 15:54                   ` Tejun Heo
@ 2010-05-30 11:29                     ` Michael S. Tsirkin
  2010-05-30 20:24                       ` [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread Tejun Heo
                                         ` (2 more replies)
  0 siblings, 3 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-05-30 11:29 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Fri, May 28, 2010 at 05:54:42PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 05/28/2010 05:08 PM, Michael S. Tsirkin wrote:
> > Well, we have create_singlethread_workqueue, right?
> > This is not very different ... is it?
> > 
> > Just copying structures and code from workqueue.c,
> > adding vhost_ in front of it will definitely work:
> 
> Sure it will, but you'll probably be able to get away with much less.
> 
> > there is nothing magic about the workqueue library.
> > But this just involves cut and paste which might be best avoided.
> 
> What I'm saying is that some magic needs to be added to workqueue and
> if you add this single(!) exception, it will have to be backed out
> pretty soon, so it would be better to do it properly now.
> 
> > One final idea before we go the cut and paste way: how about
> > 'create_workqueue_from_task' that would get a thread and have workqueue
> > run there?
> 
> You can currently depend on that implementation detail but it's not
> the workqueue interface is meant to do.  The single threadedness is
> there as execution ordering and concurrency specification and it
> doesn't (or rather won't) necessarily mean that a specific single
> thread is bound to certain workqueue.
> 
> Can you please direct me to have a look at the code.  I'll be happy to
> do the conversion for you.

Great, thanks! The code in question is in drivers/vhost/vhost.c
It is used from drivers/vhost/net.c

On top of this, we have patchset from Sridhar Samudrala,
titled '0/3 Make vhost multi-threaded and associate each thread to its
guest's cgroup':

cgroups: Add an API to attach a task to current task's cgroup
workqueue: Add an API to create a singlethread workqueue attached to the
current task's cgroup
vhost: make it more scalable by creating a vhost thread per device

I have bounced the last three your way.


> Thanks.
> 
> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-05-30 11:29                     ` Michael S. Tsirkin
@ 2010-05-30 20:24                       ` Tejun Heo
  2010-05-31 14:39                         ` Oleg Nesterov
  2010-05-31 15:22                         ` Michael S. Tsirkin
  2010-05-30 20:24                       ` [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup Tejun Heo
  2010-05-30 20:25                       ` [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers Tejun Heo
  2 siblings, 2 replies; 115+ messages in thread
From: Tejun Heo @ 2010-05-30 20:24 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Replace vhost_workqueue with per-vhost kthread.  Other than callback
argument change from struct work_struct * to struct vhost_poll *,
there's no visible change to vhost_poll_*() interface.

This conversion is to make each vhost use a dedicated kthread so that
resource control via cgroup can be applied.

Partially based on Sridhar Samudrala's patch.

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
---
Okay, here is three patch series to convert vhost to use per-vhost
kthread, add cgroup_attach_task_current_cg() and apply it to the vhost
kthreads.  The conversion is mostly straight forward although flush is
slightly tricky.

The problem is that I have no idea how to test this.  It builds fine
and I read it several times but it's entirely possible / likely that I
missed something.  Please proceed with caution (so, no sign off yet).

Thanks.

 drivers/vhost/net.c   |   58 +++++++++++++----------------
 drivers/vhost/vhost.c |   99 ++++++++++++++++++++++++++++++++++++--------------
 drivers/vhost/vhost.h |   32 +++++++++-------
 3 files changed, 117 insertions(+), 72 deletions(-)

Index: work/drivers/vhost/net.c
===================================================================
--- work.orig/drivers/vhost/net.c
+++ work/drivers/vhost/net.c
@@ -294,54 +294,60 @@ static void handle_rx(struct vhost_net *
 	unuse_mm(net->dev.mm);
 }

-static void handle_tx_kick(struct work_struct *work)
+static void handle_tx_kick(struct vhost_poll *poll)
 {
-	struct vhost_virtqueue *vq;
-	struct vhost_net *net;
-	vq = container_of(work, struct vhost_virtqueue, poll.work);
-	net = container_of(vq->dev, struct vhost_net, dev);
+	struct vhost_virtqueue *vq =
+		container_of(poll, struct vhost_virtqueue, poll);
+	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
 	handle_tx(net);
 }

-static void handle_rx_kick(struct work_struct *work)
+static void handle_rx_kick(struct vhost_poll *poll)
 {
-	struct vhost_virtqueue *vq;
-	struct vhost_net *net;
-	vq = container_of(work, struct vhost_virtqueue, poll.work);
-	net = container_of(vq->dev, struct vhost_net, dev);
+	struct vhost_virtqueue *vq =
+		container_of(poll, struct vhost_virtqueue, poll);
+	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
 	handle_rx(net);
 }

-static void handle_tx_net(struct work_struct *work)
+static void handle_tx_net(struct vhost_poll *poll)
 {
-	struct vhost_net *net;
-	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
+	struct vhost_net *net =
+		container_of(poll, struct vhost_net, poll[VHOST_NET_VQ_TX]);
+
 	handle_tx(net);
 }

-static void handle_rx_net(struct work_struct *work)
+static void handle_rx_net(struct vhost_poll *poll)
 {
-	struct vhost_net *net;
-	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
+	struct vhost_net *net =
+		container_of(poll, struct vhost_net, poll[VHOST_NET_VQ_RX]);
+
 	handle_rx(net);
 }

 static int vhost_net_open(struct inode *inode, struct file *f)
 {
 	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
+	struct vhost_dev *dev;
 	int r;
+
 	if (!n)
 		return -ENOMEM;
+
+	dev = &n->dev;
 	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
 	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
-	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
+	r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
 	if (r < 0) {
 		kfree(n);
 		return r;
 	}

-	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
-	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
 	n->tx_poll_state = VHOST_NET_POLL_DISABLED;

 	f->private_data = n;
@@ -644,25 +650,13 @@ static struct miscdevice vhost_net_misc

 static int vhost_net_init(void)
 {
-	int r = vhost_init();
-	if (r)
-		goto err_init;
-	r = misc_register(&vhost_net_misc);
-	if (r)
-		goto err_reg;
-	return 0;
-err_reg:
-	vhost_cleanup();
-err_init:
-	return r;
-
+	return misc_register(&vhost_net_misc);
 }
 module_init(vhost_net_init);

 static void vhost_net_exit(void)
 {
 	misc_deregister(&vhost_net_misc);
-	vhost_cleanup();
 }
 module_exit(vhost_net_exit);

Index: work/drivers/vhost/vhost.c
===================================================================
--- work.orig/drivers/vhost/vhost.c
+++ work/drivers/vhost/vhost.c
@@ -17,12 +17,12 @@
 #include <linux/mm.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
-#include <linux/workqueue.h>
 #include <linux/rcupdate.h>
 #include <linux/poll.h>
 #include <linux/file.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
+#include <linux/kthread.h>

 #include <linux/net.h>
 #include <linux/if_packet.h>
@@ -37,8 +37,6 @@ enum {
 	VHOST_MEMORY_F_LOG = 0x1,
 };

-static struct workqueue_struct *vhost_workqueue;
-
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 			    poll_table *pt)
 {
@@ -52,23 +50,27 @@ static void vhost_poll_func(struct file
 static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
 			     void *key)
 {
-	struct vhost_poll *poll;
-	poll = container_of(wait, struct vhost_poll, wait);
+	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
+
 	if (!((unsigned long)key & poll->mask))
 		return 0;

-	queue_work(vhost_workqueue, &poll->work);
+	vhost_poll_queue(poll);
 	return 0;
 }

 /* Init poll structure */
-void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
-		     unsigned long mask)
+void vhost_poll_init(struct vhost_poll *poll, vhost_poll_fn_t fn,
+		     unsigned long mask, struct vhost_dev *dev)
 {
-	INIT_WORK(&poll->work, func);
+	poll->fn = fn;
 	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 	init_poll_funcptr(&poll->table, vhost_poll_func);
+	INIT_LIST_HEAD(&poll->node);
+	init_waitqueue_head(&poll->done);
 	poll->mask = mask;
+	poll->dev = dev;
+	poll->queue_seq = poll->done_seq = 0;
 }

 /* Start polling a file. We add ourselves to file's wait queue. The caller must
@@ -88,16 +90,28 @@ void vhost_poll_stop(struct vhost_poll *
 	remove_wait_queue(poll->wqh, &poll->wait);
 }

-/* Flush any work that has been scheduled. When calling this, don't hold any
+/* Flush any poll that has been scheduled. When calling this, don't hold any
  * locks that are also used by the callback. */
 void vhost_poll_flush(struct vhost_poll *poll)
 {
-	flush_work(&poll->work);
+	int seq = poll->queue_seq;
+
+	if (seq - poll->done_seq > 0)
+		wait_event(poll->done, seq - poll->done_seq <= 0);
+	smp_rmb();	/* paired with wmb in vhost_poller() */
 }

 void vhost_poll_queue(struct vhost_poll *poll)
 {
-	queue_work(vhost_workqueue, &poll->work);
+	struct vhost_dev *dev = poll->dev;
+
+	spin_lock(&dev->poller_lock);
+	if (list_empty(&poll->node)) {
+		list_add_tail(&poll->node, &dev->poll_list);
+		poll->queue_seq++;
+		wake_up_process(dev->poller);
+	}
+	spin_unlock(&dev->poller_lock);
 }

 static void vhost_vq_reset(struct vhost_dev *dev,
@@ -125,10 +139,50 @@ static void vhost_vq_reset(struct vhost_
 	vq->log_ctx = NULL;
 }

+static int vhost_poller(void *data)
+{
+	struct vhost_dev *dev = data;
+	struct vhost_poll *poll;
+
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+
+	if (kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		return 0;
+	}
+
+	poll = NULL;
+	spin_lock(&dev->poller_lock);
+	if (!list_empty(&dev->poll_list)) {
+		poll = list_first_entry(&dev->poll_list,
+					struct vhost_poll, node);
+		list_del_init(&poll->node);
+	}
+	spin_unlock(&dev->poller_lock);
+
+	if (poll) {
+		__set_current_state(TASK_RUNNING);
+		poll->fn(poll);
+		smp_wmb();	/* paired with rmb in vhost_poll_flush() */
+		poll->done_seq = poll->queue_seq;
+		wake_up_all(&poll->done);
+	} else
+		schedule();
+
+	goto repeat;
+}
+
 long vhost_dev_init(struct vhost_dev *dev,
 		    struct vhost_virtqueue *vqs, int nvqs)
 {
+	struct task_struct *poller;
 	int i;
+
+	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
+	if (IS_ERR(poller))
+		return PTR_ERR(poller);
+
 	dev->vqs = vqs;
 	dev->nvqs = nvqs;
 	mutex_init(&dev->mutex);
@@ -136,6 +190,9 @@ long vhost_dev_init(struct vhost_dev *de
 	dev->log_file = NULL;
 	dev->memory = NULL;
 	dev->mm = NULL;
+	spin_lock_init(&dev->poller_lock);
+	INIT_LIST_HEAD(&dev->poll_list);
+	dev->poller = poller;

 	for (i = 0; i < dev->nvqs; ++i) {
 		dev->vqs[i].dev = dev;
@@ -143,8 +200,7 @@ long vhost_dev_init(struct vhost_dev *de
 		vhost_vq_reset(dev, dev->vqs + i);
 		if (dev->vqs[i].handle_kick)
 			vhost_poll_init(&dev->vqs[i].poll,
-					dev->vqs[i].handle_kick,
-					POLLIN);
+					dev->vqs[i].handle_kick, POLLIN, dev);
 	}
 	return 0;
 }
@@ -217,6 +273,8 @@ void vhost_dev_cleanup(struct vhost_dev
 	if (dev->mm)
 		mmput(dev->mm);
 	dev->mm = NULL;
+
+	kthread_stop(dev->poller);
 }

 static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
@@ -1113,16 +1171,3 @@ void vhost_disable_notify(struct vhost_v
 		vq_err(vq, "Failed to enable notification at %p: %d\n",
 		       &vq->used->flags, r);
 }
-
-int vhost_init(void)
-{
-	vhost_workqueue = create_singlethread_workqueue("vhost");
-	if (!vhost_workqueue)
-		return -ENOMEM;
-	return 0;
-}
-
-void vhost_cleanup(void)
-{
-	destroy_workqueue(vhost_workqueue);
-}
Index: work/drivers/vhost/vhost.h
===================================================================
--- work.orig/drivers/vhost/vhost.h
+++ work/drivers/vhost/vhost.h
@@ -5,7 +5,6 @@
 #include <linux/vhost.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
-#include <linux/workqueue.h>
 #include <linux/poll.h>
 #include <linux/file.h>
 #include <linux/skbuff.h>
@@ -20,19 +19,26 @@ enum {
 	VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
 };

+struct vhost_poll;
+typedef void (*vhost_poll_fn_t)(struct vhost_poll *poll);
+
 /* Poll a file (eventfd or socket) */
 /* Note: there's nothing vhost specific about this structure. */
 struct vhost_poll {
+	vhost_poll_fn_t		  fn;
 	poll_table                table;
 	wait_queue_head_t        *wqh;
 	wait_queue_t              wait;
-	/* struct which will handle all actual work. */
-	struct work_struct        work;
+	struct list_head	  node;
+	wait_queue_head_t	  done;
 	unsigned long		  mask;
+	struct vhost_dev	 *dev;
+	int			  queue_seq;
+	int			  done_seq;
 };

-void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
-		     unsigned long mask);
+void vhost_poll_init(struct vhost_poll *poll, vhost_poll_fn_t fn,
+		     unsigned long mask, struct vhost_dev *dev);
 void vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
@@ -63,7 +69,7 @@ struct vhost_virtqueue {
 	struct vhost_poll poll;

 	/* The routine to call when the Guest pings us, or timeout. */
-	work_func_t handle_kick;
+	vhost_poll_fn_t handle_kick;

 	/* Last available index we saw. */
 	u16 last_avail_idx;
@@ -86,11 +92,11 @@ struct vhost_virtqueue {
 	struct iovec hdr[VHOST_NET_MAX_SG];
 	size_t hdr_size;
 	/* We use a kind of RCU to access private pointer.
-	 * All readers access it from workqueue, which makes it possible to
-	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
+	 * All readers access it from poller, which makes it possible to
+	 * flush the vhost_poll instead of synchronize_rcu. Therefore readers do
 	 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
-	 * work item execution acts instead of rcu_read_lock() and the end of
-	 * work item execution acts instead of rcu_read_lock().
+	 * vhost_poll execution acts instead of rcu_read_lock() and the end of
+	 * vhost_poll execution acts instead of rcu_read_lock().
 	 * Writers use virtqueue mutex. */
 	void *private_data;
 	/* Log write descriptors */
@@ -110,6 +116,9 @@ struct vhost_dev {
 	int nvqs;
 	struct file *log_file;
 	struct eventfd_ctx *log_ctx;
+	spinlock_t poller_lock;
+	struct list_head poll_list;
+	struct task_struct *poller;
 };

 long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
@@ -136,9 +145,6 @@ bool vhost_enable_notify(struct vhost_vi
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);

-int vhost_init(void);
-void vhost_cleanup(void);
-
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
 		if ((vq)->error_ctx)                               \

^ permalink raw reply	[flat|nested] 115+ messages in thread

* [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup
  2010-05-30 11:29                     ` Michael S. Tsirkin
  2010-05-30 20:24                       ` [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread Tejun Heo
@ 2010-05-30 20:24                       ` Tejun Heo
  2010-05-31  1:07                         ` Li Zefan
  2010-05-30 20:25                       ` [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers Tejun Heo
  2 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-05-30 20:24 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

From: Sridhar Samudrala <samudrala.sridhar@gmail.com>

Add a new kernel API to attach a task to current task's cgroup
in all the active hierarchies.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
---
 include/linux/cgroup.h |    1 +
 kernel/cgroup.c        |   23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)

Index: work/include/linux/cgroup.h
===================================================================
--- work.orig/include/linux/cgroup.h
+++ work/include/linux/cgroup.h
@@ -570,6 +570,7 @@ struct task_struct *cgroup_iter_next(str
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
+int cgroup_attach_task_current_cg(struct task_struct *);

 /*
  * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
Index: work/kernel/cgroup.c
===================================================================
--- work.orig/kernel/cgroup.c
+++ work/kernel/cgroup.c
@@ -1788,6 +1788,29 @@ out:
 	return retval;
 }

+/**
+ * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_current_cg(struct task_struct *tsk)
+{
+	struct cgroupfs_root *root;
+	struct cgroup *cur_cg;
+	int retval = 0;
+
+	cgroup_lock();
+	for_each_active_root(root) {
+		cur_cg = task_cgroup_from_root(current, root);
+		retval = cgroup_attach_task(cur_cg, tsk);
+		if (retval)
+			break;
+	}
+	cgroup_unlock();
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg);
+
 /*
  * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
  * held. May take task_lock of task

^ permalink raw reply	[flat|nested] 115+ messages in thread

* [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers
  2010-05-30 11:29                     ` Michael S. Tsirkin
  2010-05-30 20:24                       ` [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread Tejun Heo
  2010-05-30 20:24                       ` [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup Tejun Heo
@ 2010-05-30 20:25                       ` Tejun Heo
  2010-05-31  1:11                         ` Li Zefan
  2010-06-24  8:11                         ` [PATCH " Michael S. Tsirkin
  2 siblings, 2 replies; 115+ messages in thread
From: Tejun Heo @ 2010-05-30 20:25 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Apply the cpumask and cgroup of the initializing task to the created
vhost poller.

Based on Sridhar Samudrala's patch.

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
---
 drivers/vhost/vhost.c |   36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

Index: work/drivers/vhost/vhost.c
===================================================================
--- work.orig/drivers/vhost/vhost.c
+++ work/drivers/vhost/vhost.c
@@ -23,6 +23,7 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
+#include <linux/cgroup.h>

 #include <linux/net.h>
 #include <linux/if_packet.h>
@@ -176,12 +177,30 @@ repeat:
 long vhost_dev_init(struct vhost_dev *dev,
 		    struct vhost_virtqueue *vqs, int nvqs)
 {
-	struct task_struct *poller;
-	int i;
+	struct task_struct *poller = NULL;
+	cpumask_var_t mask;
+	int i, ret = -ENOMEM;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		goto out;

 	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
-	if (IS_ERR(poller))
-		return PTR_ERR(poller);
+	if (IS_ERR(poller)) {
+		ret = PTR_ERR(poller);
+		goto out;
+	}
+
+	ret = sched_getaffinity(current->pid, mask);
+	if (ret)
+		goto out;
+
+	ret = sched_setaffinity(poller->pid, mask);
+	if (ret)
+		goto out;
+
+	ret = cgroup_attach_task_current_cg(poller);
+	if (ret)
+		goto out;

 	dev->vqs = vqs;
 	dev->nvqs = nvqs;
@@ -202,7 +221,14 @@ long vhost_dev_init(struct vhost_dev *de
 			vhost_poll_init(&dev->vqs[i].poll,
 					dev->vqs[i].handle_kick, POLLIN, dev);
 	}
-	return 0;
+
+	wake_up_process(poller);	/* avoid contributing to loadavg */
+	ret = 0;
+out:
+	if (ret)
+		kthread_stop(poller);
+	free_cpumask_var(mask);
+	return ret;
 }

 /* Caller should have device mutex */

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup
  2010-05-30 20:24                       ` [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup Tejun Heo
@ 2010-05-31  1:07                         ` Li Zefan
  2010-05-31  7:00                           ` Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Li Zefan @ 2010-05-31  1:07 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michael S. Tsirkin, Oleg Nesterov, Sridhar Samudrala, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen

04:24, Tejun Heo wrote:
> From: Sridhar Samudrala <samudrala.sridhar@gmail.com>
> 
> Add a new kernel API to attach a task to current task's cgroup
> in all the active hierarchies.
> 
> Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

Acked-by: Li Zefan <lizf@cn.fujitsu.com>

btw: you lost the reviewed-by tag given by Paul Menage.


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers
  2010-05-30 20:25                       ` [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers Tejun Heo
@ 2010-05-31  1:11                         ` Li Zefan
  2010-05-31  6:58                           ` [PATCH UPDATED " Tejun Heo
  2010-06-24  8:11                         ` [PATCH " Michael S. Tsirkin
  1 sibling, 1 reply; 115+ messages in thread
From: Li Zefan @ 2010-05-31  1:11 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michael S. Tsirkin, Oleg Nesterov, Sridhar Samudrala, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen

Tejun Heo wrote:
> Apply the cpumask and cgroup of the initializing task to the created
> vhost poller.
> 
> Based on Sridhar Samudrala's patch.
> 
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
> ---
>  drivers/vhost/vhost.c |   36 +++++++++++++++++++++++++++++++-----
>  1 file changed, 31 insertions(+), 5 deletions(-)
> 
> Index: work/drivers/vhost/vhost.c
> ===================================================================
> --- work.orig/drivers/vhost/vhost.c
> +++ work/drivers/vhost/vhost.c
> @@ -23,6 +23,7 @@
>  #include <linux/highmem.h>
>  #include <linux/slab.h>
>  #include <linux/kthread.h>
> +#include <linux/cgroup.h>
> 
>  #include <linux/net.h>
>  #include <linux/if_packet.h>
> @@ -176,12 +177,30 @@ repeat:
>  long vhost_dev_init(struct vhost_dev *dev,
>  		    struct vhost_virtqueue *vqs, int nvqs)
>  {
> -	struct task_struct *poller;
> -	int i;
> +	struct task_struct *poller = NULL;
> +	cpumask_var_t mask;
> +	int i, ret = -ENOMEM;
> +
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		goto out;
> 

If we "goto out", we will end up calling kthread_stop(poller), but
seems kthread_stop() requires the task_struct pointer != NULL.

>  	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
> -	if (IS_ERR(poller))
> -		return PTR_ERR(poller);
> +	if (IS_ERR(poller)) {
> +		ret = PTR_ERR(poller);
> +		goto out;
> +	}
> +
> +	ret = sched_getaffinity(current->pid, mask);
> +	if (ret)
> +		goto out;
> +
> +	ret = sched_setaffinity(poller->pid, mask);
> +	if (ret)
> +		goto out;
> +
> +	ret = cgroup_attach_task_current_cg(poller);
> +	if (ret)
> +		goto out;
> 
>  	dev->vqs = vqs;
>  	dev->nvqs = nvqs;
> @@ -202,7 +221,14 @@ long vhost_dev_init(struct vhost_dev *de
>  			vhost_poll_init(&dev->vqs[i].poll,
>  					dev->vqs[i].handle_kick, POLLIN, dev);
>  	}
> -	return 0;
> +
> +	wake_up_process(poller);	/* avoid contributing to loadavg */
> +	ret = 0;
> +out:
> +	if (ret)
> +		kthread_stop(poller);
> +	free_cpumask_var(mask);
> +	return ret;
>  }
> 
>  /* Caller should have device mutex */

^ permalink raw reply	[flat|nested] 115+ messages in thread

* [PATCH UPDATED 3/3] vhost: apply cpumask and cgroup to vhost pollers
  2010-05-31  1:11                         ` Li Zefan
@ 2010-05-31  6:58                           ` Tejun Heo
  2010-05-31  7:48                             ` Li Zefan
  0 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-05-31  6:58 UTC (permalink / raw)
  To: Li Zefan
  Cc: Michael S. Tsirkin, Oleg Nesterov, Sridhar Samudrala, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen

Apply the cpumask and cgroup of the initializing task to the created
vhost poller.

Based on Sridhar Samudrala's patch.  Li Zefan spotted a bug in error
path, fixed.

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
Updated accordingly.  Thanks.

 drivers/vhost/vhost.c |   36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

Index: work/drivers/vhost/vhost.c
===================================================================
--- work.orig/drivers/vhost/vhost.c
+++ work/drivers/vhost/vhost.c
@@ -23,6 +23,7 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
+#include <linux/cgroup.h>

 #include <linux/net.h>
 #include <linux/if_packet.h>
@@ -176,12 +177,30 @@ repeat:
 long vhost_dev_init(struct vhost_dev *dev,
 		    struct vhost_virtqueue *vqs, int nvqs)
 {
-	struct task_struct *poller;
-	int i;
+	struct task_struct *poller = NULL;
+	cpumask_var_t mask;
+	int i, ret = -ENOMEM;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		goto out;

 	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
-	if (IS_ERR(poller))
-		return PTR_ERR(poller);
+	if (IS_ERR(poller)) {
+		ret = PTR_ERR(poller);
+		goto out;
+	}
+
+	ret = sched_getaffinity(current->pid, mask);
+	if (ret)
+		goto out;
+
+	ret = sched_setaffinity(poller->pid, mask);
+	if (ret)
+		goto out;
+
+	ret = cgroup_attach_task_current_cg(poller);
+	if (ret)
+		goto out;

 	dev->vqs = vqs;
 	dev->nvqs = nvqs;
@@ -202,7 +221,14 @@ long vhost_dev_init(struct vhost_dev *de
 			vhost_poll_init(&dev->vqs[i].poll,
 					dev->vqs[i].handle_kick, POLLIN, dev);
 	}
-	return 0;
+
+	wake_up_process(poller);	/* avoid contributing to loadavg */
+	ret = 0;
+out:
+	if (ret && poller)
+		kthread_stop(poller);
+	free_cpumask_var(mask);
+	return ret;
 }

 /* Caller should have device mutex */

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup
  2010-05-31  1:07                         ` Li Zefan
@ 2010-05-31  7:00                           ` Tejun Heo
  0 siblings, 0 replies; 115+ messages in thread
From: Tejun Heo @ 2010-05-31  7:00 UTC (permalink / raw)
  To: Li Zefan
  Cc: Michael S. Tsirkin, Oleg Nesterov, Sridhar Samudrala, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen

On 05/31/2010 03:07 AM, Li Zefan wrote:
> 04:24, Tejun Heo wrote:
>> From: Sridhar Samudrala <samudrala.sridhar@gmail.com>
>>
>> Add a new kernel API to attach a task to current task's cgroup
>> in all the active hierarchies.
>>
>> Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
> 
> Acked-by: Li Zefan <lizf@cn.fujitsu.com>
> 
> btw: you lost the reviewed-by tag given by Paul Menage.

I only got bounced the original posting.  Michael, can you please add
it if/when you commit these?

Thank you.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 3/3] vhost: apply cpumask and cgroup to vhost pollers
  2010-05-31  6:58                           ` [PATCH UPDATED " Tejun Heo
@ 2010-05-31  7:48                             ` Li Zefan
  2010-05-31 10:20                               ` [PATCH UPDATED2 " Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Li Zefan @ 2010-05-31  7:48 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michael S. Tsirkin, Oleg Nesterov, Sridhar Samudrala, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen

>  	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
> -	if (IS_ERR(poller))
> -		return PTR_ERR(poller);
> +	if (IS_ERR(poller)) {
> +		ret = PTR_ERR(poller);
> +		goto out;

here...

> +	}
> +
> +	ret = sched_getaffinity(current->pid, mask);
> +	if (ret)
> +		goto out;
> +
> +	ret = sched_setaffinity(poller->pid, mask);
> +	if (ret)
> +		goto out;
> +
> +	ret = cgroup_attach_task_current_cg(poller);
> +	if (ret)
> +		goto out;
> 
>  	dev->vqs = vqs;
>  	dev->nvqs = nvqs;
> @@ -202,7 +221,14 @@ long vhost_dev_init(struct vhost_dev *de
>  			vhost_poll_init(&dev->vqs[i].poll,
>  					dev->vqs[i].handle_kick, POLLIN, dev);
>  	}
> -	return 0;
> +
> +	wake_up_process(poller);	/* avoid contributing to loadavg */
> +	ret = 0;
> +out:
> +	if (ret && poller)

It's still buggy..

> +		kthread_stop(poller);
> +	free_cpumask_var(mask);
> +	return ret;
>  }
> 
>  /* Caller should have device mutex */
> 
> 

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED2 3/3] vhost: apply cpumask and cgroup to vhost pollers
  2010-05-31  7:48                             ` Li Zefan
@ 2010-05-31 10:20                               ` Tejun Heo
  0 siblings, 0 replies; 115+ messages in thread
From: Tejun Heo @ 2010-05-31 10:20 UTC (permalink / raw)
  To: Li Zefan
  Cc: Michael S. Tsirkin, Oleg Nesterov, Sridhar Samudrala, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen

Apply the cpumask and cgroup of the initializing task to the created
vhost poller.

Based on Sridhar Samudrala's patch.  Li Zefan spotted a bug in error
path (twice), fixed (twice).

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
Heh... that's embarrassing.  Let's see if I can get it right the third
time.

Thank you.

 drivers/vhost/vhost.c |   36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

Index: work/drivers/vhost/vhost.c
===================================================================
--- work.orig/drivers/vhost/vhost.c
+++ work/drivers/vhost/vhost.c
@@ -23,6 +23,7 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
+#include <linux/cgroup.h>

 #include <linux/net.h>
 #include <linux/if_packet.h>
@@ -177,11 +178,29 @@ long vhost_dev_init(struct vhost_dev *de
 		    struct vhost_virtqueue *vqs, int nvqs)
 {
 	struct task_struct *poller;
-	int i;
+	cpumask_var_t mask;
+	int i, ret = -ENOMEM;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		goto out_free_mask;

 	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
-	if (IS_ERR(poller))
-		return PTR_ERR(poller);
+	if (IS_ERR(poller)) {
+		ret = PTR_ERR(poller);
+		goto out_free_mask;
+	}
+
+	ret = sched_getaffinity(current->pid, mask);
+	if (ret)
+		goto out_stop_poller;
+
+	ret = sched_setaffinity(poller->pid, mask);
+	if (ret)
+		goto out_stop_poller;
+
+	ret = cgroup_attach_task_current_cg(poller);
+	if (ret)
+		goto out_stop_poller;

 	dev->vqs = vqs;
 	dev->nvqs = nvqs;
@@ -202,7 +221,16 @@ long vhost_dev_init(struct vhost_dev *de
 			vhost_poll_init(&dev->vqs[i].poll,
 					dev->vqs[i].handle_kick, POLLIN, dev);
 	}
-	return 0;
+
+	wake_up_process(poller);	/* avoid contributing to loadavg */
+	ret = 0;
+	goto out_free_mask;
+
+out_stop_poller:
+	kthread_stop(poller);
+out_free_mask:
+	free_cpumask_var(mask);
+	return ret;
 }

 /* Caller should have device mutex */

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-05-30 20:24                       ` [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread Tejun Heo
@ 2010-05-31 14:39                         ` Oleg Nesterov
  2010-05-31 15:07                           ` Tejun Heo
  2010-05-31 15:22                         ` Michael S. Tsirkin
  1 sibling, 1 reply; 115+ messages in thread
From: Oleg Nesterov @ 2010-05-31 14:39 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michael S. Tsirkin, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On 05/30, Tejun Heo wrote:
>
> This conversion is to make each vhost use a dedicated kthread so that
> resource control via cgroup can be applied.

Personally, I agree. I think This is better than play with workqueue thread.

A couple of simple questions after the quick glance at the unapplied patch...

>  void vhost_poll_flush(struct vhost_poll *poll)
>  {
> -	flush_work(&poll->work);
> +	int seq = poll->queue_seq;
> +
> +	if (seq - poll->done_seq > 0)
> +		wait_event(poll->done, seq - poll->done_seq <= 0);

The check before wait_event() is not needed, please note that wait_event()
checks the condition before __wait_event().

What I can't understand is why we do have ->queue_seq and ->done_seq.

Isn't the single "bool poll->active" enough? vhost_poll_queue() sets
->active == T, vhost_poller() clears it before wake_up_all(poll->done).

> +static int vhost_poller(void *data)
> +{
> +	struct vhost_dev *dev = data;
> +	struct vhost_poll *poll;
> +
> +repeat:
> +	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */

I don't understand the comment... why do we need this barrier?

> +	if (kthread_should_stop()) {
> +		__set_current_state(TASK_RUNNING);
> +		return 0;
> +	}
> +
> +	poll = NULL;
> +	spin_lock(&dev->poller_lock);
> +	if (!list_empty(&dev->poll_list)) {
> +		poll = list_first_entry(&dev->poll_list,
> +					struct vhost_poll, node);
> +		list_del_init(&poll->node);
> +	}
> +	spin_unlock(&dev->poller_lock);
> +
> +	if (poll) {
> +		__set_current_state(TASK_RUNNING);
> +		poll->fn(poll);
> +		smp_wmb();	/* paired with rmb in vhost_poll_flush() */
> +		poll->done_seq = poll->queue_seq;
> +		wake_up_all(&poll->done);
> +	} else
> +		schedule();
> +
> +	goto repeat;
> +}

Given that vhost_poll_queue() does list_add() and wake_up_process() under
->poller_lock, I don't think we need any barriers to change ->state.

IOW, can't vhost_poller() simply do

	while(!kthread_should_stop()) {

		poll = NULL;
		spin_lock(&dev->poller_lock);
		if (!list_empty(&dev->poll_list)) {
			...
		} else
			 __set_current_state(TASK_INTERRUPTIBLE);
		spin_unlock(&dev->poller_lock);

		if (poll) {
			...
		} else
			schedule();
	}

?

Oleg.


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-05-31 14:39                         ` Oleg Nesterov
@ 2010-05-31 15:07                           ` Tejun Heo
  2010-05-31 15:31                             ` Oleg Nesterov
  0 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-05-31 15:07 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Michael S. Tsirkin, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 05/31/2010 04:39 PM, Oleg Nesterov wrote:
> On 05/30, Tejun Heo wrote:
>>
>> This conversion is to make each vhost use a dedicated kthread so that
>> resource control via cgroup can be applied.
> 
> Personally, I agree. I think This is better than play with workqueue thread.

Yeap, I think so too.  In vhost's case tho, as it exports a lot of
workqueue characteristics to vhost users, it's a bit more complex than
I wish it were.  It can probably be simplified more if someone who
knows the code better takes a look or maybe we need to make this kind
of things easier by providing a generic helpers if more cases like
this spring up, but if that happens probably the RTTD would be somehow
teaching workqueue how to deal with cgroups.  As this is the first
case, I guess open coding is okay for now.

> A couple of simple questions after the quick glance at the unapplied patch...
> 
>>  void vhost_poll_flush(struct vhost_poll *poll)
>>  {
>> -	flush_work(&poll->work);
>> +	int seq = poll->queue_seq;
>> +
>> +	if (seq - poll->done_seq > 0)
>> +		wait_event(poll->done, seq - poll->done_seq <= 0);
> 
> The check before wait_event() is not needed, please note that wait_event()
> checks the condition before __wait_event().

Heh... right, I was looking at __wait_event() and thinking "ooh... we
can skip lock in the fast path".  :-)

> What I can't understand is why we do have ->queue_seq and ->done_seq.
> 
> Isn't the single "bool poll->active" enough? vhost_poll_queue() sets
> ->active == T, vhost_poller() clears it before wake_up_all(poll->done).

I might have slightly over engineered this part not knowing the
expected workload.  ->queue_seq/->done_seq pair is to guarantee that
flushers never get starved.  Without sequencing queueings and
executions, flushers should wait for !pending && !active which can
take some time to come if the poll in question is very busy.

>> +static int vhost_poller(void *data)
>> +{
>> +	struct vhost_dev *dev = data;
>> +	struct vhost_poll *poll;
>> +
>> +repeat:
>> +	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> 
> I don't understand the comment... why do we need this barrier?

So that either kthread_stop()'s should_stop = 1 in kthread_stop() is
visible to kthread_should_stop() or task state is set to RUNNING.

>> +	if (kthread_should_stop()) {
>> +		__set_current_state(TASK_RUNNING);
>> +		return 0;
>> +	}
>> +
>> +	poll = NULL;
>> +	spin_lock(&dev->poller_lock);
>> +	if (!list_empty(&dev->poll_list)) {
>> +		poll = list_first_entry(&dev->poll_list,
>> +					struct vhost_poll, node);
>> +		list_del_init(&poll->node);
>> +	}
>> +	spin_unlock(&dev->poller_lock);
>> +
>> +	if (poll) {
>> +		__set_current_state(TASK_RUNNING);
>> +		poll->fn(poll);
>> +		smp_wmb();	/* paired with rmb in vhost_poll_flush() */
>> +		poll->done_seq = poll->queue_seq;
>> +		wake_up_all(&poll->done);
>> +	} else
>> +		schedule();
>> +
>> +	goto repeat;
>> +}
> 
> Given that vhost_poll_queue() does list_add() and wake_up_process() under
> ->poller_lock, I don't think we need any barriers to change ->state.
> 
> IOW, can't vhost_poller() simply do
> 
> 	while(!kthread_should_stop()) {
> 
> 		poll = NULL;
> 		spin_lock(&dev->poller_lock);
> 		if (!list_empty(&dev->poll_list)) {
> 			...
> 		} else
> 			 __set_current_state(TASK_INTERRUPTIBLE);
> 		spin_unlock(&dev->poller_lock);
> 
> 		if (poll) {
> 			...
> 		} else
> 			schedule();
> 	}
> ?

But then kthread_stop() can happen between kthread_should_stop() and
__set_current_state(TASK_INTERRUPTIBLE) and poller can just sleep in
schedule() not knowing that.

Thank you.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-05-30 20:24                       ` [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread Tejun Heo
  2010-05-31 14:39                         ` Oleg Nesterov
@ 2010-05-31 15:22                         ` Michael S. Tsirkin
  2010-05-31 15:45                           ` Tejun Heo
  2010-06-01 14:13                           ` [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread Paul E. McKenney
  1 sibling, 2 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-05-31 15:22 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote:
> Replace vhost_workqueue with per-vhost kthread.  Other than callback
> argument change from struct work_struct * to struct vhost_poll *,
> there's no visible change to vhost_poll_*() interface.

I would prefer a substructure vhost_work, even just to make
the code easier to review and compare to workqueue.c.

> This conversion is to make each vhost use a dedicated kthread so that
> resource control via cgroup can be applied.
> 
> Partially based on Sridhar Samudrala's patch.
> 
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
> ---
> Okay, here is three patch series to convert vhost to use per-vhost
> kthread, add cgroup_attach_task_current_cg() and apply it to the vhost
> kthreads.  The conversion is mostly straight forward although flush is
> slightly tricky.
> 
> The problem is that I have no idea how to test this.

It's a 3 step process:

1. 
Install qemu-kvm under fc13, or build recent one from source,
get it from here:
git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git

2. install guest under it:
qemu-img create -f qcow2 disk.qcow2 100G
Now get some image (e.g. fedora 13 in fc13.iso)
and install guest:
qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2


3. set up networking. I usually simply do host to guest 
on a special subnet for testing purposes:

Set up a bridge named mstbr0:

ifconfig mstbr0 down
brctl delbr mstbr0
brctl addbr mstbr0
brctl setfd mstbr0 0
ifconfig mstbr0 11.0.0.1

cat > ifup << EOF
#!/bin/sh -x
/sbin/ifconfig msttap0 0.0.0.0 up
brctl addif mstbr0 msttap0
EOF


qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2
 -net nic,model=virtio,netdev=foo -netdev
tap,id=foo,ifname=msttap0,script=/home/mst/ifup,downscript=no,vhost=on

after you set up the guest, log into it and
ifconfig eth0 11.0.0.2

You should now be able to ping guest to host and back.
Use something like netperf to stress the connection.
Close qemu with kill -9 and unload module to test flushing code.



> Index: work/drivers/vhost/vhost.c
> ===================================================================
> --- work.orig/drivers/vhost/vhost.c
> +++ work/drivers/vhost/vhost.c

...

> @@ -125,10 +139,50 @@ static void vhost_vq_reset(struct vhost_
>  	vq->log_ctx = NULL;
>  }
> 
> +static int vhost_poller(void *data)
> +{
> +	struct vhost_dev *dev = data;
> +	struct vhost_poll *poll;
> +
> +repeat:
> +	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> +
> +	if (kthread_should_stop()) {
> +		__set_current_state(TASK_RUNNING);
> +		return 0;
> +	}
> +
> +	poll = NULL;
> +	spin_lock(&dev->poller_lock);
> +	if (!list_empty(&dev->poll_list)) {
> +		poll = list_first_entry(&dev->poll_list,
> +					struct vhost_poll, node);
> +		list_del_init(&poll->node);
> +	}
> +	spin_unlock(&dev->poller_lock);
> +
> +	if (poll) {
> +		__set_current_state(TASK_RUNNING);
> +		poll->fn(poll);
> +		smp_wmb();	/* paired with rmb in vhost_poll_flush() */
> +		poll->done_seq = poll->queue_seq;
> +		wake_up_all(&poll->done);


This seems to add wakeups on data path, which uses spinlocks etc.
OTOH workqueue.c adds a special barrier
entry which only does a wakeup when needed.
Right?

> +	} else
> +		schedule();
> +
> +	goto repeat;
> +}
> +
>  long vhost_dev_init(struct vhost_dev *dev,
>  		    struct vhost_virtqueue *vqs, int nvqs)
>  {
> +	struct task_struct *poller;
>  	int i;
> +
> +	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
> +	if (IS_ERR(poller))
> +		return PTR_ERR(poller);
> +
>  	dev->vqs = vqs;
>  	dev->nvqs = nvqs;
>  	mutex_init(&dev->mutex);
> @@ -136,6 +190,9 @@ long vhost_dev_init(struct vhost_dev *de
>  	dev->log_file = NULL;
>  	dev->memory = NULL;
>  	dev->mm = NULL;
> +	spin_lock_init(&dev->poller_lock);
> +	INIT_LIST_HEAD(&dev->poll_list);
> +	dev->poller = poller;
> 
>  	for (i = 0; i < dev->nvqs; ++i) {
>  		dev->vqs[i].dev = dev;
> @@ -143,8 +200,7 @@ long vhost_dev_init(struct vhost_dev *de
>  		vhost_vq_reset(dev, dev->vqs + i);
>  		if (dev->vqs[i].handle_kick)
>  			vhost_poll_init(&dev->vqs[i].poll,
> -					dev->vqs[i].handle_kick,
> -					POLLIN);
> +					dev->vqs[i].handle_kick, POLLIN, dev);
>  	}
>  	return 0;
>  }
> @@ -217,6 +273,8 @@ void vhost_dev_cleanup(struct vhost_dev
>  	if (dev->mm)
>  		mmput(dev->mm);
>  	dev->mm = NULL;
> +
> +	kthread_stop(dev->poller);
>  }
> 
>  static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
> @@ -1113,16 +1171,3 @@ void vhost_disable_notify(struct vhost_v
>  		vq_err(vq, "Failed to enable notification at %p: %d\n",
>  		       &vq->used->flags, r);
>  }
> -
> -int vhost_init(void)
> -{
> -	vhost_workqueue = create_singlethread_workqueue("vhost");
> -	if (!vhost_workqueue)
> -		return -ENOMEM;
> -	return 0;
> -}
> -
> -void vhost_cleanup(void)
> -{
> -	destroy_workqueue(vhost_workqueue);

I note that destroy_workqueue does a flush, kthread_stop
doesn't. Right? Sure we don't need to check nothing is in one of
the lists? Maybe add a BUG_ON?

> -}
> Index: work/drivers/vhost/vhost.h
> ===================================================================
> --- work.orig/drivers/vhost/vhost.h
> +++ work/drivers/vhost/vhost.h
> @@ -5,7 +5,6 @@
>  #include <linux/vhost.h>
>  #include <linux/mm.h>
>  #include <linux/mutex.h>
> -#include <linux/workqueue.h>
>  #include <linux/poll.h>
>  #include <linux/file.h>
>  #include <linux/skbuff.h>
> @@ -20,19 +19,26 @@ enum {
>  	VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
>  };
> 
> +struct vhost_poll;
> +typedef void (*vhost_poll_fn_t)(struct vhost_poll *poll);
> +
>  /* Poll a file (eventfd or socket) */
>  /* Note: there's nothing vhost specific about this structure. */
>  struct vhost_poll {
> +	vhost_poll_fn_t		  fn;
>  	poll_table                table;
>  	wait_queue_head_t        *wqh;
>  	wait_queue_t              wait;
> -	/* struct which will handle all actual work. */
> -	struct work_struct        work;
> +	struct list_head	  node;
> +	wait_queue_head_t	  done;
>  	unsigned long		  mask;
> +	struct vhost_dev	 *dev;
> +	int			  queue_seq;
> +	int			  done_seq;
>  };
> 
> -void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
> -		     unsigned long mask);
> +void vhost_poll_init(struct vhost_poll *poll, vhost_poll_fn_t fn,
> +		     unsigned long mask, struct vhost_dev *dev);
>  void vhost_poll_start(struct vhost_poll *poll, struct file *file);
>  void vhost_poll_stop(struct vhost_poll *poll);
>  void vhost_poll_flush(struct vhost_poll *poll);
> @@ -63,7 +69,7 @@ struct vhost_virtqueue {
>  	struct vhost_poll poll;
> 
>  	/* The routine to call when the Guest pings us, or timeout. */
> -	work_func_t handle_kick;
> +	vhost_poll_fn_t handle_kick;
> 
>  	/* Last available index we saw. */
>  	u16 last_avail_idx;
> @@ -86,11 +92,11 @@ struct vhost_virtqueue {
>  	struct iovec hdr[VHOST_NET_MAX_SG];
>  	size_t hdr_size;
>  	/* We use a kind of RCU to access private pointer.
> -	 * All readers access it from workqueue, which makes it possible to
> -	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
> +	 * All readers access it from poller, which makes it possible to
> +	 * flush the vhost_poll instead of synchronize_rcu. Therefore readers do
>  	 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
> -	 * work item execution acts instead of rcu_read_lock() and the end of
> -	 * work item execution acts instead of rcu_read_lock().
> +	 * vhost_poll execution acts instead of rcu_read_lock() and the end of
> +	 * vhost_poll execution acts instead of rcu_read_lock().
>  	 * Writers use virtqueue mutex. */
>  	void *private_data;
>  	/* Log write descriptors */
> @@ -110,6 +116,9 @@ struct vhost_dev {
>  	int nvqs;
>  	struct file *log_file;
>  	struct eventfd_ctx *log_ctx;
> +	spinlock_t poller_lock;
> +	struct list_head poll_list;
> +	struct task_struct *poller;
>  };
> 
>  long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
> @@ -136,9 +145,6 @@ bool vhost_enable_notify(struct vhost_vi
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>  		    unsigned int log_num, u64 len);
> 
> -int vhost_init(void);
> -void vhost_cleanup(void);
> -
>  #define vq_err(vq, fmt, ...) do {                                  \
>  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
>  		if ((vq)->error_ctx)                               \

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-05-31 15:07                           ` Tejun Heo
@ 2010-05-31 15:31                             ` Oleg Nesterov
  2010-05-31 15:38                               ` Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Oleg Nesterov @ 2010-05-31 15:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michael S. Tsirkin, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On 05/31, Tejun Heo wrote:
>
> On 05/31/2010 04:39 PM, Oleg Nesterov wrote:
>
> > What I can't understand is why we do have ->queue_seq and ->done_seq.
> >
> > Isn't the single "bool poll->active" enough? vhost_poll_queue() sets
> > ->active == T, vhost_poller() clears it before wake_up_all(poll->done).
>
> I might have slightly over engineered this part not knowing the
> expected workload.  ->queue_seq/->done_seq pair is to guarantee that
> flushers never get starved.

Ah, indeed.

Well, afaics we do not need 2 counters anyway, both vhost_poll_queue()
and vhost_poller() could increment the single counter and the flusher
can take bit 0 into account. But I agree 2 counters are much more clean.

> >> +static int vhost_poller(void *data)
> >> +{
> >> +	struct vhost_dev *dev = data;
> >> +	struct vhost_poll *poll;
> >> +
> >> +repeat:
> >> +	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> >
> > I don't understand the comment... why do we need this barrier?
>
> So that either kthread_stop()'s should_stop = 1 in kthread_stop() is
> visible to kthread_should_stop() or task state is set to RUNNING.

Of course, you are right. I am really surprized I asked this question ;)

Thanks,

Oleg.


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-05-31 15:31                             ` Oleg Nesterov
@ 2010-05-31 15:38                               ` Tejun Heo
  0 siblings, 0 replies; 115+ messages in thread
From: Tejun Heo @ 2010-05-31 15:38 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Michael S. Tsirkin, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 05/31/2010 05:31 PM, Oleg Nesterov wrote:
>> I might have slightly over engineered this part not knowing the
>> expected workload.  ->queue_seq/->done_seq pair is to guarantee that
>> flushers never get starved.
> 
> Ah, indeed.
> 
> Well, afaics we do not need 2 counters anyway, both vhost_poll_queue()
> and vhost_poller() could increment the single counter and the flusher
> can take bit 0 into account. But I agree 2 counters are much more clean.

Right, we can do that too.  Cool. :-)

>>>> +static int vhost_poller(void *data)
>>>> +{
>>>> +	struct vhost_dev *dev = data;
>>>> +	struct vhost_poll *poll;
>>>> +
>>>> +repeat:
>>>> +	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
>>>
>>> I don't understand the comment... why do we need this barrier?
>>
>> So that either kthread_stop()'s should_stop = 1 in kthread_stop() is
>> visible to kthread_should_stop() or task state is set to RUNNING.
> 
> Of course, you are right. I am really surprized I asked this question ;)

This part is always a bit tricky tho.  Maybe it would be a good idea
to make kthread_stop() do periodic wakeups.  It's already injecting
one rather unexpected wake up into the mix anyway so there isn't much
point in avoiding multiple and it would make designing kthread loops
easier.  Or maybe what we need is something like kthread_idle() which
encapsulates the check and sleep part.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-05-31 15:22                         ` Michael S. Tsirkin
@ 2010-05-31 15:45                           ` Tejun Heo
  2010-05-31 16:00                             ` Michael S. Tsirkin
  2010-06-01 14:13                           ` [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread Paul E. McKenney
  1 sibling, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-05-31 15:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 05/31/2010 05:22 PM, Michael S. Tsirkin wrote:
> On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote:
>> Replace vhost_workqueue with per-vhost kthread.  Other than callback
>> argument change from struct work_struct * to struct vhost_poll *,
>> there's no visible change to vhost_poll_*() interface.
> 
> I would prefer a substructure vhost_work, even just to make
> the code easier to review and compare to workqueue.c.

Yeap, sure.

>> The problem is that I have no idea how to test this.
> 
> It's a 3 step process:
...
> You should now be able to ping guest to host and back.
> Use something like netperf to stress the connection.
> Close qemu with kill -9 and unload module to test flushing code.

Thanks for the instruction.  I'll see if there's a way to do it
without building qemu myself on opensuse.  But please feel free to go
ahead and test it.  It might just work!  :-)

>> +	if (poll) {
>> +		__set_current_state(TASK_RUNNING);
>> +		poll->fn(poll);
>> +		smp_wmb();	/* paired with rmb in vhost_poll_flush() */
>> +		poll->done_seq = poll->queue_seq;
>> +		wake_up_all(&poll->done);
> 

> This seems to add wakeups on data path, which uses spinlocks etc.
> OTOH workqueue.c adds a special barrier entry which only does a
> wakeup when needed.  Right?

Yeah, well, if it's a really hot path sure we can avoid wake_up_all()
in most cases.  Do you think that would be necessary?

>> -void vhost_cleanup(void)
>> -{
>> -	destroy_workqueue(vhost_workqueue);
> 
> I note that destroy_workqueue does a flush, kthread_stop
> doesn't. Right? Sure we don't need to check nothing is in one of
> the lists? Maybe add a BUG_ON?

There were a bunch of flushes before kthread_stop() and they seemed to
stop and flush everything.  Aren't they enough?  We can definitely add
BUG_ON() after kthread_should_stop() check succeeds either way tho.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-05-31 15:45                           ` Tejun Heo
@ 2010-05-31 16:00                             ` Michael S. Tsirkin
  2010-06-01  9:34                               ` Tejun Heo
                                                 ` (2 more replies)
  0 siblings, 3 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-05-31 16:00 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Mon, May 31, 2010 at 05:45:07PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 05/31/2010 05:22 PM, Michael S. Tsirkin wrote:
> > On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote:
> >> Replace vhost_workqueue with per-vhost kthread.  Other than callback
> >> argument change from struct work_struct * to struct vhost_poll *,
> >> there's no visible change to vhost_poll_*() interface.
> > 
> > I would prefer a substructure vhost_work, even just to make
> > the code easier to review and compare to workqueue.c.
> 
> Yeap, sure.
> 
> >> The problem is that I have no idea how to test this.
> > 
> > It's a 3 step process:
> ...
> > You should now be able to ping guest to host and back.
> > Use something like netperf to stress the connection.
> > Close qemu with kill -9 and unload module to test flushing code.
> 
> Thanks for the instruction.  I'll see if there's a way to do it
> without building qemu myself on opensuse.

My guess is no, there was no stable qemu release with vhost net support
yet.  Building it is mostly configure/make/make install,
as far as I remember you only need devel versions of X libraries,
SDL and curses installed.

>  But please feel free to go
> ahead and test it.  It might just work!  :-)
> 
> >> +	if (poll) {
> >> +		__set_current_state(TASK_RUNNING);
> >> +		poll->fn(poll);
> >> +		smp_wmb();	/* paired with rmb in vhost_poll_flush() */
> >> +		poll->done_seq = poll->queue_seq;
> >> +		wake_up_all(&poll->done);
> > 
> 
> > This seems to add wakeups on data path, which uses spinlocks etc.
> > OTOH workqueue.c adds a special barrier entry which only does a
> > wakeup when needed.  Right?
> 
> Yeah, well, if it's a really hot path sure we can avoid wake_up_all()
> in most cases.  Do you think that would be necessary?

My guess is yes. This is really very hot path in code, and we are
close to 100% CPU in some benchmarks.

> >> -void vhost_cleanup(void)
> >> -{
> >> -	destroy_workqueue(vhost_workqueue);
> > 
> > I note that destroy_workqueue does a flush, kthread_stop
> > doesn't. Right? Sure we don't need to check nothing is in one of
> > the lists? Maybe add a BUG_ON?
> 
> There were a bunch of flushes before kthread_stop() and they seemed to
> stop and flush everything.  Aren't they enough?

I was just asking, I'll need to review the code in depth.

> We can definitely add
> BUG_ON() after kthread_should_stop() check succeeds either way tho.
> 
> Thanks.
> 
> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-05-31 16:00                             ` Michael S. Tsirkin
@ 2010-06-01  9:34                               ` Tejun Heo
  2010-06-02 18:40                                 ` [PATCH UPDATED " Tejun Heo
  2010-06-01  9:34                               ` [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup Tejun Heo
  2010-06-01  9:35                               ` [PATCH 3/3] vhost: apply cpumask and cgroup to vhost workers Tejun Heo
  2 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-06-01  9:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Replace vhost_workqueue with per-vhost kthread.  Other than callback
argument change from struct work_struct * to struct vhost_work *,
there's no visible change to vhost_poll_*() interface.

This conversion is to make each vhost use a dedicated kthread so that
resource control via cgroup can be applied.

Partially based on Sridhar Samudrala's patch.

* Updated to use sub structure vhost_work instead of directly using
  vhost_poll at Michael's suggestion.

* Added flusher wake_up() optimization at Michael's suggestion.

NOT_SIGNED_OFF_YET
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
---
Okay, here's the updated version.  I'm still in the process of setting
up the testing environment.  I'll be traveling from this afternoon
till tomorrow so I don't think I'll be able to test it before that.
If you can give it a shot, it would be great.

Thanks.

 drivers/vhost/net.c   |   56 ++++++++++---------------
 drivers/vhost/vhost.c |  110 ++++++++++++++++++++++++++++++++++++++------------
 drivers/vhost/vhost.h |   38 +++++++++++------
 3 files changed, 133 insertions(+), 71 deletions(-)

Index: work/drivers/vhost/net.c
===================================================================
--- work.orig/drivers/vhost/net.c
+++ work/drivers/vhost/net.c
@@ -294,54 +294,58 @@ static void handle_rx(struct vhost_net *
 	unuse_mm(net->dev.mm);
 }

-static void handle_tx_kick(struct work_struct *work)
+static void handle_tx_kick(struct vhost_work *work)
 {
-	struct vhost_virtqueue *vq;
-	struct vhost_net *net;
-	vq = container_of(work, struct vhost_virtqueue, poll.work);
-	net = container_of(vq->dev, struct vhost_net, dev);
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+						  poll.work);
+	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
 	handle_tx(net);
 }

-static void handle_rx_kick(struct work_struct *work)
+static void handle_rx_kick(struct vhost_work *work)
 {
-	struct vhost_virtqueue *vq;
-	struct vhost_net *net;
-	vq = container_of(work, struct vhost_virtqueue, poll.work);
-	net = container_of(vq->dev, struct vhost_net, dev);
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+						  poll.work);
+	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
 	handle_rx(net);
 }

-static void handle_tx_net(struct work_struct *work)
+static void handle_tx_net(struct vhost_work *work)
 {
-	struct vhost_net *net;
-	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
+	struct vhost_net *net = container_of(work, struct vhost_net,
+					     poll[VHOST_NET_VQ_TX].work);
 	handle_tx(net);
 }

-static void handle_rx_net(struct work_struct *work)
+static void handle_rx_net(struct vhost_work *work)
 {
-	struct vhost_net *net;
-	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
+	struct vhost_net *net = container_of(work, struct vhost_net,
+					     poll[VHOST_NET_VQ_RX].work);
 	handle_rx(net);
 }

 static int vhost_net_open(struct inode *inode, struct file *f)
 {
 	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
+	struct vhost_dev *dev;
 	int r;
+
 	if (!n)
 		return -ENOMEM;
+
+	dev = &n->dev;
 	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
 	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
-	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
+	r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
 	if (r < 0) {
 		kfree(n);
 		return r;
 	}

-	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
-	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
 	n->tx_poll_state = VHOST_NET_POLL_DISABLED;

 	f->private_data = n;
@@ -644,25 +648,13 @@ static struct miscdevice vhost_net_misc

 static int vhost_net_init(void)
 {
-	int r = vhost_init();
-	if (r)
-		goto err_init;
-	r = misc_register(&vhost_net_misc);
-	if (r)
-		goto err_reg;
-	return 0;
-err_reg:
-	vhost_cleanup();
-err_init:
-	return r;
-
+	return misc_register(&vhost_net_misc);
 }
 module_init(vhost_net_init);

 static void vhost_net_exit(void)
 {
 	misc_deregister(&vhost_net_misc);
-	vhost_cleanup();
 }
 module_exit(vhost_net_exit);

Index: work/drivers/vhost/vhost.c
===================================================================
--- work.orig/drivers/vhost/vhost.c
+++ work/drivers/vhost/vhost.c
@@ -17,12 +17,12 @@
 #include <linux/mm.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
-#include <linux/workqueue.h>
 #include <linux/rcupdate.h>
 #include <linux/poll.h>
 #include <linux/file.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
+#include <linux/kthread.h>

 #include <linux/net.h>
 #include <linux/if_packet.h>
@@ -37,8 +37,6 @@ enum {
 	VHOST_MEMORY_F_LOG = 0x1,
 };

-static struct workqueue_struct *vhost_workqueue;
-
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 			    poll_table *pt)
 {
@@ -52,23 +50,31 @@ static void vhost_poll_func(struct file
 static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
 			     void *key)
 {
-	struct vhost_poll *poll;
-	poll = container_of(wait, struct vhost_poll, wait);
+	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
+
 	if (!((unsigned long)key & poll->mask))
 		return 0;

-	queue_work(vhost_workqueue, &poll->work);
+	vhost_poll_queue(poll);
 	return 0;
 }

 /* Init poll structure */
-void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
-		     unsigned long mask)
+void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
+		     unsigned long mask, struct vhost_dev *dev)
 {
-	INIT_WORK(&poll->work, func);
+	struct vhost_work *work = &poll->work;
+
 	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 	init_poll_funcptr(&poll->table, vhost_poll_func);
 	poll->mask = mask;
+	poll->dev = dev;
+
+	INIT_LIST_HEAD(&work->node);
+	work->fn = fn;
+	init_waitqueue_head(&work->done);
+	atomic_set(&work->flushing, 0);
+	work->queue_seq = work->done_seq = 0;
 }

 /* Start polling a file. We add ourselves to file's wait queue. The caller must
@@ -92,12 +98,28 @@ void vhost_poll_stop(struct vhost_poll *
  * locks that are also used by the callback. */
 void vhost_poll_flush(struct vhost_poll *poll)
 {
-	flush_work(&poll->work);
+	struct vhost_work *work = &poll->work;
+	int seq = work->queue_seq;
+
+	atomic_inc(&work->flushing);
+	smp_mb__after_atomic_inc();	/* mb flush-b0 paired with worker-b1 */
+	wait_event(work->done, seq - work->done_seq <= 0);
+	atomic_dec(&work->flushing);
+	smp_mb__after_atomic_dec();	/* rmb flush-b1 paired with worker-b0 */
 }

 void vhost_poll_queue(struct vhost_poll *poll)
 {
-	queue_work(vhost_workqueue, &poll->work);
+	struct vhost_dev *dev = poll->dev;
+	struct vhost_work *work = &poll->work;
+
+	spin_lock(&dev->work_lock);
+	if (list_empty(&work->node)) {
+		list_add_tail(&work->node, &dev->work_list);
+		work->queue_seq++;
+		wake_up_process(dev->worker);
+	}
+	spin_unlock(&dev->work_lock);
 }

 static void vhost_vq_reset(struct vhost_dev *dev,
@@ -125,10 +147,52 @@ static void vhost_vq_reset(struct vhost_
 	vq->log_ctx = NULL;
 }

+static int vhost_worker(void *data)
+{
+	struct vhost_dev *dev = data;
+	struct vhost_work *work;
+
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+
+	if (kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		return 0;
+	}
+
+	work = NULL;
+	spin_lock(&dev->work_lock);
+	if (!list_empty(&dev->work_list)) {
+		work = list_first_entry(&dev->work_list,
+					struct vhost_work, node);
+		list_del_init(&work->node);
+	}
+	spin_unlock(&dev->work_lock);
+
+	if (work) {
+		__set_current_state(TASK_RUNNING);
+		work->fn(work);
+		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
+		work->done_seq = work->queue_seq;
+		smp_mb();	/* mb worker-b1 paired with flush-b0 */
+		if (atomic_read(&work->flushing))
+			wake_up_all(&work->done);
+	} else
+		schedule();
+
+	goto repeat;
+}
+
 long vhost_dev_init(struct vhost_dev *dev,
 		    struct vhost_virtqueue *vqs, int nvqs)
 {
+	struct task_struct *worker;
 	int i;
+
+	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
+	if (IS_ERR(worker))
+		return PTR_ERR(worker);
+
 	dev->vqs = vqs;
 	dev->nvqs = nvqs;
 	mutex_init(&dev->mutex);
@@ -136,6 +200,9 @@ long vhost_dev_init(struct vhost_dev *de
 	dev->log_file = NULL;
 	dev->memory = NULL;
 	dev->mm = NULL;
+	spin_lock_init(&dev->work_lock);
+	INIT_LIST_HEAD(&dev->work_list);
+	dev->worker = worker;

 	for (i = 0; i < dev->nvqs; ++i) {
 		dev->vqs[i].dev = dev;
@@ -143,9 +210,10 @@ long vhost_dev_init(struct vhost_dev *de
 		vhost_vq_reset(dev, dev->vqs + i);
 		if (dev->vqs[i].handle_kick)
 			vhost_poll_init(&dev->vqs[i].poll,
-					dev->vqs[i].handle_kick,
-					POLLIN);
+					dev->vqs[i].handle_kick, POLLIN, dev);
 	}
+
+	wake_up_process(worker);	/* avoid contributing to loadavg */
 	return 0;
 }

@@ -217,6 +285,9 @@ void vhost_dev_cleanup(struct vhost_dev
 	if (dev->mm)
 		mmput(dev->mm);
 	dev->mm = NULL;
+
+	WARN_ON(!list_empty(&dev->work_list));
+	kthread_stop(dev->worker);
 }

 static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
@@ -1113,16 +1184,3 @@ void vhost_disable_notify(struct vhost_v
 		vq_err(vq, "Failed to enable notification at %p: %d\n",
 		       &vq->used->flags, r);
 }
-
-int vhost_init(void)
-{
-	vhost_workqueue = create_singlethread_workqueue("vhost");
-	if (!vhost_workqueue)
-		return -ENOMEM;
-	return 0;
-}
-
-void vhost_cleanup(void)
-{
-	destroy_workqueue(vhost_workqueue);
-}
Index: work/drivers/vhost/vhost.h
===================================================================
--- work.orig/drivers/vhost/vhost.h
+++ work/drivers/vhost/vhost.h
@@ -5,13 +5,13 @@
 #include <linux/vhost.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
-#include <linux/workqueue.h>
 #include <linux/poll.h>
 #include <linux/file.h>
 #include <linux/skbuff.h>
 #include <linux/uio.h>
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
+#include <asm/atomic.h>

 struct vhost_device;

@@ -20,19 +20,31 @@ enum {
 	VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
 };

+struct vhost_work;
+typedef void (*vhost_work_fn_t)(struct vhost_work *work);
+
+struct vhost_work {
+	struct list_head	  node;
+	vhost_work_fn_t		  fn;
+	wait_queue_head_t	  done;
+	atomic_t		  flushing;
+	int			  queue_seq;
+	int			  done_seq;
+};
+
 /* Poll a file (eventfd or socket) */
 /* Note: there's nothing vhost specific about this structure. */
 struct vhost_poll {
 	poll_table                table;
 	wait_queue_head_t        *wqh;
 	wait_queue_t              wait;
-	/* struct which will handle all actual work. */
-	struct work_struct        work;
+	struct vhost_work	  work;
 	unsigned long		  mask;
+	struct vhost_dev	 *dev;
 };

-void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
-		     unsigned long mask);
+void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
+		     unsigned long mask, struct vhost_dev *dev);
 void vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
@@ -63,7 +75,7 @@ struct vhost_virtqueue {
 	struct vhost_poll poll;

 	/* The routine to call when the Guest pings us, or timeout. */
-	work_func_t handle_kick;
+	vhost_work_fn_t handle_kick;

 	/* Last available index we saw. */
 	u16 last_avail_idx;
@@ -86,11 +98,11 @@ struct vhost_virtqueue {
 	struct iovec hdr[VHOST_NET_MAX_SG];
 	size_t hdr_size;
 	/* We use a kind of RCU to access private pointer.
-	 * All readers access it from workqueue, which makes it possible to
-	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
+	 * All readers access it from worker, which makes it possible to
+	 * flush the vhost_work instead of synchronize_rcu. Therefore readers do
 	 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
-	 * work item execution acts instead of rcu_read_lock() and the end of
-	 * work item execution acts instead of rcu_read_lock().
+	 * vhost_work execution acts instead of rcu_read_lock() and the end of
+	 * vhost_work execution acts instead of rcu_read_lock().
 	 * Writers use virtqueue mutex. */
 	void *private_data;
 	/* Log write descriptors */
@@ -110,6 +122,9 @@ struct vhost_dev {
 	int nvqs;
 	struct file *log_file;
 	struct eventfd_ctx *log_ctx;
+	spinlock_t work_lock;
+	struct list_head work_list;
+	struct task_struct *worker;
 };

 long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
@@ -136,9 +151,6 @@ bool vhost_enable_notify(struct vhost_vi
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);

-int vhost_init(void);
-void vhost_cleanup(void);
-
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
 		if ((vq)->error_ctx)                               \

^ permalink raw reply	[flat|nested] 115+ messages in thread

* [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup
  2010-05-31 16:00                             ` Michael S. Tsirkin
  2010-06-01  9:34                               ` Tejun Heo
@ 2010-06-01  9:34                               ` Tejun Heo
  2010-06-01  9:35                               ` [PATCH 3/3] vhost: apply cpumask and cgroup to vhost workers Tejun Heo
  2 siblings, 0 replies; 115+ messages in thread
From: Tejun Heo @ 2010-06-01  9:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

From: Sridhar Samudrala <samudrala.sridhar@gmail.com>

Add a new kernel API to attach a task to current task's cgroup
in all the active hierarchies.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Reviewed-by: Paul Menage <menage@google.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |    1 +
 kernel/cgroup.c        |   23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+)

Index: work/include/linux/cgroup.h
===================================================================
--- work.orig/include/linux/cgroup.h
+++ work/include/linux/cgroup.h
@@ -570,6 +570,7 @@ struct task_struct *cgroup_iter_next(str
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
+int cgroup_attach_task_current_cg(struct task_struct *);

 /*
  * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
Index: work/kernel/cgroup.c
===================================================================
--- work.orig/kernel/cgroup.c
+++ work/kernel/cgroup.c
@@ -1788,6 +1788,29 @@ out:
 	return retval;
 }

+/**
+ * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_current_cg(struct task_struct *tsk)
+{
+	struct cgroupfs_root *root;
+	struct cgroup *cur_cg;
+	int retval = 0;
+
+	cgroup_lock();
+	for_each_active_root(root) {
+		cur_cg = task_cgroup_from_root(current, root);
+		retval = cgroup_attach_task(cur_cg, tsk);
+		if (retval)
+			break;
+	}
+	cgroup_unlock();
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg);
+
 /*
  * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
  * held. May take task_lock of task

^ permalink raw reply	[flat|nested] 115+ messages in thread

* [PATCH 3/3] vhost: apply cpumask and cgroup to vhost workers
  2010-05-31 16:00                             ` Michael S. Tsirkin
  2010-06-01  9:34                               ` Tejun Heo
  2010-06-01  9:34                               ` [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup Tejun Heo
@ 2010-06-01  9:35                               ` Tejun Heo
  2010-06-01 10:17                                 ` Michael S. Tsirkin
  2010-06-01 17:19                                 ` Sridhar Samudrala
  2 siblings, 2 replies; 115+ messages in thread
From: Tejun Heo @ 2010-06-01  9:35 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Apply the cpumask and cgroup of the initializing task to the created
vhost worker.

Based on Sridhar Samudrala's patch.  Li Zefan spotted a bug in error
path (twice), fixed (twice).

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 drivers/vhost/vhost.c |   34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

Index: work/drivers/vhost/vhost.c
===================================================================
--- work.orig/drivers/vhost/vhost.c
+++ work/drivers/vhost/vhost.c
@@ -23,6 +23,7 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
+#include <linux/cgroup.h>

 #include <linux/net.h>
 #include <linux/if_packet.h>
@@ -187,11 +188,29 @@ long vhost_dev_init(struct vhost_dev *de
 		    struct vhost_virtqueue *vqs, int nvqs)
 {
 	struct task_struct *worker;
-	int i;
+	cpumask_var_t mask;
+	int i, ret = -ENOMEM;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		goto out_free_mask;

 	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
-	if (IS_ERR(worker))
-		return PTR_ERR(worker);
+	if (IS_ERR(worker)) {
+		ret = PTR_ERR(worker);
+		goto out_free_mask;
+	}
+
+	ret = sched_getaffinity(current->pid, mask);
+	if (ret)
+		goto out_stop_worker;
+
+	ret = sched_setaffinity(worker->pid, mask);
+	if (ret)
+		goto out_stop_worker;
+
+	ret = cgroup_attach_task_current_cg(worker);
+	if (ret)
+		goto out_stop_worker;

 	dev->vqs = vqs;
 	dev->nvqs = nvqs;
@@ -214,7 +233,14 @@ long vhost_dev_init(struct vhost_dev *de
 	}

 	wake_up_process(worker);	/* avoid contributing to loadavg */
-	return 0;
+	ret = 0;
+	goto out_free_mask;
+
+out_stop_worker:
+	kthread_stop(worker);
+out_free_mask:
+	free_cpumask_var(mask);
+	return ret;
 }

 /* Caller should have device mutex */

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost workers
  2010-06-01  9:35                               ` [PATCH 3/3] vhost: apply cpumask and cgroup to vhost workers Tejun Heo
@ 2010-06-01 10:17                                 ` Michael S. Tsirkin
  2010-06-01 10:56                                   ` Tejun Heo
  2010-06-01 17:19                                 ` Sridhar Samudrala
  1 sibling, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-06-01 10:17 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Tue, Jun 01, 2010 at 11:35:15AM +0200, Tejun Heo wrote:
> Apply the cpumask and cgroup of the initializing task to the created
> vhost worker.
> 
> Based on Sridhar Samudrala's patch.  Li Zefan spotted a bug in error
> path (twice), fixed (twice).
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
> Cc: Li Zefan <lizf@cn.fujitsu.com>

Something that I wanted to figure out - what happens if the
CPU mask limits us to a certain CPU that subsequently goes offline?
Will e.g. flush block forever or until that CPU comes back?
Also, does singlethreaded workqueue behave in the same way?

> ---
>  drivers/vhost/vhost.c |   34 ++++++++++++++++++++++++++++++----
>  1 file changed, 30 insertions(+), 4 deletions(-)
> 
> Index: work/drivers/vhost/vhost.c
> ===================================================================
> --- work.orig/drivers/vhost/vhost.c
> +++ work/drivers/vhost/vhost.c
> @@ -23,6 +23,7 @@
>  #include <linux/highmem.h>
>  #include <linux/slab.h>
>  #include <linux/kthread.h>
> +#include <linux/cgroup.h>
> 
>  #include <linux/net.h>
>  #include <linux/if_packet.h>
> @@ -187,11 +188,29 @@ long vhost_dev_init(struct vhost_dev *de
>  		    struct vhost_virtqueue *vqs, int nvqs)
>  {
>  	struct task_struct *worker;
> -	int i;
> +	cpumask_var_t mask;
> +	int i, ret = -ENOMEM;
> +
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		goto out_free_mask;
> 
>  	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
> -	if (IS_ERR(worker))
> -		return PTR_ERR(worker);
> +	if (IS_ERR(worker)) {
> +		ret = PTR_ERR(worker);
> +		goto out_free_mask;
> +	}
> +
> +	ret = sched_getaffinity(current->pid, mask);
> +	if (ret)
> +		goto out_stop_worker;
> +
> +	ret = sched_setaffinity(worker->pid, mask);
> +	if (ret)
> +		goto out_stop_worker;
> +
> +	ret = cgroup_attach_task_current_cg(worker);
> +	if (ret)
> +		goto out_stop_worker;
> 
>  	dev->vqs = vqs;
>  	dev->nvqs = nvqs;
> @@ -214,7 +233,14 @@ long vhost_dev_init(struct vhost_dev *de
>  	}
> 
>  	wake_up_process(worker);	/* avoid contributing to loadavg */
> -	return 0;
> +	ret = 0;
> +	goto out_free_mask;
> +
> +out_stop_worker:
> +	kthread_stop(worker);
> +out_free_mask:
> +	free_cpumask_var(mask);
> +	return ret;
>  }
> 
>  /* Caller should have device mutex */

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost workers
  2010-06-01 10:17                                 ` Michael S. Tsirkin
@ 2010-06-01 10:56                                   ` Tejun Heo
  0 siblings, 0 replies; 115+ messages in thread
From: Tejun Heo @ 2010-06-01 10:56 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 06/01/2010 12:17 PM, Michael S. Tsirkin wrote:
> Something that I wanted to figure out - what happens if the
> CPU mask limits us to a certain CPU that subsequently goes offline?

The thread gets unbound during the last steps of cpu offlining.

> Will e.g. flush block forever or until that CPU comes back?
> Also, does singlethreaded workqueue behave in the same way?

So, things will proceed as usual although the thread will lose its
affinity.  Singlethread wqs don't bind their workers (and they
shouldn't! :-).  MT ones explicitly manage workers according to cpu
up/down events.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-05-31 15:22                         ` Michael S. Tsirkin
  2010-05-31 15:45                           ` Tejun Heo
@ 2010-06-01 14:13                           ` Paul E. McKenney
  1 sibling, 0 replies; 115+ messages in thread
From: Paul E. McKenney @ 2010-06-01 14:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Tejun Heo, Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Mon, May 31, 2010 at 06:22:21PM +0300, Michael S. Tsirkin wrote:
> On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote:
> > Replace vhost_workqueue with per-vhost kthread.  Other than callback
> > argument change from struct work_struct * to struct vhost_poll *,
> > there's no visible change to vhost_poll_*() interface.
> 
> I would prefer a substructure vhost_work, even just to make
> the code easier to review and compare to workqueue.c.

Either way this plays out, the rcu_dereference_check() calls will need
to be adjusted to reflect the change.

							Thanx, Paul

> > This conversion is to make each vhost use a dedicated kthread so that
> > resource control via cgroup can be applied.
> > 
> > Partially based on Sridhar Samudrala's patch.
> > 
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
> > ---
> > Okay, here is three patch series to convert vhost to use per-vhost
> > kthread, add cgroup_attach_task_current_cg() and apply it to the vhost
> > kthreads.  The conversion is mostly straight forward although flush is
> > slightly tricky.
> > 
> > The problem is that I have no idea how to test this.
> 
> It's a 3 step process:
> 
> 1. 
> Install qemu-kvm under fc13, or build recent one from source,
> get it from here:
> git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git
> 
> 2. install guest under it:
> qemu-img create -f qcow2 disk.qcow2 100G
> Now get some image (e.g. fedora 13 in fc13.iso)
> and install guest:
> qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2
> 
> 
> 3. set up networking. I usually simply do host to guest 
> on a special subnet for testing purposes:
> 
> Set up a bridge named mstbr0:
> 
> ifconfig mstbr0 down
> brctl delbr mstbr0
> brctl addbr mstbr0
> brctl setfd mstbr0 0
> ifconfig mstbr0 11.0.0.1
> 
> cat > ifup << EOF
> #!/bin/sh -x
> /sbin/ifconfig msttap0 0.0.0.0 up
> brctl addif mstbr0 msttap0
> EOF
> 
> 
> qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2
>  -net nic,model=virtio,netdev=foo -netdev
> tap,id=foo,ifname=msttap0,script=/home/mst/ifup,downscript=no,vhost=on
> 
> after you set up the guest, log into it and
> ifconfig eth0 11.0.0.2
> 
> You should now be able to ping guest to host and back.
> Use something like netperf to stress the connection.
> Close qemu with kill -9 and unload module to test flushing code.
> 
> 
> 
> > Index: work/drivers/vhost/vhost.c
> > ===================================================================
> > --- work.orig/drivers/vhost/vhost.c
> > +++ work/drivers/vhost/vhost.c
> 
> ...
> 
> > @@ -125,10 +139,50 @@ static void vhost_vq_reset(struct vhost_
> >  	vq->log_ctx = NULL;
> >  }
> > 
> > +static int vhost_poller(void *data)
> > +{
> > +	struct vhost_dev *dev = data;
> > +	struct vhost_poll *poll;
> > +
> > +repeat:
> > +	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> > +
> > +	if (kthread_should_stop()) {
> > +		__set_current_state(TASK_RUNNING);
> > +		return 0;
> > +	}
> > +
> > +	poll = NULL;
> > +	spin_lock(&dev->poller_lock);
> > +	if (!list_empty(&dev->poll_list)) {
> > +		poll = list_first_entry(&dev->poll_list,
> > +					struct vhost_poll, node);
> > +		list_del_init(&poll->node);
> > +	}
> > +	spin_unlock(&dev->poller_lock);
> > +
> > +	if (poll) {
> > +		__set_current_state(TASK_RUNNING);
> > +		poll->fn(poll);
> > +		smp_wmb();	/* paired with rmb in vhost_poll_flush() */
> > +		poll->done_seq = poll->queue_seq;
> > +		wake_up_all(&poll->done);
> 
> 
> This seems to add wakeups on data path, which uses spinlocks etc.
> OTOH workqueue.c adds a special barrier
> entry which only does a wakeup when needed.
> Right?
> 
> > +	} else
> > +		schedule();
> > +
> > +	goto repeat;
> > +}
> > +
> >  long vhost_dev_init(struct vhost_dev *dev,
> >  		    struct vhost_virtqueue *vqs, int nvqs)
> >  {
> > +	struct task_struct *poller;
> >  	int i;
> > +
> > +	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
> > +	if (IS_ERR(poller))
> > +		return PTR_ERR(poller);
> > +
> >  	dev->vqs = vqs;
> >  	dev->nvqs = nvqs;
> >  	mutex_init(&dev->mutex);
> > @@ -136,6 +190,9 @@ long vhost_dev_init(struct vhost_dev *de
> >  	dev->log_file = NULL;
> >  	dev->memory = NULL;
> >  	dev->mm = NULL;
> > +	spin_lock_init(&dev->poller_lock);
> > +	INIT_LIST_HEAD(&dev->poll_list);
> > +	dev->poller = poller;
> > 
> >  	for (i = 0; i < dev->nvqs; ++i) {
> >  		dev->vqs[i].dev = dev;
> > @@ -143,8 +200,7 @@ long vhost_dev_init(struct vhost_dev *de
> >  		vhost_vq_reset(dev, dev->vqs + i);
> >  		if (dev->vqs[i].handle_kick)
> >  			vhost_poll_init(&dev->vqs[i].poll,
> > -					dev->vqs[i].handle_kick,
> > -					POLLIN);
> > +					dev->vqs[i].handle_kick, POLLIN, dev);
> >  	}
> >  	return 0;
> >  }
> > @@ -217,6 +273,8 @@ void vhost_dev_cleanup(struct vhost_dev
> >  	if (dev->mm)
> >  		mmput(dev->mm);
> >  	dev->mm = NULL;
> > +
> > +	kthread_stop(dev->poller);
> >  }
> > 
> >  static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
> > @@ -1113,16 +1171,3 @@ void vhost_disable_notify(struct vhost_v
> >  		vq_err(vq, "Failed to enable notification at %p: %d\n",
> >  		       &vq->used->flags, r);
> >  }
> > -
> > -int vhost_init(void)
> > -{
> > -	vhost_workqueue = create_singlethread_workqueue("vhost");
> > -	if (!vhost_workqueue)
> > -		return -ENOMEM;
> > -	return 0;
> > -}
> > -
> > -void vhost_cleanup(void)
> > -{
> > -	destroy_workqueue(vhost_workqueue);
> 
> I note that destroy_workqueue does a flush, kthread_stop
> doesn't. Right? Sure we don't need to check nothing is in one of
> the lists? Maybe add a BUG_ON?
> 
> > -}
> > Index: work/drivers/vhost/vhost.h
> > ===================================================================
> > --- work.orig/drivers/vhost/vhost.h
> > +++ work/drivers/vhost/vhost.h
> > @@ -5,7 +5,6 @@
> >  #include <linux/vhost.h>
> >  #include <linux/mm.h>
> >  #include <linux/mutex.h>
> > -#include <linux/workqueue.h>
> >  #include <linux/poll.h>
> >  #include <linux/file.h>
> >  #include <linux/skbuff.h>
> > @@ -20,19 +19,26 @@ enum {
> >  	VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
> >  };
> > 
> > +struct vhost_poll;
> > +typedef void (*vhost_poll_fn_t)(struct vhost_poll *poll);
> > +
> >  /* Poll a file (eventfd or socket) */
> >  /* Note: there's nothing vhost specific about this structure. */
> >  struct vhost_poll {
> > +	vhost_poll_fn_t		  fn;
> >  	poll_table                table;
> >  	wait_queue_head_t        *wqh;
> >  	wait_queue_t              wait;
> > -	/* struct which will handle all actual work. */
> > -	struct work_struct        work;
> > +	struct list_head	  node;
> > +	wait_queue_head_t	  done;
> >  	unsigned long		  mask;
> > +	struct vhost_dev	 *dev;
> > +	int			  queue_seq;
> > +	int			  done_seq;
> >  };
> > 
> > -void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
> > -		     unsigned long mask);
> > +void vhost_poll_init(struct vhost_poll *poll, vhost_poll_fn_t fn,
> > +		     unsigned long mask, struct vhost_dev *dev);
> >  void vhost_poll_start(struct vhost_poll *poll, struct file *file);
> >  void vhost_poll_stop(struct vhost_poll *poll);
> >  void vhost_poll_flush(struct vhost_poll *poll);
> > @@ -63,7 +69,7 @@ struct vhost_virtqueue {
> >  	struct vhost_poll poll;
> > 
> >  	/* The routine to call when the Guest pings us, or timeout. */
> > -	work_func_t handle_kick;
> > +	vhost_poll_fn_t handle_kick;
> > 
> >  	/* Last available index we saw. */
> >  	u16 last_avail_idx;
> > @@ -86,11 +92,11 @@ struct vhost_virtqueue {
> >  	struct iovec hdr[VHOST_NET_MAX_SG];
> >  	size_t hdr_size;
> >  	/* We use a kind of RCU to access private pointer.
> > -	 * All readers access it from workqueue, which makes it possible to
> > -	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
> > +	 * All readers access it from poller, which makes it possible to
> > +	 * flush the vhost_poll instead of synchronize_rcu. Therefore readers do
> >  	 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
> > -	 * work item execution acts instead of rcu_read_lock() and the end of
> > -	 * work item execution acts instead of rcu_read_lock().
> > +	 * vhost_poll execution acts instead of rcu_read_lock() and the end of
> > +	 * vhost_poll execution acts instead of rcu_read_lock().
> >  	 * Writers use virtqueue mutex. */
> >  	void *private_data;
> >  	/* Log write descriptors */
> > @@ -110,6 +116,9 @@ struct vhost_dev {
> >  	int nvqs;
> >  	struct file *log_file;
> >  	struct eventfd_ctx *log_ctx;
> > +	spinlock_t poller_lock;
> > +	struct list_head poll_list;
> > +	struct task_struct *poller;
> >  };
> > 
> >  long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
> > @@ -136,9 +145,6 @@ bool vhost_enable_notify(struct vhost_vi
> >  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
> >  		    unsigned int log_num, u64 len);
> > 
> > -int vhost_init(void);
> > -void vhost_cleanup(void);
> > -
> >  #define vq_err(vq, fmt, ...) do {                                  \
> >  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> >  		if ((vq)->error_ctx)                               \
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost workers
  2010-06-01  9:35                               ` [PATCH 3/3] vhost: apply cpumask and cgroup to vhost workers Tejun Heo
  2010-06-01 10:17                                 ` Michael S. Tsirkin
@ 2010-06-01 17:19                                 ` Sridhar Samudrala
  2010-06-01 23:59                                   ` Tejun Heo
  1 sibling, 1 reply; 115+ messages in thread
From: Sridhar Samudrala @ 2010-06-01 17:19 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michael S. Tsirkin, Oleg Nesterov, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Tue, 2010-06-01 at 11:35 +0200, Tejun Heo wrote:
> Apply the cpumask and cgroup of the initializing task to the created
> vhost worker.
> 
> Based on Sridhar Samudrala's patch.  Li Zefan spotted a bug in error
> path (twice), fixed (twice).
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
> Cc: Li Zefan <lizf@cn.fujitsu.com>
> ---
>  drivers/vhost/vhost.c |   34 ++++++++++++++++++++++++++++++----
>  1 file changed, 30 insertions(+), 4 deletions(-)
> 
> Index: work/drivers/vhost/vhost.c
> ===================================================================
> --- work.orig/drivers/vhost/vhost.c
> +++ work/drivers/vhost/vhost.c
> @@ -23,6 +23,7 @@
>  #include <linux/highmem.h>
>  #include <linux/slab.h>
>  #include <linux/kthread.h>
> +#include <linux/cgroup.h>
> 
>  #include <linux/net.h>
>  #include <linux/if_packet.h>
> @@ -187,11 +188,29 @@ long vhost_dev_init(struct vhost_dev *de
>  		    struct vhost_virtqueue *vqs, int nvqs)
>  {
>  	struct task_struct *worker;
> -	int i;
> +	cpumask_var_t mask;
> +	int i, ret = -ENOMEM;
> +
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		goto out_free_mask;

I think this is another bug in the error path. You should simply
do a return instead of a goto here when aloc_cpu_mask fails.

Thanks
Sridhar
> 
>  	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
> -	if (IS_ERR(worker))
> -		return PTR_ERR(worker);
> +	if (IS_ERR(worker)) {
> +		ret = PTR_ERR(worker);
> +		goto out_free_mask;
> +	}
> +
> +	ret = sched_getaffinity(current->pid, mask);
> +	if (ret)
> +		goto out_stop_worker;
> +
> +	ret = sched_setaffinity(worker->pid, mask);
> +	if (ret)
> +		goto out_stop_worker;
> +
> +	ret = cgroup_attach_task_current_cg(worker);
> +	if (ret)
> +		goto out_stop_worker;
> 
>  	dev->vqs = vqs;
>  	dev->nvqs = nvqs;
> @@ -214,7 +233,14 @@ long vhost_dev_init(struct vhost_dev *de
>  	}
> 
>  	wake_up_process(worker);	/* avoid contributing to loadavg */
> -	return 0;
> +	ret = 0;
> +	goto out_free_mask;
> +
> +out_stop_worker:
> +	kthread_stop(worker);
> +out_free_mask:
> +	free_cpumask_var(mask);
> +	return ret;
>  }
> 
>  /* Caller should have device mutex */
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost workers
  2010-06-01 17:19                                 ` Sridhar Samudrala
@ 2010-06-01 23:59                                   ` Tejun Heo
  0 siblings, 0 replies; 115+ messages in thread
From: Tejun Heo @ 2010-06-01 23:59 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Michael S. Tsirkin, Oleg Nesterov, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On 06/01/2010 07:19 PM, Sridhar Samudrala wrote:
>> -	int i;
>> +	cpumask_var_t mask;
>> +	int i, ret = -ENOMEM;
>> +
>> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
>> +		goto out_free_mask;
> 
> I think this is another bug in the error path. You should simply
> do a return instead of a goto here when aloc_cpu_mask fails.

Oh... it's always safe to call free_cpumask_var() after failed
alloc_cpumask_var(), so that part isn't broken.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-06-01  9:34                               ` Tejun Heo
@ 2010-06-02 18:40                                 ` Tejun Heo
  2010-06-02 21:34                                   ` Sridhar Samudrala
  2010-07-22 15:58                                   ` Michael S. Tsirkin
  0 siblings, 2 replies; 115+ messages in thread
From: Tejun Heo @ 2010-06-02 18:40 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Replace vhost_workqueue with per-vhost kthread.  Other than callback
argument change from struct work_struct * to struct vhost_work *,
there's no visible change to vhost_poll_*() interface.

This conversion is to make each vhost use a dedicated kthread so that
resource control via cgroup can be applied.

Partially based on Sridhar Samudrala's patch.

* Updated to use sub structure vhost_work instead of directly using
  vhost_poll at Michael's suggestion.

* Added flusher wake_up() optimization at Michael's suggestion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
---
Okay, just tested it.  dev->work_lock had to be updated to use irq
operations but other than that it worked just fine.  Copied a large
file using scp and it seems to perform pretty well although I don't
have any reference of comparison.  So, here's the updated version with
the sign off.

Thanks.

 drivers/vhost/net.c   |   56 ++++++++++---------------
 drivers/vhost/vhost.c |  111 ++++++++++++++++++++++++++++++++++++++------------
 drivers/vhost/vhost.h |   38 +++++++++++------
 3 files changed, 134 insertions(+), 71 deletions(-)

Index: work/drivers/vhost/net.c
===================================================================
--- work.orig/drivers/vhost/net.c
+++ work/drivers/vhost/net.c
@@ -294,54 +294,58 @@ static void handle_rx(struct vhost_net *
 	unuse_mm(net->dev.mm);
 }

-static void handle_tx_kick(struct work_struct *work)
+static void handle_tx_kick(struct vhost_work *work)
 {
-	struct vhost_virtqueue *vq;
-	struct vhost_net *net;
-	vq = container_of(work, struct vhost_virtqueue, poll.work);
-	net = container_of(vq->dev, struct vhost_net, dev);
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+						  poll.work);
+	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
 	handle_tx(net);
 }

-static void handle_rx_kick(struct work_struct *work)
+static void handle_rx_kick(struct vhost_work *work)
 {
-	struct vhost_virtqueue *vq;
-	struct vhost_net *net;
-	vq = container_of(work, struct vhost_virtqueue, poll.work);
-	net = container_of(vq->dev, struct vhost_net, dev);
+	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+						  poll.work);
+	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
+
 	handle_rx(net);
 }

-static void handle_tx_net(struct work_struct *work)
+static void handle_tx_net(struct vhost_work *work)
 {
-	struct vhost_net *net;
-	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
+	struct vhost_net *net = container_of(work, struct vhost_net,
+					     poll[VHOST_NET_VQ_TX].work);
 	handle_tx(net);
 }

-static void handle_rx_net(struct work_struct *work)
+static void handle_rx_net(struct vhost_work *work)
 {
-	struct vhost_net *net;
-	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
+	struct vhost_net *net = container_of(work, struct vhost_net,
+					     poll[VHOST_NET_VQ_RX].work);
 	handle_rx(net);
 }

 static int vhost_net_open(struct inode *inode, struct file *f)
 {
 	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
+	struct vhost_dev *dev;
 	int r;
+
 	if (!n)
 		return -ENOMEM;
+
+	dev = &n->dev;
 	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
 	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
-	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
+	r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
 	if (r < 0) {
 		kfree(n);
 		return r;
 	}

-	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
-	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
 	n->tx_poll_state = VHOST_NET_POLL_DISABLED;

 	f->private_data = n;
@@ -644,25 +648,13 @@ static struct miscdevice vhost_net_misc

 static int vhost_net_init(void)
 {
-	int r = vhost_init();
-	if (r)
-		goto err_init;
-	r = misc_register(&vhost_net_misc);
-	if (r)
-		goto err_reg;
-	return 0;
-err_reg:
-	vhost_cleanup();
-err_init:
-	return r;
-
+	return misc_register(&vhost_net_misc);
 }
 module_init(vhost_net_init);

 static void vhost_net_exit(void)
 {
 	misc_deregister(&vhost_net_misc);
-	vhost_cleanup();
 }
 module_exit(vhost_net_exit);

Index: work/drivers/vhost/vhost.c
===================================================================
--- work.orig/drivers/vhost/vhost.c
+++ work/drivers/vhost/vhost.c
@@ -17,12 +17,12 @@
 #include <linux/mm.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
-#include <linux/workqueue.h>
 #include <linux/rcupdate.h>
 #include <linux/poll.h>
 #include <linux/file.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
+#include <linux/kthread.h>

 #include <linux/net.h>
 #include <linux/if_packet.h>
@@ -37,8 +37,6 @@ enum {
 	VHOST_MEMORY_F_LOG = 0x1,
 };

-static struct workqueue_struct *vhost_workqueue;
-
 static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 			    poll_table *pt)
 {
@@ -52,23 +50,31 @@ static void vhost_poll_func(struct file
 static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
 			     void *key)
 {
-	struct vhost_poll *poll;
-	poll = container_of(wait, struct vhost_poll, wait);
+	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
+
 	if (!((unsigned long)key & poll->mask))
 		return 0;

-	queue_work(vhost_workqueue, &poll->work);
+	vhost_poll_queue(poll);
 	return 0;
 }

 /* Init poll structure */
-void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
-		     unsigned long mask)
+void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
+		     unsigned long mask, struct vhost_dev *dev)
 {
-	INIT_WORK(&poll->work, func);
+	struct vhost_work *work = &poll->work;
+
 	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
 	init_poll_funcptr(&poll->table, vhost_poll_func);
 	poll->mask = mask;
+	poll->dev = dev;
+
+	INIT_LIST_HEAD(&work->node);
+	work->fn = fn;
+	init_waitqueue_head(&work->done);
+	atomic_set(&work->flushing, 0);
+	work->queue_seq = work->done_seq = 0;
 }

 /* Start polling a file. We add ourselves to file's wait queue. The caller must
@@ -92,12 +98,29 @@ void vhost_poll_stop(struct vhost_poll *
  * locks that are also used by the callback. */
 void vhost_poll_flush(struct vhost_poll *poll)
 {
-	flush_work(&poll->work);
+	struct vhost_work *work = &poll->work;
+	int seq = work->queue_seq;
+
+	atomic_inc(&work->flushing);
+	smp_mb__after_atomic_inc();	/* mb flush-b0 paired with worker-b1 */
+	wait_event(work->done, seq - work->done_seq <= 0);
+	atomic_dec(&work->flushing);
+	smp_mb__after_atomic_dec();	/* rmb flush-b1 paired with worker-b0 */
 }

 void vhost_poll_queue(struct vhost_poll *poll)
 {
-	queue_work(vhost_workqueue, &poll->work);
+	struct vhost_dev *dev = poll->dev;
+	struct vhost_work *work = &poll->work;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->work_lock, flags);
+	if (list_empty(&work->node)) {
+		list_add_tail(&work->node, &dev->work_list);
+		work->queue_seq++;
+		wake_up_process(dev->worker);
+	}
+	spin_unlock_irqrestore(&dev->work_lock, flags);
 }

 static void vhost_vq_reset(struct vhost_dev *dev,
@@ -125,10 +148,52 @@ static void vhost_vq_reset(struct vhost_
 	vq->log_ctx = NULL;
 }

+static int vhost_worker(void *data)
+{
+	struct vhost_dev *dev = data;
+	struct vhost_work *work;
+
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+
+	if (kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		return 0;
+	}
+
+	work = NULL;
+	spin_lock_irq(&dev->work_lock);
+	if (!list_empty(&dev->work_list)) {
+		work = list_first_entry(&dev->work_list,
+					struct vhost_work, node);
+		list_del_init(&work->node);
+	}
+	spin_unlock_irq(&dev->work_lock);
+
+	if (work) {
+		__set_current_state(TASK_RUNNING);
+		work->fn(work);
+		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
+		work->done_seq = work->queue_seq;
+		smp_mb();	/* mb worker-b1 paired with flush-b0 */
+		if (atomic_read(&work->flushing))
+			wake_up_all(&work->done);
+	} else
+		schedule();
+
+	goto repeat;
+}
+
 long vhost_dev_init(struct vhost_dev *dev,
 		    struct vhost_virtqueue *vqs, int nvqs)
 {
+	struct task_struct *worker;
 	int i;
+
+	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
+	if (IS_ERR(worker))
+		return PTR_ERR(worker);
+
 	dev->vqs = vqs;
 	dev->nvqs = nvqs;
 	mutex_init(&dev->mutex);
@@ -136,6 +201,9 @@ long vhost_dev_init(struct vhost_dev *de
 	dev->log_file = NULL;
 	dev->memory = NULL;
 	dev->mm = NULL;
+	spin_lock_init(&dev->work_lock);
+	INIT_LIST_HEAD(&dev->work_list);
+	dev->worker = worker;

 	for (i = 0; i < dev->nvqs; ++i) {
 		dev->vqs[i].dev = dev;
@@ -143,9 +211,10 @@ long vhost_dev_init(struct vhost_dev *de
 		vhost_vq_reset(dev, dev->vqs + i);
 		if (dev->vqs[i].handle_kick)
 			vhost_poll_init(&dev->vqs[i].poll,
-					dev->vqs[i].handle_kick,
-					POLLIN);
+					dev->vqs[i].handle_kick, POLLIN, dev);
 	}
+
+	wake_up_process(worker);	/* avoid contributing to loadavg */
 	return 0;
 }

@@ -217,6 +286,9 @@ void vhost_dev_cleanup(struct vhost_dev
 	if (dev->mm)
 		mmput(dev->mm);
 	dev->mm = NULL;
+
+	WARN_ON(!list_empty(&dev->work_list));
+	kthread_stop(dev->worker);
 }

 static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
@@ -1113,16 +1185,3 @@ void vhost_disable_notify(struct vhost_v
 		vq_err(vq, "Failed to enable notification at %p: %d\n",
 		       &vq->used->flags, r);
 }
-
-int vhost_init(void)
-{
-	vhost_workqueue = create_singlethread_workqueue("vhost");
-	if (!vhost_workqueue)
-		return -ENOMEM;
-	return 0;
-}
-
-void vhost_cleanup(void)
-{
-	destroy_workqueue(vhost_workqueue);
-}
Index: work/drivers/vhost/vhost.h
===================================================================
--- work.orig/drivers/vhost/vhost.h
+++ work/drivers/vhost/vhost.h
@@ -5,13 +5,13 @@
 #include <linux/vhost.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
-#include <linux/workqueue.h>
 #include <linux/poll.h>
 #include <linux/file.h>
 #include <linux/skbuff.h>
 #include <linux/uio.h>
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
+#include <asm/atomic.h>

 struct vhost_device;

@@ -20,19 +20,31 @@ enum {
 	VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
 };

+struct vhost_work;
+typedef void (*vhost_work_fn_t)(struct vhost_work *work);
+
+struct vhost_work {
+	struct list_head	  node;
+	vhost_work_fn_t		  fn;
+	wait_queue_head_t	  done;
+	atomic_t		  flushing;
+	int			  queue_seq;
+	int			  done_seq;
+};
+
 /* Poll a file (eventfd or socket) */
 /* Note: there's nothing vhost specific about this structure. */
 struct vhost_poll {
 	poll_table                table;
 	wait_queue_head_t        *wqh;
 	wait_queue_t              wait;
-	/* struct which will handle all actual work. */
-	struct work_struct        work;
+	struct vhost_work	  work;
 	unsigned long		  mask;
+	struct vhost_dev	 *dev;
 };

-void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
-		     unsigned long mask);
+void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
+		     unsigned long mask, struct vhost_dev *dev);
 void vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
@@ -63,7 +75,7 @@ struct vhost_virtqueue {
 	struct vhost_poll poll;

 	/* The routine to call when the Guest pings us, or timeout. */
-	work_func_t handle_kick;
+	vhost_work_fn_t handle_kick;

 	/* Last available index we saw. */
 	u16 last_avail_idx;
@@ -86,11 +98,11 @@ struct vhost_virtqueue {
 	struct iovec hdr[VHOST_NET_MAX_SG];
 	size_t hdr_size;
 	/* We use a kind of RCU to access private pointer.
-	 * All readers access it from workqueue, which makes it possible to
-	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
+	 * All readers access it from worker, which makes it possible to
+	 * flush the vhost_work instead of synchronize_rcu. Therefore readers do
 	 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
-	 * work item execution acts instead of rcu_read_lock() and the end of
-	 * work item execution acts instead of rcu_read_lock().
+	 * vhost_work execution acts instead of rcu_read_lock() and the end of
+	 * vhost_work execution acts instead of rcu_read_lock().
 	 * Writers use virtqueue mutex. */
 	void *private_data;
 	/* Log write descriptors */
@@ -110,6 +122,9 @@ struct vhost_dev {
 	int nvqs;
 	struct file *log_file;
 	struct eventfd_ctx *log_ctx;
+	spinlock_t work_lock;
+	struct list_head work_list;
+	struct task_struct *worker;
 };

 long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
@@ -136,9 +151,6 @@ bool vhost_enable_notify(struct vhost_vi
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
 		    unsigned int log_num, u64 len);

-int vhost_init(void);
-void vhost_cleanup(void);
-
 #define vq_err(vq, fmt, ...) do {                                  \
 		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
 		if ((vq)->error_ctx)                               \

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-06-02 18:40                                 ` [PATCH UPDATED " Tejun Heo
@ 2010-06-02 21:34                                   ` Sridhar Samudrala
  2010-07-22 15:58                                   ` Michael S. Tsirkin
  1 sibling, 0 replies; 115+ messages in thread
From: Sridhar Samudrala @ 2010-06-02 21:34 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michael S. Tsirkin, Oleg Nesterov, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On 6/2/2010 11:40 AM, Tejun Heo wrote:
> Replace vhost_workqueue with per-vhost kthread.  Other than callback
> argument change from struct work_struct * to struct vhost_work *,
> there's no visible change to vhost_poll_*() interface.
>
> This conversion is to make each vhost use a dedicated kthread so that
> resource control via cgroup can be applied.
>
> Partially based on Sridhar Samudrala's patch.
>
> * Updated to use sub structure vhost_work instead of directly using
>    vhost_poll at Michael's suggestion.
>
> * Added flusher wake_up() optimization at Michael's suggestion.
>
> Signed-off-by: Tejun Heo<tj@kernel.org>
> Cc: Michael S. Tsirkin<mst@redhat.com>
> Cc: Sridhar Samudrala<samudrala.sridhar@gmail.com>
> ---
> Okay, just tested it.  dev->work_lock had to be updated to use irq
> operations but other than that it worked just fine.  Copied a large
> file using scp and it seems to perform pretty well although I don't
> have any reference of comparison.  So, here's the updated version with
> the sign off.
>    

I tested this with 4 VMs running netperf TCP stream tests from guest to 
host and i am seeing similar
level of scalability in throughput i saw with the multi-thread workqueue 
patch.
            11200Mb/s - default         (host cpu utilization: 40%)
            21600Mb/s - multi-thread (host cpu utilization: 86%)

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

Thanks
Sridhar
> Thanks.
>
>   drivers/vhost/net.c   |   56 ++++++++++---------------
>   drivers/vhost/vhost.c |  111 ++++++++++++++++++++++++++++++++++++++------------
>   drivers/vhost/vhost.h |   38 +++++++++++------
>   3 files changed, 134 insertions(+), 71 deletions(-)
>
> Index: work/drivers/vhost/net.c
> ===================================================================
> --- work.orig/drivers/vhost/net.c
> +++ work/drivers/vhost/net.c
> @@ -294,54 +294,58 @@ static void handle_rx(struct vhost_net *
>   	unuse_mm(net->dev.mm);
>   }
>
> -static void handle_tx_kick(struct work_struct *work)
> +static void handle_tx_kick(struct vhost_work *work)
>   {
> -	struct vhost_virtqueue *vq;
> -	struct vhost_net *net;
> -	vq = container_of(work, struct vhost_virtqueue, poll.work);
> -	net = container_of(vq->dev, struct vhost_net, dev);
> +	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
> +						  poll.work);
> +	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
> +
>   	handle_tx(net);
>   }
>
> -static void handle_rx_kick(struct work_struct *work)
> +static void handle_rx_kick(struct vhost_work *work)
>   {
> -	struct vhost_virtqueue *vq;
> -	struct vhost_net *net;
> -	vq = container_of(work, struct vhost_virtqueue, poll.work);
> -	net = container_of(vq->dev, struct vhost_net, dev);
> +	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
> +						  poll.work);
> +	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
> +
>   	handle_rx(net);
>   }
>
> -static void handle_tx_net(struct work_struct *work)
> +static void handle_tx_net(struct vhost_work *work)
>   {
> -	struct vhost_net *net;
> -	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work);
> +	struct vhost_net *net = container_of(work, struct vhost_net,
> +					     poll[VHOST_NET_VQ_TX].work);
>   	handle_tx(net);
>   }
>
> -static void handle_rx_net(struct work_struct *work)
> +static void handle_rx_net(struct vhost_work *work)
>   {
> -	struct vhost_net *net;
> -	net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work);
> +	struct vhost_net *net = container_of(work, struct vhost_net,
> +					     poll[VHOST_NET_VQ_RX].work);
>   	handle_rx(net);
>   }
>
>   static int vhost_net_open(struct inode *inode, struct file *f)
>   {
>   	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
> +	struct vhost_dev *dev;
>   	int r;
> +
>   	if (!n)
>   		return -ENOMEM;
> +
> +	dev =&n->dev;
>   	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
>   	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
> -	r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX);
> +	r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
>   	if (r<  0) {
>   		kfree(n);
>   		return r;
>   	}
>
> -	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT);
> -	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN);
> +	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
> +	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
>   	n->tx_poll_state = VHOST_NET_POLL_DISABLED;
>
>   	f->private_data = n;
> @@ -644,25 +648,13 @@ static struct miscdevice vhost_net_misc
>
>   static int vhost_net_init(void)
>   {
> -	int r = vhost_init();
> -	if (r)
> -		goto err_init;
> -	r = misc_register(&vhost_net_misc);
> -	if (r)
> -		goto err_reg;
> -	return 0;
> -err_reg:
> -	vhost_cleanup();
> -err_init:
> -	return r;
> -
> +	return misc_register(&vhost_net_misc);
>   }
>   module_init(vhost_net_init);
>
>   static void vhost_net_exit(void)
>   {
>   	misc_deregister(&vhost_net_misc);
> -	vhost_cleanup();
>   }
>   module_exit(vhost_net_exit);
>
> Index: work/drivers/vhost/vhost.c
> ===================================================================
> --- work.orig/drivers/vhost/vhost.c
> +++ work/drivers/vhost/vhost.c
> @@ -17,12 +17,12 @@
>   #include<linux/mm.h>
>   #include<linux/miscdevice.h>
>   #include<linux/mutex.h>
> -#include<linux/workqueue.h>
>   #include<linux/rcupdate.h>
>   #include<linux/poll.h>
>   #include<linux/file.h>
>   #include<linux/highmem.h>
>   #include<linux/slab.h>
> +#include<linux/kthread.h>
>
>   #include<linux/net.h>
>   #include<linux/if_packet.h>
> @@ -37,8 +37,6 @@ enum {
>   	VHOST_MEMORY_F_LOG = 0x1,
>   };
>
> -static struct workqueue_struct *vhost_workqueue;
> -
>   static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
>   			    poll_table *pt)
>   {
> @@ -52,23 +50,31 @@ static void vhost_poll_func(struct file
>   static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
>   			     void *key)
>   {
> -	struct vhost_poll *poll;
> -	poll = container_of(wait, struct vhost_poll, wait);
> +	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
> +
>   	if (!((unsigned long)key&  poll->mask))
>   		return 0;
>
> -	queue_work(vhost_workqueue,&poll->work);
> +	vhost_poll_queue(poll);
>   	return 0;
>   }
>
>   /* Init poll structure */
> -void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
> -		     unsigned long mask)
> +void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
> +		     unsigned long mask, struct vhost_dev *dev)
>   {
> -	INIT_WORK(&poll->work, func);
> +	struct vhost_work *work =&poll->work;
> +
>   	init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
>   	init_poll_funcptr(&poll->table, vhost_poll_func);
>   	poll->mask = mask;
> +	poll->dev = dev;
> +
> +	INIT_LIST_HEAD(&work->node);
> +	work->fn = fn;
> +	init_waitqueue_head(&work->done);
> +	atomic_set(&work->flushing, 0);
> +	work->queue_seq = work->done_seq = 0;
>   }
>
>   /* Start polling a file. We add ourselves to file's wait queue. The caller must
> @@ -92,12 +98,29 @@ void vhost_poll_stop(struct vhost_poll *
>    * locks that are also used by the callback. */
>   void vhost_poll_flush(struct vhost_poll *poll)
>   {
> -	flush_work(&poll->work);
> +	struct vhost_work *work =&poll->work;
> +	int seq = work->queue_seq;
> +
> +	atomic_inc(&work->flushing);
> +	smp_mb__after_atomic_inc();	/* mb flush-b0 paired with worker-b1 */
> +	wait_event(work->done, seq - work->done_seq<= 0);
> +	atomic_dec(&work->flushing);
> +	smp_mb__after_atomic_dec();	/* rmb flush-b1 paired with worker-b0 */
>   }
>
>   void vhost_poll_queue(struct vhost_poll *poll)
>   {
> -	queue_work(vhost_workqueue,&poll->work);
> +	struct vhost_dev *dev = poll->dev;
> +	struct vhost_work *work =&poll->work;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&dev->work_lock, flags);
> +	if (list_empty(&work->node)) {
> +		list_add_tail(&work->node,&dev->work_list);
> +		work->queue_seq++;
> +		wake_up_process(dev->worker);
> +	}
> +	spin_unlock_irqrestore(&dev->work_lock, flags);
>   }
>
>   static void vhost_vq_reset(struct vhost_dev *dev,
> @@ -125,10 +148,52 @@ static void vhost_vq_reset(struct vhost_
>   	vq->log_ctx = NULL;
>   }
>
> +static int vhost_worker(void *data)
> +{
> +	struct vhost_dev *dev = data;
> +	struct vhost_work *work;
> +
> +repeat:
> +	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> +
> +	if (kthread_should_stop()) {
> +		__set_current_state(TASK_RUNNING);
> +		return 0;
> +	}
> +
> +	work = NULL;
> +	spin_lock_irq(&dev->work_lock);
> +	if (!list_empty(&dev->work_list)) {
> +		work = list_first_entry(&dev->work_list,
> +					struct vhost_work, node);
> +		list_del_init(&work->node);
> +	}
> +	spin_unlock_irq(&dev->work_lock);
> +
> +	if (work) {
> +		__set_current_state(TASK_RUNNING);
> +		work->fn(work);
> +		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
> +		work->done_seq = work->queue_seq;
> +		smp_mb();	/* mb worker-b1 paired with flush-b0 */
> +		if (atomic_read(&work->flushing))
> +			wake_up_all(&work->done);
> +	} else
> +		schedule();
> +
> +	goto repeat;
> +}
> +
>   long vhost_dev_init(struct vhost_dev *dev,
>   		    struct vhost_virtqueue *vqs, int nvqs)
>   {
> +	struct task_struct *worker;
>   	int i;
> +
> +	worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid);
> +	if (IS_ERR(worker))
> +		return PTR_ERR(worker);
> +
>   	dev->vqs = vqs;
>   	dev->nvqs = nvqs;
>   	mutex_init(&dev->mutex);
> @@ -136,6 +201,9 @@ long vhost_dev_init(struct vhost_dev *de
>   	dev->log_file = NULL;
>   	dev->memory = NULL;
>   	dev->mm = NULL;
> +	spin_lock_init(&dev->work_lock);
> +	INIT_LIST_HEAD(&dev->work_list);
> +	dev->worker = worker;
>
>   	for (i = 0; i<  dev->nvqs; ++i) {
>   		dev->vqs[i].dev = dev;
> @@ -143,9 +211,10 @@ long vhost_dev_init(struct vhost_dev *de
>   		vhost_vq_reset(dev, dev->vqs + i);
>   		if (dev->vqs[i].handle_kick)
>   			vhost_poll_init(&dev->vqs[i].poll,
> -					dev->vqs[i].handle_kick,
> -					POLLIN);
> +					dev->vqs[i].handle_kick, POLLIN, dev);
>   	}
> +
> +	wake_up_process(worker);	/* avoid contributing to loadavg */
>   	return 0;
>   }
>
> @@ -217,6 +286,9 @@ void vhost_dev_cleanup(struct vhost_dev
>   	if (dev->mm)
>   		mmput(dev->mm);
>   	dev->mm = NULL;
> +
> +	WARN_ON(!list_empty(&dev->work_list));
> +	kthread_stop(dev->worker);
>   }
>
>   static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
> @@ -1113,16 +1185,3 @@ void vhost_disable_notify(struct vhost_v
>   		vq_err(vq, "Failed to enable notification at %p: %d\n",
>   		&vq->used->flags, r);
>   }
> -
> -int vhost_init(void)
> -{
> -	vhost_workqueue = create_singlethread_workqueue("vhost");
> -	if (!vhost_workqueue)
> -		return -ENOMEM;
> -	return 0;
> -}
> -
> -void vhost_cleanup(void)
> -{
> -	destroy_workqueue(vhost_workqueue);
> -}
> Index: work/drivers/vhost/vhost.h
> ===================================================================
> --- work.orig/drivers/vhost/vhost.h
> +++ work/drivers/vhost/vhost.h
> @@ -5,13 +5,13 @@
>   #include<linux/vhost.h>
>   #include<linux/mm.h>
>   #include<linux/mutex.h>
> -#include<linux/workqueue.h>
>   #include<linux/poll.h>
>   #include<linux/file.h>
>   #include<linux/skbuff.h>
>   #include<linux/uio.h>
>   #include<linux/virtio_config.h>
>   #include<linux/virtio_ring.h>
> +#include<asm/atomic.h>
>
>   struct vhost_device;
>
> @@ -20,19 +20,31 @@ enum {
>   	VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
>   };
>
> +struct vhost_work;
> +typedef void (*vhost_work_fn_t)(struct vhost_work *work);
> +
> +struct vhost_work {
> +	struct list_head	  node;
> +	vhost_work_fn_t		  fn;
> +	wait_queue_head_t	  done;
> +	atomic_t		  flushing;
> +	int			  queue_seq;
> +	int			  done_seq;
> +};
> +
>   /* Poll a file (eventfd or socket) */
>   /* Note: there's nothing vhost specific about this structure. */
>   struct vhost_poll {
>   	poll_table                table;
>   	wait_queue_head_t        *wqh;
>   	wait_queue_t              wait;
> -	/* struct which will handle all actual work. */
> -	struct work_struct        work;
> +	struct vhost_work	  work;
>   	unsigned long		  mask;
> +	struct vhost_dev	 *dev;
>   };
>
> -void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
> -		     unsigned long mask);
> +void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
> +		     unsigned long mask, struct vhost_dev *dev);
>   void vhost_poll_start(struct vhost_poll *poll, struct file *file);
>   void vhost_poll_stop(struct vhost_poll *poll);
>   void vhost_poll_flush(struct vhost_poll *poll);
> @@ -63,7 +75,7 @@ struct vhost_virtqueue {
>   	struct vhost_poll poll;
>
>   	/* The routine to call when the Guest pings us, or timeout. */
> -	work_func_t handle_kick;
> +	vhost_work_fn_t handle_kick;
>
>   	/* Last available index we saw. */
>   	u16 last_avail_idx;
> @@ -86,11 +98,11 @@ struct vhost_virtqueue {
>   	struct iovec hdr[VHOST_NET_MAX_SG];
>   	size_t hdr_size;
>   	/* We use a kind of RCU to access private pointer.
> -	 * All readers access it from workqueue, which makes it possible to
> -	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
> +	 * All readers access it from worker, which makes it possible to
> +	 * flush the vhost_work instead of synchronize_rcu. Therefore readers do
>   	 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
> -	 * work item execution acts instead of rcu_read_lock() and the end of
> -	 * work item execution acts instead of rcu_read_lock().
> +	 * vhost_work execution acts instead of rcu_read_lock() and the end of
> +	 * vhost_work execution acts instead of rcu_read_lock().
>   	 * Writers use virtqueue mutex. */
>   	void *private_data;
>   	/* Log write descriptors */
> @@ -110,6 +122,9 @@ struct vhost_dev {
>   	int nvqs;
>   	struct file *log_file;
>   	struct eventfd_ctx *log_ctx;
> +	spinlock_t work_lock;
> +	struct list_head work_list;
> +	struct task_struct *worker;
>   };
>
>   long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
> @@ -136,9 +151,6 @@ bool vhost_enable_notify(struct vhost_vi
>   int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>   		    unsigned int log_num, u64 len);
>
> -int vhost_init(void);
> -void vhost_cleanup(void);
> -
>   #define vq_err(vq, fmt, ...) do {                                  \
>   		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
>   		if ((vq)->error_ctx)                               \
>    



^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers
  2010-05-30 20:25                       ` [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers Tejun Heo
  2010-05-31  1:11                         ` Li Zefan
@ 2010-06-24  8:11                         ` Michael S. Tsirkin
  2010-06-24 22:45                           ` Sridhar Samudrala
  1 sibling, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-06-24  8:11 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Sun, May 30, 2010 at 10:25:01PM +0200, Tejun Heo wrote:
> Apply the cpumask and cgroup of the initializing task to the created
> vhost poller.
> 
> Based on Sridhar Samudrala's patch.
> 
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>


I wanted to apply this, but modpost fails:
ERROR: "sched_setaffinity" [drivers/vhost/vhost_net.ko] undefined!
ERROR: "sched_getaffinity" [drivers/vhost/vhost_net.ko] undefined!

Did you try building as a module?

> ---
>  drivers/vhost/vhost.c |   36 +++++++++++++++++++++++++++++++-----
>  1 file changed, 31 insertions(+), 5 deletions(-)
> 
> Index: work/drivers/vhost/vhost.c
> ===================================================================
> --- work.orig/drivers/vhost/vhost.c
> +++ work/drivers/vhost/vhost.c
> @@ -23,6 +23,7 @@
>  #include <linux/highmem.h>
>  #include <linux/slab.h>
>  #include <linux/kthread.h>
> +#include <linux/cgroup.h>
> 
>  #include <linux/net.h>
>  #include <linux/if_packet.h>
> @@ -176,12 +177,30 @@ repeat:
>  long vhost_dev_init(struct vhost_dev *dev,
>  		    struct vhost_virtqueue *vqs, int nvqs)
>  {
> -	struct task_struct *poller;
> -	int i;
> +	struct task_struct *poller = NULL;
> +	cpumask_var_t mask;
> +	int i, ret = -ENOMEM;
> +
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		goto out;
> 
>  	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
> -	if (IS_ERR(poller))
> -		return PTR_ERR(poller);
> +	if (IS_ERR(poller)) {
> +		ret = PTR_ERR(poller);
> +		goto out;
> +	}
> +
> +	ret = sched_getaffinity(current->pid, mask);
> +	if (ret)
> +		goto out;
> +
> +	ret = sched_setaffinity(poller->pid, mask);
> +	if (ret)
> +		goto out;
> +
> +	ret = cgroup_attach_task_current_cg(poller);
> +	if (ret)
> +		goto out;
> 
>  	dev->vqs = vqs;
>  	dev->nvqs = nvqs;
> @@ -202,7 +221,14 @@ long vhost_dev_init(struct vhost_dev *de
>  			vhost_poll_init(&dev->vqs[i].poll,
>  					dev->vqs[i].handle_kick, POLLIN, dev);
>  	}
> -	return 0;
> +
> +	wake_up_process(poller);	/* avoid contributing to loadavg */
> +	ret = 0;
> +out:
> +	if (ret)
> +		kthread_stop(poller);
> +	free_cpumask_var(mask);
> +	return ret;
>  }
> 
>  /* Caller should have device mutex */

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers
  2010-06-24  8:11                         ` [PATCH " Michael S. Tsirkin
@ 2010-06-24 22:45                           ` Sridhar Samudrala
  2010-06-25 10:10                             ` [PATCH] sched: export sched_set/getaffinity (was Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers) Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Sridhar Samudrala @ 2010-06-24 22:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Tejun Heo, Oleg Nesterov, netdev, lkml, kvm, Andrew Morton,
	Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner, Ingo Molnar,
	Andi Kleen

On Thu, 2010-06-24 at 11:11 +0300, Michael S. Tsirkin wrote:
> On Sun, May 30, 2010 at 10:25:01PM +0200, Tejun Heo wrote:
> > Apply the cpumask and cgroup of the initializing task to the created
> > vhost poller.
> > 
> > Based on Sridhar Samudrala's patch.
> > 
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
> 
> 
> I wanted to apply this, but modpost fails:
> ERROR: "sched_setaffinity" [drivers/vhost/vhost_net.ko] undefined!
> ERROR: "sched_getaffinity" [drivers/vhost/vhost_net.ko] undefined!
> 
> Did you try building as a module?

In my original implementation, i had these calls in workqueue.c.
Now that these are moved to vhost.c which can be built as a module,
these symbols need to be exported.
The following patch fixes the build issue with vhost as a module.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

diff --git a/kernel/sched.c b/kernel/sched.c
index 3c2a54f..15a0c6f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4837,6 +4837,7 @@ out_put_task:
 	put_online_cpus();
 	return retval;
 }
+EXPORT_SYMBOL_GPL(sched_setaffinity);
 
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr,
unsigned len,
 			     struct cpumask *new_mask)
@@ -4900,6 +4901,7 @@ out_unlock:
 
 	return retval;
 }
+EXPORT_SYMBOL_GPL(sched_getaffinity);
 
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process


> > ---
> >  drivers/vhost/vhost.c |   36 +++++++++++++++++++++++++++++++-----
> >  1 file changed, 31 insertions(+), 5 deletions(-)
> > 
> > Index: work/drivers/vhost/vhost.c
> > ===================================================================
> > --- work.orig/drivers/vhost/vhost.c
> > +++ work/drivers/vhost/vhost.c
> > @@ -23,6 +23,7 @@
> >  #include <linux/highmem.h>
> >  #include <linux/slab.h>
> >  #include <linux/kthread.h>
> > +#include <linux/cgroup.h>
> > 
> >  #include <linux/net.h>
> >  #include <linux/if_packet.h>
> > @@ -176,12 +177,30 @@ repeat:
> >  long vhost_dev_init(struct vhost_dev *dev,
> >  		    struct vhost_virtqueue *vqs, int nvqs)
> >  {
> > -	struct task_struct *poller;
> > -	int i;
> > +	struct task_struct *poller = NULL;
> > +	cpumask_var_t mask;
> > +	int i, ret = -ENOMEM;
> > +
> > +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> > +		goto out;
> > 
> >  	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
> > -	if (IS_ERR(poller))
> > -		return PTR_ERR(poller);
> > +	if (IS_ERR(poller)) {
> > +		ret = PTR_ERR(poller);
> > +		goto out;
> > +	}
> > +
> > +	ret = sched_getaffinity(current->pid, mask);
> > +	if (ret)
> > +		goto out;
> > +
> > +	ret = sched_setaffinity(poller->pid, mask);
> > +	if (ret)
> > +		goto out;
> > +
> > +	ret = cgroup_attach_task_current_cg(poller);
> > +	if (ret)
> > +		goto out;
> > 
> >  	dev->vqs = vqs;
> >  	dev->nvqs = nvqs;
> > @@ -202,7 +221,14 @@ long vhost_dev_init(struct vhost_dev *de
> >  			vhost_poll_init(&dev->vqs[i].poll,
> >  					dev->vqs[i].handle_kick, POLLIN, dev);
> >  	}
> > -	return 0;
> > +
> > +	wake_up_process(poller);	/* avoid contributing to loadavg */
> > +	ret = 0;
> > +out:
> > +	if (ret)
> > +		kthread_stop(poller);
> > +	free_cpumask_var(mask);
> > +	return ret;
> >  }
> > 
> >  /* Caller should have device mutex */
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply related	[flat|nested] 115+ messages in thread

* [PATCH] sched: export sched_set/getaffinity (was Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers)
  2010-06-24 22:45                           ` Sridhar Samudrala
@ 2010-06-25 10:10                             ` Michael S. Tsirkin
  2010-07-01 11:07                               ` [PATCH repost] sched: export sched_set/getaffinity to modules Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-06-25 10:10 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Tejun Heo, Oleg Nesterov, netdev, lkml, kvm, Andrew Morton,
	Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner, Ingo Molnar,
	Andi Kleen

On Thu, Jun 24, 2010 at 03:45:51PM -0700, Sridhar Samudrala wrote:
> On Thu, 2010-06-24 at 11:11 +0300, Michael S. Tsirkin wrote:
> > On Sun, May 30, 2010 at 10:25:01PM +0200, Tejun Heo wrote:
> > > Apply the cpumask and cgroup of the initializing task to the created
> > > vhost poller.
> > > 
> > > Based on Sridhar Samudrala's patch.
> > > 
> > > Cc: Michael S. Tsirkin <mst@redhat.com>
> > > Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
> > 
> > 
> > I wanted to apply this, but modpost fails:
> > ERROR: "sched_setaffinity" [drivers/vhost/vhost_net.ko] undefined!
> > ERROR: "sched_getaffinity" [drivers/vhost/vhost_net.ko] undefined!
> > 
> > Did you try building as a module?
> 
> In my original implementation, i had these calls in workqueue.c.
> Now that these are moved to vhost.c which can be built as a module,
> these symbols need to be exported.
> The following patch fixes the build issue with vhost as a module.
> 
> Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

Works for me. To simplify dependencies, I'd like to queue this
together with the chost patches through net-next.
Ack to this?

> diff --git a/kernel/sched.c b/kernel/sched.c
> index 3c2a54f..15a0c6f 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -4837,6 +4837,7 @@ out_put_task:
>  	put_online_cpus();
>  	return retval;
>  }
> +EXPORT_SYMBOL_GPL(sched_setaffinity);
>  
>  static int get_user_cpu_mask(unsigned long __user *user_mask_ptr,
> unsigned len,
>  			     struct cpumask *new_mask)
> @@ -4900,6 +4901,7 @@ out_unlock:
>  
>  	return retval;
>  }
> +EXPORT_SYMBOL_GPL(sched_getaffinity);
>  
>  /**
>   * sys_sched_getaffinity - get the cpu affinity of a process
> 
> 
> > > ---
> > >  drivers/vhost/vhost.c |   36 +++++++++++++++++++++++++++++++-----
> > >  1 file changed, 31 insertions(+), 5 deletions(-)
> > > 
> > > Index: work/drivers/vhost/vhost.c
> > > ===================================================================
> > > --- work.orig/drivers/vhost/vhost.c
> > > +++ work/drivers/vhost/vhost.c
> > > @@ -23,6 +23,7 @@
> > >  #include <linux/highmem.h>
> > >  #include <linux/slab.h>
> > >  #include <linux/kthread.h>
> > > +#include <linux/cgroup.h>
> > > 
> > >  #include <linux/net.h>
> > >  #include <linux/if_packet.h>
> > > @@ -176,12 +177,30 @@ repeat:
> > >  long vhost_dev_init(struct vhost_dev *dev,
> > >  		    struct vhost_virtqueue *vqs, int nvqs)
> > >  {
> > > -	struct task_struct *poller;
> > > -	int i;
> > > +	struct task_struct *poller = NULL;
> > > +	cpumask_var_t mask;
> > > +	int i, ret = -ENOMEM;
> > > +
> > > +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> > > +		goto out;
> > > 
> > >  	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
> > > -	if (IS_ERR(poller))
> > > -		return PTR_ERR(poller);
> > > +	if (IS_ERR(poller)) {
> > > +		ret = PTR_ERR(poller);
> > > +		goto out;
> > > +	}
> > > +
> > > +	ret = sched_getaffinity(current->pid, mask);
> > > +	if (ret)
> > > +		goto out;
> > > +
> > > +	ret = sched_setaffinity(poller->pid, mask);
> > > +	if (ret)
> > > +		goto out;
> > > +
> > > +	ret = cgroup_attach_task_current_cg(poller);
> > > +	if (ret)
> > > +		goto out;
> > > 
> > >  	dev->vqs = vqs;
> > >  	dev->nvqs = nvqs;
> > > @@ -202,7 +221,14 @@ long vhost_dev_init(struct vhost_dev *de
> > >  			vhost_poll_init(&dev->vqs[i].poll,
> > >  					dev->vqs[i].handle_kick, POLLIN, dev);
> > >  	}
> > > -	return 0;
> > > +
> > > +	wake_up_process(poller);	/* avoid contributing to loadavg */
> > > +	ret = 0;
> > > +out:
> > > +	if (ret)
> > > +		kthread_stop(poller);
> > > +	free_cpumask_var(mask);
> > > +	return ret;
> > >  }
> > > 
> > >  /* Caller should have device mutex */
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 115+ messages in thread

* [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-06-25 10:10                             ` [PATCH] sched: export sched_set/getaffinity (was Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers) Michael S. Tsirkin
@ 2010-07-01 11:07                               ` Michael S. Tsirkin
  2010-07-01 11:19                                 ` Peter Zijlstra
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-01 11:07 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Andi Kleen

Author: Sridhar Samudrala <sri@us.ibm.com>

sched: export sched_set/getaffinity to modules

vhost-net driver wants to copy the affinity from the
owner thread to thread it creates. Export
sched_set/get affinity to modules to make this possible
when vhost is built as a module.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---

I'm not sure the previous time made it clear what exactly is the
proposed change, so reposting.  Info, Peter, could you ack merging the
following through the net-next tree please?

diff --git a/kernel/sched.c b/kernel/sched.c
index d484081..3759391 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4744,6 +4744,7 @@ out_put_task:
 	put_online_cpus();
 	return retval;
 }
+EXPORT_SYMBOL_GPL(sched_setaffinity);
 
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 			     struct cpumask *new_mask)
@@ -4807,6 +4808,7 @@ out_unlock:
 
 	return retval;
 }
+EXPORT_SYMBOL_GPL(sched_getaffinity);
 
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process

^ permalink raw reply related	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 11:07                               ` [PATCH repost] sched: export sched_set/getaffinity to modules Michael S. Tsirkin
@ 2010-07-01 11:19                                 ` Peter Zijlstra
  2010-07-01 11:43                                   ` Peter Zijlstra
  0 siblings, 1 reply; 115+ messages in thread
From: Peter Zijlstra @ 2010-07-01 11:19 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, 2010-07-01 at 14:07 +0300, Michael S. Tsirkin wrote:
> Author: Sridhar Samudrala <sri@us.ibm.com>
> 
> sched: export sched_set/getaffinity to modules
> 
> vhost-net driver wants to copy the affinity from the
> owner thread to thread it creates. Export
> sched_set/get affinity to modules to make this possible
> when vhost is built as a module.
> 
> Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> ---
> 
> I'm not sure the previous time made it clear what exactly is the
> proposed change, so reposting.  Info, Peter, could you ack merging the
> following through the net-next tree please?
> 
> diff --git a/kernel/sched.c b/kernel/sched.c
> index d484081..3759391 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -4744,6 +4744,7 @@ out_put_task:
>  	put_online_cpus();
>  	return retval;
>  }
> +EXPORT_SYMBOL_GPL(sched_setaffinity);
>  
>  static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
>  			     struct cpumask *new_mask)
> @@ -4807,6 +4808,7 @@ out_unlock:
>  
>  	return retval;
>  }
> +EXPORT_SYMBOL_GPL(sched_getaffinity);
>  
>  /**
>   * sys_sched_getaffinity - get the cpu affinity of a process

Urgh,.. so why again is that a good idea?

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 11:19                                 ` Peter Zijlstra
@ 2010-07-01 11:43                                   ` Peter Zijlstra
  2010-07-01 11:55                                     ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Peter Zijlstra @ 2010-07-01 11:43 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, 2010-07-01 at 13:19 +0200, Peter Zijlstra wrote:
> On Thu, 2010-07-01 at 14:07 +0300, Michael S. Tsirkin wrote:
> > Author: Sridhar Samudrala <sri@us.ibm.com>
> > 
> > sched: export sched_set/getaffinity to modules
> > 
> > vhost-net driver wants to copy the affinity from the
> > owner thread to thread it creates. Export
> > sched_set/get affinity to modules to make this possible
> > when vhost is built as a module.

> Urgh,.. so why again is that a good idea?

In particular:
 - who sets the affinity of the task? 
 - why can't it set the kernel thread's affinity too?
 - what happens if someone changes the tasks' affinity?

So no, I don't think this is a sensible thing to do at all.

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 11:43                                   ` Peter Zijlstra
@ 2010-07-01 11:55                                     ` Michael S. Tsirkin
  2010-07-01 12:23                                       ` Michael S. Tsirkin
  2010-07-01 12:32                                       ` Peter Zijlstra
  0 siblings, 2 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-01 11:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, Jul 01, 2010 at 01:43:23PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-07-01 at 13:19 +0200, Peter Zijlstra wrote:
> > On Thu, 2010-07-01 at 14:07 +0300, Michael S. Tsirkin wrote:
> > > Author: Sridhar Samudrala <sri@us.ibm.com>
> > > 
> > > sched: export sched_set/getaffinity to modules
> > > 
> > > vhost-net driver wants to copy the affinity from the
> > > owner thread to thread it creates. Export
> > > sched_set/get affinity to modules to make this possible
> > > when vhost is built as a module.
> 
> > Urgh,.. so why again is that a good idea?
> 
> In particular:
>  - who sets the affinity of the task? 

management tools do this when they start qemu.

>  - why can't it set the kernel thread's affinity too?

It can. However: the threads are started internally by the driver
when qemu does an ioctl.  What we want to do is give it a sensible
default affinity. management tool can later tweak it if it wants to.

>  - what happens if someone changes the tasks' affinity?

We would normally create a cgroup including all internal
tasks, making it easy to find and change affinity for
them all if necessary.

> So no, I don't think this is a sensible thing to do at all.

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 11:55                                     ` Michael S. Tsirkin
@ 2010-07-01 12:23                                       ` Michael S. Tsirkin
  2010-07-01 12:34                                         ` Peter Zijlstra
  2010-07-01 12:32                                       ` Peter Zijlstra
  1 sibling, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-01 12:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, Jul 01, 2010 at 02:55:07PM +0300, Michael S. Tsirkin wrote:
> On Thu, Jul 01, 2010 at 01:43:23PM +0200, Peter Zijlstra wrote:
> > On Thu, 2010-07-01 at 13:19 +0200, Peter Zijlstra wrote:
> > > On Thu, 2010-07-01 at 14:07 +0300, Michael S. Tsirkin wrote:
> > > > Author: Sridhar Samudrala <sri@us.ibm.com>
> > > > 
> > > > sched: export sched_set/getaffinity to modules
> > > > 
> > > > vhost-net driver wants to copy the affinity from the
> > > > owner thread to thread it creates. Export
> > > > sched_set/get affinity to modules to make this possible
> > > > when vhost is built as a module.
> > 
> > > Urgh,.. so why again is that a good idea?
> > 
> > In particular:
> >  - who sets the affinity of the task? 
> 
> management tools do this when they start qemu.
> 
> >  - why can't it set the kernel thread's affinity too?
> 
> It can. However: the threads are started internally by the driver
> when qemu does an ioctl.  What we want to do is give it a sensible
> default affinity. management tool can later tweak it if it wants to.
> 
> >  - what happens if someone changes the tasks' affinity?
> 
> We would normally create a cgroup including all internal
> tasks, making it easy to find and change affinity for
> them all if necessary.
> 
> > So no, I don't think this is a sensible thing to do at all.

The patch using this is here:
http://www.mail-archive.com/kvm@vger.kernel.org/msg35411.html

It simply copies the affinity from the parent when thread is created.

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 11:55                                     ` Michael S. Tsirkin
  2010-07-01 12:23                                       ` Michael S. Tsirkin
@ 2010-07-01 12:32                                       ` Peter Zijlstra
  2010-07-01 12:50                                         ` Michael S. Tsirkin
  1 sibling, 1 reply; 115+ messages in thread
From: Peter Zijlstra @ 2010-07-01 12:32 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, 2010-07-01 at 14:55 +0300, Michael S. Tsirkin wrote:

> >  - why can't it set the kernel thread's affinity too?
> 
> It can. However: the threads are started internally by the driver
> when qemu does an ioctl.  What we want to do is give it a sensible
> default affinity. management tool can later tweak it if it wants to.

So have that ioctl return the tid of that new fancy thread and then set
its affinity, stuff it in cgroup, whatever you fancy.

> >  - what happens if someone changes the tasks' affinity?
> 
> We would normally create a cgroup including all internal
> tasks, making it easy to find and change affinity for
> them all if necessary. 

And to stuff them in a cgroup you also need the tid, at which point it
might as well set the affinity from userspace, right?

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 12:23                                       ` Michael S. Tsirkin
@ 2010-07-01 12:34                                         ` Peter Zijlstra
  2010-07-01 12:46                                           ` Peter Zijlstra
  0 siblings, 1 reply; 115+ messages in thread
From: Peter Zijlstra @ 2010-07-01 12:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, 2010-07-01 at 15:23 +0300, Michael S. Tsirkin wrote:
> 
> The patch using this is here:
> http://www.mail-archive.com/kvm@vger.kernel.org/msg35411.html
> 
> It simply copies the affinity from the parent when thread is created.

Sounds like policy, not something the kernel should do..

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 12:34                                         ` Peter Zijlstra
@ 2010-07-01 12:46                                           ` Peter Zijlstra
  2010-07-01 13:08                                             ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Peter Zijlstra @ 2010-07-01 12:46 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, 2010-07-01 at 14:34 +0200, Peter Zijlstra wrote:
> On Thu, 2010-07-01 at 15:23 +0300, Michael S. Tsirkin wrote:
> > 
> > The patch using this is here:
> > http://www.mail-archive.com/kvm@vger.kernel.org/msg35411.html
> > 
> > It simply copies the affinity from the parent when thread is created.
> 
> Sounds like policy, not something the kernel should do..

The alternative would be using clone() instead of thread_create() and
inherit everything from the creating task. Inheriting from kthreadd and
then undoing some aspects just sounds like daft policy that really ought
to be in userspace.

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 12:32                                       ` Peter Zijlstra
@ 2010-07-01 12:50                                         ` Michael S. Tsirkin
  2010-07-01 13:07                                           ` Peter Zijlstra
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-01 12:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, Jul 01, 2010 at 02:32:43PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-07-01 at 14:55 +0300, Michael S. Tsirkin wrote:
> 
> > >  - why can't it set the kernel thread's affinity too?
> > 
> > It can. However: the threads are started internally by the driver
> > when qemu does an ioctl.  What we want to do is give it a sensible
> > default affinity. management tool can later tweak it if it wants to.
> 
> So have that ioctl return the tid of that new fancy thread and then set
> its affinity, stuff it in cgroup, whatever you fancy.
> 
> > >  - what happens if someone changes the tasks' affinity?
> > 
> > We would normally create a cgroup including all internal
> > tasks, making it easy to find and change affinity for
> > them all if necessary. 
> 
> And to stuff them in a cgroup you also need the tid, at which point it
> might as well set the affinity from userspace, right?

We also put it in a cgroup transparently. I think that it's actually
important to do it on thread creation: if we don't, malicious userspace
can create large amount of work exceeding the cgroup limits.

And the same applies so the affinity, right? If the qemu process
is limited to a set of CPUs, isn't it important to make
the kernel thread that does work our behalf limited to the same
set of CPUs?

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 12:50                                         ` Michael S. Tsirkin
@ 2010-07-01 13:07                                           ` Peter Zijlstra
  2010-07-01 13:22                                             ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Peter Zijlstra @ 2010-07-01 13:07 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, 2010-07-01 at 15:50 +0300, Michael S. Tsirkin wrote:
> On Thu, Jul 01, 2010 at 02:32:43PM +0200, Peter Zijlstra wrote:
> > On Thu, 2010-07-01 at 14:55 +0300, Michael S. Tsirkin wrote:
> > 
> > > >  - why can't it set the kernel thread's affinity too?
> > > 
> > > It can. However: the threads are started internally by the driver
> > > when qemu does an ioctl.  What we want to do is give it a sensible
> > > default affinity. management tool can later tweak it if it wants to.
> > 
> > So have that ioctl return the tid of that new fancy thread and then set
> > its affinity, stuff it in cgroup, whatever you fancy.
> > 
> > > >  - what happens if someone changes the tasks' affinity?
> > > 
> > > We would normally create a cgroup including all internal
> > > tasks, making it easy to find and change affinity for
> > > them all if necessary. 
> > 
> > And to stuff them in a cgroup you also need the tid, at which point it
> > might as well set the affinity from userspace, right?
> 
> We also put it in a cgroup transparently. I think that it's actually
> important to do it on thread creation: if we don't, malicious userspace
> can create large amount of work exceeding the cgroup limits.
> 
> And the same applies so the affinity, right? If the qemu process
> is limited to a set of CPUs, isn't it important to make
> the kernel thread that does work our behalf limited to the same
> set of CPUs?

I'm not sure we have anything like this, but I wouldn't think so, if a
driver creates a kthread and manages to inject tons of work its not
typically limited to whatever limits apply to the task that supplied the
work.

Take the encryption threads for example, those don't run in the context
of whoever provides the data to be encrypted (file,net whatever) and
thus the task responsible could consume heaps more resources than when
it would have to do the encryption itself.

That's how kthreads work.

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 12:46                                           ` Peter Zijlstra
@ 2010-07-01 13:08                                             ` Michael S. Tsirkin
  2010-07-01 13:30                                               ` Peter Zijlstra
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-01 13:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, Jul 01, 2010 at 02:46:35PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-07-01 at 14:34 +0200, Peter Zijlstra wrote:
> > On Thu, 2010-07-01 at 15:23 +0300, Michael S. Tsirkin wrote:
> > > 
> > > The patch using this is here:
> > > http://www.mail-archive.com/kvm@vger.kernel.org/msg35411.html
> > > 
> > > It simply copies the affinity from the parent when thread is created.
> > 
> > Sounds like policy, not something the kernel should do..
> 
> The alternative would be using clone() instead of thread_create() and
> inherit everything from the creating task.
> Inheriting from kthreadd and then undoing some aspects just sounds
> like daft policy that really ought to be in userspace.

Yes, that's basically what this patchset is trying to do:
create a workqueue inheriting everything from the creating task.
Sridhar started with an API to do exactly this:
http://linux.derkeiler.com/Mailing-Lists/Kernel/2010-05/msg07478.html

Then we switched to raw kthread to avoid stepping on cwq toes.
Maybe it makes sense to add kthread_clone (in addition to
kthread_create) that would do what you suggest?
If yes, any hints on an implementation?


-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 13:07                                           ` Peter Zijlstra
@ 2010-07-01 13:22                                             ` Michael S. Tsirkin
  0 siblings, 0 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-01 13:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, Jul 01, 2010 at 03:07:26PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-07-01 at 15:50 +0300, Michael S. Tsirkin wrote:
> > On Thu, Jul 01, 2010 at 02:32:43PM +0200, Peter Zijlstra wrote:
> > > On Thu, 2010-07-01 at 14:55 +0300, Michael S. Tsirkin wrote:
> > > 
> > > > >  - why can't it set the kernel thread's affinity too?
> > > > 
> > > > It can. However: the threads are started internally by the driver
> > > > when qemu does an ioctl.  What we want to do is give it a sensible
> > > > default affinity. management tool can later tweak it if it wants to.
> > > 
> > > So have that ioctl return the tid of that new fancy thread and then set
> > > its affinity, stuff it in cgroup, whatever you fancy.
> > > 
> > > > >  - what happens if someone changes the tasks' affinity?
> > > > 
> > > > We would normally create a cgroup including all internal
> > > > tasks, making it easy to find and change affinity for
> > > > them all if necessary. 
> > > 
> > > And to stuff them in a cgroup you also need the tid, at which point it
> > > might as well set the affinity from userspace, right?
> > 
> > We also put it in a cgroup transparently. I think that it's actually
> > important to do it on thread creation: if we don't, malicious userspace
> > can create large amount of work exceeding the cgroup limits.
> > 
> > And the same applies so the affinity, right? If the qemu process
> > is limited to a set of CPUs, isn't it important to make
> > the kernel thread that does work our behalf limited to the same
> > set of CPUs?
> 
> I'm not sure we have anything like this, but I wouldn't think so, if a
> driver creates a kthread and manages to inject tons of work its not
> typically limited to whatever limits apply to the task that supplied the
> work.
> 
> Take the encryption threads for example, those don't run in the context
> of whoever provides the data to be encrypted (file,net whatever) and
> thus the task responsible could consume heaps more resources than when
> it would have to do the encryption itself.
> 
> That's how kthreads work.

Right. And IMHO ideally all such work would run on the appropriate
CPU and be accounted to. It's just that with virt people seem to
run untrusted applications and expect the damage to be contained.
So we came up with a simple approach that seems to do the
just just for us.

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 13:08                                             ` Michael S. Tsirkin
@ 2010-07-01 13:30                                               ` Peter Zijlstra
  2010-07-01 13:39                                                 ` Michael S. Tsirkin
  2010-07-01 14:33                                                 ` Oleg Nesterov
  0 siblings, 2 replies; 115+ messages in thread
From: Peter Zijlstra @ 2010-07-01 13:30 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, 2010-07-01 at 16:08 +0300, Michael S. Tsirkin wrote:
> On Thu, Jul 01, 2010 at 02:46:35PM +0200, Peter Zijlstra wrote:
> > On Thu, 2010-07-01 at 14:34 +0200, Peter Zijlstra wrote:
> > > On Thu, 2010-07-01 at 15:23 +0300, Michael S. Tsirkin wrote:
> > > > 
> > > > The patch using this is here:
> > > > http://www.mail-archive.com/kvm@vger.kernel.org/msg35411.html
> > > > 
> > > > It simply copies the affinity from the parent when thread is created.
> > > 
> > > Sounds like policy, not something the kernel should do..
> > 
> > The alternative would be using clone() instead of thread_create() and
> > inherit everything from the creating task.
> > Inheriting from kthreadd and then undoing some aspects just sounds
> > like daft policy that really ought to be in userspace.
> 
> Yes, that's basically what this patchset is trying to do:
> create a workqueue inheriting everything from the creating task.
> Sridhar started with an API to do exactly this:
> http://linux.derkeiler.com/Mailing-Lists/Kernel/2010-05/msg07478.html
> 
> Then we switched to raw kthread to avoid stepping on cwq toes.
> Maybe it makes sense to add kthread_clone (in addition to
> kthread_create) that would do what you suggest?
> If yes, any hints on an implementation?

I think that's called kernel_thread() see
kernel/kthread.c:create_kthread().

Doing the whole kthreadd dance and then copying bits and pieces back
sounds very fragile, so yeah, something like that should work.

The other issue to consider is the thread group status of these things,
I think it would be best if these threads were still considered part of
the process that spawned them so that they would die nicely when the
process gets whacked.

At which point one could wonder if the kthread interface makes any
sense, why not let userspace fork tasks and let them call into the
kernel to perform work...

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 13:30                                               ` Peter Zijlstra
@ 2010-07-01 13:39                                                 ` Michael S. Tsirkin
  2010-07-01 13:57                                                   ` Peter Zijlstra
  2010-07-01 14:27                                                   ` Tejun Heo
  2010-07-01 14:33                                                 ` Oleg Nesterov
  1 sibling, 2 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-01 13:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, Jul 01, 2010 at 03:30:24PM +0200, Peter Zijlstra wrote:
> On Thu, 2010-07-01 at 16:08 +0300, Michael S. Tsirkin wrote:
> > On Thu, Jul 01, 2010 at 02:46:35PM +0200, Peter Zijlstra wrote:
> > > On Thu, 2010-07-01 at 14:34 +0200, Peter Zijlstra wrote:
> > > > On Thu, 2010-07-01 at 15:23 +0300, Michael S. Tsirkin wrote:
> > > > > 
> > > > > The patch using this is here:
> > > > > http://www.mail-archive.com/kvm@vger.kernel.org/msg35411.html
> > > > > 
> > > > > It simply copies the affinity from the parent when thread is created.
> > > > 
> > > > Sounds like policy, not something the kernel should do..
> > > 
> > > The alternative would be using clone() instead of thread_create() and
> > > inherit everything from the creating task.
> > > Inheriting from kthreadd and then undoing some aspects just sounds
> > > like daft policy that really ought to be in userspace.
> > 
> > Yes, that's basically what this patchset is trying to do:
> > create a workqueue inheriting everything from the creating task.
> > Sridhar started with an API to do exactly this:
> > http://linux.derkeiler.com/Mailing-Lists/Kernel/2010-05/msg07478.html
> > 
> > Then we switched to raw kthread to avoid stepping on cwq toes.
> > Maybe it makes sense to add kthread_clone (in addition to
> > kthread_create) that would do what you suggest?
> > If yes, any hints on an implementation?
> 
> I think that's called kernel_thread() see
> kernel/kthread.c:create_kthread().
> 
> Doing the whole kthreadd dance and then copying bits and pieces back
> sounds very fragile, so yeah, something like that should work.


Thanks!
Sridhar, Tejun, have the time to look into this approach?

> The other issue to consider is the thread group status of these things,
> I think it would be best if these threads were still considered part of
> the process that spawned them so that they would die nicely when the
> process gets whacked.

The proposed patch kills the thread when the fd is closed,
so I think this already works without making it part of the process.

> At which point one could wonder if the kthread interface makes any
> sense, why not let userspace fork tasks and let them call into the
> kernel to perform work...

One thing I wanted to avoid is letting userspace know
just how many threads are there. We are using a single one
now, but we used to have threads per-cpu, and we might
switch to a thread per virtqueue in the future.
IMO all this should ideally be transparent to userspace.

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 13:39                                                 ` Michael S. Tsirkin
@ 2010-07-01 13:57                                                   ` Peter Zijlstra
  2010-07-01 14:27                                                   ` Tejun Heo
  1 sibling, 0 replies; 115+ messages in thread
From: Peter Zijlstra @ 2010-07-01 13:57 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Ingo Molnar, Sridhar Samudrala, Tejun Heo, Oleg Nesterov, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Thu, 2010-07-01 at 16:39 +0300, Michael S. Tsirkin wrote:
> 
> The proposed patch kills the thread when the fd is closed,
> so I think this already works without making it part of the process.
> 
OK, fd bound resources are fine.

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 13:39                                                 ` Michael S. Tsirkin
  2010-07-01 13:57                                                   ` Peter Zijlstra
@ 2010-07-01 14:27                                                   ` Tejun Heo
  2010-07-01 14:46                                                     ` Oleg Nesterov
  1 sibling, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-07-01 14:27 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Peter Zijlstra, Ingo Molnar, Sridhar Samudrala, Oleg Nesterov,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

Hello,

On 07/01/2010 03:39 PM, Michael S. Tsirkin wrote:
>> I think that's called kernel_thread() see
>> kernel/kthread.c:create_kthread().
>>
>> Doing the whole kthreadd dance and then copying bits and pieces back
>> sounds very fragile, so yeah, something like that should work.
> 
> Thanks!
> Sridhar, Tejun, have the time to look into this approach?

All that's necessary is shortcutting indirection through kthreadd.
ie. An exported function which looks like the following,

 struct kthread_clone_or_whatever(int (*threadfn).....)
 {
	struct kthread_create_info create;
	int pid;

	INIT create;

	pid = kernel_thread(kthread, &create, CLONE_FS...);
	if (pid < 0)
		return ERROR;
	wait_for_completion(&create.done);

	if (!IS_ERR(create.result))
		SET NAME;
	return create.result;
 }

It might be a good idea to make the function take extra clone flags
but anyways once created cloned task can be treated the same way as
other kthreads, so nothing else needs to be changed.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 13:30                                               ` Peter Zijlstra
  2010-07-01 13:39                                                 ` Michael S. Tsirkin
@ 2010-07-01 14:33                                                 ` Oleg Nesterov
  1 sibling, 0 replies; 115+ messages in thread
From: Oleg Nesterov @ 2010-07-01 14:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Michael S. Tsirkin, Ingo Molnar, Sridhar Samudrala, Tejun Heo,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On 07/01, Peter Zijlstra wrote:
>
> On Thu, 2010-07-01 at 16:08 +0300, Michael S. Tsirkin wrote:
> > Maybe it makes sense to add kthread_clone (in addition to
> > kthread_create) that would do what you suggest?
> > If yes, any hints on an implementation?
>
> I think that's called kernel_thread() see
> kernel/kthread.c:create_kthread().

Well, strictly speaking kernel_thread() doesn't create the kernel thread.
Unless the caller is the kernel thread. And daemonize() is deprecated.
kernel_thread() just forks the CLONE_VM + flags child.

Oleg.


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 14:27                                                   ` Tejun Heo
@ 2010-07-01 14:46                                                     ` Oleg Nesterov
  2010-07-01 14:53                                                       ` Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Oleg Nesterov @ 2010-07-01 14:46 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michael S. Tsirkin, Peter Zijlstra, Ingo Molnar,
	Sridhar Samudrala, netdev, lkml, kvm, Andrew Morton,
	Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner, Andi Kleen

On 07/01, Tejun Heo wrote:
>
> All that's necessary is shortcutting indirection through kthreadd.
> ie. An exported function which looks like the following,
>
>  struct kthread_clone_or_whatever(int (*threadfn).....)
>  {
> 	struct kthread_create_info create;
> 	int pid;
>
> 	INIT create;
>
> 	pid = kernel_thread(kthread, &create, CLONE_FS...);
> 	if (pid < 0)
> 		return ERROR;
> 	wait_for_completion(&create.done);
>
> 	if (!IS_ERR(create.result))
> 		SET NAME;
> 	return create.result;
>  }
>
> It might be a good idea to make the function take extra clone flags
> but anyways once created cloned task can be treated the same way as
> other kthreads, so nothing else needs to be changed.

This makes kthread_stop() work. Otherwise the new thread is just
the CLONE_VM child of the caller, and the caller is the user-mode
task doing ioctl() ?

Oleg.


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 14:46                                                     ` Oleg Nesterov
@ 2010-07-01 14:53                                                       ` Tejun Heo
  2010-07-01 14:55                                                         ` Peter Zijlstra
  0 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-07-01 14:53 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Michael S. Tsirkin, Peter Zijlstra, Ingo Molnar,
	Sridhar Samudrala, netdev, lkml, kvm, Andrew Morton,
	Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner, Andi Kleen

Hello,

On 07/01/2010 04:46 PM, Oleg Nesterov wrote:
>> It might be a good idea to make the function take extra clone flags
>> but anyways once created cloned task can be treated the same way as
>> other kthreads, so nothing else needs to be changed.
> 
> This makes kthread_stop() work. Otherwise the new thread is just
> the CLONE_VM child of the caller, and the caller is the user-mode
> task doing ioctl() ?

Hmmm, indeed.  It makes the attribute inheritance work but circumvents
the whole reason there is kthreadd.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 14:53                                                       ` Tejun Heo
@ 2010-07-01 14:55                                                         ` Peter Zijlstra
  2010-07-02 18:01                                                           ` Sridhar Samudrala
  0 siblings, 1 reply; 115+ messages in thread
From: Peter Zijlstra @ 2010-07-01 14:55 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Michael S. Tsirkin, Ingo Molnar,
	Sridhar Samudrala, netdev, lkml, kvm, Andrew Morton,
	Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner, Andi Kleen

On Thu, 2010-07-01 at 16:53 +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/01/2010 04:46 PM, Oleg Nesterov wrote:
> >> It might be a good idea to make the function take extra clone flags
> >> but anyways once created cloned task can be treated the same way as
> >> other kthreads, so nothing else needs to be changed.
> > 
> > This makes kthread_stop() work. Otherwise the new thread is just
> > the CLONE_VM child of the caller, and the caller is the user-mode
> > task doing ioctl() ?
> 
> Hmmm, indeed.  It makes the attribute inheritance work but circumvents
> the whole reason there is kthreadd.

I thought the whole reason there was threadd was to avoid the
inheritance? So avoiding the avoiding of inheritance seems like the goal
here, no?

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-01 14:55                                                         ` Peter Zijlstra
@ 2010-07-02 18:01                                                           ` Sridhar Samudrala
  2010-07-02 18:11                                                             ` Peter Zijlstra
  0 siblings, 1 reply; 115+ messages in thread
From: Sridhar Samudrala @ 2010-07-02 18:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tejun Heo, Oleg Nesterov, Michael S. Tsirkin, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On 7/1/2010 7:55 AM, Peter Zijlstra wrote:
> On Thu, 2010-07-01 at 16:53 +0200, Tejun Heo wrote:
>    
>> Hello,
>>
>> On 07/01/2010 04:46 PM, Oleg Nesterov wrote:
>>      
>>>> It might be a good idea to make the function take extra clone flags
>>>> but anyways once created cloned task can be treated the same way as
>>>> other kthreads, so nothing else needs to be changed.
>>>>          
>>> This makes kthread_stop() work. Otherwise the new thread is just
>>> the CLONE_VM child of the caller, and the caller is the user-mode
>>> task doing ioctl() ?
>>>        
>> Hmmm, indeed.  It makes the attribute inheritance work but circumvents
>> the whole reason there is kthreadd.
>>      
> I thought the whole reason there was threadd was to avoid the
> inheritance? So avoiding the avoiding of inheritance seems like the goal
> here, no?
>    
I think so. Does  it (Tejun's kthread_clone() patch) also  inherit the 
cgroup of the caller? or do we still need the explicit
call to attach the thread to the current task's cgroup?

I am on vacation next week and cannot look into this until Jul 12. Hope 
this will be resoved by then.
If not, i will look into after i am back.

Thanks
Sridhar


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-02 18:01                                                           ` Sridhar Samudrala
@ 2010-07-02 18:11                                                             ` Peter Zijlstra
  2010-07-02 21:06                                                               ` Oleg Nesterov
  0 siblings, 1 reply; 115+ messages in thread
From: Peter Zijlstra @ 2010-07-02 18:11 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Tejun Heo, Oleg Nesterov, Michael S. Tsirkin, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Fri, 2010-07-02 at 11:01 -0700, Sridhar Samudrala wrote:
>  
>  Does  it (Tejun's kthread_clone() patch) also  inherit the 
> cgroup of the caller?

Of course, its a simple do_fork() which inherits everything just as you
would expect from a similar sys_clone()/sys_fork() call.

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-02 18:11                                                             ` Peter Zijlstra
@ 2010-07-02 21:06                                                               ` Oleg Nesterov
  2010-07-04  9:00                                                                 ` Michael S. Tsirkin
  2010-07-26 17:12                                                                 ` Michael S. Tsirkin
  0 siblings, 2 replies; 115+ messages in thread
From: Oleg Nesterov @ 2010-07-02 21:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Sridhar Samudrala, Tejun Heo, Michael S. Tsirkin, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On 07/02, Peter Zijlstra wrote:
>
> On Fri, 2010-07-02 at 11:01 -0700, Sridhar Samudrala wrote:
> >
> >  Does  it (Tejun's kthread_clone() patch) also  inherit the
> > cgroup of the caller?
>
> Of course, its a simple do_fork() which inherits everything just as you
> would expect from a similar sys_clone()/sys_fork() call.

Yes. And I'm afraid it can inherit more than we want. IIUC, this is called
from ioctl(), right?

Then the new thread becomes the natural child of the caller, and it shares
->mm with the parent. And files, dup_fd() without CLONE_FS.

Signals. Say, if you send SIGKILL to this new thread, it can't sleep in
TASK_INTERRUPTIBLE or KILLABLE after that. And this SIGKILL can be sent
just because the parent gets SIGQUIT or abother coredumpable signal.
Or the new thread can recieve SIGSTOP via ^Z.

Perhaps this is OK, I do not know. Just to remind that kernel_thread()
is merely clone(CLONE_VM).

Oleg.


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-02 21:06                                                               ` Oleg Nesterov
@ 2010-07-04  9:00                                                                 ` Michael S. Tsirkin
  2010-07-13  6:59                                                                   ` Sridhar Samudrala
  2010-07-26 17:12                                                                 ` Michael S. Tsirkin
  1 sibling, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-04  9:00 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, Sridhar Samudrala, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Fri, Jul 02, 2010 at 11:06:37PM +0200, Oleg Nesterov wrote:
> On 07/02, Peter Zijlstra wrote:
> >
> > On Fri, 2010-07-02 at 11:01 -0700, Sridhar Samudrala wrote:
> > >
> > >  Does  it (Tejun's kthread_clone() patch) also  inherit the
> > > cgroup of the caller?
> >
> > Of course, its a simple do_fork() which inherits everything just as you
> > would expect from a similar sys_clone()/sys_fork() call.
> 
> Yes. And I'm afraid it can inherit more than we want. IIUC, this is called
> from ioctl(), right?
> 
> Then the new thread becomes the natural child of the caller, and it shares
> ->mm with the parent. And files, dup_fd() without CLONE_FS.
> 
> Signals. Say, if you send SIGKILL to this new thread, it can't sleep in
> TASK_INTERRUPTIBLE or KILLABLE after that. And this SIGKILL can be sent
> just because the parent gets SIGQUIT or abother coredumpable signal.
> Or the new thread can recieve SIGSTOP via ^Z.
> 
> Perhaps this is OK, I do not know. Just to remind that kernel_thread()
> is merely clone(CLONE_VM).
> 
> Oleg.


Right. Doing this might break things like flush.  The signal and exit
behaviour needs to be examined carefully. I am also unsure whether
using such threads might be more expensive than inheriting kthreadd.

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-04  9:00                                                                 ` Michael S. Tsirkin
@ 2010-07-13  6:59                                                                   ` Sridhar Samudrala
  2010-07-13 11:09                                                                     ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Sridhar Samudrala @ 2010-07-13  6:59 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Peter Zijlstra, Tejun Heo, Ingo Molnar, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On 7/4/2010 2:00 AM, Michael S. Tsirkin wrote:
> On Fri, Jul 02, 2010 at 11:06:37PM +0200, Oleg Nesterov wrote:
>    
>> On 07/02, Peter Zijlstra wrote:
>>      
>>> On Fri, 2010-07-02 at 11:01 -0700, Sridhar Samudrala wrote:
>>>        
>>>>   Does  it (Tejun's kthread_clone() patch) also  inherit the
>>>> cgroup of the caller?
>>>>          
>>> Of course, its a simple do_fork() which inherits everything just as you
>>> would expect from a similar sys_clone()/sys_fork() call.
>>>        
>> Yes. And I'm afraid it can inherit more than we want. IIUC, this is called
>> from ioctl(), right?
>>
>> Then the new thread becomes the natural child of the caller, and it shares
>> ->mm with the parent. And files, dup_fd() without CLONE_FS.
>>
>> Signals. Say, if you send SIGKILL to this new thread, it can't sleep in
>> TASK_INTERRUPTIBLE or KILLABLE after that. And this SIGKILL can be sent
>> just because the parent gets SIGQUIT or abother coredumpable signal.
>> Or the new thread can recieve SIGSTOP via ^Z.
>>
>> Perhaps this is OK, I do not know. Just to remind that kernel_thread()
>> is merely clone(CLONE_VM).
>>
>> Oleg.
>>      
>
> Right. Doing this might break things like flush.  The signal and exit
> behaviour needs to be examined carefully. I am also unsure whether
> using such threads might be more expensive than inheriting kthreadd.
>
>    
Should we just leave it to the userspace to set the cgroup/cpumask after 
qemu starts the guest and
the vhost threads?

Thanks
Sridhar



^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-13  6:59                                                                   ` Sridhar Samudrala
@ 2010-07-13 11:09                                                                     ` Michael S. Tsirkin
  2010-07-14 23:26                                                                       ` Sridhar Samudrala
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-13 11:09 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Oleg Nesterov, Peter Zijlstra, Tejun Heo, Ingo Molnar, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Mon, Jul 12, 2010 at 11:59:08PM -0700, Sridhar Samudrala wrote:
> On 7/4/2010 2:00 AM, Michael S. Tsirkin wrote:
> >On Fri, Jul 02, 2010 at 11:06:37PM +0200, Oleg Nesterov wrote:
> >>On 07/02, Peter Zijlstra wrote:
> >>>On Fri, 2010-07-02 at 11:01 -0700, Sridhar Samudrala wrote:
> >>>>  Does  it (Tejun's kthread_clone() patch) also  inherit the
> >>>>cgroup of the caller?
> >>>Of course, its a simple do_fork() which inherits everything just as you
> >>>would expect from a similar sys_clone()/sys_fork() call.
> >>Yes. And I'm afraid it can inherit more than we want. IIUC, this is called
> >>from ioctl(), right?
> >>
> >>Then the new thread becomes the natural child of the caller, and it shares
> >>->mm with the parent. And files, dup_fd() without CLONE_FS.
> >>
> >>Signals. Say, if you send SIGKILL to this new thread, it can't sleep in
> >>TASK_INTERRUPTIBLE or KILLABLE after that. And this SIGKILL can be sent
> >>just because the parent gets SIGQUIT or abother coredumpable signal.
> >>Or the new thread can recieve SIGSTOP via ^Z.
> >>
> >>Perhaps this is OK, I do not know. Just to remind that kernel_thread()
> >>is merely clone(CLONE_VM).
> >>
> >>Oleg.
> >
> >Right. Doing this might break things like flush.  The signal and exit
> >behaviour needs to be examined carefully. I am also unsure whether
> >using such threads might be more expensive than inheriting kthreadd.
> >
> Should we just leave it to the userspace to set the cgroup/cpumask
> after qemu starts the guest and
> the vhost threads?
> 
> Thanks
> Sridhar

Yes but we can't trust userspace to do this. It's important
to do it on thread creation: if we don't, malicious userspace
can create large amount of work exceeding the cgroup limits.

And the same applies so the affinity: if the qemu process
is limited to a set of CPUs, it's important to make
the kernel thread that does work our behalf limited to the same
set of CPUs.

This is not unique to vhost, it's just that virt scenarious are affected
by this more: people seem to run untrusted applications and expect the
damage to be contained.

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-13 11:09                                                                     ` Michael S. Tsirkin
@ 2010-07-14 23:26                                                                       ` Sridhar Samudrala
  2010-07-15  0:05                                                                         ` Oleg Nesterov
  0 siblings, 1 reply; 115+ messages in thread
From: Sridhar Samudrala @ 2010-07-14 23:26 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Peter Zijlstra, Tejun Heo, Ingo Molnar, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Tue, 2010-07-13 at 14:09 +0300, Michael S. Tsirkin wrote: 
> On Mon, Jul 12, 2010 at 11:59:08PM -0700, Sridhar Samudrala wrote:
> > On 7/4/2010 2:00 AM, Michael S. Tsirkin wrote:
> > >On Fri, Jul 02, 2010 at 11:06:37PM +0200, Oleg Nesterov wrote:
> > >>On 07/02, Peter Zijlstra wrote:
> > >>>On Fri, 2010-07-02 at 11:01 -0700, Sridhar Samudrala wrote:
> > >>>>  Does  it (Tejun's kthread_clone() patch) also  inherit the
> > >>>>cgroup of the caller?
> > >>>Of course, its a simple do_fork() which inherits everything just as you
> > >>>would expect from a similar sys_clone()/sys_fork() call.
> > >>Yes. And I'm afraid it can inherit more than we want. IIUC, this is called
> > >>from ioctl(), right?
> > >>
> > >>Then the new thread becomes the natural child of the caller, and it shares
> > >>->mm with the parent. And files, dup_fd() without CLONE_FS.
> > >>
> > >>Signals. Say, if you send SIGKILL to this new thread, it can't sleep in
> > >>TASK_INTERRUPTIBLE or KILLABLE after that. And this SIGKILL can be sent
> > >>just because the parent gets SIGQUIT or abother coredumpable signal.
> > >>Or the new thread can recieve SIGSTOP via ^Z.
> > >>
> > >>Perhaps this is OK, I do not know. Just to remind that kernel_thread()
> > >>is merely clone(CLONE_VM).
> > >>
> > >>Oleg.
> > >
> > >Right. Doing this might break things like flush.  The signal and exit
> > >behaviour needs to be examined carefully. I am also unsure whether
> > >using such threads might be more expensive than inheriting kthreadd.
> > >
> > Should we just leave it to the userspace to set the cgroup/cpumask
> > after qemu starts the guest and
> > the vhost threads?
> > 
> > Thanks
> > Sridhar
> 
> Yes but we can't trust userspace to do this. It's important
> to do it on thread creation: if we don't, malicious userspace
> can create large amount of work exceeding the cgroup limits.
> 
> And the same applies so the affinity: if the qemu process
> is limited to a set of CPUs, it's important to make
> the kernel thread that does work our behalf limited to the same
> set of CPUs.
> 
> This is not unique to vhost, it's just that virt scenarious are affected
> by this more: people seem to run untrusted applications and expect the
> damage to be contained.

OK. So we want to create a thread that is a child of kthreadd, but inherits the cgroup/cpumask
from the caller. How about an exported kthread function kthread_create_in_current_cg() 
that does this?

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index aabc8a1..e0616f0 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -9,6 +9,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 				   const char namefmt[], ...)
 	__attribute__((format(printf, 3, 4)));
 
+struct task_struct *kthread_create_in_current_cg(int (*threadfn)(void *data),
+						 void *data, char *name);
+
 /**
  * kthread_run - create and wake a thread.
  * @threadfn: the function to run until signal_pending(current).
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c7..ea4e737 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <trace/events/sched.h>
+#include <linux/cgroup.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
@@ -149,6 +150,42 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create);
 
+struct task_struct *kthread_create_in_current_cg(int (*threadfn)(void *data),
+						 void *data, char *name)
+{
+	struct task_struct *worker;
+	cpumask_var_t mask;
+	int ret = -ENOMEM;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		goto out_free_mask;
+
+	worker = kthread_create(threadfn, data, "%s-%d", name, current->pid);
+	if (IS_ERR(worker))
+		goto out_free_mask;
+
+	ret = sched_getaffinity(current->pid, mask);
+	if (ret)
+		goto out_stop_worker;
+
+	ret = sched_setaffinity(worker->pid, mask);
+	if (ret)
+		goto out_stop_worker;
+
+	ret = cgroup_attach_task_current_cg(worker);
+	if (ret)
+		goto out_stop_worker;
+
+	return worker;
+
+out_stop_worker:
+	kthread_stop(worker);
+out_free_mask:
+	free_cpumask_var(mask);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(kthread_create_in_current_cg);
+
 /**
  * kthread_bind - bind a just-created kthread to a cpu.
  * @p: thread created by kthread_create().


Thanks
Sridhar


^ permalink raw reply related	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-14 23:26                                                                       ` Sridhar Samudrala
@ 2010-07-15  0:05                                                                         ` Oleg Nesterov
  2010-07-15  5:29                                                                           ` Sridhar Samudrala
  0 siblings, 1 reply; 115+ messages in thread
From: Oleg Nesterov @ 2010-07-15  0:05 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Michael S. Tsirkin, Peter Zijlstra, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On 07/14, Sridhar Samudrala wrote:
>
> OK. So we want to create a thread that is a child of kthreadd, but inherits the cgroup/cpumask
> from the caller. How about an exported kthread function kthread_create_in_current_cg()
> that does this?

Well. I must admit, this looks a bit strange to me ;)

Instead of exporting sched_xxxaffinity() we export the new function
which calls them. And I don't think this new helper is very useful
in general. May be I am wrong...

Oleg.


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-15  0:05                                                                         ` Oleg Nesterov
@ 2010-07-15  5:29                                                                           ` Sridhar Samudrala
  0 siblings, 0 replies; 115+ messages in thread
From: Sridhar Samudrala @ 2010-07-15  5:29 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Michael S. Tsirkin, Peter Zijlstra, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On 7/14/2010 5:05 PM, Oleg Nesterov wrote:
> On 07/14, Sridhar Samudrala wrote:
>    
>> OK. So we want to create a thread that is a child of kthreadd, but inherits the cgroup/cpumask
>> from the caller. How about an exported kthread function kthread_create_in_current_cg()
>> that does this?
>>      
> Well. I must admit, this looks a bit strange to me ;)
>
> Instead of exporting sched_xxxaffinity() we export the new function
> which calls them. And I don't think this new helper is very useful
> in general. May be I am wrong...
>    
If we agree on exporting sched_xxxaffinity() functions, we don't need 
this new kthread function and we
can do the same in vhost as the original patch did.

Thanks
Sridhar


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-06-02 18:40                                 ` [PATCH UPDATED " Tejun Heo
  2010-06-02 21:34                                   ` Sridhar Samudrala
@ 2010-07-22 15:58                                   ` Michael S. Tsirkin
  2010-07-22 21:21                                     ` Tejun Heo
  1 sibling, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-22 15:58 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Wed, Jun 02, 2010 at 08:40:00PM +0200, Tejun Heo wrote:
> Replace vhost_workqueue with per-vhost kthread.  Other than callback
> argument change from struct work_struct * to struct vhost_work *,
> there's no visible change to vhost_poll_*() interface.
> 
> This conversion is to make each vhost use a dedicated kthread so that
> resource control via cgroup can be applied.
> 
> Partially based on Sridhar Samudrala's patch.
> 
> * Updated to use sub structure vhost_work instead of directly using
>   vhost_poll at Michael's suggestion.
> 
> * Added flusher wake_up() optimization at Michael's suggestion.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>

All the tricky barrier pairing made me uncomfortable.  So I came up with
this on top (untested): if we do all operations under the spinlock, we
can get by without barriers and atomics.  And since we need the lock for
list operations anyway, this should have no paerformance impact.

What do you think?


diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0c6b533..7730a30 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -73,7 +73,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 	INIT_LIST_HEAD(&work->node);
 	work->fn = fn;
 	init_waitqueue_head(&work->done);
-	atomic_set(&work->flushing, 0);
+	work->flushing = 0;
 	work->queue_seq = work->done_seq = 0;
 }
 
@@ -99,13 +99,23 @@ void vhost_poll_stop(struct vhost_poll *poll)
 void vhost_poll_flush(struct vhost_poll *poll)
 {
 	struct vhost_work *work = &poll->work;
-	int seq = work->queue_seq;
+	unsigned seq, left;
+	int flushing;
 
-	atomic_inc(&work->flushing);
-	smp_mb__after_atomic_inc();	/* mb flush-b0 paired with worker-b1 */
-	wait_event(work->done, seq - work->done_seq <= 0);
-	atomic_dec(&work->flushing);
-	smp_mb__after_atomic_dec();	/* rmb flush-b1 paired with worker-b0 */
+	spin_lock_irq(&dev->work_lock);
+	seq = work->queue_seq;
+	work->flushing++;
+	spin_unlock_irq(&dev->work_lock);
+	wait_event(work->done, {
+		   spin_lock_irq(&dev->work_lock);
+		   left = work->done_seq - seq;
+		   spin_unlock_irq(&dev->work_lock);
+		   left < UINT_MAX / 2;
+	});
+	spin_lock_irq(&dev->work_lock);
+	flushing = --work->flushing;
+	spin_unlock_irq(&dev->work_lock);
+	BUG_ON(flushing < 0);
 }
 
 void vhost_poll_queue(struct vhost_poll *poll)
@@ -151,37 +161,37 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 static int vhost_worker(void *data)
 {
 	struct vhost_dev *dev = data;
-	struct vhost_work *work;
+	struct vhost_work *work = NULL;
 
-repeat:
-	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
 
-	if (kthread_should_stop()) {
-		__set_current_state(TASK_RUNNING);
-		return 0;
-	}
+		if (kthread_should_stop()) {
+			__set_current_state(TASK_RUNNING);
+			return 0;
+		}
 
-	work = NULL;
-	spin_lock_irq(&dev->work_lock);
-	if (!list_empty(&dev->work_list)) {
-		work = list_first_entry(&dev->work_list,
-					struct vhost_work, node);
-		list_del_init(&work->node);
-	}
-	spin_unlock_irq(&dev->work_lock);
+		spin_lock_irq(&dev->work_lock);
+		if (work) {
+			work->done_seq = work->queue_seq;
+			if (work->flushing)
+				wake_up_all(&work->done);
+		}
+		if (!list_empty(&dev->work_list)) {
+			work = list_first_entry(&dev->work_list,
+						struct vhost_work, node);
+			list_del_init(&work->node);
+		} else
+			work = NULL;
+		spin_unlock_irq(&dev->work_lock);
+
+		if (work) {
+			__set_current_state(TASK_RUNNING);
+			work->fn(work);
+		} else
+			schedule();
 
-	if (work) {
-		__set_current_state(TASK_RUNNING);
-		work->fn(work);
-		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
-		work->done_seq = work->queue_seq;
-		smp_mb();	/* mb worker-b1 paired with flush-b0 */
-		if (atomic_read(&work->flushing))
-			wake_up_all(&work->done);
-	} else
-		schedule();
-
-	goto repeat;
+	}
 }
 
 long vhost_dev_init(struct vhost_dev *dev,
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 0e63091..3693327 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -27,9 +27,9 @@ struct vhost_work {
 	struct list_head	  node;
 	vhost_work_fn_t		  fn;
 	wait_queue_head_t	  done;
-	atomic_t		  flushing;
-	int			  queue_seq;
-	int			  done_seq;
+	int			  flushing;
+	unsigned		  queue_seq;
+	unsigned		  done_seq;
 };
 
 /* Poll a file (eventfd or socket) */

^ permalink raw reply related	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-22 15:58                                   ` Michael S. Tsirkin
@ 2010-07-22 21:21                                     ` Tejun Heo
  2010-07-24 19:14                                       ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-07-22 21:21 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 07/22/2010 05:58 PM, Michael S. Tsirkin wrote:
> All the tricky barrier pairing made me uncomfortable.  So I came up with
> this on top (untested): if we do all operations under the spinlock, we
> can get by without barriers and atomics.  And since we need the lock for
> list operations anyway, this should have no paerformance impact.
> 
> What do you think?

I've created kthread_worker in wq#for-next tree and already converted
ivtv to use it.  Once this lands in mainline, I think converting vhost
to use it would be better choice.  kthread worker code uses basically
the same logic used in the vhost_workqueue code but is better
organized and documented.  So, I think it would be better to stick
with the original implementation, as otherwise we're likely to just
decrease test coverage without much gain.

  http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439

> @@ -151,37 +161,37 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  static int vhost_worker(void *data)
>  {
>  	struct vhost_dev *dev = data;
> -	struct vhost_work *work;
> +	struct vhost_work *work = NULL;
>  
> -repeat:
> -	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> +	for (;;) {
> +		set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
>  
> -	if (kthread_should_stop()) {
> -		__set_current_state(TASK_RUNNING);
> -		return 0;
> -	}
> +		if (kthread_should_stop()) {
> +			__set_current_state(TASK_RUNNING);
> +			return 0;
> +		}
>  
> -	work = NULL;
> -	spin_lock_irq(&dev->work_lock);
> -	if (!list_empty(&dev->work_list)) {
> -		work = list_first_entry(&dev->work_list,
> -					struct vhost_work, node);
> -		list_del_init(&work->node);
> -	}
> -	spin_unlock_irq(&dev->work_lock);
> +		spin_lock_irq(&dev->work_lock);
> +		if (work) {
> +			work->done_seq = work->queue_seq;
> +			if (work->flushing)
> +				wake_up_all(&work->done);

I don't think doing this before executing the function is correct, so
you'll have to release the lock, execute the function, regrab the lock
and then do the flush processing.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-22 21:21                                     ` Tejun Heo
@ 2010-07-24 19:14                                       ` Michael S. Tsirkin
  2010-07-25  7:41                                         ` Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-24 19:14 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Thu, Jul 22, 2010 at 11:21:40PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/22/2010 05:58 PM, Michael S. Tsirkin wrote:
> > All the tricky barrier pairing made me uncomfortable.  So I came up with
> > this on top (untested): if we do all operations under the spinlock, we
> > can get by without barriers and atomics.  And since we need the lock for
> > list operations anyway, this should have no paerformance impact.
> > 
> > What do you think?
> 
> I've created kthread_worker in wq#for-next tree and already converted
> ivtv to use it.  Once this lands in mainline, I think converting vhost
> to use it would be better choice.  kthread worker code uses basically
> the same logic used in the vhost_workqueue code but is better
> organized and documented.  So, I think it would be better to stick
> with the original implementation, as otherwise we're likely to just
> decrease test coverage without much gain.
> 
>   http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439

Sure, if we keep using workqueue. But I'd like to investigate this
direction a bit more because there's discussion to switching from kthread to
regular threads altogether.

> > @@ -151,37 +161,37 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> >  static int vhost_worker(void *data)
> >  {
> >  	struct vhost_dev *dev = data;
> > -	struct vhost_work *work;
> > +	struct vhost_work *work = NULL;
> >  
> > -repeat:
> > -	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> > +	for (;;) {
> > +		set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> >  
> > -	if (kthread_should_stop()) {
> > -		__set_current_state(TASK_RUNNING);
> > -		return 0;
> > -	}
> > +		if (kthread_should_stop()) {
> > +			__set_current_state(TASK_RUNNING);
> > +			return 0;
> > +		}
> >  
> > -	work = NULL;
> > -	spin_lock_irq(&dev->work_lock);
> > -	if (!list_empty(&dev->work_list)) {
> > -		work = list_first_entry(&dev->work_list,
> > -					struct vhost_work, node);
> > -		list_del_init(&work->node);
> > -	}
> > -	spin_unlock_irq(&dev->work_lock);
> > +		spin_lock_irq(&dev->work_lock);
> > +		if (work) {
> > +			work->done_seq = work->queue_seq;
> > +			if (work->flushing)
> > +				wake_up_all(&work->done);
> 
> I don't think doing this before executing the function is correct,

Well, before I execute the function work is NULL, so this is skipped.
Correct?

> so
> you'll have to release the lock, execute the function, regrab the lock
> and then do the flush processing.
> 
> Thanks.

It's done in the loop, so I thought we can reuse the locking
done for the sake of processing the next work item.
Makes sense?


> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-24 19:14                                       ` Michael S. Tsirkin
@ 2010-07-25  7:41                                         ` Tejun Heo
  2010-07-25 10:04                                           ` Michael S. Tsirkin
  2010-07-26 15:25                                           ` Michael S. Tsirkin
  0 siblings, 2 replies; 115+ messages in thread
From: Tejun Heo @ 2010-07-25  7:41 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 07/24/2010 09:14 PM, Michael S. Tsirkin wrote:
>> I've created kthread_worker in wq#for-next tree and already converted
>> ivtv to use it.  Once this lands in mainline, I think converting vhost
>> to use it would be better choice.  kthread worker code uses basically
>> the same logic used in the vhost_workqueue code but is better
>> organized and documented.  So, I think it would be better to stick
>> with the original implementation, as otherwise we're likely to just
>> decrease test coverage without much gain.
>>
>>   http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439
> 
> Sure, if we keep using workqueue. But I'd like to investigate this
> direction a bit more because there's discussion to switching from kthread to
> regular threads altogether.

Hmmm? It doesn't have much to do with workqueue.  kthread_worker is a
simple wrapper around kthread.  It now assumes kthread but changing it
to be useable with any thread shouldn't be too hard.  Wouldn't that be
better?

>> I don't think doing this before executing the function is correct,
> 
> Well, before I execute the function work is NULL, so this is skipped.
> Correct?
>
>> so
>> you'll have to release the lock, execute the function, regrab the lock
>> and then do the flush processing.
>>
>> Thanks.
> 
> It's done in the loop, so I thought we can reuse the locking
> done for the sake of processing the next work item.
> Makes sense?

Yeap, right.  I think it would make much more sense to use common code
when it becomes available but if you think the posted change is
necessary till then, please feel free to go ahead.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-25  7:41                                         ` Tejun Heo
@ 2010-07-25 10:04                                           ` Michael S. Tsirkin
  2010-07-26 15:25                                           ` Michael S. Tsirkin
  1 sibling, 0 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-25 10:04 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Sun, Jul 25, 2010 at 09:41:22AM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/24/2010 09:14 PM, Michael S. Tsirkin wrote:
> >> I've created kthread_worker in wq#for-next tree and already converted
> >> ivtv to use it.  Once this lands in mainline, I think converting vhost
> >> to use it would be better choice.  kthread worker code uses basically
> >> the same logic used in the vhost_workqueue code but is better
> >> organized and documented.  So, I think it would be better to stick
> >> with the original implementation, as otherwise we're likely to just
> >> decrease test coverage without much gain.
> >>
> >>   http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439
> > 
> > Sure, if we keep using workqueue. But I'd like to investigate this
> > direction a bit more because there's discussion to switching from kthread to
> > regular threads altogether.
> 
> Hmmm? It doesn't have much to do with workqueue.  kthread_worker is a
> simple wrapper around kthread.  It now assumes kthread but changing it
> to be useable with any thread shouldn't be too hard.  Wouldn't that be
> better?

Yes, of course, when common code becomes available we should
switch to that.

> >> I don't think doing this before executing the function is correct,
> > 
> > Well, before I execute the function work is NULL, so this is skipped.
> > Correct?
> >
> >> so
> >> you'll have to release the lock, execute the function, regrab the lock
> >> and then do the flush processing.
> >>
> >> Thanks.
> > 
> > It's done in the loop, so I thought we can reuse the locking
> > done for the sake of processing the next work item.
> > Makes sense?
> 
> Yeap, right.  I think it would make much more sense to use common code
> when it becomes available but if you think the posted change is
> necessary till then, please feel free to go ahead.
> 
> Thanks.
> 
> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-25  7:41                                         ` Tejun Heo
  2010-07-25 10:04                                           ` Michael S. Tsirkin
@ 2010-07-26 15:25                                           ` Michael S. Tsirkin
  2010-07-26 15:34                                             ` Tejun Heo
  1 sibling, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 15:25 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Sun, Jul 25, 2010 at 09:41:22AM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/24/2010 09:14 PM, Michael S. Tsirkin wrote:
> >> I've created kthread_worker in wq#for-next tree and already converted
> >> ivtv to use it.  Once this lands in mainline, I think converting vhost
> >> to use it would be better choice.  kthread worker code uses basically
> >> the same logic used in the vhost_workqueue code but is better
> >> organized and documented.  So, I think it would be better to stick
> >> with the original implementation, as otherwise we're likely to just
> >> decrease test coverage without much gain.
> >>
> >>   http://git.kernel.org/?p=linux/kernel/git/tj/wq.git;a=commitdiff;h=b56c0d8937e665a27d90517ee7a746d0aa05af46;hp=53c5f5ba42c194cb13dd3083ed425f2c5b1ec439
> > 
> > Sure, if we keep using workqueue. But I'd like to investigate this
> > direction a bit more because there's discussion to switching from kthread to
> > regular threads altogether.
> 
> Hmmm? It doesn't have much to do with workqueue.  kthread_worker is a
> simple wrapper around kthread.  It now assumes kthread but changing it
> to be useable with any thread shouldn't be too hard.  Wouldn't that be
> better?

BTW, kthread_worker would benefit from the optimization I implemented
here as well.

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 15:25                                           ` Michael S. Tsirkin
@ 2010-07-26 15:34                                             ` Tejun Heo
  2010-07-26 15:46                                               ` Tejun Heo
  2010-07-26 15:50                                               ` Michael S. Tsirkin
  0 siblings, 2 replies; 115+ messages in thread
From: Tejun Heo @ 2010-07-26 15:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
> BTW, kthread_worker would benefit from the optimization I implemented
> here as well.

Hmmm... I'm not quite sure whether it's an optimization.  I thought
the patch was due to feeling uncomfortable about using barriers?  Is
it an optimization?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 15:34                                             ` Tejun Heo
@ 2010-07-26 15:46                                               ` Tejun Heo
  2010-07-26 15:51                                                 ` Michael S. Tsirkin
  2010-07-26 15:50                                               ` Michael S. Tsirkin
  1 sibling, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-07-26 15:46 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On 07/26/2010 05:34 PM, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
>> BTW, kthread_worker would benefit from the optimization I implemented
>> here as well.
> 
> Hmmm... I'm not quite sure whether it's an optimization.  I thought
> the patch was due to feeling uncomfortable about using barriers?  Is
> it an optimization?

Yeah, one less smp_mb() in execution path.  The lock dancing in
flush() is ugly but then again mucking with barriers could be harder
to understand.  Care to send a patch against wq#for-next tree?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 15:34                                             ` Tejun Heo
  2010-07-26 15:46                                               ` Tejun Heo
@ 2010-07-26 15:50                                               ` Michael S. Tsirkin
  2010-07-26 16:05                                                 ` Tejun Heo
  1 sibling, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 15:50 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Mon, Jul 26, 2010 at 05:34:44PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
> > BTW, kthread_worker would benefit from the optimization I implemented
> > here as well.
> 
> Hmmm... I'm not quite sure whether it's an optimization.  I thought
> the patch was due to feeling uncomfortable about using barriers?

Oh yes. But getting rid of barriers is what motivated me originally.

>  Is it an optimization?
> 
> Thanks.

Yes, sure. This removes atomic read and 2 barrier operations on data path.  And
it does not add any new synchronization: instead, we reuse the lock that we
take anyway.  The relevant part is:


+               if (work) {
+                       __set_current_state(TASK_RUNNING);
+                       work->fn(work);
+               } else
+                       schedule();

-       if (work) {
-               __set_current_state(TASK_RUNNING);
-               work->fn(work);
-               smp_wmb();      /* wmb worker-b0 paired with flush-b1 */
-               work->done_seq = work->queue_seq;
-               smp_mb();       /* mb worker-b1 paired with flush-b0 */
-               if (atomic_read(&work->flushing))
-                       wake_up_all(&work->done);
-       } else
-               schedule();
-
-       goto repeat;

Is there a git tree with kthread_worker applied?
I might do this just for fun ...


> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 15:46                                               ` Tejun Heo
@ 2010-07-26 15:51                                                 ` Michael S. Tsirkin
  0 siblings, 0 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 15:51 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Mon, Jul 26, 2010 at 05:46:30PM +0200, Tejun Heo wrote:
> On 07/26/2010 05:34 PM, Tejun Heo wrote:
> > Hello,
> > 
> > On 07/26/2010 05:25 PM, Michael S. Tsirkin wrote:
> >> BTW, kthread_worker would benefit from the optimization I implemented
> >> here as well.
> > 
> > Hmmm... I'm not quite sure whether it's an optimization.  I thought
> > the patch was due to feeling uncomfortable about using barriers?  Is
> > it an optimization?
> 
> Yeah, one less smp_mb() in execution path.  The lock dancing in
> flush() is ugly but then again mucking with barriers could be harder
> to understand.  Care to send a patch against wq#for-next tree?
> 
> Thanks.

Sure. Where's that, exactly?

> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 15:50                                               ` Michael S. Tsirkin
@ 2010-07-26 16:05                                                 ` Tejun Heo
  2010-07-26 16:14                                                   ` Tejun Heo
  2010-07-26 16:23                                                   ` Michael S. Tsirkin
  0 siblings, 2 replies; 115+ messages in thread
From: Tejun Heo @ 2010-07-26 16:05 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 07/26/2010 05:50 PM, Michael S. Tsirkin wrote:
>> Hmmm... I'm not quite sure whether it's an optimization.  I thought
>> the patch was due to feeling uncomfortable about using barriers?
> 
> Oh yes. But getting rid of barriers is what motivated me originally.

Yeah, getting rid of barriers is always good.  :-)

> Is there a git tree with kthread_worker applied?
> I might do this just for fun ...

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git for-next

For the original implementaiton, please take a look at commit
b56c0d8937e665a27d90517ee7a746d0aa05af46.

* Can you please keep the outer goto repeat loop?  I just don't like
  outermost for (;;).

* Placing try_to_freeze() could be a bit annoying.  It shouldn't be
  executed when there's a work to flush.

* I think A - B <= 0 test would be more familiar.  At least
  time_before/after() are implemented that way.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 16:05                                                 ` Tejun Heo
@ 2010-07-26 16:14                                                   ` Tejun Heo
  2010-07-26 16:31                                                     ` Michael S. Tsirkin
                                                                       ` (2 more replies)
  2010-07-26 16:23                                                   ` Michael S. Tsirkin
  1 sibling, 3 replies; 115+ messages in thread
From: Tejun Heo @ 2010-07-26 16:14 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Just one more thing.

On 07/26/2010 06:05 PM, Tejun Heo wrote:
> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
>   executed when there's a work to flush.

* Similar issue exists for kthread_stop().  The kthread shouldn't exit
  while there's a work to flush (please note that kthread_worker
  interface allows detaching / attaching worker kthread during
  operation, so it should remain in consistent state with regard to
  flushing).

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 16:05                                                 ` Tejun Heo
  2010-07-26 16:14                                                   ` Tejun Heo
@ 2010-07-26 16:23                                                   ` Michael S. Tsirkin
  2010-07-26 19:04                                                     ` Tejun Heo
  1 sibling, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 16:23 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Mon, Jul 26, 2010 at 06:05:27PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 05:50 PM, Michael S. Tsirkin wrote:
> >> Hmmm... I'm not quite sure whether it's an optimization.  I thought
> >> the patch was due to feeling uncomfortable about using barriers?
> > 
> > Oh yes. But getting rid of barriers is what motivated me originally.
> 
> Yeah, getting rid of barriers is always good.  :-)
> 
> > Is there a git tree with kthread_worker applied?
> > I might do this just for fun ...
> 
>  git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git for-next
> 
> For the original implementaiton, please take a look at commit
> b56c0d8937e665a27d90517ee7a746d0aa05af46.
> 
> * Can you please keep the outer goto repeat loop?  I just don't like
>   outermost for (;;).

Okay ... can we put the code in a {} scope to make it clear
where does the loop starts and ends?

> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
>   executed when there's a work to flush.

It currently seems to be executed when there is work to flush.
Is this wrong?

> * I think A - B <= 0 test would be more familiar.  At least
>   time_before/after() are implemented that way.

I am concerned that this overflows a signed integer -
which I seem to remeber that C99 disallows.
timer macros are on data path so might be worth the risk there,
but flush is slow path so better be safe?

> Thanks.
> 
> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 16:14                                                   ` Tejun Heo
@ 2010-07-26 16:31                                                     ` Michael S. Tsirkin
  2010-07-26 18:51                                                       ` Tejun Heo
  2010-07-26 16:51                                                     ` Michael S. Tsirkin
  2010-07-26 16:57                                                     ` Michael S. Tsirkin
  2 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 16:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Mon, Jul 26, 2010 at 06:14:30PM +0200, Tejun Heo wrote:
> Just one more thing.
> 
> On 07/26/2010 06:05 PM, Tejun Heo wrote:
> > * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
> >   executed when there's a work to flush.

BTW why is this important?
We could always get another work and flush right after
try_to_freeze, and then flush would block for a long time.


BTW the vhost patch you sent does not do this at all.
I am guessing it is because our thread is not freezable?

> * Similar issue exists for kthread_stop().  The kthread shouldn't exit
>   while there's a work to flush (please note that kthread_worker
>   interface allows detaching / attaching worker kthread during
>   operation, so it should remain in consistent state with regard to
>   flushing).
> 
> Thanks.

Not sure I agree here. Users must synchronise flush and stop calls.
Otherwise a work might get queued after stop is called, and
you won't be able to flush it.


> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 16:14                                                   ` Tejun Heo
  2010-07-26 16:31                                                     ` Michael S. Tsirkin
@ 2010-07-26 16:51                                                     ` Michael S. Tsirkin
  2010-07-26 19:14                                                       ` Tejun Heo
  2010-07-26 16:57                                                     ` Michael S. Tsirkin
  2 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 16:51 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Mon, Jul 26, 2010 at 06:14:30PM +0200, Tejun Heo wrote:
> Just one more thing.

I noticed that with vhost, flush_work was getting the worker
pointer as well. Can we live with this API change?

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 16:14                                                   ` Tejun Heo
  2010-07-26 16:31                                                     ` Michael S. Tsirkin
  2010-07-26 16:51                                                     ` Michael S. Tsirkin
@ 2010-07-26 16:57                                                     ` Michael S. Tsirkin
  2 siblings, 0 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 16:57 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Here's an untested patch forward-ported from vhost
(works fine for vhost).

kthread_worker: replace barriers+atomics with a lock

We can save some cycles and make code simpler by
reusing worker lock for flush, instead of atomics.
flush_kthread_work needs to get worker pointer for
this to work.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 685ea65..19ae9f2 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -58,7 +58,7 @@ struct kthread_work {
 	struct list_head	node;
 	kthread_work_func_t	func;
 	wait_queue_head_t	done;
-	atomic_t		flushing;
+	int			flushing;
 	int			queue_seq;
 	int			done_seq;
 };
@@ -72,7 +72,7 @@ struct kthread_work {
 	.node = LIST_HEAD_INIT((work).node),				\
 	.func = (fn),							\
 	.done = __WAIT_QUEUE_HEAD_INITIALIZER((work).done),		\
-	.flushing = ATOMIC_INIT(0),					\
+	.flushing = 0,							\
 	}
 
 #define DEFINE_KTHREAD_WORKER(worker)					\
@@ -96,7 +96,8 @@ int kthread_worker_fn(void *worker_ptr);
 
 bool queue_kthread_work(struct kthread_worker *worker,
 			struct kthread_work *work);
-void flush_kthread_work(struct kthread_work *work);
+void flush_kthread_work(struct kthread_worker *worker,
+			struct kthread_work *work);
 void flush_kthread_worker(struct kthread_worker *worker);
 
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786..461f58d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -283,10 +283,12 @@ int kthreadd(void *unused)
 int kthread_worker_fn(void *worker_ptr)
 {
 	struct kthread_worker *worker = worker_ptr;
-	struct kthread_work *work;
+	struct kthread_work *work = NULL;
 
+	spin_lock_irq(&worker->lock);
 	WARN_ON(worker->task);
 	worker->task = current;
+	spin_unlock_irq(&worker->lock);
 repeat:
 	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
 
@@ -298,23 +300,23 @@ repeat:
 		return 0;
 	}
 
-	work = NULL;
 	spin_lock_irq(&worker->lock);
+	if (work) {
+		work->done_seq = work->queue_seq;
+		if (work->flushing)
+			wake_up_all(&work->done);
+	}
 	if (!list_empty(&worker->work_list)) {
 		work = list_first_entry(&worker->work_list,
 					struct kthread_work, node);
 		list_del_init(&work->node);
-	}
+	} else
+		work = NULL;
 	spin_unlock_irq(&worker->lock);
 
 	if (work) {
 		__set_current_state(TASK_RUNNING);
 		work->func(work);
-		smp_wmb();	/* wmb worker-b0 paired with flush-b1 */
-		work->done_seq = work->queue_seq;
-		smp_mb();	/* mb worker-b1 paired with flush-b0 */
-		if (atomic_read(&work->flushing))
-			wake_up_all(&work->done);
 	} else if (!freezing(current))
 		schedule();
 
@@ -353,31 +355,33 @@ EXPORT_SYMBOL_GPL(queue_kthread_work);
 
 /**
  * flush_kthread_work - flush a kthread_work
+ * @worker: where work might be running
  * @work: work to flush
  *
  * If @work is queued or executing, wait for it to finish execution.
  */
-void flush_kthread_work(struct kthread_work *work)
+void flush_kthread_work(struct kthread_worker *worker,
+			struct kthread_work *work)
 {
-	int seq = work->queue_seq;
+	int seq
 
-	atomic_inc(&work->flushing);
-
-	/*
-	 * mb flush-b0 paired with worker-b1, to make sure either
-	 * worker sees the above increment or we see done_seq update.
-	 */
-	smp_mb__after_atomic_inc();
+	spin_lock_irq(&worker->lock);
+	seq = work->queue_seq;
+	++work->flushing;
+	spin_unlock_irq(&worker->lock);
 
 	/* A - B <= 0 tests whether B is in front of A regardless of overflow */
-	wait_event(work->done, seq - work->done_seq <= 0);
-	atomic_dec(&work->flushing);
-
-	/*
-	 * rmb flush-b1 paired with worker-b0, to make sure our caller
-	 * sees every change made by work->func().
-	 */
-	smp_mb__after_atomic_dec();
+	wait_event(work->done,
+		   ({
+			int done;
+			spin_lock_irq(&worker->lock);
+		    	delta = seq - work->done_seq <= 0;
+			spin_unlock_irq(&worker->lock);
+			done;
+		   });
+	spin_lock_irq(&worker->lock);
+	--work->flushing;
+	spin_unlock_irq(&worker->lock);
 }
 EXPORT_SYMBOL_GPL(flush_kthread_work);
 

^ permalink raw reply related	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-02 21:06                                                               ` Oleg Nesterov
  2010-07-04  9:00                                                                 ` Michael S. Tsirkin
@ 2010-07-26 17:12                                                                 ` Michael S. Tsirkin
  2010-07-26 17:51                                                                   ` Sridhar Samudrala
  1 sibling, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 17:12 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, Sridhar Samudrala, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Fri, Jul 02, 2010 at 11:06:37PM +0200, Oleg Nesterov wrote:
> On 07/02, Peter Zijlstra wrote:
> >
> > On Fri, 2010-07-02 at 11:01 -0700, Sridhar Samudrala wrote:
> > >
> > >  Does  it (Tejun's kthread_clone() patch) also  inherit the
> > > cgroup of the caller?
> >
> > Of course, its a simple do_fork() which inherits everything just as you
> > would expect from a similar sys_clone()/sys_fork() call.
> 
> Yes. And I'm afraid it can inherit more than we want. IIUC, this is called
> from ioctl(), right?
> 
> Then the new thread becomes the natural child of the caller, and it shares
> ->mm with the parent. And files, dup_fd() without CLONE_FS.
> 
> Signals. Say, if you send SIGKILL to this new thread, it can't sleep in
> TASK_INTERRUPTIBLE or KILLABLE after that. And this SIGKILL can be sent
> just because the parent gets SIGQUIT or abother coredumpable signal.
> Or the new thread can recieve SIGSTOP via ^Z.
> 
> Perhaps this is OK, I do not know. Just to remind that kernel_thread()
> is merely clone(CLONE_VM).
> 
> Oleg.

With some machinery to stop it later, yes.
Oleg, how does the below look to you?

Here I explicitly drop the fds so we don't share them.
CLONE_VM takes care of sharing the mm I think.
About signals - for the vhost-net use this is OK as we use
uninterruptible sleep anyway (like the new kthread_worker does).

This code seems to work fine for me so far - any comments?

---

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index aabc8a1..72c7b17 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -9,6 +9,11 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 				   const char namefmt[], ...)
 	__attribute__((format(printf, 3, 4)));
 
+struct task_struct *kthread_create_inherit(int (*threadfn)(void *data),
+					   void *data,
+					   const char namefmt[], ...)
+	__attribute__((format(printf, 3, 4)));
+
 /**
  * kthread_run - create and wake a thread.
  * @threadfn: the function to run until signal_pending(current).
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c7..b81588c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -149,6 +149,38 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create);
 
+/* Same as kthread_create, but inherit attributes (cgroups, priority, CPU mask)
+ * from current. */
+struct task_struct *kthread_create_inherit(int (*threadfn)(void *data),
+					   void *data,
+					   const char namefmt[],
+					   ...)
+{
+	struct kthread_create_info create;
+
+	create.threadfn = threadfn;
+	create.data = data;
+	init_completion(&create.done);
+
+	create_kthread(&create);
+	wait_for_completion(&create.done);
+
+	if (!IS_ERR(create.result)) {
+		va_list args;
+
+		/* Don't share files with parent as drivers use release for
+		 * close on exit, etc. */
+		exit_files(create.result);
+
+		va_start(args, namefmt);
+		vsnprintf(create.result->comm, sizeof(create.result->comm),
+			  namefmt, args);
+		va_end(args);
+	}
+	return create.result;
+}
+EXPORT_SYMBOL(kthread_create_inherit);
+
 /**
  * kthread_bind - bind a just-created kthread to a cpu.
  * @p: thread created by kthread_create().

^ permalink raw reply related	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-26 17:12                                                                 ` Michael S. Tsirkin
@ 2010-07-26 17:51                                                                   ` Sridhar Samudrala
  2010-07-26 18:08                                                                     ` Oleg Nesterov
  0 siblings, 1 reply; 115+ messages in thread
From: Sridhar Samudrala @ 2010-07-26 17:51 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Peter Zijlstra, Tejun Heo, Ingo Molnar, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Mon, 2010-07-26 at 20:12 +0300, Michael S. Tsirkin wrote:
> On Fri, Jul 02, 2010 at 11:06:37PM +0200, Oleg Nesterov wrote:
> > On 07/02, Peter Zijlstra wrote:
> > >
> > > On Fri, 2010-07-02 at 11:01 -0700, Sridhar Samudrala wrote:
> > > >
> > > >  Does  it (Tejun's kthread_clone() patch) also  inherit the
> > > > cgroup of the caller?
> > >
> > > Of course, its a simple do_fork() which inherits everything just as you
> > > would expect from a similar sys_clone()/sys_fork() call.
> > 
> > Yes. And I'm afraid it can inherit more than we want. IIUC, this is called
> > from ioctl(), right?
> > 
> > Then the new thread becomes the natural child of the caller, and it shares
> > ->mm with the parent. And files, dup_fd() without CLONE_FS.
> > 
> > Signals. Say, if you send SIGKILL to this new thread, it can't sleep in
> > TASK_INTERRUPTIBLE or KILLABLE after that. And this SIGKILL can be sent
> > just because the parent gets SIGQUIT or abother coredumpable signal.
> > Or the new thread can recieve SIGSTOP via ^Z.
> > 
> > Perhaps this is OK, I do not know. Just to remind that kernel_thread()
> > is merely clone(CLONE_VM).
> > 
> > Oleg.
> 
> With some machinery to stop it later, yes.
> Oleg, how does the below look to you?
> 
> Here I explicitly drop the fds so we don't share them.
> CLONE_VM takes care of sharing the mm I think.
> About signals - for the vhost-net use this is OK as we use
> uninterruptible sleep anyway (like the new kthread_worker does).
> 
> This code seems to work fine for me so far - any comments?
> 
> ---
> 
> diff --git a/include/linux/kthread.h b/include/linux/kthread.h
> index aabc8a1..72c7b17 100644
> --- a/include/linux/kthread.h
> +++ b/include/linux/kthread.h
> @@ -9,6 +9,11 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
>  				   const char namefmt[], ...)
>  	__attribute__((format(printf, 3, 4)));
> 
> +struct task_struct *kthread_create_inherit(int (*threadfn)(void *data),
> +					   void *data,
> +					   const char namefmt[], ...)
> +	__attribute__((format(printf, 3, 4)));
> +
>  /**
>   * kthread_run - create and wake a thread.
>   * @threadfn: the function to run until signal_pending(current).
> diff --git a/kernel/kthread.c b/kernel/kthread.c
> index 83911c7..b81588c 100644
> --- a/kernel/kthread.c
> +++ b/kernel/kthread.c
> @@ -149,6 +149,38 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
>  }
>  EXPORT_SYMBOL(kthread_create);
> 
> +/* Same as kthread_create, but inherit attributes (cgroups, priority, CPU mask)
> + * from current. */
> +struct task_struct *kthread_create_inherit(int (*threadfn)(void *data),
> +					   void *data,
> +					   const char namefmt[],
> +					   ...)
> +{
> +	struct kthread_create_info create;
> +
> +	create.threadfn = threadfn;
> +	create.data = data;
> +	init_completion(&create.done);
> +
> +	create_kthread(&create);
> +	wait_for_completion(&create.done);
> +
> +	if (!IS_ERR(create.result)) {
> +		va_list args;
> +
> +		/* Don't share files with parent as drivers use release for
> +		 * close on exit, etc. */
> +		exit_files(create.result);
> +
> +		va_start(args, namefmt);
> +		vsnprintf(create.result->comm, sizeof(create.result->comm),
> +			  namefmt, args);
> +		va_end(args);
> +	}
> +	return create.result;
> +}
> +EXPORT_SYMBOL(kthread_create_inherit);
> +
>  /**
>   * kthread_bind - bind a just-created kthread to a cpu.
>   * @p: thread created by kthread_create().


I have been testing out a similar patch that uses kernel_thread() without CLONE_FILES
flag rather than create_kthread() and then closing the files.
Either version should be fine.

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index aabc8a1..634eaf7 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -9,6 +9,11 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 				   const char namefmt[], ...)
 	__attribute__((format(printf, 3, 4)));
 
+struct task_struct *kthread_clone(int (*threadfn)(void *data),
+				  void *data,
+				  const char namefmt[], ...)
+	__attribute__((format(printf, 3, 4)));
+
 /**
  * kthread_run - create and wake a thread.
  * @threadfn: the function to run until signal_pending(current).
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c7..806dae5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -149,6 +149,38 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create);
 
+struct task_struct *kthread_clone(int (*threadfn)(void *data),
+				  void *data,
+				  const char namefmt[],
+				  ...)
+{
+	struct kthread_create_info create;
+	int pid;
+
+	create.threadfn = threadfn;
+	create.data = data;
+	init_completion(&create.done);
+	INIT_LIST_HEAD(&create.list);
+
+	pid = kernel_thread(kthread, &create, CLONE_FS);
+	if (pid < 0) {
+		create.result = ERR_PTR(pid);
+		complete(&create.done);
+	}
+	wait_for_completion(&create.done);
+
+	if (!IS_ERR(create.result)) {
+		va_list args;
+		va_start(args, namefmt);
+		vsnprintf(create.result->comm, sizeof(create.result->comm),
+			  namefmt, args);
+		va_end(args);
+	}
+
+	return create.result;
+}
+EXPORT_SYMBOL(kthread_clone);
+
 /**
  * kthread_bind - bind a just-created kthread to a cpu.
  * @p: thread created by kthread_create().






^ permalink raw reply related	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-26 17:51                                                                   ` Sridhar Samudrala
@ 2010-07-26 18:08                                                                     ` Oleg Nesterov
  2010-07-26 19:55                                                                       ` Michael S. Tsirkin
                                                                                         ` (3 more replies)
  0 siblings, 4 replies; 115+ messages in thread
From: Oleg Nesterov @ 2010-07-26 18:08 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Michael S. Tsirkin, Peter Zijlstra, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On 07/26, Sridhar Samudrala wrote:
>
> I have been testing out a similar patch that uses kernel_thread() without CLONE_FILES
> flag rather than create_kthread() and then closing the files.

!CLONE_FILES can't help. copy_files() does dup_fd() in this case.
The child still inherits the files.

> Either version should be fine.

I think neither version is fine ;)

exit_files() is not enough too. How about the signals, reparenting?


I already forgot all details, probably I missed somethig. But it
seems to me that it is better to just export get/set affinity and
forget about all complications.

Oleg.


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 16:31                                                     ` Michael S. Tsirkin
@ 2010-07-26 18:51                                                       ` Tejun Heo
  2010-07-26 19:57                                                         ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-07-26 18:51 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 07/26/2010 06:31 PM, Michael S. Tsirkin wrote:
>> On 07/26/2010 06:05 PM, Tejun Heo wrote:
>>> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
>>>   executed when there's a work to flush.
> 
> BTW why is this important?
> We could always get another work and flush right after
> try_to_freeze, and then flush would block for a long time.
> 
> BTW the vhost patch you sent does not do this at all.
> I am guessing it is because our thread is not freezable?

Yeap, I think so.

>> * Similar issue exists for kthread_stop().  The kthread shouldn't exit
>>   while there's a work to flush (please note that kthread_worker
>>   interface allows detaching / attaching worker kthread during
>>   operation, so it should remain in consistent state with regard to
>>   flushing).
> 
> Not sure I agree here. Users must synchronise flush and stop calls.
> Otherwise a work might get queued after stop is called, and
> you won't be able to flush it.

For freeze, it probably is okay but for stop, I think it's better to
keep the semantics straight forward.  It may be okay to do otherwise
but having such oddity in generic interface is nasty and may lead to
surprises which can be pretty difficult to track down later on.  It's
just a bit more of annoyance while writing the generic code, so...

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 16:23                                                   ` Michael S. Tsirkin
@ 2010-07-26 19:04                                                     ` Tejun Heo
  2010-07-26 20:19                                                       ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-07-26 19:04 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On 07/26/2010 06:23 PM, Michael S. Tsirkin wrote:
>> * Can you please keep the outer goto repeat loop?  I just don't like
>>   outermost for (;;).
> 
> Okay ... can we put the code in a {} scope to make it clear
> where does the loop starts and ends?

If we're gonna do that, it would be better to put it inside a loop
construct.  The reason why I don't like it is that loops like that
don't really help read/writeability much while indenting the whole
logic unnecessarily and look more like a result of obsession against
goto rather than any practical reason.  It's just a cosmetic
preference and I might as well be the weirdo here, so if you feel
strong about it, please feel free to put everything in a loop.

>> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
>>   executed when there's a work to flush.
> 
> It currently seems to be executed when there is work to flush.
> Is this wrong?

Oh, does it?  As I wrote in the other mail, things like that wouldn't
necessarily break correctness but I think it would be better to avoid
surprises in the generic code if not too difficult.

>> * I think A - B <= 0 test would be more familiar.  At least
>>   time_before/after() are implemented that way.
> 
> I am concerned that this overflows a signed integer -
> which I seem to remeber that C99 disallows.

Really?  Overflows of pointer isn't expected and that's why we have
weird RELOC_HIDE() macro for such calculations but integers not
expected to overflow is a news to me.  Are you sure?  That basically
means time_before/after() aren't safe either.

> timer macros are on data path so might be worth the risk there,
> but flush is slow path so better be safe?

I don't think performance matters much here.  I just think the sign
test is clearer / more familiar for the logic.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 16:51                                                     ` Michael S. Tsirkin
@ 2010-07-26 19:14                                                       ` Tejun Heo
  2010-07-26 19:31                                                         ` Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-07-26 19:14 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
> On Mon, Jul 26, 2010 at 06:14:30PM +0200, Tejun Heo wrote:
>> Just one more thing.
> 
> I noticed that with vhost, flush_work was getting the worker
> pointer as well. Can we live with this API change?

Yeah, the flushing mechanism wouldn't work reliably if the work is
queued to a different worker without flushing, so yeah passing in
@worker might actually be better.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 19:14                                                       ` Tejun Heo
@ 2010-07-26 19:31                                                         ` Tejun Heo
  2010-07-26 19:59                                                           ` Michael S. Tsirkin
  2010-07-27 19:19                                                           ` Michael S. Tsirkin
  0 siblings, 2 replies; 115+ messages in thread
From: Tejun Heo @ 2010-07-26 19:31 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 07/26/2010 09:14 PM, Tejun Heo wrote:
> On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
>> I noticed that with vhost, flush_work was getting the worker
>> pointer as well. Can we live with this API change?
> 
> Yeah, the flushing mechanism wouldn't work reliably if the work is
> queued to a different worker without flushing, so yeah passing in
> @worker might actually be better.

Thinking a bit more about it, it kind of sucks that queueing to
another worker from worker->func() breaks flush.  Maybe the right
thing to do there is using atomic_t for done_seq?  It pays a bit more
overhead but maybe that's justifiable to keep the API saner?  It would
be great if it can be fixed somehow even if it means that the work has
to be separately flushed for each worker it has been on before being
destroyed.

Or, if flushing has to be associated with a specific worker anyway,
maybe it would be better to move the sequence counter to
kthread_worker and do it similarly with the original workqueue so that
work can be destroyed once execution starts?  Then, it can at least
remain semantically identical to the original workqueue.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-26 18:08                                                                     ` Oleg Nesterov
@ 2010-07-26 19:55                                                                       ` Michael S. Tsirkin
  2010-07-26 20:27                                                                       ` Michael S. Tsirkin
                                                                                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 19:55 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Sridhar Samudrala, Peter Zijlstra, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Mon, Jul 26, 2010 at 08:08:34PM +0200, Oleg Nesterov wrote:
> On 07/26, Sridhar Samudrala wrote:
> >
> > I have been testing out a similar patch that uses kernel_thread() without CLONE_FILES
> > flag rather than create_kthread() and then closing the files.
> 
> !CLONE_FILES can't help. copy_files() does dup_fd() in this case.
> The child still inherits the files.
> 
> > Either version should be fine.
> 
> I think neither version is fine ;)
> 
> exit_files() is not enough too. How about the signals,

As I said, signals are unimportant as we are using this
thread to base a worker on - it sleeps uninterruptibly.

> reparenting?

That's actually a feature: it lets us find out which process
owns the device using the thread by looking at the parent.

> 
> I already forgot all details, probably I missed somethig. But it
> seems to me that it is better to just export get/set affinity and
> forget about all complications.
> 
> Oleg.

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 18:51                                                       ` Tejun Heo
@ 2010-07-26 19:57                                                         ` Michael S. Tsirkin
  2010-07-27  8:18                                                           ` Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 19:57 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Mon, Jul 26, 2010 at 08:51:50PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 06:31 PM, Michael S. Tsirkin wrote:
> >> On 07/26/2010 06:05 PM, Tejun Heo wrote:
> >>> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
> >>>   executed when there's a work to flush.
> > 
> > BTW why is this important?
> > We could always get another work and flush right after
> > try_to_freeze, and then flush would block for a long time.
> > 
> > BTW the vhost patch you sent does not do this at all.
> > I am guessing it is because our thread is not freezable?
> 
> Yeap, I think so.
> 
> >> * Similar issue exists for kthread_stop().  The kthread shouldn't exit
> >>   while there's a work to flush (please note that kthread_worker
> >>   interface allows detaching / attaching worker kthread during
> >>   operation, so it should remain in consistent state with regard to
> >>   flushing).
> > 
> > Not sure I agree here. Users must synchronise flush and stop calls.
> > Otherwise a work might get queued after stop is called, and
> > you won't be able to flush it.
> 
> For freeze, it probably is okay but for stop, I think it's better to
> keep the semantics straight forward.

What are the semantics then? What do we want stop followed
by queue and flush to do?

>  It may be okay to do otherwise
> but having such oddity in generic interface is nasty and may lead to
> surprises which can be pretty difficult to track down later on.  It's
> just a bit more of annoyance while writing the generic code, so...
> 
> Thanks.
> 
> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 19:31                                                         ` Tejun Heo
@ 2010-07-26 19:59                                                           ` Michael S. Tsirkin
  2010-07-27 19:19                                                           ` Michael S. Tsirkin
  1 sibling, 0 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 19:59 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Mon, Jul 26, 2010 at 09:31:58PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 09:14 PM, Tejun Heo wrote:
> > On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
> >> I noticed that with vhost, flush_work was getting the worker
> >> pointer as well. Can we live with this API change?
> > 
> > Yeah, the flushing mechanism wouldn't work reliably if the work is
> > queued to a different worker without flushing, so yeah passing in
> > @worker might actually be better.
> 
> Thinking a bit more about it, it kind of sucks that queueing to
> another worker from worker->func() breaks flush.  Maybe the right
> thing to do there is using atomic_t for done_seq?  It pays a bit more
> overhead but maybe that's justifiable to keep the API saner?  It would
> be great if it can be fixed somehow even if it means that the work has
> to be separately flushed for each worker it has been on before being
> destroyed.
> 
> Or, if flushing has to be associated with a specific worker anyway,
> maybe it would be better to move the sequence counter to
> kthread_worker and do it similarly with the original workqueue so that
> work can be destroyed once execution starts?  Then, it can at least
> remain semantically identical to the original workqueue.
> 
> Thanks.

This last sounds sane: in fact I didn't know there is any difference.

> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 19:04                                                     ` Tejun Heo
@ 2010-07-26 20:19                                                       ` Michael S. Tsirkin
  2010-07-27  8:21                                                         ` Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 20:19 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Mon, Jul 26, 2010 at 09:04:17PM +0200, Tejun Heo wrote:
> On 07/26/2010 06:23 PM, Michael S. Tsirkin wrote:
> >> * Can you please keep the outer goto repeat loop?  I just don't like
> >>   outermost for (;;).
> > 
> > Okay ... can we put the code in a {} scope to make it clear
> > where does the loop starts and ends?
> 
> If we're gonna do that, it would be better to put it inside a loop
> construct.  The reason why I don't like it is that loops like that
> don't really help read/writeability much while indenting the whole
> logic unnecessarily and look more like a result of obsession against
> goto rather than any practical reason.  It's just a cosmetic
> preference and I might as well be the weirdo here, so if you feel
> strong about it, please feel free to put everything in a loop.
> 
> >> * Placing try_to_freeze() could be a bit annoying.  It shouldn't be
> >>   executed when there's a work to flush.
> > 
> > It currently seems to be executed when there is work to flush.
> > Is this wrong?
> 
> Oh, does it?  As I wrote in the other mail, things like that wouldn't
> necessarily break correctness but I think it would be better to avoid
> surprises in the generic code if not too difficult.

Let's try to define what do we want to achieve then.
Do you want code that flushes workers not to block
when workers are frozen? How will we handle work
submitted when worker is frozen?


> >> * I think A - B <= 0 test would be more familiar.  At least
> >>   time_before/after() are implemented that way.
> > 
> > I am concerned that this overflows a signed integer -
> > which I seem to remeber that C99 disallows.
> 
> Really?  Overflows of pointer isn't expected and that's why we have
> weird RELOC_HIDE() macro for such calculations but integers not
> expected to overflow is a news to me.  Are you sure?  That basically
> means time_before/after() aren't safe either.

As I said, in C99.
However,  the kernel is built with -fno-strict-overflow, so it will work.

> > timer macros are on data path so might be worth the risk there,
> > but flush is slow path so better be safe?
> 
> I don't think performance matters much here.  I just think the sign
> test is clearer / more familiar for the logic.
> 
> Thanks.
> 
> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-26 18:08                                                                     ` Oleg Nesterov
  2010-07-26 19:55                                                                       ` Michael S. Tsirkin
@ 2010-07-26 20:27                                                                       ` Michael S. Tsirkin
  2010-07-27  4:55                                                                       ` Michael S. Tsirkin
  2010-07-27 15:41                                                                       ` Michael S. Tsirkin
  3 siblings, 0 replies; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-26 20:27 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Sridhar Samudrala, Peter Zijlstra, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Mon, Jul 26, 2010 at 08:08:34PM +0200, Oleg Nesterov wrote:
> On 07/26, Sridhar Samudrala wrote:
> >
> > I have been testing out a similar patch that uses kernel_thread() without CLONE_FILES
> > flag rather than create_kthread() and then closing the files.
> 
> !CLONE_FILES can't help. copy_files() does dup_fd() in this case.
> The child still inherits the files.
> 
> > Either version should be fine.
> 
> I think neither version is fine ;)
> 
> exit_files() is not enough too. How about the signals, reparenting?
> 
> 
> I already forgot all details, probably I missed somethig. But it
> seems to me that it is better to just export get/set affinity and
> forget about all complications.
> 
> Oleg.

Almost forgot, we need the same for priority.

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-26 18:08                                                                     ` Oleg Nesterov
  2010-07-26 19:55                                                                       ` Michael S. Tsirkin
  2010-07-26 20:27                                                                       ` Michael S. Tsirkin
@ 2010-07-27  4:55                                                                       ` Michael S. Tsirkin
  2010-08-04 10:45                                                                         ` Peter Zijlstra
  2010-07-27 15:41                                                                       ` Michael S. Tsirkin
  3 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-27  4:55 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Sridhar Samudrala, Peter Zijlstra, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Mon, Jul 26, 2010 at 08:08:34PM +0200, Oleg Nesterov wrote:
> On 07/26, Sridhar Samudrala wrote:
> >
> > I have been testing out a similar patch that uses kernel_thread() without CLONE_FILES
> > flag rather than create_kthread() and then closing the files.
> 
> !CLONE_FILES can't help. copy_files() does dup_fd() in this case.
> The child still inherits the files.
> 
> > Either version should be fine.
> 
> I think neither version is fine ;)
> 
> exit_files() is not enough too. How about the signals, reparenting?
> 
> 
> I already forgot all details, probably I missed somethig. But it
> seems to me that it is better to just export get/set affinity and
> forget about all complications.
> 
> Oleg.


Peter, could you please indicate whether you think this is the way to
go, too?

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 19:57                                                         ` Michael S. Tsirkin
@ 2010-07-27  8:18                                                           ` Tejun Heo
  0 siblings, 0 replies; 115+ messages in thread
From: Tejun Heo @ 2010-07-27  8:18 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 07/26/2010 09:57 PM, Michael S. Tsirkin wrote:
>> For freeze, it probably is okay but for stop, I think it's better to
>> keep the semantics straight forward.
> 
> What are the semantics then? What do we want stop followed
> by queue and flush to do?

One scenario I can think of is the following.

 kthread_worker allows kthreads to be attached and stopped anytime, so
 if the caller stops the current worker while flushing is pending and
 attaches a new worker, the flushing which was pending will never
 happen.

But, in general, it's nasty to allow execution and its completion to
be separated.  Things like that are likely to bite us back in obscure
ways.  I think it would be silly to have such oddity in generic code
when it can be avoided without too much trouble.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 20:19                                                       ` Michael S. Tsirkin
@ 2010-07-27  8:21                                                         ` Tejun Heo
  0 siblings, 0 replies; 115+ messages in thread
From: Tejun Heo @ 2010-07-27  8:21 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 07/26/2010 10:19 PM, Michael S. Tsirkin wrote:
> Let's try to define what do we want to achieve then.  Do you want
> code that flushes workers not to block when workers are frozen? How
> will we handle work submitted when worker is frozen?

As I wrote earlier, it's not necessarily about correctness but rather
avoiding unnecessary surprises and of course flushing can and should
stall if the queue is frozen but let's not separate execution of a
work and its completion with something which can take undeterminate
amount of time.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-26 18:08                                                                     ` Oleg Nesterov
                                                                                         ` (2 preceding siblings ...)
  2010-07-27  4:55                                                                       ` Michael S. Tsirkin
@ 2010-07-27 15:41                                                                       ` Michael S. Tsirkin
  2010-07-30 14:19                                                                         ` Oleg Nesterov
  3 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-27 15:41 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Sridhar Samudrala, Peter Zijlstra, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Mon, Jul 26, 2010 at 08:08:34PM +0200, Oleg Nesterov wrote:
> On 07/26, Sridhar Samudrala wrote:
> >
> > I have been testing out a similar patch that uses kernel_thread() without CLONE_FILES
> > flag rather than create_kthread() and then closing the files.
> 
> !CLONE_FILES can't help. copy_files() does dup_fd() in this case.
> The child still inherits the files.
> 
> > Either version should be fine.
> 
> I think neither version is fine ;)
> 
> exit_files() is not enough too. How about the signals, reparenting?
> 
> 
> I already forgot all details, probably I missed somethig. But it
> seems to me that it is better to just export get/set affinity and
> forget about all complications.
> 
> Oleg.

Oleg, so can I attach your Ack to the patch in question, and merge
it all through net-next?

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-26 19:31                                                         ` Tejun Heo
  2010-07-26 19:59                                                           ` Michael S. Tsirkin
@ 2010-07-27 19:19                                                           ` Michael S. Tsirkin
  2010-07-28  7:48                                                             ` Tejun Heo
  1 sibling, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-27 19:19 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Mon, Jul 26, 2010 at 09:31:58PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 07/26/2010 09:14 PM, Tejun Heo wrote:
> > On 07/26/2010 06:51 PM, Michael S. Tsirkin wrote:
> >> I noticed that with vhost, flush_work was getting the worker
> >> pointer as well. Can we live with this API change?
> > 
> > Yeah, the flushing mechanism wouldn't work reliably if the work is
> > queued to a different worker without flushing, so yeah passing in
> > @worker might actually be better.
> 
> Thinking a bit more about it, it kind of sucks that queueing to
> another worker from worker->func() breaks flush.  Maybe the right
> thing to do there is using atomic_t for done_seq?

I don't believe it will help: we might have:

worker1 runs work
work requeues itself queued index = 1
worker1 reads queued index = 1
worker2 runs work
work requeues itself queued index = 2
worker2 runs work
worker2 reads queued index = 2
worker2 writes done index = 2
worker1 writes done index = 1

As you see, done index got moved back.



>  It pays a bit more
> overhead but maybe that's justifiable to keep the API saner?  It would
> be great if it can be fixed somehow even if it means that the work has
> to be separately flushed for each worker it has been on before being
> destroyed.
> 
> Or, if flushing has to be associated with a specific worker anyway,
> maybe it would be better to move the sequence counter to
> kthread_worker and do it similarly with the original workqueue so that
> work can be destroyed once execution starts?  Then, it can at least
> remain semantically identical to the original workqueue.
> 
> Thanks.
> 
> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-27 19:19                                                           ` Michael S. Tsirkin
@ 2010-07-28  7:48                                                             ` Tejun Heo
  2010-07-28 10:48                                                               ` Michael S. Tsirkin
  0 siblings, 1 reply; 115+ messages in thread
From: Tejun Heo @ 2010-07-28  7:48 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On 07/27/2010 09:19 PM, Michael S. Tsirkin wrote:
>> Thinking a bit more about it, it kind of sucks that queueing to
>> another worker from worker->func() breaks flush.  Maybe the right
>> thing to do there is using atomic_t for done_seq?
> 
> I don't believe it will help: we might have:
> 
> worker1 runs work
> work requeues itself queued index = 1
> worker1 reads queued index = 1
> worker2 runs work
> work requeues itself queued index = 2
> worker2 runs work
> worker2 reads queued index = 2
> worker2 writes done index = 2
> worker1 writes done index = 1
> 
> As you see, done index got moved back.

Yeah, I think the flushing logic should be moved to the worker.  Are
you interested in doing it w/ your change?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-28  7:48                                                             ` Tejun Heo
@ 2010-07-28 10:48                                                               ` Michael S. Tsirkin
  2010-07-28 12:00                                                                 ` Tejun Heo
  0 siblings, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-07-28 10:48 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

On Wed, Jul 28, 2010 at 09:48:31AM +0200, Tejun Heo wrote:
> On 07/27/2010 09:19 PM, Michael S. Tsirkin wrote:
> >> Thinking a bit more about it, it kind of sucks that queueing to
> >> another worker from worker->func() breaks flush.  Maybe the right
> >> thing to do there is using atomic_t for done_seq?
> > 
> > I don't believe it will help: we might have:
> > 
> > worker1 runs work
> > work requeues itself queued index = 1
> > worker1 reads queued index = 1
> > worker2 runs work
> > work requeues itself queued index = 2
> > worker2 runs work
> > worker2 reads queued index = 2
> > worker2 writes done index = 2
> > worker1 writes done index = 1
> > 
> > As you see, done index got moved back.
> 
> Yeah, I think the flushing logic should be moved to the worker.
> Are you interested in doing it w/ your change?
> 
> Thanks.

I'm unsure how flush_work operates under these conditions.  E.g. in
workqueue.c, this seems to work by keeping a pointer to current
workqueue in the work.  But what prevents us from destroying the
workqueue when work might not be running?

Is this currently broken if you use multiple workqueues
for the same work? If yes, I propose we do as I did,
making flush_work get worker pointer, and only flushing
on that worker.

> -- 
> tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
  2010-07-28 10:48                                                               ` Michael S. Tsirkin
@ 2010-07-28 12:00                                                                 ` Tejun Heo
  0 siblings, 0 replies; 115+ messages in thread
From: Tejun Heo @ 2010-07-28 12:00 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml, kvm,
	Andrew Morton, Dmitri Vorobiev, Jiri Kosina, Thomas Gleixner,
	Ingo Molnar, Andi Kleen

Hello,

On 07/28/2010 12:48 PM, Michael S. Tsirkin wrote:
> I'm unsure how flush_work operates under these conditions.  E.g. in
> workqueue.c, this seems to work by keeping a pointer to current
> workqueue in the work.  But what prevents us from destroying the
> workqueue when work might not be running?

In cmwq, work points to the gcwq it was on, which keeps track of all
the works in progress, so flushing work which is on a destroyed
workqueue should be fine, but in the original implementation, it would
end up accessing freed memory.

> Is this currently broken if you use multiple workqueues
> for the same work? If yes, I propose we do as I did,
> making flush_work get worker pointer, and only flushing
> on that worker.

The original semantics of workqueue is that flush_work() guarantees
that the work has finished executing on the workqueue it was last
queued on.  Adding @worker to flush_work() is okay, I think.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-27 15:41                                                                       ` Michael S. Tsirkin
@ 2010-07-30 14:19                                                                         ` Oleg Nesterov
  2010-07-30 14:31                                                                           ` Tejun Heo
  2010-08-01  8:50                                                                           ` Michael S. Tsirkin
  0 siblings, 2 replies; 115+ messages in thread
From: Oleg Nesterov @ 2010-07-30 14:19 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, Peter Zijlstra, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

Sorry for the delay, I can't be responsive these days...

On 07/27, Michael S. Tsirkin wrote:
>
> On Mon, Jul 26, 2010 at 08:08:34PM +0200, Oleg Nesterov wrote:
> > On 07/26, Sridhar Samudrala wrote:
> > >
> > > I have been testing out a similar patch that uses kernel_thread() without CLONE_FILES
> > > flag rather than create_kthread() and then closing the files.
> >
> > !CLONE_FILES can't help. copy_files() does dup_fd() in this case.
> > The child still inherits the files.
> >
> > > Either version should be fine.
> >
> > I think neither version is fine ;)
> >
> > exit_files() is not enough too. How about the signals, reparenting?
> >
> >
> > I already forgot all details, probably I missed somethig. But it
> > seems to me that it is better to just export get/set affinity and
> > forget about all complications.
> >
> > Oleg.
>
> Oleg, so can I attach your Ack to the patch in question, and merge
> it all through net-next?

Well, I do not think you need my ack ;)

But I must admit, I personally dislike this idea. A kernel thread which
is the child of the user-space process, and in fact it is not the "real"
kernel thread. I think this is against the common case. If you do not
care the signals/reparenting, why can't you fork the user-space process
which does all the work via ioctl's ? OK, I do not understand the problem
domain, probably this can't work.

Anyway, the patch looks buggy to me. Starting from

	create_kthread(&create);
	wait_for_completion(&create.done);

At least you should check create_kthread() suceeds, otherwise
wait_for_completion() will hang forever. OTOH, if it suceeds then
wait_for_completion() is not needed. But this is minor.

create_kthread()->kernel_thread() uses CLONE_VM, this means that the
child will share ->mm. And this means that if the parent recieves
the coredumping signal it will hang forever in kernel space waiting
until this child exits.

This is just the immediate surprise I can see with this approach,
I am afraid there is something else.

And once again. We are doing this hacks only because we lack a
couples of exports (iiuk). This is, well, a bit strange ;)

Oleg.

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-30 14:19                                                                         ` Oleg Nesterov
@ 2010-07-30 14:31                                                                           ` Tejun Heo
  2010-08-01  8:50                                                                           ` Michael S. Tsirkin
  1 sibling, 0 replies; 115+ messages in thread
From: Tejun Heo @ 2010-07-30 14:31 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Michael S. Tsirkin, Sridhar Samudrala, Peter Zijlstra,
	Ingo Molnar, netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev,
	Jiri Kosina, Thomas Gleixner, Andi Kleen

Hello,

On 07/30/2010 04:19 PM, Oleg Nesterov wrote:
> But I must admit, I personally dislike this idea. A kernel thread which
> is the child of the user-space process, and in fact it is not the "real"
> kernel thread. I think this is against the common case. If you do not
> care the signals/reparenting, why can't you fork the user-space process
> which does all the work via ioctl's ? OK, I do not understand the problem
> domain, probably this can't work.

Having kernel threads which are children of user process is plain
scary considering the many things parent/children relationship implies
and various bugs and security vulnerabilities in the area.  I can't
pinpoint any problem but I think we really shouldn't be adding
something like this for this specific use case.  If we can get away
with exporting a few symbols, let's go that way.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-30 14:19                                                                         ` Oleg Nesterov
  2010-07-30 14:31                                                                           ` Tejun Heo
@ 2010-08-01  8:50                                                                           ` Michael S. Tsirkin
  2010-08-02 15:02                                                                             ` Oleg Nesterov
  1 sibling, 1 reply; 115+ messages in thread
From: Michael S. Tsirkin @ 2010-08-01  8:50 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Sridhar Samudrala, Peter Zijlstra, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Fri, Jul 30, 2010 at 04:19:01PM +0200, Oleg Nesterov wrote:
> Sorry for the delay, I can't be responsive these days...
> 
> On 07/27, Michael S. Tsirkin wrote:
> >
> > On Mon, Jul 26, 2010 at 08:08:34PM +0200, Oleg Nesterov wrote:
> > > On 07/26, Sridhar Samudrala wrote:
> > > >
> > > > I have been testing out a similar patch that uses kernel_thread() without CLONE_FILES
> > > > flag rather than create_kthread() and then closing the files.
> > >
> > > !CLONE_FILES can't help. copy_files() does dup_fd() in this case.
> > > The child still inherits the files.
> > >
> > > > Either version should be fine.
> > >
> > > I think neither version is fine ;)
> > >
> > > exit_files() is not enough too. How about the signals, reparenting?
> > >
> > >
> > > I already forgot all details, probably I missed somethig. But it
> > > seems to me that it is better to just export get/set affinity and
> > > forget about all complications.
> > >
> > > Oleg.
> >
> > Oleg, so can I attach your Ack to the patch in question, and merge
> > it all through net-next?
> 
> Well, I do not think you need my ack ;)
> 
> 
> But I must admit, I personally dislike this idea. A kernel thread which
> is the child of the user-space process, and in fact it is not the "real"
> kernel thread. I think this is against the common case. If you do not
> care the signals/reparenting, why can't you fork the user-space process
> which does all the work via ioctl's ? OK, I do not understand the problem
> domain, probably this can't work.
> 
> Anyway, the patch looks buggy to me. Starting from
> 
> 	create_kthread(&create);
> 	wait_for_completion(&create.done);
> 
> At least you should check create_kthread() suceeds, otherwise
> wait_for_completion() will hang forever. OTOH, if it suceeds then
> wait_for_completion() is not needed. But this is minor.
> 
> create_kthread()->kernel_thread() uses CLONE_VM, this means that the
> child will share ->mm. And this means that if the parent recieves
> the coredumping signal it will hang forever in kernel space waiting
> until this child exits.
> 
> This is just the immediate surprise I can see with this approach,
> I am afraid there is something else.
> 
> And once again. We are doing this hacks only because we lack a
> couples of exports (iiuk). This is, well, a bit strange ;)
> 
> Oleg.


Oleg, I mean Ack the exporting of get/set affinity.

Thanks!

-- 
MST

^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-08-01  8:50                                                                           ` Michael S. Tsirkin
@ 2010-08-02 15:02                                                                             ` Oleg Nesterov
  0 siblings, 0 replies; 115+ messages in thread
From: Oleg Nesterov @ 2010-08-02 15:02 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, Peter Zijlstra, Tejun Heo, Ingo Molnar,
	netdev, lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On 08/01, Michael S. Tsirkin wrote:
>
> Oleg, I mean Ack the exporting of get/set affinity.

Ah, I misunderstood.

Yes, I believe the exporting is the lesser evil. Please feel free
to add my ack.

Oleg.


^ permalink raw reply	[flat|nested] 115+ messages in thread

* Re: [PATCH repost] sched: export sched_set/getaffinity to modules
  2010-07-27  4:55                                                                       ` Michael S. Tsirkin
@ 2010-08-04 10:45                                                                         ` Peter Zijlstra
  0 siblings, 0 replies; 115+ messages in thread
From: Peter Zijlstra @ 2010-08-04 10:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, Tejun Heo, Ingo Molnar, netdev,
	lkml, kvm, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Andi Kleen

On Tue, 2010-07-27 at 07:55 +0300, Michael S. Tsirkin wrote:

> Peter, could you please indicate whether you think this is the way to
> go, too?

I really dislike it, as you indicated, you now want priority too..

It seems the problem is that we normally don't consider work done by
kernel threads for user processes part of that process.

I'm not sure what work you're doing, but I'm pretty sure there's similar
things already in the kernel -- think about the work done by encryption
threads for encrypted sockets and stuff.

If you want proper containment of work caused by a process, I'd suggest
you start by looking at curing the general problem, instead of special
casing this one case.

^ permalink raw reply	[flat|nested] 115+ messages in thread

end of thread, other threads:[~2010-08-04 10:46 UTC | newest]

Thread overview: 115+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-05-19  0:04 [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup Sridhar Samudrala
2010-05-27  9:14 ` Michael S. Tsirkin
2010-05-27 12:44   ` Oleg Nesterov
2010-05-27 13:12     ` Michael S. Tsirkin
2010-05-27 13:48       ` Oleg Nesterov
2010-05-27 16:15       ` Tejun Heo
2010-05-27 16:39         ` Michael S. Tsirkin
2010-05-27 16:56           ` Tejun Heo
2010-05-27 17:32             ` Michael S. Tsirkin
2010-05-27 21:20               ` Tejun Heo
2010-05-28 15:08                 ` Michael S. Tsirkin
2010-05-28 15:54                   ` Tejun Heo
2010-05-30 11:29                     ` Michael S. Tsirkin
2010-05-30 20:24                       ` [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread Tejun Heo
2010-05-31 14:39                         ` Oleg Nesterov
2010-05-31 15:07                           ` Tejun Heo
2010-05-31 15:31                             ` Oleg Nesterov
2010-05-31 15:38                               ` Tejun Heo
2010-05-31 15:22                         ` Michael S. Tsirkin
2010-05-31 15:45                           ` Tejun Heo
2010-05-31 16:00                             ` Michael S. Tsirkin
2010-06-01  9:34                               ` Tejun Heo
2010-06-02 18:40                                 ` [PATCH UPDATED " Tejun Heo
2010-06-02 21:34                                   ` Sridhar Samudrala
2010-07-22 15:58                                   ` Michael S. Tsirkin
2010-07-22 21:21                                     ` Tejun Heo
2010-07-24 19:14                                       ` Michael S. Tsirkin
2010-07-25  7:41                                         ` Tejun Heo
2010-07-25 10:04                                           ` Michael S. Tsirkin
2010-07-26 15:25                                           ` Michael S. Tsirkin
2010-07-26 15:34                                             ` Tejun Heo
2010-07-26 15:46                                               ` Tejun Heo
2010-07-26 15:51                                                 ` Michael S. Tsirkin
2010-07-26 15:50                                               ` Michael S. Tsirkin
2010-07-26 16:05                                                 ` Tejun Heo
2010-07-26 16:14                                                   ` Tejun Heo
2010-07-26 16:31                                                     ` Michael S. Tsirkin
2010-07-26 18:51                                                       ` Tejun Heo
2010-07-26 19:57                                                         ` Michael S. Tsirkin
2010-07-27  8:18                                                           ` Tejun Heo
2010-07-26 16:51                                                     ` Michael S. Tsirkin
2010-07-26 19:14                                                       ` Tejun Heo
2010-07-26 19:31                                                         ` Tejun Heo
2010-07-26 19:59                                                           ` Michael S. Tsirkin
2010-07-27 19:19                                                           ` Michael S. Tsirkin
2010-07-28  7:48                                                             ` Tejun Heo
2010-07-28 10:48                                                               ` Michael S. Tsirkin
2010-07-28 12:00                                                                 ` Tejun Heo
2010-07-26 16:57                                                     ` Michael S. Tsirkin
2010-07-26 16:23                                                   ` Michael S. Tsirkin
2010-07-26 19:04                                                     ` Tejun Heo
2010-07-26 20:19                                                       ` Michael S. Tsirkin
2010-07-27  8:21                                                         ` Tejun Heo
2010-06-01  9:34                               ` [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup Tejun Heo
2010-06-01  9:35                               ` [PATCH 3/3] vhost: apply cpumask and cgroup to vhost workers Tejun Heo
2010-06-01 10:17                                 ` Michael S. Tsirkin
2010-06-01 10:56                                   ` Tejun Heo
2010-06-01 17:19                                 ` Sridhar Samudrala
2010-06-01 23:59                                   ` Tejun Heo
2010-06-01 14:13                           ` [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread Paul E. McKenney
2010-05-30 20:24                       ` [PATCH 2/3] cgroups: Add an API to attach a task to current task's cgroup Tejun Heo
2010-05-31  1:07                         ` Li Zefan
2010-05-31  7:00                           ` Tejun Heo
2010-05-30 20:25                       ` [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers Tejun Heo
2010-05-31  1:11                         ` Li Zefan
2010-05-31  6:58                           ` [PATCH UPDATED " Tejun Heo
2010-05-31  7:48                             ` Li Zefan
2010-05-31 10:20                               ` [PATCH UPDATED2 " Tejun Heo
2010-06-24  8:11                         ` [PATCH " Michael S. Tsirkin
2010-06-24 22:45                           ` Sridhar Samudrala
2010-06-25 10:10                             ` [PATCH] sched: export sched_set/getaffinity (was Re: [PATCH 3/3] vhost: apply cpumask and cgroup to vhost pollers) Michael S. Tsirkin
2010-07-01 11:07                               ` [PATCH repost] sched: export sched_set/getaffinity to modules Michael S. Tsirkin
2010-07-01 11:19                                 ` Peter Zijlstra
2010-07-01 11:43                                   ` Peter Zijlstra
2010-07-01 11:55                                     ` Michael S. Tsirkin
2010-07-01 12:23                                       ` Michael S. Tsirkin
2010-07-01 12:34                                         ` Peter Zijlstra
2010-07-01 12:46                                           ` Peter Zijlstra
2010-07-01 13:08                                             ` Michael S. Tsirkin
2010-07-01 13:30                                               ` Peter Zijlstra
2010-07-01 13:39                                                 ` Michael S. Tsirkin
2010-07-01 13:57                                                   ` Peter Zijlstra
2010-07-01 14:27                                                   ` Tejun Heo
2010-07-01 14:46                                                     ` Oleg Nesterov
2010-07-01 14:53                                                       ` Tejun Heo
2010-07-01 14:55                                                         ` Peter Zijlstra
2010-07-02 18:01                                                           ` Sridhar Samudrala
2010-07-02 18:11                                                             ` Peter Zijlstra
2010-07-02 21:06                                                               ` Oleg Nesterov
2010-07-04  9:00                                                                 ` Michael S. Tsirkin
2010-07-13  6:59                                                                   ` Sridhar Samudrala
2010-07-13 11:09                                                                     ` Michael S. Tsirkin
2010-07-14 23:26                                                                       ` Sridhar Samudrala
2010-07-15  0:05                                                                         ` Oleg Nesterov
2010-07-15  5:29                                                                           ` Sridhar Samudrala
2010-07-26 17:12                                                                 ` Michael S. Tsirkin
2010-07-26 17:51                                                                   ` Sridhar Samudrala
2010-07-26 18:08                                                                     ` Oleg Nesterov
2010-07-26 19:55                                                                       ` Michael S. Tsirkin
2010-07-26 20:27                                                                       ` Michael S. Tsirkin
2010-07-27  4:55                                                                       ` Michael S. Tsirkin
2010-08-04 10:45                                                                         ` Peter Zijlstra
2010-07-27 15:41                                                                       ` Michael S. Tsirkin
2010-07-30 14:19                                                                         ` Oleg Nesterov
2010-07-30 14:31                                                                           ` Tejun Heo
2010-08-01  8:50                                                                           ` Michael S. Tsirkin
2010-08-02 15:02                                                                             ` Oleg Nesterov
2010-07-01 14:33                                                 ` Oleg Nesterov
2010-07-01 12:32                                       ` Peter Zijlstra
2010-07-01 12:50                                         ` Michael S. Tsirkin
2010-07-01 13:07                                           ` Peter Zijlstra
2010-07-01 13:22                                             ` Michael S. Tsirkin
2010-05-27 16:24     ` [PATCH 2/3] workqueue: Add an API to create a singlethread workqueue attached to the current task's cgroup Sridhar Samudrala
2010-05-27 16:41       ` Michael S. Tsirkin
2010-05-27 17:30       ` Oleg Nesterov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.