linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] ipc: use a work queue to free_ipc
@ 2020-02-17 18:36 Giuseppe Scrivano
  2020-02-17 18:56 ` Paul E. McKenney
  2020-02-21 19:37 ` Eric W. Biederman
  0 siblings, 2 replies; 5+ messages in thread
From: Giuseppe Scrivano @ 2020-02-17 18:36 UTC (permalink / raw)
  To: linux-kernel; +Cc: rcu, ebiederm, paulmck, viro

it avoids blocking on synchronize_rcu() in kern_umount().

the code:

\#define _GNU_SOURCE
\#include <sched.h>
\#include <error.h>
\#include <errno.h>
\#include <stdlib.h>
int main()
{
  int i;
  for (i  = 0; i < 1000; i++)
    if (unshare (CLONE_NEWIPC) < 0)
      error (EXIT_FAILURE, errno, "unshare");
}

gets from:

	Command being timed: "./ipc-namespace"
	User time (seconds): 0.00
	System time (seconds): 0.06
	Percent of CPU this job got: 0%
	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:08.05

to:

	Command being timed: "./ipc-namespace"
	User time (seconds): 0.00
	System time (seconds): 0.02
	Percent of CPU this job got: 96%
	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.03

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
---
v2:
- comment added in free_ipc_ns()

v1: https://lkml.org/lkml/2020/2/11/692

 include/linux/ipc_namespace.h |  2 ++
 ipc/namespace.c               | 20 ++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index c309f43bde45..a06a78c67f19 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -68,6 +68,8 @@ struct ipc_namespace {
 	struct user_namespace *user_ns;
 	struct ucounts *ucounts;
 
+	struct llist_node mnt_llist;
+
 	struct ns_common ns;
 } __randomize_layout;
 
diff --git a/ipc/namespace.c b/ipc/namespace.c
index b3ca1476ca51..7b9922244891 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -117,6 +117,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
 
 static void free_ipc_ns(struct ipc_namespace *ns)
 {
+	/* mq_put_mnt() waits for a grace period as kern_unmount()
+	 * uses synchronize_rcu().
+	 */
+	mq_put_mnt(ns);
 	sem_exit_ns(ns);
 	msg_exit_ns(ns);
 	shm_exit_ns(ns);
@@ -127,6 +131,17 @@ static void free_ipc_ns(struct ipc_namespace *ns)
 	kfree(ns);
 }
 
+static LLIST_HEAD(free_ipc_list);
+static void free_ipc(struct work_struct *unused)
+{
+	struct llist_node *node = llist_del_all(&free_ipc_list);
+	struct ipc_namespace *n, *t;
+
+	llist_for_each_entry_safe(n, t, node, mnt_llist)
+		free_ipc_ns(n);
+}
+static DECLARE_WORK(free_ipc_work, free_ipc);
+
 /*
  * put_ipc_ns - drop a reference to an ipc namespace.
  * @ns: the namespace to put
@@ -148,8 +163,9 @@ void put_ipc_ns(struct ipc_namespace *ns)
 	if (refcount_dec_and_lock(&ns->count, &mq_lock)) {
 		mq_clear_sbinfo(ns);
 		spin_unlock(&mq_lock);
-		mq_put_mnt(ns);
-		free_ipc_ns(ns);
+
+		if (llist_add(&ns->mnt_llist, &free_ipc_list))
+			schedule_work(&free_ipc_work);
 	}
 }
 
-- 
2.24.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] ipc: use a work queue to free_ipc
  2020-02-17 18:36 [PATCH v2] ipc: use a work queue to free_ipc Giuseppe Scrivano
@ 2020-02-17 18:56 ` Paul E. McKenney
  2020-02-21 19:37 ` Eric W. Biederman
  1 sibling, 0 replies; 5+ messages in thread
From: Paul E. McKenney @ 2020-02-17 18:56 UTC (permalink / raw)
  To: Giuseppe Scrivano; +Cc: linux-kernel, rcu, ebiederm, viro

On Mon, Feb 17, 2020 at 07:36:27PM +0100, Giuseppe Scrivano wrote:
> it avoids blocking on synchronize_rcu() in kern_umount().
> 
> the code:
> 
> \#define _GNU_SOURCE
> \#include <sched.h>
> \#include <error.h>
> \#include <errno.h>
> \#include <stdlib.h>
> int main()
> {
>   int i;
>   for (i  = 0; i < 1000; i++)
>     if (unshare (CLONE_NEWIPC) < 0)
>       error (EXIT_FAILURE, errno, "unshare");
> }
> 
> gets from:
> 
> 	Command being timed: "./ipc-namespace"
> 	User time (seconds): 0.00
> 	System time (seconds): 0.06
> 	Percent of CPU this job got: 0%
> 	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:08.05
> 
> to:
> 
> 	Command being timed: "./ipc-namespace"
> 	User time (seconds): 0.00
> 	System time (seconds): 0.02
> 	Percent of CPU this job got: 96%
> 	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.03
> 
> Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
> ---
> v2:
> - comment added in free_ipc_ns()

Much better, thank you!

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>

> v1: https://lkml.org/lkml/2020/2/11/692
> 
>  include/linux/ipc_namespace.h |  2 ++
>  ipc/namespace.c               | 20 ++++++++++++++++++--
>  2 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> index c309f43bde45..a06a78c67f19 100644
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -68,6 +68,8 @@ struct ipc_namespace {
>  	struct user_namespace *user_ns;
>  	struct ucounts *ucounts;
>  
> +	struct llist_node mnt_llist;
> +
>  	struct ns_common ns;
>  } __randomize_layout;
>  
> diff --git a/ipc/namespace.c b/ipc/namespace.c
> index b3ca1476ca51..7b9922244891 100644
> --- a/ipc/namespace.c
> +++ b/ipc/namespace.c
> @@ -117,6 +117,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
>  
>  static void free_ipc_ns(struct ipc_namespace *ns)
>  {
> +	/* mq_put_mnt() waits for a grace period as kern_unmount()
> +	 * uses synchronize_rcu().
> +	 */
> +	mq_put_mnt(ns);
>  	sem_exit_ns(ns);
>  	msg_exit_ns(ns);
>  	shm_exit_ns(ns);
> @@ -127,6 +131,17 @@ static void free_ipc_ns(struct ipc_namespace *ns)
>  	kfree(ns);
>  }
>  
> +static LLIST_HEAD(free_ipc_list);
> +static void free_ipc(struct work_struct *unused)
> +{
> +	struct llist_node *node = llist_del_all(&free_ipc_list);
> +	struct ipc_namespace *n, *t;
> +
> +	llist_for_each_entry_safe(n, t, node, mnt_llist)
> +		free_ipc_ns(n);
> +}
> +static DECLARE_WORK(free_ipc_work, free_ipc);
> +
>  /*
>   * put_ipc_ns - drop a reference to an ipc namespace.
>   * @ns: the namespace to put
> @@ -148,8 +163,9 @@ void put_ipc_ns(struct ipc_namespace *ns)
>  	if (refcount_dec_and_lock(&ns->count, &mq_lock)) {
>  		mq_clear_sbinfo(ns);
>  		spin_unlock(&mq_lock);
> -		mq_put_mnt(ns);
> -		free_ipc_ns(ns);
> +
> +		if (llist_add(&ns->mnt_llist, &free_ipc_list))
> +			schedule_work(&free_ipc_work);
>  	}
>  }
>  
> -- 
> 2.24.1
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] ipc: use a work queue to free_ipc
  2020-02-17 18:36 [PATCH v2] ipc: use a work queue to free_ipc Giuseppe Scrivano
  2020-02-17 18:56 ` Paul E. McKenney
@ 2020-02-21 19:37 ` Eric W. Biederman
  2020-02-23 19:01   ` Giuseppe Scrivano
  1 sibling, 1 reply; 5+ messages in thread
From: Eric W. Biederman @ 2020-02-21 19:37 UTC (permalink / raw)
  To: Giuseppe Scrivano; +Cc: linux-kernel, rcu, paulmck, viro

Giuseppe Scrivano <gscrivan@redhat.com> writes:

> it avoids blocking on synchronize_rcu() in kern_umount().
>
> the code:
>
> \#define _GNU_SOURCE
> \#include <sched.h>
> \#include <error.h>
> \#include <errno.h>
> \#include <stdlib.h>
> int main()
> {
>   int i;
>   for (i  = 0; i < 1000; i++)
>     if (unshare (CLONE_NEWIPC) < 0)
>       error (EXIT_FAILURE, errno, "unshare");
> }
>
> gets from:
>
> 	Command being timed: "./ipc-namespace"
> 	User time (seconds): 0.00
> 	System time (seconds): 0.06
> 	Percent of CPU this job got: 0%
> 	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:08.05
>
> to:
>
> 	Command being timed: "./ipc-namespace"
> 	User time (seconds): 0.00
> 	System time (seconds): 0.02
> 	Percent of CPU this job got: 96%
> 	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.03

I have a question.  You create 1000 namespaces in a single process
and then free them.  So I expect that single process is busy waiting
for that kern_umount 1000 types, and waiting for 1000 synchronize_rcu's.

Does this ever show up in a real world work-load?

Is the cost of a single synchronize_rcu a problem?

The code you are working to avoid is this.

void kern_unmount(struct vfsmount *mnt)
{
	/* release long term mount so mount point can be released */
	if (!IS_ERR_OR_NULL(mnt)) {
		real_mount(mnt)->mnt_ns = NULL;
		synchronize_rcu();	/* yecchhh... */
		mntput(mnt);
	}
}

Which makes me wonder if perhaps there might be a simpler solution
involving just that code.  But I do realize such a solution
would require analyzing all of the code after kern_unmount
to see if any of it depends upon the synchronize_rcu.


In summary, I see no correctness problems with your code.
Code that runs faster is always nice.  In this case I just
see the cost being shifted somewhere else not eliminated.
I also see a slight increase in complexity.

So I am wondering if this was an exercise to speed up a toy
benchmark or if this is an effort to speed of real world code.

At the very least some version of the motivation needs to be
recorded so that the next time some one comes in an reworks
the code they can look in the history and figure out what
they need to do to avoid introducing a regeression.

Eric

> Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
> ---
> v2:
> - comment added in free_ipc_ns()
>
> v1: https://lkml.org/lkml/2020/2/11/692
>
>  include/linux/ipc_namespace.h |  2 ++
>  ipc/namespace.c               | 20 ++++++++++++++++++--
>  2 files changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
> index c309f43bde45..a06a78c67f19 100644
> --- a/include/linux/ipc_namespace.h
> +++ b/include/linux/ipc_namespace.h
> @@ -68,6 +68,8 @@ struct ipc_namespace {
>  	struct user_namespace *user_ns;
>  	struct ucounts *ucounts;
>  
> +	struct llist_node mnt_llist;
> +
>  	struct ns_common ns;
>  } __randomize_layout;
>  
> diff --git a/ipc/namespace.c b/ipc/namespace.c
> index b3ca1476ca51..7b9922244891 100644
> --- a/ipc/namespace.c
> +++ b/ipc/namespace.c
> @@ -117,6 +117,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
>  
>  static void free_ipc_ns(struct ipc_namespace *ns)
>  {
> +	/* mq_put_mnt() waits for a grace period as kern_unmount()
> +	 * uses synchronize_rcu().
> +	 */
> +	mq_put_mnt(ns);
>  	sem_exit_ns(ns);
>  	msg_exit_ns(ns);
>  	shm_exit_ns(ns);
> @@ -127,6 +131,17 @@ static void free_ipc_ns(struct ipc_namespace *ns)
>  	kfree(ns);
>  }
>  
> +static LLIST_HEAD(free_ipc_list);
> +static void free_ipc(struct work_struct *unused)
> +{
> +	struct llist_node *node = llist_del_all(&free_ipc_list);
> +	struct ipc_namespace *n, *t;
> +
> +	llist_for_each_entry_safe(n, t, node, mnt_llist)
> +		free_ipc_ns(n);
> +}
> +static DECLARE_WORK(free_ipc_work, free_ipc);
> +
>  /*
>   * put_ipc_ns - drop a reference to an ipc namespace.
>   * @ns: the namespace to put
> @@ -148,8 +163,9 @@ void put_ipc_ns(struct ipc_namespace *ns)
>  	if (refcount_dec_and_lock(&ns->count, &mq_lock)) {
>  		mq_clear_sbinfo(ns);
>  		spin_unlock(&mq_lock);
> -		mq_put_mnt(ns);
> -		free_ipc_ns(ns);
> +
> +		if (llist_add(&ns->mnt_llist, &free_ipc_list))
> +			schedule_work(&free_ipc_work);
>  	}
>  }

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] ipc: use a work queue to free_ipc
  2020-02-21 19:37 ` Eric W. Biederman
@ 2020-02-23 19:01   ` Giuseppe Scrivano
  2020-02-24 16:10     ` Eric W. Biederman
  0 siblings, 1 reply; 5+ messages in thread
From: Giuseppe Scrivano @ 2020-02-23 19:01 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: linux-kernel, rcu, paulmck, viro

ebiederm@xmission.com (Eric W. Biederman) writes:

> Giuseppe Scrivano <gscrivan@redhat.com> writes:
>
>> it avoids blocking on synchronize_rcu() in kern_umount().
>>
>> the code:
>>
>> \#define _GNU_SOURCE
>> \#include <sched.h>
>> \#include <error.h>
>> \#include <errno.h>
>> \#include <stdlib.h>
>> int main()
>> {
>>   int i;
>>   for (i  = 0; i < 1000; i++)
>>     if (unshare (CLONE_NEWIPC) < 0)
>>       error (EXIT_FAILURE, errno, "unshare");
>> }
>>
>> gets from:
>>
>> 	Command being timed: "./ipc-namespace"
>> 	User time (seconds): 0.00
>> 	System time (seconds): 0.06
>> 	Percent of CPU this job got: 0%
>> 	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:08.05
>>
>> to:
>>
>> 	Command being timed: "./ipc-namespace"
>> 	User time (seconds): 0.00
>> 	System time (seconds): 0.02
>> 	Percent of CPU this job got: 96%
>> 	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.03
>
> I have a question.  You create 1000 namespaces in a single process
> and then free them.  So I expect that single process is busy waiting
> for that kern_umount 1000 types, and waiting for 1000 synchronize_rcu's.
>
> Does this ever show up in a real world work-load?
>
> Is the cost of a single synchronize_rcu a problem?

yes exactly, creating 1000 namespaces is not a real world use case (at
least in my experience) but I've used it only to show the impact of the
patch.

The cost of the single synchronize_rcu is the issue.

Most containers run in their own IPC namespace, so this is a constant
cost for each container.


> The code you are working to avoid is this.
>
> void kern_unmount(struct vfsmount *mnt)
> {
> 	/* release long term mount so mount point can be released */
> 	if (!IS_ERR_OR_NULL(mnt)) {
> 		real_mount(mnt)->mnt_ns = NULL;
> 		synchronize_rcu();	/* yecchhh... */
> 		mntput(mnt);
> 	}
> }
>
> Which makes me wonder if perhaps there might be a simpler solution
> involving just that code.  But I do realize such a solution
> would require analyzing all of the code after kern_unmount
> to see if any of it depends upon the synchronize_rcu.
>
>
> In summary, I see no correctness problems with your code.
> Code that runs faster is always nice.  In this case I just
> see the cost being shifted somewhere else not eliminated.
> I also see a slight increase in complexity.
>
> So I am wondering if this was an exercise to speed up a toy
> benchmark or if this is an effort to speed of real world code.

I've seen the issue while profiling real world work loads.


> At the very least some version of the motivation needs to be
> recorded so that the next time some one comes in an reworks
> the code they can look in the history and figure out what
> they need to do to avoid introducing a regeression.

Is it enough in the git commit message or should it be an inline
comment?

Thanks,
Giuseppe


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] ipc: use a work queue to free_ipc
  2020-02-23 19:01   ` Giuseppe Scrivano
@ 2020-02-24 16:10     ` Eric W. Biederman
  0 siblings, 0 replies; 5+ messages in thread
From: Eric W. Biederman @ 2020-02-24 16:10 UTC (permalink / raw)
  To: Giuseppe Scrivano; +Cc: linux-kernel, rcu, paulmck, viro

Giuseppe Scrivano <gscrivan@redhat.com> writes:

> ebiederm@xmission.com (Eric W. Biederman) writes:
>
>> Giuseppe Scrivano <gscrivan@redhat.com> writes:
>>
>>> it avoids blocking on synchronize_rcu() in kern_umount().
>>>
>>> the code:
>>>
>>> \#define _GNU_SOURCE
>>> \#include <sched.h>
>>> \#include <error.h>
>>> \#include <errno.h>
>>> \#include <stdlib.h>
>>> int main()
>>> {
>>>   int i;
>>>   for (i  = 0; i < 1000; i++)
>>>     if (unshare (CLONE_NEWIPC) < 0)
>>>       error (EXIT_FAILURE, errno, "unshare");
>>> }
>>>
>>> gets from:
>>>
>>> 	Command being timed: "./ipc-namespace"
>>> 	User time (seconds): 0.00
>>> 	System time (seconds): 0.06
>>> 	Percent of CPU this job got: 0%
>>> 	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:08.05
>>>
>>> to:
>>>
>>> 	Command being timed: "./ipc-namespace"
>>> 	User time (seconds): 0.00
>>> 	System time (seconds): 0.02
>>> 	Percent of CPU this job got: 96%
>>> 	Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.03
>>
>> I have a question.  You create 1000 namespaces in a single process
>> and then free them.  So I expect that single process is busy waiting
>> for that kern_umount 1000 types, and waiting for 1000 synchronize_rcu's.
>>
>> Does this ever show up in a real world work-load?
>>
>> Is the cost of a single synchronize_rcu a problem?
>
> yes exactly, creating 1000 namespaces is not a real world use case (at
> least in my experience) but I've used it only to show the impact of the
> patch.

I know running 1000 containers is a real use case, and I would not be
surprised if their are configurations that go higher.

> The cost of the single synchronize_rcu is the issue.
>
> Most containers run in their own IPC namespace, so this is a constant
> cost for each container.

Agreed.

>> The code you are working to avoid is this.
>>
>> void kern_unmount(struct vfsmount *mnt)
>> {
>> 	/* release long term mount so mount point can be released */
>> 	if (!IS_ERR_OR_NULL(mnt)) {
>> 		real_mount(mnt)->mnt_ns = NULL;
>> 		synchronize_rcu();	/* yecchhh... */
>> 		mntput(mnt);
>> 	}
>> }
>>
>> Which makes me wonder if perhaps there might be a simpler solution
>> involving just that code.  But I do realize such a solution
>> would require analyzing all of the code after kern_unmount
>> to see if any of it depends upon the synchronize_rcu.
>>
>>
>> In summary, I see no correctness problems with your code.
>> Code that runs faster is always nice.  In this case I just
>> see the cost being shifted somewhere else not eliminated.
>> I also see a slight increase in complexity.
>>
>> So I am wondering if this was an exercise to speed up a toy
>> benchmark or if this is an effort to speed of real world code.
>
> I've seen the issue while profiling real world work loads.

So the question is how to remove this delay.

>> At the very least some version of the motivation needs to be
>> recorded so that the next time some one comes in an reworks
>> the code they can look in the history and figure out what
>> they need to do to avoid introducing a regeression.
>
> Is it enough in the git commit message or should it be an inline
> comment?

The git commit message should be enough to record the motivation.

A comment in the code that about the work queue that says something
like "used to avoid the cost of synchronize_rcu in kern_unmount" would
also be nice.

Eric

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2020-02-24 16:12 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-17 18:36 [PATCH v2] ipc: use a work queue to free_ipc Giuseppe Scrivano
2020-02-17 18:56 ` Paul E. McKenney
2020-02-21 19:37 ` Eric W. Biederman
2020-02-23 19:01   ` Giuseppe Scrivano
2020-02-24 16:10     ` Eric W. Biederman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).