linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] kernfs: Separate kernfs_pr_cont_buf and rename_lock.
@ 2022-05-16 18:28 Hao Luo
  2022-05-16 18:41 ` Tejun Heo
  0 siblings, 1 reply; 5+ messages in thread
From: Hao Luo @ 2022-05-16 18:28 UTC (permalink / raw)
  To: Greg Kroah-Hartman, Tejun Heo; +Cc: linux-kernel, Hao Luo

Previously the protection of kernfs_pr_cont_buf was piggy backed by
rename_lock, which means that pr_cont() needs to be protected under
rename_lock. This can cause potential circular lock dependencies.

If there is an OOM, we have the following call hierarchy:

 -> cpuset_print_current_mems_allowed()
   -> pr_cont_cgroup_name()
     -> pr_cont_kernfs_name()

pr_cont_kernfs_name() will grab rename_lock and call printk. So we have
the following lock dependencies:

 kernfs_rename_lock -> console_sem

Sometimes, printk does a wakeup before releasing console_sem, which has
the dependence chain:

 console_sem -> p->pi_lock -> rq->lock

Now, imagine one wants to read cgroup_name under rq->lock, for example,
printing cgroup_name in a tracepoint in the scheduler code. They will
be holding rq->lock and take rename_lock:

 rq->lock -> kernfs_rename_lock

Now they will deadlock.

A prevention to this circular lock dependency is to separate the
protection of pr_cont_buf from rename_lock. In principle, rename_lock
is to protect the integrity of cgroup name when copying to buf. Once
pr_cont_buf has got its content, rename_lock can be dropped. So it's
safe to drop rename_lock after kernfs_name_locked (and
kernfs_path_from_node_locked) and rely on a dedicated pr_cont_lock
to protect pr_cont_buf.

Signed-off-by: Hao Luo <haoluo@google.com>
---
 fs/kernfs/dir.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e205fde7163a..966d24562f0f 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -18,7 +18,8 @@
 #include "kernfs-internal.h"
 
 static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
-static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by rename_lock */
+static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
+static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by pr_cont_lock */
 static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
 
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
@@ -229,12 +230,12 @@ void pr_cont_kernfs_name(struct kernfs_node *kn)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&kernfs_rename_lock, flags);
+	spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
 
-	kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
+	kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
 	pr_cont("%s", kernfs_pr_cont_buf);
 
-	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+	spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
 }
 
 /**
@@ -248,10 +249,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
 	unsigned long flags;
 	int sz;
 
-	spin_lock_irqsave(&kernfs_rename_lock, flags);
+	spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
 
-	sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
-					  sizeof(kernfs_pr_cont_buf));
+	sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
+				   sizeof(kernfs_pr_cont_buf));
 	if (sz < 0) {
 		pr_cont("(error)");
 		goto out;
@@ -265,7 +266,7 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
 	pr_cont("%s", kernfs_pr_cont_buf);
 
 out:
-	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+	spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
 }
 
 /**
@@ -823,13 +824,12 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
 
 	lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
 
-	/* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
-	spin_lock_irq(&kernfs_rename_lock);
+	spin_lock_irq(&kernfs_pr_cont_lock);
 
 	len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
 
 	if (len >= sizeof(kernfs_pr_cont_buf)) {
-		spin_unlock_irq(&kernfs_rename_lock);
+		spin_unlock_irq(&kernfs_pr_cont_lock);
 		return NULL;
 	}
 
@@ -841,7 +841,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
 		parent = kernfs_find_ns(parent, name, ns);
 	}
 
-	spin_unlock_irq(&kernfs_rename_lock);
+	spin_unlock_irq(&kernfs_pr_cont_lock);
 
 	return parent;
 }
-- 
2.36.1.124.g0e6072fb45-goog


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] kernfs: Separate kernfs_pr_cont_buf and rename_lock.
  2022-05-16 18:28 [PATCH] kernfs: Separate kernfs_pr_cont_buf and rename_lock Hao Luo
@ 2022-05-16 18:41 ` Tejun Heo
  2022-05-16 19:08   ` Hao Luo
  0 siblings, 1 reply; 5+ messages in thread
From: Tejun Heo @ 2022-05-16 18:41 UTC (permalink / raw)
  To: Hao Luo; +Cc: Greg Kroah-Hartman, linux-kernel

On Mon, May 16, 2022 at 11:28:59AM -0700, Hao Luo wrote:
> Previously the protection of kernfs_pr_cont_buf was piggy backed by
> rename_lock, which means that pr_cont() needs to be protected under
> rename_lock. This can cause potential circular lock dependencies.
> 
> If there is an OOM, we have the following call hierarchy:
> 
>  -> cpuset_print_current_mems_allowed()
>    -> pr_cont_cgroup_name()
>      -> pr_cont_kernfs_name()
> 
> pr_cont_kernfs_name() will grab rename_lock and call printk. So we have
> the following lock dependencies:
> 
>  kernfs_rename_lock -> console_sem
> 
> Sometimes, printk does a wakeup before releasing console_sem, which has
> the dependence chain:
> 
>  console_sem -> p->pi_lock -> rq->lock
> 
> Now, imagine one wants to read cgroup_name under rq->lock, for example,
> printing cgroup_name in a tracepoint in the scheduler code. They will
> be holding rq->lock and take rename_lock:
> 
>  rq->lock -> kernfs_rename_lock
> 
> Now they will deadlock.
> 
> A prevention to this circular lock dependency is to separate the
> protection of pr_cont_buf from rename_lock. In principle, rename_lock
> is to protect the integrity of cgroup name when copying to buf. Once
> pr_cont_buf has got its content, rename_lock can be dropped. So it's
> safe to drop rename_lock after kernfs_name_locked (and
> kernfs_path_from_node_locked) and rely on a dedicated pr_cont_lock
> to protect pr_cont_buf.
> 
> Signed-off-by: Hao Luo <haoluo@google.com>

Can you please add a comment explaining why the lock is separate? Other than
that:

Acked-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] kernfs: Separate kernfs_pr_cont_buf and rename_lock.
  2022-05-16 18:41 ` Tejun Heo
@ 2022-05-16 19:08   ` Hao Luo
  0 siblings, 0 replies; 5+ messages in thread
From: Hao Luo @ 2022-05-16 19:08 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Greg Kroah-Hartman, linux-kernel

On Mon, May 16, 2022 at 11:41 AM Tejun Heo <tj@kernel.org> wrote:
>
> Can you please add a comment explaining why the lock is separate? Other than
> that:
>
> Acked-by: Tejun Heo <tj@kernel.org>
>
> Thanks.

Sure, will do and resend.

>
> --
> tejun

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] kernfs: Separate kernfs_pr_cont_buf and rename_lock.
  2022-05-16 19:09 Hao Luo
@ 2022-05-16 19:34 ` Greg Kroah-Hartman
  0 siblings, 0 replies; 5+ messages in thread
From: Greg Kroah-Hartman @ 2022-05-16 19:34 UTC (permalink / raw)
  To: Hao Luo; +Cc: Tejun Heo, linux-kernel

On Mon, May 16, 2022 at 12:09:51PM -0700, Hao Luo wrote:
> Previously the protection of kernfs_pr_cont_buf was piggy backed by
> rename_lock, which means that pr_cont() needs to be protected under
> rename_lock. This can cause potential circular lock dependencies.
> 
> If there is an OOM, we have the following call hierarchy:
> 
>  -> cpuset_print_current_mems_allowed()
>    -> pr_cont_cgroup_name()
>      -> pr_cont_kernfs_name()
> 
> pr_cont_kernfs_name() will grab rename_lock and call printk. So we have
> the following lock dependencies:
> 
>  kernfs_rename_lock -> console_sem
> 
> Sometimes, printk does a wakeup before releasing console_sem, which has
> the dependence chain:
> 
>  console_sem -> p->pi_lock -> rq->lock
> 
> Now, imagine one wants to read cgroup_name under rq->lock, for example,
> printing cgroup_name in a tracepoint in the scheduler code. They will
> be holding rq->lock and take rename_lock:
> 
>  rq->lock -> kernfs_rename_lock
> 
> Now they will deadlock.
> 
> A prevention to this circular lock dependency is to separate the
> protection of pr_cont_buf from rename_lock. In principle, rename_lock
> is to protect the integrity of cgroup name when copying to buf. Once
> pr_cont_buf has got its content, rename_lock can be dropped. So it's
> safe to drop rename_lock after kernfs_name_locked (and
> kernfs_path_from_node_locked) and rely on a dedicated pr_cont_lock
> to protect pr_cont_buf.
> 
> Acked-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Hao Luo <haoluo@google.com>
> ---
>  fs/kernfs/dir.c | 31 +++++++++++++++++++------------
>  1 file changed, 19 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index e205fde7163a..6eca72cfa1f2 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -18,7 +18,15 @@
>  #include "kernfs-internal.h"
>  
>  static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
> -static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by rename_lock */
> +/*
> + * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
> + * call pr_cont() while holding rename_lock. Because sometimes pr_cont()
> + * will perform wakeups when releasing console_sem. Holding rename_lock
> + * will introduce deadlock if the scheduler reads the kernfs_name in the
> + * wakeup path.
> + */
> +static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
> +static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by pr_cont_lock */
>  static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
>  
>  #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
> @@ -229,12 +237,12 @@ void pr_cont_kernfs_name(struct kernfs_node *kn)
>  {
>  	unsigned long flags;
>  
> -	spin_lock_irqsave(&kernfs_rename_lock, flags);
> +	spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
>  
> -	kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
> +	kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
>  	pr_cont("%s", kernfs_pr_cont_buf);
>  
> -	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
> +	spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
>  }
>  
>  /**
> @@ -248,10 +256,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
>  	unsigned long flags;
>  	int sz;
>  
> -	spin_lock_irqsave(&kernfs_rename_lock, flags);
> +	spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
>  
> -	sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
> -					  sizeof(kernfs_pr_cont_buf));
> +	sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
> +				   sizeof(kernfs_pr_cont_buf));
>  	if (sz < 0) {
>  		pr_cont("(error)");
>  		goto out;
> @@ -265,7 +273,7 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
>  	pr_cont("%s", kernfs_pr_cont_buf);
>  
>  out:
> -	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
> +	spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
>  }
>  
>  /**
> @@ -823,13 +831,12 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
>  
>  	lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
>  
> -	/* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
> -	spin_lock_irq(&kernfs_rename_lock);
> +	spin_lock_irq(&kernfs_pr_cont_lock);
>  
>  	len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
>  
>  	if (len >= sizeof(kernfs_pr_cont_buf)) {
> -		spin_unlock_irq(&kernfs_rename_lock);
> +		spin_unlock_irq(&kernfs_pr_cont_lock);
>  		return NULL;
>  	}
>  
> @@ -841,7 +848,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
>  		parent = kernfs_find_ns(parent, name, ns);
>  	}
>  
> -	spin_unlock_irq(&kernfs_rename_lock);
> +	spin_unlock_irq(&kernfs_pr_cont_lock);
>  
>  	return parent;
>  }
> -- 
> 2.36.1.124.g0e6072fb45-goog
> 

Hi,

This is the friendly patch-bot of Greg Kroah-Hartman.  You have sent him
a patch that has triggered this response.  He used to manually respond
to these common problems, but in order to save his sanity (he kept
writing the same thing over and over, yet to different people), I was
created.  Hopefully you will not take offence and will fix the problem
in your patch and resubmit it so that it can be accepted into the Linux
kernel tree.

You are receiving this message because of the following common error(s)
as indicated below:

- This looks like a new version of a previously submitted patch, but you
  did not list below the --- line any changes from the previous version.
  Please read the section entitled "The canonical patch format" in the
  kernel file, Documentation/SubmittingPatches for what needs to be done
  here to properly describe this.

If you wish to discuss this problem further, or you have questions about
how to resolve this issue, please feel free to respond to this email and
Greg will reply once he has dug out from the pending patches received
from other developers.

thanks,

greg k-h's patch email bot

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH] kernfs: Separate kernfs_pr_cont_buf and rename_lock.
@ 2022-05-16 19:09 Hao Luo
  2022-05-16 19:34 ` Greg Kroah-Hartman
  0 siblings, 1 reply; 5+ messages in thread
From: Hao Luo @ 2022-05-16 19:09 UTC (permalink / raw)
  To: Greg Kroah-Hartman, Tejun Heo; +Cc: linux-kernel, Hao Luo

Previously the protection of kernfs_pr_cont_buf was piggy backed by
rename_lock, which means that pr_cont() needs to be protected under
rename_lock. This can cause potential circular lock dependencies.

If there is an OOM, we have the following call hierarchy:

 -> cpuset_print_current_mems_allowed()
   -> pr_cont_cgroup_name()
     -> pr_cont_kernfs_name()

pr_cont_kernfs_name() will grab rename_lock and call printk. So we have
the following lock dependencies:

 kernfs_rename_lock -> console_sem

Sometimes, printk does a wakeup before releasing console_sem, which has
the dependence chain:

 console_sem -> p->pi_lock -> rq->lock

Now, imagine one wants to read cgroup_name under rq->lock, for example,
printing cgroup_name in a tracepoint in the scheduler code. They will
be holding rq->lock and take rename_lock:

 rq->lock -> kernfs_rename_lock

Now they will deadlock.

A prevention to this circular lock dependency is to separate the
protection of pr_cont_buf from rename_lock. In principle, rename_lock
is to protect the integrity of cgroup name when copying to buf. Once
pr_cont_buf has got its content, rename_lock can be dropped. So it's
safe to drop rename_lock after kernfs_name_locked (and
kernfs_path_from_node_locked) and rely on a dedicated pr_cont_lock
to protect pr_cont_buf.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Hao Luo <haoluo@google.com>
---
 fs/kernfs/dir.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e205fde7163a..6eca72cfa1f2 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -18,7 +18,15 @@
 #include "kernfs-internal.h"
 
 static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
-static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by rename_lock */
+/*
+ * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
+ * call pr_cont() while holding rename_lock. Because sometimes pr_cont()
+ * will perform wakeups when releasing console_sem. Holding rename_lock
+ * will introduce deadlock if the scheduler reads the kernfs_name in the
+ * wakeup path.
+ */
+static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
+static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by pr_cont_lock */
 static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
 
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
@@ -229,12 +237,12 @@ void pr_cont_kernfs_name(struct kernfs_node *kn)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&kernfs_rename_lock, flags);
+	spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
 
-	kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
+	kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
 	pr_cont("%s", kernfs_pr_cont_buf);
 
-	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+	spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
 }
 
 /**
@@ -248,10 +256,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
 	unsigned long flags;
 	int sz;
 
-	spin_lock_irqsave(&kernfs_rename_lock, flags);
+	spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
 
-	sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
-					  sizeof(kernfs_pr_cont_buf));
+	sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
+				   sizeof(kernfs_pr_cont_buf));
 	if (sz < 0) {
 		pr_cont("(error)");
 		goto out;
@@ -265,7 +273,7 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
 	pr_cont("%s", kernfs_pr_cont_buf);
 
 out:
-	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+	spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
 }
 
 /**
@@ -823,13 +831,12 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
 
 	lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
 
-	/* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
-	spin_lock_irq(&kernfs_rename_lock);
+	spin_lock_irq(&kernfs_pr_cont_lock);
 
 	len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
 
 	if (len >= sizeof(kernfs_pr_cont_buf)) {
-		spin_unlock_irq(&kernfs_rename_lock);
+		spin_unlock_irq(&kernfs_pr_cont_lock);
 		return NULL;
 	}
 
@@ -841,7 +848,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
 		parent = kernfs_find_ns(parent, name, ns);
 	}
 
-	spin_unlock_irq(&kernfs_rename_lock);
+	spin_unlock_irq(&kernfs_pr_cont_lock);
 
 	return parent;
 }
-- 
2.36.1.124.g0e6072fb45-goog


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-05-16 19:34 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-16 18:28 [PATCH] kernfs: Separate kernfs_pr_cont_buf and rename_lock Hao Luo
2022-05-16 18:41 ` Tejun Heo
2022-05-16 19:08   ` Hao Luo
2022-05-16 19:09 Hao Luo
2022-05-16 19:34 ` Greg Kroah-Hartman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).