linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] mm: mmap_lock: fix use-after-free race and css ref leak in tracepoints
@ 2020-12-01 20:32 Axel Rasmussen
  2020-12-01 21:28 ` Steven Rostedt
  0 siblings, 1 reply; 3+ messages in thread
From: Axel Rasmussen @ 2020-12-01 20:32 UTC (permalink / raw)
  To: Andrew Morton, Chinwen Chang, Daniel Jordan, David Rientjes,
	Davidlohr Bueso, Ingo Molnar, Jann Horn, Laurent Dufour,
	Michel Lespinasse, Stephen Rothwell, Steven Rostedt,
	Vlastimil Babka
  Cc: Yafang Shao, davem, dsahern, gregkh, kuba, liuhangbin, tj,
	linux-kernel, linux-mm, Axel Rasmussen

syzbot reported[1] a use-after-free introduced in 0f818c4bc1f3. The bug
is that an ongoing trace event might race with the tracepoint being
disabled (and therefore the _unreg() callback being called). Consider
this ordering:

T1: trace event fires, get_mm_memcg_path() is called
T1: get_memcg_path_buf() returns a buffer pointer
T2: trace_mmap_lock_unreg() is called, buffers are freed
T1: cgroup_path() is called with the now-freed buffer

The solution in this commit is to switch to mutex + RCU. With the RCU
API we can first stop new buffers from being handed out, then wait for
existing users to finish, and *then* free the buffers.

I have a simple reproducer program which spins up two pools of threads,
doing the following in a tight loop:

  Pool 1:
  mmap(NULL, 4096, PROT_READ | PROT_WRITE,
       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)
  munmap()

  Pool 2:
  echo 1 > /sys/kernel/debug/tracing/events/mmap_lock/enable
  echo 0 > /sys/kernel/debug/tracing/events/mmap_lock/enable

This triggers the use-after-free very quickly. With this patch, I let it
run for an hour without any BUGs.

While fixing this, I also noticed and fixed a css ref leak. Previously
we called get_mem_cgroup_from_mm(), but we never called css_put() to
release that reference. get_mm_memcg_path() now does this properly.

[1]: https://syzkaller.appspot.com/bug?extid=19e6dd9943972fa1c58a

Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
---
 mm/mmap_lock.c | 104 +++++++++++++++++++++++++++++++------------------
 1 file changed, 66 insertions(+), 38 deletions(-)

diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 12af8f1b8a14..5a3349bf1501 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -6,9 +6,10 @@
 #include <linux/cgroup.h>
 #include <linux/memcontrol.h>
 #include <linux/mmap_lock.h>
+#include <linux/mutex.h>
 #include <linux/percpu.h>
+#include <linux/rcupdate.h>
 #include <linux/smp.h>
-#include <linux/spinlock.h>
 #include <linux/trace_events.h>
 
 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
@@ -23,8 +24,8 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
  * concurrent _reg() and _unreg() calls, and count how many _reg() calls have
  * been made.
  */
-static DEFINE_SPINLOCK(reg_lock);
-static int reg_refcount;
+static DEFINE_MUTEX(reg_lock);
+static int reg_refcount; /* Protected by reg_lock. */
 
 /*
  * Size of the buffer for memcg path names. Ignoring stack trace support,
@@ -38,99 +39,126 @@ static int reg_refcount;
  */
 #define CONTEXT_COUNT 4
 
-DEFINE_PER_CPU(char *, memcg_path_buf);
-DEFINE_PER_CPU(int, memcg_path_buf_idx);
+static DEFINE_PER_CPU(char __rcu *, memcg_path_buf);
+static DEFINE_PER_CPU(int, memcg_path_buf_idx);
+
+/* Called with reg_lock held. */
+static void free_memcg_path_bufs(void)
+{
+	int cpu;
+	char *old;
+
+	for_each_possible_cpu(cpu) {
+		old = rcu_dereference_protected(per_cpu(memcg_path_buf, cpu),
+						lockdep_is_held(&reg_lock));
+		if (old == NULL)
+			break;
+		rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL);
+		/* Wait for inflight memcg_path_buf users to finish. */
+		synchronize_rcu();
+		kfree(old);
+	}
+}
 
 int trace_mmap_lock_reg(void)
 {
-	unsigned long flags;
 	int cpu;
+	char *new;
 
-	spin_lock_irqsave(&reg_lock, flags);
+	mutex_lock(&reg_lock);
 
+	/* If the refcount is going 0->1, proceed with allocating buffers. */
 	if (reg_refcount++)
 		goto out;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(memcg_path_buf, cpu) = NULL;
-	}
-	for_each_possible_cpu(cpu) {
-		per_cpu(memcg_path_buf, cpu) = kmalloc(
-			MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_NOWAIT);
-		if (per_cpu(memcg_path_buf, cpu) == NULL)
+		new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL);
+		if (new == NULL)
 			goto out_fail;
-		per_cpu(memcg_path_buf_idx, cpu) = 0;
+		rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new);
+		/* Don't need to wait for inflights, they'd have gotten NULL. */
 	}
 
 out:
-	spin_unlock_irqrestore(&reg_lock, flags);
+	mutex_unlock(&reg_lock);
 	return 0;
 
 out_fail:
-	for_each_possible_cpu(cpu) {
-		if (per_cpu(memcg_path_buf, cpu) != NULL)
-			kfree(per_cpu(memcg_path_buf, cpu));
-		else
-			break;
-	}
+	free_memcg_path_bufs();
 
+	/* Since we failed, undo the earlier ref increment. */
 	--reg_refcount;
 
-	spin_unlock_irqrestore(&reg_lock, flags);
+	mutex_unlock(&reg_lock);
 	return -ENOMEM;
 }
 
 void trace_mmap_lock_unreg(void)
 {
-	unsigned long flags;
-	int cpu;
-
-	spin_lock_irqsave(&reg_lock, flags);
+	mutex_lock(&reg_lock);
 
+	/* If the refcount is going 1->0, proceed with freeing buffers. */
 	if (--reg_refcount)
 		goto out;
 
-	for_each_possible_cpu(cpu) {
-		kfree(per_cpu(memcg_path_buf, cpu));
-	}
+	free_memcg_path_bufs();
 
 out:
-	spin_unlock_irqrestore(&reg_lock, flags);
+	mutex_unlock(&reg_lock);
 }
 
 static inline char *get_memcg_path_buf(void)
 {
+	char *buf;
 	int idx;
 
+	rcu_read_lock();
+	buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf));
+	if (buf == NULL)
+		return NULL;
 	idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) -
 	      MEMCG_PATH_BUF_SIZE;
-	return &this_cpu_read(memcg_path_buf)[idx];
+	return &buf[idx];
 }
 
 static inline void put_memcg_path_buf(void)
 {
 	this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE);
+	rcu_read_unlock();
 }
 
 /*
  * Write the given mm_struct's memcg path to a percpu buffer, and return a
- * pointer to it. If the path cannot be determined, NULL is returned.
+ * pointer to it. If the path cannot be determined, or no buffer was available
+ * (because the trace event is being unregistered), NULL is returned.
  *
  * Note: buffers are allocated per-cpu to avoid locking, so preemption must be
  * disabled by the caller before calling us, and re-enabled only after the
  * caller is done with the pointer.
+ *
+ * The caller must call put_memcg_path_buf() once the buffer is no longer
+ * needed. This must be done while preemption is still disabled.
  */
 static const char *get_mm_memcg_path(struct mm_struct *mm)
 {
+	char *buf = NULL;
 	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
 
-	if (memcg != NULL && likely(memcg->css.cgroup != NULL)) {
-		char *buf = get_memcg_path_buf();
+	if (memcg == NULL)
+		goto out;
+	if (unlikely(memcg->css.cgroup == NULL))
+		goto out_put;
 
-		cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE);
-		return buf;
-	}
-	return NULL;
+	buf = get_memcg_path_buf();
+	if (buf == NULL)
+		goto out_put;
+
+	cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE);
+
+out_put:
+	css_put(&memcg->css);
+out:
+	return buf;
 }
 
 #define TRACE_MMAP_LOCK_EVENT(type, mm, ...)                                   \
-- 
2.29.2.454.gaff20da3a2-goog


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] mm: mmap_lock: fix use-after-free race and css ref leak in tracepoints
  2020-12-01 20:32 [PATCH v2] mm: mmap_lock: fix use-after-free race and css ref leak in tracepoints Axel Rasmussen
@ 2020-12-01 21:28 ` Steven Rostedt
  2020-12-01 21:31   ` Steven Rostedt
  0 siblings, 1 reply; 3+ messages in thread
From: Steven Rostedt @ 2020-12-01 21:28 UTC (permalink / raw)
  To: Axel Rasmussen
  Cc: Andrew Morton, Chinwen Chang, Daniel Jordan, David Rientjes,
	Davidlohr Bueso, Ingo Molnar, Jann Horn, Laurent Dufour,
	Michel Lespinasse, Stephen Rothwell, Vlastimil Babka,
	Yafang Shao, davem, dsahern, gregkh, kuba, liuhangbin, tj,
	linux-kernel, linux-mm

On Tue,  1 Dec 2020 12:32:49 -0800
Axel Rasmussen <axelrasmussen@google.com> wrote:

> +/* Called with reg_lock held. */

The above comment is reduntant, as the lockdep_is_held() below also suggest
that it is ;-)

> +static void free_memcg_path_bufs(void)
> +{
> +	int cpu;
> +	char *old;
> +
> +	for_each_possible_cpu(cpu) {
> +		old = rcu_dereference_protected(per_cpu(memcg_path_buf, cpu),
> +						lockdep_is_held(&reg_lock));
> +		if (old == NULL)
> +			break;

Hmm, what if the topology of the system has missing CPU numbers (this is
possible I believe on some systems)?

> +		rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL);
> +		/* Wait for inflight memcg_path_buf users to finish. */
> +		synchronize_rcu();

Please break this up into two loops. You will need to have another array
that is created in trace_mmap_lock_reg() function:

static char **path_holders;

trace_mmap_lock_reg()
{
[..]
	path_holders = kmalloc(num_possible_cpus * sizeof(*path_holders));
[..]
}

Then this function can be:

static void free_memcg_path_bufs(void)
{
	int cpu;

	for_each_possible_cpu(cpu) {
		path_holders[cpu] = rcu_dereference_protected(per_cpu(memcg_path_buf, cpu),
						lockdep_is_held(&reg_lock));
		rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL);
	}

	/* Wait for inflight memcg_path_buf users to finish. */
	synchronize_rcu();

	for_each_possible_cpu(cpu) {
		kfree(path_holders[cpu]);
	}

	kfree(path_holders);
	path_holders = NULL;
}

Otherwise, if you have a machine with 128 possible CPUs, doing 128
synchronize_rcu()s is going to be expensive!

> +		kfree(old);
> +	}
> +}
>  


>  static inline char *get_memcg_path_buf(void)
>  {
> +	char *buf;
>  	int idx;
>  
> +	rcu_read_lock();

The caller of get_mm_memcg_path() has preemption disabled, which is also
now an RCU lock. So the rcu_read_lock() is somewhat redundant.

Oh, and looking at the original patch:

+                                      memcg_path != NULL ? memcg_path : "",   \

The above could be shorten to:

					memcg_path ? : "",

As gcc has a trick with the "? :" which is if there's nothing in between
the "?" and ":" it will use what was tested as the result if it is not zero
or NULL.

-- Steve

> +	buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf));
> +	if (buf == NULL)
> +		return NULL;
>  	idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) -
>  	      MEMCG_PATH_BUF_SIZE;
> -	return &this_cpu_read(memcg_path_buf)[idx];
> +	return &buf[idx];
>  }

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] mm: mmap_lock: fix use-after-free race and css ref leak in tracepoints
  2020-12-01 21:28 ` Steven Rostedt
@ 2020-12-01 21:31   ` Steven Rostedt
  0 siblings, 0 replies; 3+ messages in thread
From: Steven Rostedt @ 2020-12-01 21:31 UTC (permalink / raw)
  To: Axel Rasmussen
  Cc: Andrew Morton, Chinwen Chang, Daniel Jordan, David Rientjes,
	Davidlohr Bueso, Ingo Molnar, Jann Horn, Laurent Dufour,
	Michel Lespinasse, Stephen Rothwell, Vlastimil Babka,
	Yafang Shao, davem, dsahern, gregkh, kuba, liuhangbin, tj,
	linux-kernel, linux-mm

On Tue, 1 Dec 2020 16:28:47 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue,  1 Dec 2020 12:32:49 -0800
> Axel Rasmussen <axelrasmussen@google.com> wrote:
> 
> > +/* Called with reg_lock held. */  
> 
> The above comment is reduntant, as the lockdep_is_held() below also suggest
> that it is ;-)
>


> 
> >  static inline char *get_memcg_path_buf(void)
> >  {
> > +	char *buf;
> >  	int idx;
> >  
> > +	rcu_read_lock();  
> 
> The caller of get_mm_memcg_path() has preemption disabled, which is also
> now an RCU lock. So the rcu_read_lock() is somewhat redundant.


BTW, both of these comments are FYI. You can keep the comment and keep the
rcu_read_lock(). I was just stating that they are redundant, but keeping
them may make the code a bit more robust.

-- Steve

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-12-01 21:32 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-01 20:32 [PATCH v2] mm: mmap_lock: fix use-after-free race and css ref leak in tracepoints Axel Rasmussen
2020-12-01 21:28 ` Steven Rostedt
2020-12-01 21:31   ` Steven Rostedt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).