* [PATCH v2 1/3] mm: refactor vm_area_struct::anon_vma_name usage code
@ 2022-02-23 15:36 Suren Baghdasaryan
2022-02-23 15:36 ` [PATCH v2 2/3] mm: prevent vm_area_struct::anon_name refcount saturation Suren Baghdasaryan
` (2 more replies)
0 siblings, 3 replies; 7+ messages in thread
From: Suren Baghdasaryan @ 2022-02-23 15:36 UTC (permalink / raw)
To: akpm
Cc: ccross, sumit.semwal, mhocko, dave.hansen, keescook, willy,
kirill.shutemov, vbabka, hannes, ebiederm, brauner, legion,
ran.xiaokai, sashal, chris.hyser, dave, pcc, caoxiaofeng, david,
gorcunov, linux-mm, linux-kernel, kernel-team, surenb
Avoid mixing strings and their anon_vma_name referenced pointers
by using struct anon_vma_name whenever possible. This simplifies
the code and allows easier sharing of anon_vma_name structures when
they represent the same name.
Suggested-by: Matthew Wilcox <willy@infradead.org>
Suggested-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
changes in v2:
- renamed vma_anon_name to anon_vma_name, dup_vma_anon_name to
dup_anon_vma_name and free_vma_anon_name to free_anon_vma_name,
per Michal Hocko
- moved anon_vma_name_get and anon_vma_name_put close together,
per Michal Hocko
- Updated the comment in free_anon_vma_name, per Michal Hocko
- Removed extra check in anon_vma_name_alloc, per Michal Hocko
- Removed extra variable usage in madvise_vma_behavior, per Michal Hocko
- Cleaned whitespace noise in madvise_set_anon_name, per Michal Hocko
fs/proc/task_mmu.c | 6 +--
fs/userfaultfd.c | 6 +--
include/linux/mm.h | 7 ++--
include/linux/mm_inline.h | 87 ++++++++++++++++++++++++++-------------
include/linux/mm_types.h | 5 ++-
kernel/fork.c | 4 +-
kernel/sys.c | 19 +++++----
mm/madvise.c | 87 +++++++++++++--------------------------
mm/mempolicy.c | 2 +-
mm/mlock.c | 2 +-
mm/mmap.c | 12 +++---
mm/mprotect.c | 2 +-
12 files changed, 125 insertions(+), 114 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6e97ed775074..2c48b1eaaa9c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -309,7 +309,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
name = arch_vma_name(vma);
if (!name) {
- const char *anon_name;
+ struct anon_vma_name *anon_name;
if (!mm) {
name = "[vdso]";
@@ -327,10 +327,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
- anon_name = vma_anon_name(vma);
+ anon_name = anon_vma_name(vma);
if (anon_name) {
seq_pad(m, ' ');
- seq_printf(m, "[anon:%s]", anon_name);
+ seq_printf(m, "[anon:%s]", anon_name->name);
}
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e26b10132d47..8e03b3d3f5fa 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -878,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX, vma_anon_name(vma));
+ NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev)
vma = prev;
else
@@ -1438,7 +1438,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
((struct vm_userfaultfd_ctx){ ctx }),
- vma_anon_name(vma));
+ anon_vma_name(vma));
if (prev) {
vma = prev;
goto next;
@@ -1615,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX, vma_anon_name(vma));
+ NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev) {
vma = prev;
goto next;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 213cc569b192..5744a3fc4716 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2626,7 +2626,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *, struct vm_userfaultfd_ctx, const char *);
+ struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
@@ -3372,11 +3372,12 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
#ifdef CONFIG_ANON_VMA_NAME
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, const char *name);
+ unsigned long len_in,
+ struct anon_vma_name *anon_name);
#else
static inline int
madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, const char *name) {
+ unsigned long len_in, struct anon_vma_name *anon_name) {
return 0;
}
#endif
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index b725839dfe71..4bad32507570 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -140,50 +140,81 @@ static __always_inline void del_page_from_lru_list(struct page *page,
#ifdef CONFIG_ANON_VMA_NAME
/*
- * mmap_lock should be read-locked when calling vma_anon_name() and while using
- * the returned pointer.
+ * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
+ * either keep holding the lock while using the returned pointer or it should
+ * raise anon_vma_name refcount before releasing the lock.
*/
-extern const char *vma_anon_name(struct vm_area_struct *vma);
+extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
+extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
+extern void anon_vma_name_free(struct kref *kref);
-/*
- * mmap_lock should be read-locked for orig_vma->vm_mm.
- * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be
- * isolated.
- */
-extern void dup_vma_anon_name(struct vm_area_struct *orig_vma,
- struct vm_area_struct *new_vma);
+/* mmap_lock should be read-locked */
+static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
+{
+ if (anon_name)
+ kref_get(&anon_name->kref);
+}
-/*
- * mmap_lock should be write-locked or vma should have been isolated under
- * write-locked mmap_lock protection.
- */
-extern void free_vma_anon_name(struct vm_area_struct *vma);
+static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
+{
+ if (anon_name)
+ kref_put(&anon_name->kref, anon_vma_name_free);
+}
-/* mmap_lock should be read-locked */
-static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
- const char *name)
+static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
+ struct vm_area_struct *new_vma)
+{
+ struct anon_vma_name *anon_name = anon_vma_name(orig_vma);
+
+ if (anon_name) {
+ anon_vma_name_get(anon_name);
+ new_vma->anon_name = anon_name;
+ }
+}
+
+static inline void free_anon_vma_name(struct vm_area_struct *vma)
{
- const char *vma_name = vma_anon_name(vma);
+ /*
+ * Not using anon_vma_name because it generates a warning if vma->vm_mm
+ * is not held, which might be the case here.
+ */
+ if (!vma->vm_file)
+ anon_vma_name_put(vma->anon_name);
+}
- /* either both NULL, or pointers to same string */
- if (vma_name == name)
+static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
+ struct anon_vma_name *anon_name2)
+{
+ if (anon_name1 == anon_name2)
return true;
- return name && vma_name && !strcmp(name, vma_name);
+ return anon_name1 && anon_name2 &&
+ !strcmp(anon_name1->name, anon_name2->name);
}
+
#else /* CONFIG_ANON_VMA_NAME */
-static inline const char *vma_anon_name(struct vm_area_struct *vma)
+static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
return NULL;
}
-static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
- struct vm_area_struct *new_vma) {}
-static inline void free_vma_anon_name(struct vm_area_struct *vma) {}
-static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
- const char *name)
+
+static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
+{
+ return NULL;
+}
+
+static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
+static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
+static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
+ struct vm_area_struct *new_vma) {}
+static inline void free_anon_vma_name(struct vm_area_struct *vma) {}
+
+static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
+ struct anon_vma_name *anon_name2)
{
return true;
}
+
#endif /* CONFIG_ANON_VMA_NAME */
static inline void init_tlb_flush_pending(struct mm_struct *mm)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5140e5feb486..0f549870da6a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -416,7 +416,10 @@ struct vm_area_struct {
struct rb_node rb;
unsigned long rb_subtree_last;
} shared;
- /* Serialized by mmap_sem. */
+ /*
+ * Serialized by mmap_sem. Never use directly because it is
+ * valid only when vm_file is NULL. Use anon_vma_name instead.
+ */
struct anon_vma_name *anon_name;
};
diff --git a/kernel/fork.c b/kernel/fork.c
index a024bf6254df..f1e89007f228 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -366,14 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
- dup_vma_anon_name(orig, new);
+ dup_anon_vma_name(orig, new);
}
return new;
}
void vm_area_free(struct vm_area_struct *vma)
{
- free_vma_anon_name(vma);
+ free_anon_vma_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 97dc9e5d6bf9..5b0e172c4d47 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/reboot.h>
@@ -2286,15 +2287,16 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
{
struct mm_struct *mm = current->mm;
const char __user *uname;
- char *name, *pch;
+ struct anon_vma_name *anon_name = NULL;
int error;
switch (opt) {
case PR_SET_VMA_ANON_NAME:
uname = (const char __user *)arg;
if (uname) {
- name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
+ char *name, *pch;
+ name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
if (IS_ERR(name))
return PTR_ERR(name);
@@ -2304,15 +2306,18 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
return -EINVAL;
}
}
- } else {
- /* Reset the name */
- name = NULL;
+ /* anon_vma has its own copy */
+ anon_name = anon_vma_name_alloc(name);
+ kfree(name);
+ if (!anon_name)
+ return -ENOMEM;
+
}
mmap_write_lock(mm);
- error = madvise_set_anon_name(mm, addr, size, name);
+ error = madvise_set_anon_name(mm, addr, size, anon_name);
mmap_write_unlock(mm);
- kfree(name);
+ anon_vma_name_put(anon_name);
break;
default:
error = -EINVAL;
diff --git a/mm/madvise.c b/mm/madvise.c
index 5604064df464..081b1cded21e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -65,7 +65,7 @@ static int madvise_need_mmap_write(int behavior)
}
#ifdef CONFIG_ANON_VMA_NAME
-static struct anon_vma_name *anon_vma_name_alloc(const char *name)
+struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
struct anon_vma_name *anon_name;
size_t count;
@@ -81,78 +81,49 @@ static struct anon_vma_name *anon_vma_name_alloc(const char *name)
return anon_name;
}
-static void vma_anon_name_free(struct kref *kref)
+void anon_vma_name_free(struct kref *kref)
{
struct anon_vma_name *anon_name =
container_of(kref, struct anon_vma_name, kref);
kfree(anon_name);
}
-static inline bool has_vma_anon_name(struct vm_area_struct *vma)
+struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
- return !vma->vm_file && vma->anon_name;
-}
-
-const char *vma_anon_name(struct vm_area_struct *vma)
-{
- if (!has_vma_anon_name(vma))
- return NULL;
-
mmap_assert_locked(vma->vm_mm);
- return vma->anon_name->name;
-}
-
-void dup_vma_anon_name(struct vm_area_struct *orig_vma,
- struct vm_area_struct *new_vma)
-{
- if (!has_vma_anon_name(orig_vma))
- return;
-
- kref_get(&orig_vma->anon_name->kref);
- new_vma->anon_name = orig_vma->anon_name;
-}
-
-void free_vma_anon_name(struct vm_area_struct *vma)
-{
- struct anon_vma_name *anon_name;
-
- if (!has_vma_anon_name(vma))
- return;
+ if (vma->vm_file)
+ return NULL;
- anon_name = vma->anon_name;
- vma->anon_name = NULL;
- kref_put(&anon_name->kref, vma_anon_name_free);
+ return vma->anon_name;
}
/* mmap_lock should be write-locked */
-static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+static int replace_anon_vma_name(struct vm_area_struct *vma,
+ struct anon_vma_name *anon_name)
{
- const char *anon_name;
+ struct anon_vma_name *orig_name = anon_vma_name(vma);
- if (!name) {
- free_vma_anon_name(vma);
+ if (!anon_name) {
+ vma->anon_name = NULL;
+ anon_vma_name_put(orig_name);
return 0;
}
- anon_name = vma_anon_name(vma);
- if (anon_name) {
- /* Same name, nothing to do here */
- if (!strcmp(name, anon_name))
- return 0;
+ if (anon_vma_name_eq(orig_name, anon_name))
+ return 0;
- free_vma_anon_name(vma);
- }
- vma->anon_name = anon_vma_name_alloc(name);
- if (!vma->anon_name)
- return -ENOMEM;
+ anon_vma_name_get(anon_name);
+ vma->anon_name = anon_name;
+ anon_vma_name_put(orig_name);
return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
-static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+static int replace_anon_vma_name(struct vm_area_struct *vma,
+ struct anon_vma_name *anon_name)
{
- if (name)
+ if (anon_name)
return -EINVAL;
return 0;
@@ -165,13 +136,13 @@ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
static int madvise_update_vma(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, unsigned long new_flags,
- const char *name)
+ struct anon_vma_name *anon_name)
{
struct mm_struct *mm = vma->vm_mm;
int error;
pgoff_t pgoff;
- if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) {
+ if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
*prev = vma;
return 0;
}
@@ -179,7 +150,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, name);
+ vma->vm_userfaultfd_ctx, anon_name);
if (*prev) {
vma = *prev;
goto success;
@@ -209,7 +180,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
*/
vma->vm_flags = new_flags;
if (!vma->vm_file) {
- error = replace_vma_anon_name(vma, name);
+ error = replace_anon_vma_name(vma, anon_name);
if (error)
return error;
}
@@ -1041,7 +1012,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
}
error = madvise_update_vma(vma, prev, start, end, new_flags,
- vma_anon_name(vma));
+ anon_vma_name(vma));
out:
/*
@@ -1225,7 +1196,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
static int madvise_vma_anon_name(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long name)
+ unsigned long anon_name)
{
int error;
@@ -1234,7 +1205,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
return -EBADF;
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- (const char *)name);
+ (struct anon_vma_name *)anon_name);
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1246,7 +1217,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
}
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, const char *name)
+ unsigned long len_in, struct anon_vma_name *anon_name)
{
unsigned long end;
unsigned long len;
@@ -1266,7 +1237,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
if (end == start)
return 0;
- return madvise_walk_vmas(mm, start, end, (unsigned long)name,
+ return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 028e8dd82b44..69284d3b5e53 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -814,7 +814,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx,
- vma_anon_name(vma));
+ anon_vma_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f584eddd305..25934e7db3e1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index 1e8fdb0b51ed..ad6a1fffee91 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1031,7 +1031,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -1049,7 +1049,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
- if (!is_same_vma_anon_name(vma, anon_name))
+ if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
return 0;
return 1;
}
@@ -1084,7 +1084,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
@@ -1106,7 +1106,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
@@ -1167,7 +1167,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -3255,7 +3255,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5ca3fbcb1495..2887644fd150 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
--
2.35.1.473.g83b2b277ed-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v2 2/3] mm: prevent vm_area_struct::anon_name refcount saturation
2022-02-23 15:36 [PATCH v2 1/3] mm: refactor vm_area_struct::anon_vma_name usage code Suren Baghdasaryan
@ 2022-02-23 15:36 ` Suren Baghdasaryan
2022-02-24 8:42 ` Michal Hocko
2022-02-23 15:36 ` [PATCH v5 3/3] mm: fix use-after-free when anon vma name is used after vma is freed Suren Baghdasaryan
2022-02-24 8:36 ` [PATCH v2 1/3] mm: refactor vm_area_struct::anon_vma_name usage code Michal Hocko
2 siblings, 1 reply; 7+ messages in thread
From: Suren Baghdasaryan @ 2022-02-23 15:36 UTC (permalink / raw)
To: akpm
Cc: ccross, sumit.semwal, mhocko, dave.hansen, keescook, willy,
kirill.shutemov, vbabka, hannes, ebiederm, brauner, legion,
ran.xiaokai, sashal, chris.hyser, dave, pcc, caoxiaofeng, david,
gorcunov, linux-mm, linux-kernel, kernel-team, surenb
A deep process chain with many vmas could grow really high.
With default sysctl_max_map_count (64k) and default pid_max (32k)
the max number of vmas in the system is 2147450880 and the
refcounter has headroom of 1073774592 before it reaches
REFCOUNT_SATURATED (3221225472). Therefore it's unlikely that
an anonymous name refcounter will overflow with these defaults.
Currently the max for pid_max is PID_MAX_LIMIT (4194304) and
for sysctl_max_map_count it's INT_MAX (2147483647). In this
configuration anon_vma_name refcount overflow becomes
theoretically possible (that still require heavy sharing of
that anon_vma_name between processes).
kref refcounting interface used in anon_vma_name structure will
detect a counter overflow when it reaches REFCOUNT_SATURATED value
but will only generate a warning about broken refcounter.
To ensure anon_vma_name refcount does not overflow, stop anon_vma_name
sharing when the refcount reaches REFCOUNT_MAX (2147483647), which
still leaves INT_MAX/2 (1073741823) values before the counter reaches
REFCOUNT_SATURATED. This should provide enough headroom for raising
the refcounts temporarily.
Suggested-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
changes in v2:
- Updated description to include calculation details, per Michal Hocko
include/linux/mm_inline.h | 18 ++++++++++++++----
mm/madvise.c | 3 +--
2 files changed, 15 insertions(+), 6 deletions(-)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 4bad32507570..f82085ff8a6b 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -161,15 +161,25 @@ static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
kref_put(&anon_name->kref, anon_vma_name_free);
}
+static inline
+struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
+{
+ /* Prevent anon_name refcount saturation early on */
+ if (kref_read(&anon_name->kref) < REFCOUNT_MAX) {
+ anon_vma_name_get(anon_name);
+ return anon_name;
+
+ }
+ return anon_vma_name_alloc(anon_name->name);
+}
+
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
struct vm_area_struct *new_vma)
{
struct anon_vma_name *anon_name = anon_vma_name(orig_vma);
- if (anon_name) {
- anon_vma_name_get(anon_name);
- new_vma->anon_name = anon_name;
- }
+ if (anon_name)
+ new_vma->anon_name = anon_vma_name_reuse(anon_name);
}
static inline void free_anon_vma_name(struct vm_area_struct *vma)
diff --git a/mm/madvise.c b/mm/madvise.c
index 081b1cded21e..1f2693dccf7b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -113,8 +113,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
if (anon_vma_name_eq(orig_name, anon_name))
return 0;
- anon_vma_name_get(anon_name);
- vma->anon_name = anon_name;
+ vma->anon_name = anon_vma_name_reuse(anon_name);
anon_vma_name_put(orig_name);
return 0;
--
2.35.1.473.g83b2b277ed-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v5 3/3] mm: fix use-after-free when anon vma name is used after vma is freed
2022-02-23 15:36 [PATCH v2 1/3] mm: refactor vm_area_struct::anon_vma_name usage code Suren Baghdasaryan
2022-02-23 15:36 ` [PATCH v2 2/3] mm: prevent vm_area_struct::anon_name refcount saturation Suren Baghdasaryan
@ 2022-02-23 15:36 ` Suren Baghdasaryan
2022-02-24 8:45 ` Michal Hocko
2022-02-24 8:36 ` [PATCH v2 1/3] mm: refactor vm_area_struct::anon_vma_name usage code Michal Hocko
2 siblings, 1 reply; 7+ messages in thread
From: Suren Baghdasaryan @ 2022-02-23 15:36 UTC (permalink / raw)
To: akpm
Cc: ccross, sumit.semwal, mhocko, dave.hansen, keescook, willy,
kirill.shutemov, vbabka, hannes, ebiederm, brauner, legion,
ran.xiaokai, sashal, chris.hyser, dave, pcc, caoxiaofeng, david,
gorcunov, linux-mm, linux-kernel, kernel-team, surenb,
syzbot+aa7b3d4b35f9dc46a366
When adjacent vmas are being merged it can result in the vma that was
originally passed to madvise_update_vma being destroyed. In the current
implementation, the name parameter passed to madvise_update_vma points
directly to vma->anon_name and it is used after the call to
vma_merge. In the cases when vma_merge merges the original vma and
destroys it, this will result in use-after-free bug as shown below:
madvise_vma_behavior(vma)
madvise_update_vma(vma, ..., anon_name == vma->anon_name)
vma_merge(vma)
__vma_adjust(vma) <-- merges vma with adjacent one
vm_area_free(vma) <-- frees the original vma
replace_vma_anon_name(anon_name) <-- UAF of vma->anon_name
Fix this by raising the name refcount and stabilizing it.
Fixes: 9a10064f5625 ("mm: add a field to store names for private anonymous memory")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: syzbot+aa7b3d4b35f9dc46a366@syzkaller.appspotmail.com
---
changes in v5:
- Updated description, per Michal Hocko
mm/madvise.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index 1f2693dccf7b..38d0f515d548 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -131,6 +131,8 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
/*
* Update the vm_flags on region of a vma, splitting it or merging it as
* necessary. Must be called with mmap_sem held for writing;
+ * Caller should ensure anon_name stability by raising its refcount even when
+ * anon_name belongs to a valid vma because this function might free that vma.
*/
static int madvise_update_vma(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
@@ -945,6 +947,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
unsigned long behavior)
{
int error;
+ struct anon_vma_name *anon_name;
unsigned long new_flags = vma->vm_flags;
switch (behavior) {
@@ -1010,8 +1013,11 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
}
+ anon_name = anon_vma_name(vma);
+ anon_vma_name_get(anon_name);
error = madvise_update_vma(vma, prev, start, end, new_flags,
- anon_vma_name(vma));
+ anon_name);
+ anon_vma_name_put(anon_name);
out:
/*
--
2.35.1.473.g83b2b277ed-goog
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH v2 1/3] mm: refactor vm_area_struct::anon_vma_name usage code
2022-02-23 15:36 [PATCH v2 1/3] mm: refactor vm_area_struct::anon_vma_name usage code Suren Baghdasaryan
2022-02-23 15:36 ` [PATCH v2 2/3] mm: prevent vm_area_struct::anon_name refcount saturation Suren Baghdasaryan
2022-02-23 15:36 ` [PATCH v5 3/3] mm: fix use-after-free when anon vma name is used after vma is freed Suren Baghdasaryan
@ 2022-02-24 8:36 ` Michal Hocko
2022-02-24 23:22 ` Suren Baghdasaryan
2 siblings, 1 reply; 7+ messages in thread
From: Michal Hocko @ 2022-02-24 8:36 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: akpm, ccross, sumit.semwal, dave.hansen, keescook, willy,
kirill.shutemov, vbabka, hannes, ebiederm, brauner, legion,
ran.xiaokai, sashal, chris.hyser, dave, pcc, caoxiaofeng, david,
gorcunov, linux-mm, linux-kernel, kernel-team
On Wed 23-02-22 07:36:11, Suren Baghdasaryan wrote:
> Avoid mixing strings and their anon_vma_name referenced pointers
> by using struct anon_vma_name whenever possible. This simplifies
> the code and allows easier sharing of anon_vma_name structures when
> they represent the same name.
>
> Suggested-by: Matthew Wilcox <willy@infradead.org>
> Suggested-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
LGTM
Acked-by: Michal Hocko <mhocko@suse.com>
Thanks and one minor nit below
[...]
> +static inline void free_anon_vma_name(struct vm_area_struct *vma)
> {
> - const char *vma_name = vma_anon_name(vma);
> + /*
> + * Not using anon_vma_name because it generates a warning if vma->vm_mm
> + * is not held, which might be the case here.
s@vma->vm_mm@mmap_lock@
> + */
> + if (!vma->vm_file)
> + anon_vma_name_put(vma->anon_name);
> +}
>
> - /* either both NULL, or pointers to same string */
> - if (vma_name == name)
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2 2/3] mm: prevent vm_area_struct::anon_name refcount saturation
2022-02-23 15:36 ` [PATCH v2 2/3] mm: prevent vm_area_struct::anon_name refcount saturation Suren Baghdasaryan
@ 2022-02-24 8:42 ` Michal Hocko
0 siblings, 0 replies; 7+ messages in thread
From: Michal Hocko @ 2022-02-24 8:42 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: akpm, ccross, sumit.semwal, dave.hansen, keescook, willy,
kirill.shutemov, vbabka, hannes, ebiederm, brauner, legion,
ran.xiaokai, sashal, chris.hyser, dave, pcc, caoxiaofeng, david,
gorcunov, linux-mm, linux-kernel, kernel-team
On Wed 23-02-22 07:36:12, Suren Baghdasaryan wrote:
> A deep process chain with many vmas could grow really high.
> With default sysctl_max_map_count (64k) and default pid_max (32k)
> the max number of vmas in the system is 2147450880 and the
> refcounter has headroom of 1073774592 before it reaches
> REFCOUNT_SATURATED (3221225472). Therefore it's unlikely that
> an anonymous name refcounter will overflow with these defaults.
> Currently the max for pid_max is PID_MAX_LIMIT (4194304) and
> for sysctl_max_map_count it's INT_MAX (2147483647). In this
> configuration anon_vma_name refcount overflow becomes
> theoretically possible (that still require heavy sharing of
> that anon_vma_name between processes).
> kref refcounting interface used in anon_vma_name structure will
> detect a counter overflow when it reaches REFCOUNT_SATURATED value
> but will only generate a warning about broken refcounter.
If I am reading the refcounter code properly the "overflow" will simply
make the ref counter frozen and the object will never be freed. A
determined attacker could leak memory like that but it would be rather
expensive and inefficient way to do so. Still good to have it covered.
> To ensure anon_vma_name refcount does not overflow, stop anon_vma_name
> sharing when the refcount reaches REFCOUNT_MAX (2147483647), which
> still leaves INT_MAX/2 (1073741823) values before the counter reaches
> REFCOUNT_SATURATED. This should provide enough headroom for raising
> the refcounts temporarily.
I am not sure this is the intended way refcounter users should avoid
overflows but I do not see other interface that would be usable. Maybe
somebody else can come up with a better suggestion but this approach
makes sense to me.
>
> Suggested-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Thanks!
> ---
> changes in v2:
> - Updated description to include calculation details, per Michal Hocko
>
> include/linux/mm_inline.h | 18 ++++++++++++++----
> mm/madvise.c | 3 +--
> 2 files changed, 15 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> index 4bad32507570..f82085ff8a6b 100644
> --- a/include/linux/mm_inline.h
> +++ b/include/linux/mm_inline.h
> @@ -161,15 +161,25 @@ static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
> kref_put(&anon_name->kref, anon_vma_name_free);
> }
>
> +static inline
> +struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
> +{
> + /* Prevent anon_name refcount saturation early on */
> + if (kref_read(&anon_name->kref) < REFCOUNT_MAX) {
> + anon_vma_name_get(anon_name);
> + return anon_name;
> +
> + }
> + return anon_vma_name_alloc(anon_name->name);
> +}
> +
> static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
> struct vm_area_struct *new_vma)
> {
> struct anon_vma_name *anon_name = anon_vma_name(orig_vma);
>
> - if (anon_name) {
> - anon_vma_name_get(anon_name);
> - new_vma->anon_name = anon_name;
> - }
> + if (anon_name)
> + new_vma->anon_name = anon_vma_name_reuse(anon_name);
> }
>
> static inline void free_anon_vma_name(struct vm_area_struct *vma)
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 081b1cded21e..1f2693dccf7b 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -113,8 +113,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
> if (anon_vma_name_eq(orig_name, anon_name))
> return 0;
>
> - anon_vma_name_get(anon_name);
> - vma->anon_name = anon_name;
> + vma->anon_name = anon_vma_name_reuse(anon_name);
> anon_vma_name_put(orig_name);
>
> return 0;
> --
> 2.35.1.473.g83b2b277ed-goog
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v5 3/3] mm: fix use-after-free when anon vma name is used after vma is freed
2022-02-23 15:36 ` [PATCH v5 3/3] mm: fix use-after-free when anon vma name is used after vma is freed Suren Baghdasaryan
@ 2022-02-24 8:45 ` Michal Hocko
0 siblings, 0 replies; 7+ messages in thread
From: Michal Hocko @ 2022-02-24 8:45 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: akpm, ccross, sumit.semwal, dave.hansen, keescook, willy,
kirill.shutemov, vbabka, hannes, ebiederm, brauner, legion,
ran.xiaokai, sashal, chris.hyser, dave, pcc, caoxiaofeng, david,
gorcunov, linux-mm, linux-kernel, kernel-team,
syzbot+aa7b3d4b35f9dc46a366
On Wed 23-02-22 07:36:13, Suren Baghdasaryan wrote:
> When adjacent vmas are being merged it can result in the vma that was
> originally passed to madvise_update_vma being destroyed. In the current
> implementation, the name parameter passed to madvise_update_vma points
> directly to vma->anon_name and it is used after the call to
> vma_merge. In the cases when vma_merge merges the original vma and
> destroys it, this will result in use-after-free bug as shown below:
Not that I want to nit pick on the wording here because destruction of
the original vma will not result in UAF automatically. For that it would
need to hold anon_vma_name with the last reference. So I would
reformulate:
"In cases when vma_merge merges the original vma and destroys it, it
might result in UAF. For that the original vma would have to hold the
anon_vma_name with the last reference. The following vma would need to
contain a different anon_vma_name object with the same string.
> madvise_vma_behavior(vma)
> madvise_update_vma(vma, ..., anon_name == vma->anon_name)
> vma_merge(vma)
> __vma_adjust(vma) <-- merges vma with adjacent one
> vm_area_free(vma) <-- frees the original vma
> replace_vma_anon_name(anon_name) <-- UAF of vma->anon_name
>
> Fix this by raising the name refcount and stabilizing it.
>
> Fixes: 9a10064f5625 ("mm: add a field to store names for private anonymous memory")
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> Reported-by: syzbot+aa7b3d4b35f9dc46a366@syzkaller.appspotmail.com
Acked-by: Michal Hocko <mhocko@suse.com>
Thanks!
> ---
> changes in v5:
> - Updated description, per Michal Hocko
>
> mm/madvise.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 1f2693dccf7b..38d0f515d548 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -131,6 +131,8 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
> /*
> * Update the vm_flags on region of a vma, splitting it or merging it as
> * necessary. Must be called with mmap_sem held for writing;
> + * Caller should ensure anon_name stability by raising its refcount even when
> + * anon_name belongs to a valid vma because this function might free that vma.
> */
> static int madvise_update_vma(struct vm_area_struct *vma,
> struct vm_area_struct **prev, unsigned long start,
> @@ -945,6 +947,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> unsigned long behavior)
> {
> int error;
> + struct anon_vma_name *anon_name;
> unsigned long new_flags = vma->vm_flags;
>
> switch (behavior) {
> @@ -1010,8 +1013,11 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> break;
> }
>
> + anon_name = anon_vma_name(vma);
> + anon_vma_name_get(anon_name);
> error = madvise_update_vma(vma, prev, start, end, new_flags,
> - anon_vma_name(vma));
> + anon_name);
> + anon_vma_name_put(anon_name);
>
> out:
> /*
> --
> 2.35.1.473.g83b2b277ed-goog
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2 1/3] mm: refactor vm_area_struct::anon_vma_name usage code
2022-02-24 8:36 ` [PATCH v2 1/3] mm: refactor vm_area_struct::anon_vma_name usage code Michal Hocko
@ 2022-02-24 23:22 ` Suren Baghdasaryan
0 siblings, 0 replies; 7+ messages in thread
From: Suren Baghdasaryan @ 2022-02-24 23:22 UTC (permalink / raw)
To: Michal Hocko
Cc: akpm, ccross, sumit.semwal, dave.hansen, keescook, willy,
kirill.shutemov, vbabka, hannes, ebiederm, brauner, legion,
ran.xiaokai, sashal, chris.hyser, dave, pcc, caoxiaofeng, david,
gorcunov, linux-mm, linux-kernel, kernel-team
On Thu, Feb 24, 2022 at 12:36 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Wed 23-02-22 07:36:11, Suren Baghdasaryan wrote:
> > Avoid mixing strings and their anon_vma_name referenced pointers
> > by using struct anon_vma_name whenever possible. This simplifies
> > the code and allows easier sharing of anon_vma_name structures when
> > they represent the same name.
> >
> > Suggested-by: Matthew Wilcox <willy@infradead.org>
> > Suggested-by: Michal Hocko <mhocko@suse.com>
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
>
> LGTM
> Acked-by: Michal Hocko <mhocko@suse.com>
>
> Thanks and one minor nit below
Addressed all the nits in the next version:
https://lore.kernel.org/all/20220224231834.1481408-1-surenb@google.com/
Thanks!
>
> [...]
> > +static inline void free_anon_vma_name(struct vm_area_struct *vma)
> > {
> > - const char *vma_name = vma_anon_name(vma);
> > + /*
> > + * Not using anon_vma_name because it generates a warning if vma->vm_mm
> > + * is not held, which might be the case here.
>
> s@vma->vm_mm@mmap_lock@
>
> > + */
> > + if (!vma->vm_file)
> > + anon_vma_name_put(vma->anon_name);
> > +}
> >
> > - /* either both NULL, or pointers to same string */
> > - if (vma_name == name)
>
> --
> Michal Hocko
> SUSE Labs
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2022-02-24 23:23 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-23 15:36 [PATCH v2 1/3] mm: refactor vm_area_struct::anon_vma_name usage code Suren Baghdasaryan
2022-02-23 15:36 ` [PATCH v2 2/3] mm: prevent vm_area_struct::anon_name refcount saturation Suren Baghdasaryan
2022-02-24 8:42 ` Michal Hocko
2022-02-23 15:36 ` [PATCH v5 3/3] mm: fix use-after-free when anon vma name is used after vma is freed Suren Baghdasaryan
2022-02-24 8:45 ` Michal Hocko
2022-02-24 8:36 ` [PATCH v2 1/3] mm: refactor vm_area_struct::anon_vma_name usage code Michal Hocko
2022-02-24 23:22 ` Suren Baghdasaryan
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.