All of lore.kernel.org
 help / color / mirror / Atom feed
From: Muhammad Usama Anjum <usama.anjum@collabora.com>
To: Alexander Viro <viro@zeniv.linux.org.uk>,
	Eric Biederman <ebiederm@xmission.com>,
	Kees Cook <keescook@chromium.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Steven Rostedt <rostedt@goodmis.org>,
	Masami Hiramatsu <mhiramat@kernel.org>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>,
	kernel@collabora.com, peterx@redhat.com, david@redhat.com,
	Cyrill Gorcunov <gorcunov@gmail.com>,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH RFC] mm: implement granular soft-dirty vma support
Date: Tue, 20 Dec 2022 21:26:05 +0500	[thread overview]
Message-ID: <20221220162606.1595355-1-usama.anjum@collabora.com> (raw)

The VM_SOFTDIRTY is used to mark a whole VMA soft-dirty. Sometimes
soft-dirty and non-soft-dirty VMAs are merged making the non-soft-dirty
region soft-dirty. This creates problems as the whole VMA region comes
out to be soft-dirty while in-reality no there is no soft-dirty page.
This can be solved by not merging the VMAs with different soft-dirty
flags. But it has potential to cause regression as the number of VMAs
can increase drastically.

This patch introduces non-soft-dirty regions tracking on VMA level. The
implementation is clean when non-soft-dirty regions are tracked. There
is no overhead for cases when soft-dirty flag isn't used by the
application. The real non-soft-dirty tracking in the VMA starts when the
soft-dirty flag is cleared from the user side. Then the list is
manipulated along with the VMA manipulations like split/merge/expand the
VMAs.

The sd flag is still shown in the smaps file. Only the softdirty will
not show up in format strings as its corresponding VM_SOFTDIRTY has
been removed which seems justified.

Overall, this patch should correct the soft-dirty tracking on the VMA
level. There shouldn't be any impact on the user's view other than now
the user will not get affected by kernel's internal activity (VMA
merging/splitting).

Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
---
Cyrill has pointed to this kind of solution [1].

[1] https://lore.kernel.org/all/Y4W0axw0ZgORtfkt@grain
---
 fs/exec.c                      |   2 +-
 fs/proc/task_mmu.c             |  52 ++++++++++-----
 include/linux/mm.h             |   7 +-
 include/linux/mm_inline.h      | 117 +++++++++++++++++++++++++++++++++
 include/linux/mm_types.h       |   7 ++
 include/trace/events/mmflags.h |   7 --
 kernel/fork.c                  |   4 ++
 mm/gup.c                       |   6 +-
 mm/huge_memory.c               |   8 +--
 mm/internal.h                  |  22 ++++++-
 mm/mmap.c                      |  45 +++++--------
 mm/mprotect.c                  |   4 +-
 12 files changed, 214 insertions(+), 67 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index ab913243a367..30616601567c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -270,7 +270,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
 	vma->vm_end = STACK_TOP_MAX;
 	vma->vm_start = vma->vm_end - PAGE_SIZE;
-	vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
+	vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 
 	err = insert_vm_struct(mm, vma);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f444da25e2e2..bdb4eaf21986 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -683,9 +683,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_DONTDUMP)]	= "dd",
 #ifdef CONFIG_ARM64_BTI
 		[ilog2(VM_ARM64_BTI)]	= "bt",
-#endif
-#ifdef CONFIG_MEM_SOFT_DIRTY
-		[ilog2(VM_SOFTDIRTY)]	= "sd",
 #endif
 		[ilog2(VM_MIXEDMAP)]	= "mm",
 		[ilog2(VM_HUGEPAGE)]	= "hg",
@@ -723,6 +720,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 			seq_putc(m, ' ');
 		}
 	}
+
+	/* non_sd_reg isn't initialized for vsyscall vma */
+	if (is_nsdr_list_initialized(&vma->non_sd_reg) && is_vma_sd(vma))
+		seq_puts(m, "sd");
 	seq_putc(m, '\n');
 }
 
@@ -1299,10 +1300,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
 			mas_for_each(&mas, vma, ULONG_MAX) {
-				if (!(vma->vm_flags & VM_SOFTDIRTY))
-					continue;
-				vma->vm_flags &= ~VM_SOFTDIRTY;
-				vma_set_page_prot(vma);
+				struct non_sd_reg *r;
+
+				if (list_empty(&vma->non_sd_reg)) {
+					r = kmalloc(sizeof(struct non_sd_reg), GFP_KERNEL);
+					list_add(&r->nsdr_head, &vma->non_sd_reg);
+				} else if (list_is_singular(&vma->non_sd_reg)) {
+					r = list_first_entry(&vma->non_sd_reg,
+							     struct non_sd_reg, nsdr_head);
+				} else {
+					while (!list_is_singular(&vma->non_sd_reg)) {
+						r = list_first_entry(&vma->non_sd_reg,
+								     struct non_sd_reg, nsdr_head);
+						list_del(&r->nsdr_head);
+						kfree(r);
+					}
+				}
+				r->start = vma->vm_start;
+				r->end = vma->vm_end;
 			}
 
 			inc_tlb_flush_pending(mm);
@@ -1397,10 +1412,13 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 		if (!vma)
 			break;
 
-		/* Addresses in the VMA. */
-		if (vma->vm_flags & VM_SOFTDIRTY)
-			pme = make_pme(0, PM_SOFT_DIRTY);
 		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
+			/* Addresses in the VMA. */
+			if (is_vma_addr_sd(vma, addr))
+				pme = make_pme(0, PM_SOFT_DIRTY);
+			else
+				pme = make_pme(0, 0);
+
 			err = add_to_pagemap(addr, &pme, pm);
 			if (err)
 				goto out;
@@ -1458,7 +1476,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		flags |= PM_FILE;
 	if (page && !migration && page_mapcount(page) == 1)
 		flags |= PM_MMAP_EXCLUSIVE;
-	if (vma->vm_flags & VM_SOFTDIRTY)
+	if (is_vma_addr_sd(vma, addr))
 		flags |= PM_SOFT_DIRTY;
 
 	return make_pme(frame, flags);
@@ -1481,7 +1499,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		pmd_t pmd = *pmdp;
 		struct page *page = NULL;
 
-		if (vma->vm_flags & VM_SOFTDIRTY)
+		if (is_vma_addr_sd(vma, addr))
 			flags |= PM_SOFT_DIRTY;
 
 		if (pmd_present(pmd)) {
@@ -1578,9 +1596,6 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
 	int err = 0;
 	pte_t pte;
 
-	if (vma->vm_flags & VM_SOFTDIRTY)
-		flags |= PM_SOFT_DIRTY;
-
 	pte = huge_ptep_get(ptep);
 	if (pte_present(pte)) {
 		struct page *page = pte_page(pte);
@@ -1603,7 +1618,12 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
 	}
 
 	for (; addr != end; addr += PAGE_SIZE) {
-		pagemap_entry_t pme = make_pme(frame, flags);
+		pagemap_entry_t pme;
+
+		if (is_vma_addr_sd(vma, addr))
+			pme = make_pme(frame, flags | PM_SOFT_DIRTY);
+		else
+			pme = make_pme(frame, flags);
 
 		err = add_to_pagemap(addr, &pme, pm);
 		if (err)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 517225fce10f..a2f11aa96d90 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -287,12 +287,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_WIPEONFORK	0x02000000	/* Wipe VMA contents in child. */
 #define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
 
-#ifdef CONFIG_MEM_SOFT_DIRTY
-# define VM_SOFTDIRTY	0x08000000	/* Not soft dirty clean area */
-#else
-# define VM_SOFTDIRTY	0
-#endif
-
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
 #define VM_HUGEPAGE	0x20000000	/* MADV_HUGEPAGE marked this vma */
 #define VM_NOHUGEPAGE	0x40000000	/* MADV_NOHUGEPAGE marked this vma */
@@ -609,6 +603,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_mm = mm;
 	vma->vm_ops = &dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
+	INIT_LIST_HEAD(&vma->non_sd_reg);
 }
 
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index e8ed225d8f7c..d515c2a9b10a 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -578,4 +578,121 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
 #endif
 }
 
+static inline int nsdr_clone(struct vm_area_struct *new_vma, struct vm_area_struct *vma)
+{
+	struct non_sd_reg *r, *reg;
+
+	list_for_each_entry(r, &vma->non_sd_reg, nsdr_head) {
+		reg = kmalloc(sizeof(struct non_sd_reg), GFP_KERNEL);
+		if (!reg)
+			return -ENOMEM;
+		reg->start = r->start;
+		reg->end = r->end;
+		list_add_tail(&reg->nsdr_head, &new_vma->non_sd_reg);
+	}
+	return 0;
+}
+
+static inline void nsdr_clear(struct vm_area_struct *vma)
+{
+	struct non_sd_reg *r, *r_tmp;
+
+	list_for_each_entry_safe(r, r_tmp, &vma->non_sd_reg, nsdr_head) {
+		list_del(&r->nsdr_head);
+		kfree(r);
+	}
+}
+
+static inline bool is_vma_range_sd(struct vm_area_struct *vma, unsigned long addr,
+				   unsigned long end)
+{
+	struct non_sd_reg *r;
+
+	list_for_each_entry(r, &vma->non_sd_reg, nsdr_head) {
+		if (addr >= r->start && end <= r->end)
+			return false;
+	}
+	return true;
+}
+
+static inline bool is_vma_addr_sd(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct non_sd_reg *r;
+
+	list_for_each_entry(r, &vma->non_sd_reg, nsdr_head) {
+		if (addr >= r->start && addr < r->end)
+			return false;
+	}
+	return true;
+}
+
+static inline bool is_vma_sd(struct vm_area_struct *vma)
+{
+	struct non_sd_reg *r;
+
+	if (list_is_singular(&vma->non_sd_reg)) {
+		r = list_first_entry(&vma->non_sd_reg, struct non_sd_reg, nsdr_head);
+		if (r->start == vma->vm_start && r->end == vma->vm_end)
+			return false;
+	}
+	return true;
+}
+
+static inline bool is_nsdr_list_initialized(struct list_head *list)
+{
+	return (list->next != NULL);
+}
+
+static inline void nsdr_move(struct vm_area_struct *vma_to, struct vm_area_struct *vma_from)
+{
+	struct non_sd_reg *r, *r_tmp;
+
+	list_for_each_entry_safe(r, r_tmp, &vma_from->non_sd_reg, nsdr_head)
+		list_move_tail(&r->nsdr_head, &vma_to->non_sd_reg);
+}
+
+static inline int nsdr_adjust_new_first(struct vm_area_struct *new, struct vm_area_struct *vma)
+{
+	struct non_sd_reg *r, *r_tmp, *reg;
+	unsigned long mid = vma->vm_start;
+
+	list_for_each_entry_safe(r, r_tmp, &vma->non_sd_reg, nsdr_head) {
+		if (r->start < mid && r->end > mid) {
+			reg = kmalloc(sizeof(struct non_sd_reg), GFP_KERNEL);
+			if (!reg)
+				return -ENOMEM;
+			reg->start = r->start;
+			reg->end = mid;
+			list_add_tail(&reg->nsdr_head, &new->non_sd_reg);
+
+			r->start = mid;
+		} else if (r->end <= mid) {
+			list_move_tail(&r->nsdr_head, &new->non_sd_reg);
+		}
+	}
+	return 0;
+}
+
+static inline int nsdr_adjust_new_after(struct vm_area_struct *new, struct vm_area_struct *vma)
+{
+	struct non_sd_reg *r, *r_tmp, *reg;
+	unsigned long mid = vma->vm_end;
+
+	list_for_each_entry_safe(r, r_tmp, &vma->non_sd_reg, nsdr_head) {
+		if (r->start < mid && r->end > mid) {
+			reg = kmalloc(sizeof(struct non_sd_reg), GFP_KERNEL);
+			if (!reg)
+				return -ENOMEM;
+			reg->start = mid;
+			reg->end = r->end;
+			list_add_tail(&reg->nsdr_head, &new->non_sd_reg);
+
+			r->end = vma->vm_end;
+		} else if (r->start >= mid) {
+			list_move_tail(&r->nsdr_head, &new->non_sd_reg);
+		}
+	}
+	return 0;
+}
+
 #endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3b8475007734..c4e2dfc75d26 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -595,8 +595,15 @@ struct vm_area_struct {
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+	struct list_head non_sd_reg;
 } __randomize_layout;
 
+struct non_sd_reg {
+	struct list_head nsdr_head;
+	unsigned long start;
+	unsigned long end;
+};
+
 struct kioctx_table;
 struct mm_struct {
 	struct {
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 412b5a46374c..75cd0da07673 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -151,12 +151,6 @@ IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison")
 #define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1,	"arch_1"	}
 #endif
 
-#ifdef CONFIG_MEM_SOFT_DIRTY
-#define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name },
-#else
-#define IF_HAVE_VM_SOFTDIRTY(flag,name)
-#endif
-
 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
 # define IF_HAVE_UFFD_MINOR(flag, name) {flag, name},
 #else
@@ -191,7 +185,6 @@ IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR,	"uffd_minor"	)		\
 	__VM_ARCH_SPECIFIC_1				,		\
 	{VM_WIPEONFORK,			"wipeonfork"	},		\
 	{VM_DONTDUMP,			"dontdump"	},		\
-IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 	{VM_MIXEDMAP,			"mixedmap"	},		\
 	{VM_HUGEPAGE,			"hugepage"	},		\
 	{VM_NOHUGEPAGE,			"nohugepage"	},		\
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe3541897..3f8d2c30efbf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		 */
 		*new = data_race(*orig);
 		INIT_LIST_HEAD(&new->anon_vma_chain);
+		INIT_LIST_HEAD(&new->non_sd_reg);
 		dup_anon_vma_name(orig, new);
 	}
 	return new;
@@ -481,6 +482,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 
 void vm_area_free(struct vm_area_struct *vma)
 {
+	nsdr_clear(vma);
 	free_anon_vma_name(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
@@ -650,6 +652,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		retval = dup_userfaultfd(tmp, &uf);
 		if (retval)
 			goto fail_nomem_anon_vma_fork;
+		if (nsdr_clone(tmp, mpnt))
+			goto fail_nomem_anon_vma_fork;
 		if (tmp->vm_flags & VM_WIPEONFORK) {
 			/*
 			 * VM_WIPEONFORK gets a clean slate in the child.
diff --git a/mm/gup.c b/mm/gup.c
index c2c2c6dbcfcf..aec7a0ea98b7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -496,7 +496,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
 /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
 static inline bool can_follow_write_pte(pte_t pte, struct page *page,
 					struct vm_area_struct *vma,
-					unsigned int flags)
+					unsigned int flags, unsigned long addr)
 {
 	/* If the pte is writable, we can write to the page. */
 	if (pte_write(pte))
@@ -526,7 +526,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page,
 		return false;
 
 	/* ... and a write-fault isn't required for other reasons. */
-	if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
+	if (vma_addr_soft_dirty_enabled(vma, addr, PAGE_SIZE) && !pte_soft_dirty(pte))
 		return false;
 	return !userfaultfd_pte_wp(vma, pte);
 }
@@ -562,7 +562,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 	 * have to worry about pte_devmap() because they are never anon.
 	 */
 	if ((flags & FOLL_WRITE) &&
-	    !can_follow_write_pte(pte, page, vma, flags)) {
+	    !can_follow_write_pte(pte, page, vma, flags, address)) {
 		page = NULL;
 		goto out;
 	}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index abe6cfd92ffa..a6f74e5535bd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1405,7 +1405,7 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
 		return false;
 
 	/* Do we need write faults for softdirty tracking? */
-	if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
+	if (vma_addr_soft_dirty_enabled(vma, addr, HPAGE_SIZE) && !pmd_soft_dirty(pmd))
 		return false;
 
 	/* Do we need write faults for uffd-wp tracking? */
@@ -1425,7 +1425,7 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
 /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
 static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
 					struct vm_area_struct *vma,
-					unsigned int flags)
+					unsigned int flags, unsigned long addr)
 {
 	/* If the pmd is writable, we can write to the page. */
 	if (pmd_write(pmd))
@@ -1455,7 +1455,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
 		return false;
 
 	/* ... and a write-fault isn't required for other reasons. */
-	if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
+	if (vma_addr_soft_dirty_enabled(vma, addr, PAGE_SIZE) && !pmd_soft_dirty(pmd))
 		return false;
 	return !userfaultfd_huge_pmd_wp(vma, pmd);
 }
@@ -1475,7 +1475,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
 
 	if ((flags & FOLL_WRITE) &&
-	    !can_follow_write_pmd(*pmd, page, vma, flags))
+	    !can_follow_write_pmd(*pmd, page, vma, flags, addr))
 		return NULL;
 
 	/* Avoid dumping huge zero page */
diff --git a/mm/internal.h b/mm/internal.h
index bcf75a8b032d..35f682ce7b17 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -12,6 +12,7 @@
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
 #include <linux/tracepoint-defs.h>
+#include <linux/mm_inline.h>
 
 struct folio_batch;
 
@@ -830,6 +831,25 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
 
 extern bool mirrored_kernelcore;
 
+static inline bool vma_addr_soft_dirty_enabled(struct vm_area_struct *vma,
+					       unsigned long addr, unsigned long end)
+{
+	/*
+	 * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
+	 * enablements, because when without soft-dirty being compiled in,
+	 * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
+	 * will be constantly true.
+	 */
+	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+		return false;
+
+	/*
+	 * Soft-dirty is kind of special: its tracking is enabled when the
+	 * vma flags not set.
+	 */
+	return !(is_vma_range_sd(vma, addr, end));
+}
+
 static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
 {
 	/*
@@ -845,7 +865,7 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
 	 * Soft-dirty is kind of special: its tracking is enabled when the
 	 * vma flags not set.
 	 */
-	return !(vma->vm_flags & VM_SOFTDIRTY);
+	return !(is_vma_sd(vma));
 }
 
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 5d54cde108c0..687fbc19ee8e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -587,6 +587,7 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
 	}
 
 	if (remove_next) {
+		nsdr_move(vma, next);
 		if (file) {
 			uprobe_munmap(next, next->vm_start, next->vm_end);
 			fput(file);
@@ -868,15 +869,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
 				struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
 				struct anon_vma_name *anon_name)
 {
-	/*
-	 * VM_SOFTDIRTY should not prevent from VMA merging, if we
-	 * match the flags but dirty bit -- the caller should mark
-	 * merged VMA as dirty. If dirty bit won't be excluded from
-	 * comparison, we increase pressure on the memory system forcing
-	 * the kernel to generate new VMAs when old one could be
-	 * extended instead.
-	 */
-	if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
+	if (vma->vm_flags ^ vm_flags)
 		return 0;
 	if (vma->vm_file != file)
 		return 0;
@@ -1050,6 +1043,9 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 		err = __vma_adjust(prev, prev->vm_start,
 					next->vm_end, prev->vm_pgoff, NULL,
 					prev);
+		if (err)
+			return NULL;
+		err = nsdr_adjust_new_first(prev, next);
 		res = prev;
 	} else if (merge_prev) {			/* cases 2, 5, 7 */
 		err = __vma_adjust(prev, prev->vm_start,
@@ -1092,7 +1088,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
 	return a->vm_end == b->vm_start &&
 		mpol_equal(vma_policy(a), vma_policy(b)) &&
 		a->vm_file == b->vm_file &&
-		!((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
+		!((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS)) &&
 		b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
 }
 
@@ -2236,8 +2232,14 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 		err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
 
 	/* Success. */
-	if (!err)
-		return 0;
+	if (!err) {
+		if (new_below)
+			err = nsdr_adjust_new_first(new, vma);
+		else
+			err = nsdr_adjust_new_after(new, vma);
+		if (!err)
+			return err;
+	}
 
 	/* Avoid vm accounting in close() operation */
 	new->vm_start = new->vm_end;
@@ -2728,17 +2730,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	if (file)
 		uprobe_mmap(vma);
 
-	/*
-	 * New (or expanded) vma always get soft dirty status.
-	 * Otherwise user-space soft-dirty page tracker won't
-	 * be able to distinguish situation when vma area unmapped,
-	 * then new mapped in-place (which must be aimed as
-	 * a completely new data area).
-	 */
-	vma->vm_flags |= VM_SOFTDIRTY;
-
-	vma_set_page_prot(vma);
-
 	validate_mm(mm);
 	return addr;
 
@@ -2947,7 +2938,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
 	 */
 	if (vma &&
 	    (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
-	    ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
+	    (vma->vm_flags == flags)) {
 		mas_set_range(mas, vma->vm_start, addr + len - 1);
 		if (mas_preallocate(mas, vma, GFP_KERNEL))
 			goto unacct_fail;
@@ -2958,7 +2949,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
 			anon_vma_interval_tree_pre_update_vma(vma);
 		}
 		vma->vm_end = addr + len;
-		vma->vm_flags |= VM_SOFTDIRTY;
 		mas_store_prealloc(mas, vma);
 
 		if (vma->anon_vma) {
@@ -2991,7 +2981,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
 	mm->data_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED)
 		mm->locked_vm += (len >> PAGE_SHIFT);
-	vma->vm_flags |= VM_SOFTDIRTY;
 	validate_mm(mm);
 	return 0;
 
@@ -3224,6 +3213,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		new_vma->vm_pgoff = pgoff;
 		if (vma_dup_policy(vma, new_vma))
 			goto out_free_vma;
+		if (nsdr_clone(new_vma, vma))
+			goto out_free_vma;
 		if (anon_vma_clone(new_vma, vma))
 			goto out_free_mempol;
 		if (new_vma->vm_file)
@@ -3395,7 +3386,7 @@ static struct vm_area_struct *__install_special_mapping(
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 
-	vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
+	vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
 	vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ce6119456d54..562d017aa9ff 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -52,7 +52,7 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
 		return false;
 
 	/* Do we need write faults for softdirty tracking? */
-	if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
+	if (vma_addr_soft_dirty_enabled(vma, addr, addr + PAGE_SIZE) && !pte_soft_dirty(pte))
 		return false;
 
 	/* Do we need write faults for uffd-wp tracking? */
@@ -610,7 +610,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			   vma->vm_userfaultfd_ctx, anon_vma_name(vma));
 	if (*pprev) {
 		vma = *pprev;
-		VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
+		VM_WARN_ON(vma->vm_flags ^ newflags);
 		goto success;
 	}
 
-- 
2.30.2


             reply	other threads:[~2022-12-20 16:27 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-12-20 16:26 Muhammad Usama Anjum [this message]
2022-12-21  9:01 ` [PATCH RFC] mm: implement granular soft-dirty vma support David Hildenbrand
2023-01-09 22:18 ` Cyrill Gorcunov
2023-01-10  5:49   ` Muhammad Usama Anjum

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221220162606.1595355-1-usama.anjum@collabora.com \
    --to=usama.anjum@collabora.com \
    --cc=akpm@linux-foundation.org \
    --cc=david@redhat.com \
    --cc=ebiederm@xmission.com \
    --cc=gorcunov@gmail.com \
    --cc=keescook@chromium.org \
    --cc=kernel@collabora.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhiramat@kernel.org \
    --cc=peterx@redhat.com \
    --cc=rostedt@goodmis.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.