Linux-man Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v7 1/2] mm: Add MREMAP_DONTUNMAP to mremap().
@ 2020-02-21 17:42 Brian Geffon
  2020-02-21 17:42 ` [PATCH v7 2/2] selftest: Add MREMAP_DONTUNMAP selftest Brian Geffon
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Brian Geffon @ 2020-02-21 17:42 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michael S . Tsirkin, Brian Geffon, Arnd Bergmann, linux-kernel,
	linux-mm, linux-api, Andy Lutomirski, Will Deacon,
	Andrea Arcangeli, Sonny Rao, Minchan Kim, Joel Fernandes,
	Yu Zhao, Jesse Barnes, Florian Weimer, Kirill A . Shutemov,
	mtk.manpages, linux-man, Lokesh Gidra

When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
set, the source mapping will not be removed. The remap operation
will be performed as it would have been normally by moving over the
page tables to the new mapping. The old vma will have any locked
flags cleared, have no pagetables, and any userfaultfds that were
watching that range will continue watching it.

For a mapping that is shared or not anonymous, MREMAP_DONTUNMAP will cause
the mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
a VMA you MUST use the MREMAP_MAYMOVE flag, it's not possible to resize
a VMA while also moving with MREMAP_DONTUNMAP so old_len must always
be equal to the new_len otherwise it will return -EINVAL.

We hope to use this in Chrome OS where with userfaultfd we could write
an anonymous mapping to disk without having to STOP the process or worry
about VMA permission changes.

This feature also has a use case in Android, Lokesh Gidra has said
that "As part of using userfaultfd for GC, We'll have to move the physical
pages of the java heap to a separate location. For this purpose mremap
will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
heap, its virtual mapping will be removed as well. Therefore, we'll
require performing mmap immediately after. This is not only time consuming
but also opens a time window where a native thread may call mmap and
reserve the java heap's address range for its own usage. This flag
solves the problem."

  v6 -> v7:
    - Don't allow resizing VMA as part of MREMAP_DONTUNMAP.
      There is no clear use case at the moment and it can be added
      later as it simplifies the implementation for now.

  v5 -> v6:
    - Code cleanup suggested by Kirill.

  v4 -> v5:
    - Correct commit message to more accurately reflect the behavior.
    - Clear VM_LOCKED and VM_LOCKEDONFAULT on the old vma.
           
Signed-off-by: Brian Geffon <bgeffon@google.com>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Tested-by: Lokesh Gidra <lokeshgidra@google.com>
---
 include/uapi/linux/mman.h |  5 ++-
 mm/mremap.c               | 90 ++++++++++++++++++++++++++++++---------
 2 files changed, 72 insertions(+), 23 deletions(-)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index fc1a64c3447b..923cc162609c 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,8 +5,9 @@
 #include <asm/mman.h>
 #include <asm-generic/hugetlb_encode.h>
 
-#define MREMAP_MAYMOVE	1
-#define MREMAP_FIXED	2
+#define MREMAP_MAYMOVE		1
+#define MREMAP_FIXED		2
+#define MREMAP_DONTUNMAP	4
 
 #define OVERCOMMIT_GUESS		0
 #define OVERCOMMIT_ALWAYS		1
diff --git a/mm/mremap.c b/mm/mremap.c
index 1fc8a29fbe3f..8b7bf3845e50 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
 		unsigned long new_len, unsigned long new_addr,
-		bool *locked, struct vm_userfaultfd_ctx *uf,
-		struct list_head *uf_unmap)
+		bool *locked, unsigned long flags,
+		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -408,11 +408,32 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (unlikely(vma->vm_flags & VM_PFNMAP))
 		untrack_pfn_moved(vma);
 
+	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+		if (vm_flags & VM_ACCOUNT) {
+			/* Always put back VM_ACCOUNT since we won't unmap */
+			vma->vm_flags |= VM_ACCOUNT;
+
+			vm_acct_memory(vma_pages(new_vma));
+		}
+
+		/* We always clear VM_LOCKED[ONFAULT] on the old vma */
+		vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+
+		/* Because we won't unmap we don't need to touch locked_vm */
+		goto out;
+	}
+
 	if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
 	}
+
+	if (vm_flags & VM_LOCKED) {
+		mm->locked_vm += new_len >> PAGE_SHIFT;
+		*locked = true;
+	}
+out:
 	mm->hiwater_vm = hiwater_vm;
 
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
@@ -422,16 +443,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 			vma->vm_next->vm_flags |= VM_ACCOUNT;
 	}
 
-	if (vm_flags & VM_LOCKED) {
-		mm->locked_vm += new_len >> PAGE_SHIFT;
-		*locked = true;
-	}
-
 	return new_addr;
 }
 
 static struct vm_area_struct *vma_to_resize(unsigned long addr,
-	unsigned long old_len, unsigned long new_len, unsigned long *p)
+	unsigned long old_len, unsigned long new_len, unsigned long flags,
+	unsigned long *p)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = find_vma(mm, addr);
@@ -453,6 +470,10 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 		return ERR_PTR(-EINVAL);
 	}
 
+	if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
+			vma->vm_flags & VM_SHARED))
+		return ERR_PTR(-EINVAL);
+
 	if (is_vm_hugetlb_page(vma))
 		return ERR_PTR(-EINVAL);
 
@@ -497,7 +518,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		unsigned long new_addr, unsigned long new_len, bool *locked,
-		struct vm_userfaultfd_ctx *uf,
+		unsigned long flags, struct vm_userfaultfd_ctx *uf,
 		struct list_head *uf_unmap_early,
 		struct list_head *uf_unmap)
 {
@@ -505,7 +526,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	struct vm_area_struct *vma;
 	unsigned long ret = -EINVAL;
 	unsigned long charged = 0;
-	unsigned long map_flags;
+	unsigned long map_flags = 0;
 
 	if (offset_in_page(new_addr))
 		goto out;
@@ -534,9 +555,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
 		return -ENOMEM;
 
-	ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
-	if (ret)
-		goto out;
+	if (flags & MREMAP_FIXED) {
+		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
+		if (ret)
+			goto out;
+	}
 
 	if (old_len >= new_len) {
 		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
@@ -545,13 +568,22 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		old_len = new_len;
 	}
 
-	vma = vma_to_resize(addr, old_len, new_len, &charged);
+	vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
 		goto out;
 	}
 
-	map_flags = MAP_FIXED;
+	/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
+	if (flags & MREMAP_DONTUNMAP &&
+		!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (flags & MREMAP_FIXED)
+		map_flags |= MAP_FIXED;
+
 	if (vma->vm_flags & VM_MAYSHARE)
 		map_flags |= MAP_SHARED;
 
@@ -561,10 +593,16 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (offset_in_page(ret))
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+	/* We got a new mapping */
+	if (!(flags & MREMAP_FIXED))
+		new_addr = ret;
+
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
 		       uf_unmap);
+
 	if (!(offset_in_page(ret)))
 		goto out;
+
 out1:
 	vm_unacct_memory(charged);
 
@@ -609,12 +647,21 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	addr = untagged_addr(addr);
 	new_addr = untagged_addr(new_addr);
 
-	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
 		return ret;
 
 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
 		return ret;
 
+	/*
+	 * MREMAP_DONTUNMAP is always a move and it does not allow resizing
+	 * in the process.
+	 */
+	if (flags & MREMAP_DONTUNMAP &&
+			(!(flags & MREMAP_MAYMOVE) || old_len != new_len))
+		return ret;
+
+
 	if (offset_in_page(addr))
 		return ret;
 
@@ -632,9 +679,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	if (down_write_killable(&current->mm->mmap_sem))
 		return -EINTR;
 
-	if (flags & MREMAP_FIXED) {
+	if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked, &uf, &uf_unmap_early, &uf_unmap);
+				&locked, flags, &uf, &uf_unmap_early,
+				&uf_unmap);
 		goto out;
 	}
 
@@ -662,7 +710,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	/*
 	 * Ok, we need to grow..
 	 */
-	vma = vma_to_resize(addr, old_len, new_len, &charged);
+	vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
 		goto out;
@@ -712,7 +760,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		}
 
 		ret = move_vma(vma, addr, old_len, new_len, new_addr,
-			       &locked, &uf, &uf_unmap);
+			       &locked, flags, &uf, &uf_unmap);
 	}
 out:
 	if (offset_in_page(ret)) {
-- 
2.25.0.265.gbab2e86ba0-goog


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v7 2/2] selftest: Add MREMAP_DONTUNMAP selftest.
  2020-02-21 17:42 [PATCH v7 1/2] mm: Add MREMAP_DONTUNMAP to mremap() Brian Geffon
@ 2020-02-21 17:42 ` Brian Geffon
  2020-02-21 17:42 ` [PATCH v7] mremap.2: Add information for MREMAP_DONTUNMAP Brian Geffon
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Brian Geffon @ 2020-02-21 17:42 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michael S . Tsirkin, Brian Geffon, Arnd Bergmann, linux-kernel,
	linux-mm, linux-api, Andy Lutomirski, Will Deacon,
	Andrea Arcangeli, Sonny Rao, Minchan Kim, Joel Fernandes,
	Yu Zhao, Jesse Barnes, Florian Weimer, Kirill A . Shutemov,
	mtk.manpages, linux-man

Add a few simple self tests for the new flag MREMAP_DONTUNMAP,
they are simple smoke tests which also demonstrate the behavior.

Signed-off-by: Brian Geffon <bgeffon@google.com>
---
 tools/testing/selftests/vm/Makefile           |   1 +
 tools/testing/selftests/vm/mremap_dontunmap.c | 313 ++++++++++++++++++
 tools/testing/selftests/vm/run_vmtests        |  15 +
 3 files changed, 329 insertions(+)
 create mode 100644 tools/testing/selftests/vm/mremap_dontunmap.c

diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 9534dc2bc929..4b2b969fc3c7 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -12,6 +12,7 @@ TEST_GEN_FILES += map_fixed_noreplace
 TEST_GEN_FILES += map_populate
 TEST_GEN_FILES += mlock-random-test
 TEST_GEN_FILES += mlock2-tests
+TEST_GEN_FILES += mremap_dontunmap
 TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c
new file mode 100644
index 000000000000..23240ee1e384
--- /dev/null
+++ b/tools/testing/selftests/vm/mremap_dontunmap.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Tests for mremap w/ MREMAP_DONTUNMAP.
+ *
+ * Copyright 2020, Brian Geffon <bgeffon@google.com>
+ */
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#ifndef MREMAP_DONTUNMAP
+#define MREMAP_DONTUNMAP 4
+#endif
+
+unsigned long page_size;
+char *page_buffer;
+
+static void dump_maps(void)
+{
+	char cmd[32];
+
+	snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+	system(cmd);
+}
+
+#define BUG_ON(condition, description)					      \
+	do {								      \
+		if (condition) {					      \
+			fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \
+				__LINE__, (description), strerror(errno));    \
+			dump_maps();					  \
+			exit(1);					      \
+		} 							      \
+	} while (0)
+
+// Try a simple operation for to "test" for kernel support this prevents
+// reporting tests as failed when it's run on an older kernel.
+static int kernel_support_for_mremap_dontunmap()
+{
+	int ret = 0;
+	unsigned long num_pages = 1;
+	void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE,
+				    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+	// This simple remap should only fail if MREMAP_DONTUNMAP isn't
+	// supported.
+	void *dest_mapping =
+	    mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0);
+	if (dest_mapping == MAP_FAILED) {
+		ret = errno;
+	} else {
+		BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+		       "unable to unmap destination mapping");
+	}
+
+	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+	       "unable to unmap source mapping");
+	return ret;
+}
+
+// This helper will just validate that an entire mapping contains the expected
+// byte.
+static int check_region_contains_byte(void *addr, unsigned long size, char byte)
+{
+	BUG_ON(size & (page_size - 1),
+	       "check_region_contains_byte expects page multiples");
+	BUG_ON((unsigned long)addr & (page_size - 1),
+	       "check_region_contains_byte expects page alignment");
+
+	memset(page_buffer, byte, page_size);
+
+	unsigned long num_pages = size / page_size;
+	unsigned long i;
+
+	// Compare each page checking that it contains our expected byte.
+	for (i = 0; i < num_pages; ++i) {
+		int ret =
+		    memcmp(addr + (i * page_size), page_buffer, page_size);
+		if (ret) {
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving
+// the source mapping mapped.
+static void mremap_dontunmap_simple()
+{
+	unsigned long num_pages = 5;
+
+	void *source_mapping =
+	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+	memset(source_mapping, 'a', num_pages * page_size);
+
+	// Try to just move the whole mapping anywhere (not fixed).
+	void *dest_mapping =
+	    mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+	BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+	// Validate that the pages have been moved, we know they were moved if
+	// the dest_mapping contains a's.
+	BUG_ON(check_region_contains_byte
+	       (dest_mapping, num_pages * page_size, 'a') != 0,
+	       "pages did not migrate");
+	BUG_ON(check_region_contains_byte
+	       (source_mapping, num_pages * page_size, 0) != 0,
+	       "source should have no ptes");
+
+	BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+	       "unable to unmap destination mapping");
+	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+	       "unable to unmap source mapping");
+}
+
+// This test validates MREMAP_DONTUNMAP will move page tables to a specific
+// destination using MREMAP_FIXED, also while validating that the source
+// remains intact.
+static void mremap_dontunmap_simple_fixed()
+{
+	unsigned long num_pages = 5;
+
+	// Since we want to guarantee that we can remap to a point, we will
+	// create a mapping up front.
+	void *dest_mapping =
+	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+	memset(dest_mapping, 'X', num_pages * page_size);
+
+	void *source_mapping =
+	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(source_mapping == MAP_FAILED, "mmap");
+	memset(source_mapping, 'a', num_pages * page_size);
+
+	void *remapped_mapping =
+	    mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+		   MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE,
+		   dest_mapping);
+	BUG_ON(remapped_mapping == MAP_FAILED, "mremap");
+	BUG_ON(remapped_mapping != dest_mapping,
+	       "mremap should have placed the remapped mapping at dest_mapping");
+
+	// The dest mapping will have been unmap by mremap so we expect the Xs
+	// to be gone and replaced with a's.
+	BUG_ON(check_region_contains_byte
+	       (dest_mapping, num_pages * page_size, 'a') != 0,
+	       "pages did not migrate");
+
+	// And the source mapping will have had its ptes dropped.
+	BUG_ON(check_region_contains_byte
+	       (source_mapping, num_pages * page_size, 0) != 0,
+	       "source should have no ptes");
+
+	BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+	       "unable to unmap destination mapping");
+	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+	       "unable to unmap source mapping");
+}
+
+// This test validates that we can MREMAP_DONTUNMAP for a portion of an
+// existing mapping.
+static void mremap_dontunmap_partial_mapping()
+{
+	/*
+	 *  source mapping:
+	 *  --------------
+	 *  | aaaaaaaaaa |
+	 *  --------------
+	 *  to become:
+	 *  --------------
+	 *  | aaaaa00000 |
+	 *  --------------
+	 *  With the destination mapping containing 5 pages of As.
+	 *  ---------
+	 *  | aaaaa |
+	 *  ---------
+	 */
+	unsigned long num_pages = 10;
+	void *source_mapping =
+	    mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(source_mapping == MAP_FAILED, "mmap");
+	memset(source_mapping, 'a', num_pages * page_size);
+
+	// We will grab the last 5 pages of the source and move them.
+	void *dest_mapping =
+	    mremap(source_mapping + (5 * page_size), 5 * page_size,
+		   5 * page_size,
+		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+	BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+	// We expect the first 5 pages of the source to contain a's and the
+	// final 5 pages to contain zeros.
+	BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') !=
+	       0, "first 5 pages of source should have original pages");
+	BUG_ON(check_region_contains_byte
+	       (source_mapping + (5 * page_size), 5 * page_size, 0) != 0,
+	       "final 5 pages of source should have no ptes");
+
+	// Finally we expect the destination to have 5 pages worth of a's.
+	BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') !=
+	       0, "dest mapping should contain ptes from the source");
+
+	BUG_ON(munmap(dest_mapping, 5 * page_size) == -1,
+	       "unable to unmap destination mapping");
+	BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+	       "unable to unmap source mapping");
+}
+
+// This test validates that we can remap over only a portion of a mapping.
+static void mremap_dontunmap_partial_mapping_overwrite()
+{
+	/*
+	 *  source mapping:
+	 *  ---------
+	 *  |aaaaa|
+	 *  ---------
+	 *  dest mapping initially:
+	 *  -----------
+	 *  |XXXXXXXXXX|
+	 *  ------------
+	 *  Source to become:
+	 *  ---------
+	 *  |00000|
+	 *  ---------
+	 *  With the destination mapping containing 5 pages of As.
+	 *  ------------
+	 *  |aaaaaXXXXX|
+	 *  ------------
+	 */
+	void *source_mapping =
+	    mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(source_mapping == MAP_FAILED, "mmap");
+	memset(source_mapping, 'a', 5 * page_size);
+
+	void *dest_mapping =
+	    mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+	memset(dest_mapping, 'X', 10 * page_size);
+
+	// We will grab the last 5 pages of the source and move them.
+	void *remapped_mapping =
+	    mremap(source_mapping, 5 * page_size,
+		   5 * page_size,
+		   MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping);
+	BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+	BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping");
+
+	BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) !=
+	       0, "first 5 pages of source should have no ptes");
+
+	// Finally we expect the destination to have 5 pages worth of a's.
+	BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0,
+			"dest mapping should contain ptes from the source");
+
+	// Finally the last 5 pages shouldn't have been touched.
+	BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size),
+				5 * page_size, 'X') != 0,
+			"dest mapping should have retained the last 5 pages");
+
+	BUG_ON(munmap(dest_mapping, 10 * page_size) == -1,
+	       "unable to unmap destination mapping");
+	BUG_ON(munmap(source_mapping, 5 * page_size) == -1,
+	       "unable to unmap source mapping");
+}
+
+int main(void)
+{
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	// test for kernel support for MREMAP_DONTUNMAP skipping the test if
+	// not.
+	if (kernel_support_for_mremap_dontunmap() != 0) {
+		printf("No kernel support for MREMAP_DONTUNMAP\n");
+		return KSFT_SKIP;
+	}
+
+	// Keep a page sized buffer around for when we need it.
+	page_buffer =
+	    mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page.");
+
+	mremap_dontunmap_simple();
+	mremap_dontunmap_simple_fixed();
+	mremap_dontunmap_partial_mapping();
+	mremap_dontunmap_partial_mapping_overwrite();
+
+	BUG_ON(munmap(page_buffer, page_size) == -1,
+	       "unable to unmap page buffer");
+
+	printf("OK\n");
+	return 0;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 951c507a27f7..d380b95c5de5 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -227,4 +227,19 @@ else
 	exitcode=1
 fi
 
+echo "------------------------------------"
+echo "running MREMAP_DONTUNMAP smoke test"
+echo "------------------------------------"
+./mremap_dontunmap
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+	echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+	 echo "[SKIP]"
+	 exitcode=$ksft_skip
+else
+	echo "[FAIL]"
+	exitcode=1
+fi
 exit $exitcode
-- 
2.25.0.265.gbab2e86ba0-goog


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v7] mremap.2: Add information for MREMAP_DONTUNMAP.
  2020-02-21 17:42 [PATCH v7 1/2] mm: Add MREMAP_DONTUNMAP to mremap() Brian Geffon
  2020-02-21 17:42 ` [PATCH v7 2/2] selftest: Add MREMAP_DONTUNMAP selftest Brian Geffon
@ 2020-02-21 17:42 ` Brian Geffon
  2020-02-25 13:51   ` Vlastimil Babka
  2020-02-24  8:16 ` [PATCH v7 1/2] mm: Add MREMAP_DONTUNMAP to mremap() Kirill A. Shutemov
  2020-02-25 13:48 ` Vlastimil Babka
  3 siblings, 1 reply; 7+ messages in thread
From: Brian Geffon @ 2020-02-21 17:42 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michael S . Tsirkin, Brian Geffon, Arnd Bergmann, linux-kernel,
	linux-mm, linux-api, Andy Lutomirski, Will Deacon,
	Andrea Arcangeli, Sonny Rao, Minchan Kim, Joel Fernandes,
	Yu Zhao, Jesse Barnes, Florian Weimer, Kirill A . Shutemov,
	mtk.manpages, linux-man

Signed-off-by: Brian Geffon <bgeffon@google.com>
---
 man2/mremap.2 | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/man2/mremap.2 b/man2/mremap.2
index d73fb64fa..54ec67b20 100644
--- a/man2/mremap.2
+++ b/man2/mremap.2
@@ -26,7 +26,8 @@
 .\" 1996-04-12 Tom Bjorkholm <tomb@mydata.se>
 .\"            Update for Linux 1.3.87 and later
 .\" 2005-10-11 mtk: Added NOTES for MREMAP_FIXED; revised EINVAL text.
-.\"
+.\" 2020-02-05 Brian Geffon <bgeffon@google.com>
+.\"            Update for MREMAP_DONTUNMAP.
 .TH MREMAP 2 2019-03-06 "Linux" "Linux Programmer's Manual"
 .SH NAME
 mremap \- remap a virtual memory address
@@ -129,6 +130,13 @@ If
 is specified, then
 .B MREMAP_MAYMOVE
 must also be specified.
+.TP
+.BR MREMAP_DONTUNMAP " (since Linux 5.7)"
+This flag which must be used in conjuction with
+.B MREMAP_MAYMOVE
+remaps a mapping to a new address and it does not unmap the mapping at \fIold_address\fP. This flag can only be used with private anonymous mappings. Any access to the range specified at \fIold_address\fP after completion will result in a page fault. If a
+.BR userfaultfd (2)
+was registered on the mapping specified by \fIold_address\fP it will continue to watch that mapping for faults.
 .PP
 If the memory segment specified by
 .I old_address
@@ -176,6 +184,8 @@ a value other than
 .B MREMAP_MAYMOVE
 or
 .B MREMAP_FIXED
+or
+.B MREMAP_DONTUNMAP
 was specified in
 .IR flags ;
 .IP *
@@ -197,9 +207,17 @@ and
 .IR old_size ;
 .IP *
 .B MREMAP_FIXED
+or
+.B MREMAP_DONTUNMAP
 was specified without also specifying
 .BR MREMAP_MAYMOVE ;
 .IP *
+.B MREMAP_DONTUNMAP
+was specified with an \fIold_address\fP that was not private anonymous;
+.IP *
+.B MREMAP_DONTUNMAP
+was specified and \fIold_size\fP was not equal to \fInew_size\fP;
+.IP *
 \fIold_size\fP was zero and \fIold_address\fP does not refer to a
 shareable mapping (but see BUGS);
 .IP *
@@ -209,10 +227,20 @@ flag was not specified.
 .RE
 .TP
 .B ENOMEM
+Not enough memory was available to complete the operation.
+Possible causes are:
+.RS
+.IP * 3
 The memory area cannot be expanded at the current virtual address, and the
 .B MREMAP_MAYMOVE
 flag is not set in \fIflags\fP.
 Or, there is not enough (virtual) memory available.
+.IP *
+.B MREMAP_DONTUNMAP
+was used without
+.B MREMAP_FIXED
+causing a new mapping to be created that would exceed the virtual memory available or it would exceed the maximum number of allowed mappings.
+.RE
 .SH CONFORMING TO
 This call is Linux-specific, and should not be used in programs
 intended to be portable.
-- 
2.25.0.265.gbab2e86ba0-goog


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v7 1/2] mm: Add MREMAP_DONTUNMAP to mremap().
  2020-02-21 17:42 [PATCH v7 1/2] mm: Add MREMAP_DONTUNMAP to mremap() Brian Geffon
  2020-02-21 17:42 ` [PATCH v7 2/2] selftest: Add MREMAP_DONTUNMAP selftest Brian Geffon
  2020-02-21 17:42 ` [PATCH v7] mremap.2: Add information for MREMAP_DONTUNMAP Brian Geffon
@ 2020-02-24  8:16 ` Kirill A. Shutemov
  2020-02-25 13:48 ` Vlastimil Babka
  3 siblings, 0 replies; 7+ messages in thread
From: Kirill A. Shutemov @ 2020-02-24  8:16 UTC (permalink / raw)
  To: Brian Geffon
  Cc: Andrew Morton, Michael S . Tsirkin, Arnd Bergmann, linux-kernel,
	linux-mm, linux-api, Andy Lutomirski, Will Deacon,
	Andrea Arcangeli, Sonny Rao, Minchan Kim, Joel Fernandes,
	Yu Zhao, Jesse Barnes, Florian Weimer, mtk.manpages, linux-man,
	Lokesh Gidra

On Fri, Feb 21, 2020 at 09:42:46AM -0800, Brian Geffon wrote:
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. The remap operation
> will be performed as it would have been normally by moving over the
> page tables to the new mapping. The old vma will have any locked
> flags cleared, have no pagetables, and any userfaultfds that were
> watching that range will continue watching it.
> 
> For a mapping that is shared or not anonymous, MREMAP_DONTUNMAP will cause
> the mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
> a VMA you MUST use the MREMAP_MAYMOVE flag, it's not possible to resize
> a VMA while also moving with MREMAP_DONTUNMAP so old_len must always
> be equal to the new_len otherwise it will return -EINVAL.
> 
> We hope to use this in Chrome OS where with userfaultfd we could write
> an anonymous mapping to disk without having to STOP the process or worry
> about VMA permission changes.
> 
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."
> 
>   v6 -> v7:
>     - Don't allow resizing VMA as part of MREMAP_DONTUNMAP.
>       There is no clear use case at the moment and it can be added
>       later as it simplifies the implementation for now.
> 
>   v5 -> v6:
>     - Code cleanup suggested by Kirill.
> 
>   v4 -> v5:
>     - Correct commit message to more accurately reflect the behavior.
>     - Clear VM_LOCKED and VM_LOCKEDONFAULT on the old vma.
>            
> Signed-off-by: Brian Geffon <bgeffon@google.com>
> Reviewed-by: Minchan Kim <minchan@kernel.org>
> Tested-by: Lokesh Gidra <lokeshgidra@google.com>

Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v7 1/2] mm: Add MREMAP_DONTUNMAP to mremap().
  2020-02-21 17:42 [PATCH v7 1/2] mm: Add MREMAP_DONTUNMAP to mremap() Brian Geffon
                   ` (2 preceding siblings ...)
  2020-02-24  8:16 ` [PATCH v7 1/2] mm: Add MREMAP_DONTUNMAP to mremap() Kirill A. Shutemov
@ 2020-02-25 13:48 ` Vlastimil Babka
  3 siblings, 0 replies; 7+ messages in thread
From: Vlastimil Babka @ 2020-02-25 13:48 UTC (permalink / raw)
  To: Brian Geffon, Andrew Morton
  Cc: Michael S . Tsirkin, Arnd Bergmann, linux-kernel, linux-mm,
	linux-api, Andy Lutomirski, Will Deacon, Andrea Arcangeli,
	Sonny Rao, Minchan Kim, Joel Fernandes, Yu Zhao, Jesse Barnes,
	Florian Weimer, Kirill A . Shutemov, mtk.manpages, linux-man,
	Lokesh Gidra

On 2/21/20 6:42 PM, Brian Geffon wrote:
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. The remap operation
> will be performed as it would have been normally by moving over the
> page tables to the new mapping. The old vma will have any locked
> flags cleared, have no pagetables, and any userfaultfds that were
> watching that range will continue watching it.
> 
> For a mapping that is shared or not anonymous, MREMAP_DONTUNMAP will cause
> the mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
> a VMA you MUST use the MREMAP_MAYMOVE flag, it's not possible to resize
> a VMA while also moving with MREMAP_DONTUNMAP so old_len must always
> be equal to the new_len otherwise it will return -EINVAL.
> 
> We hope to use this in Chrome OS where with userfaultfd we could write
> an anonymous mapping to disk without having to STOP the process or worry
> about VMA permission changes.
> 
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."
> 
>   v6 -> v7:
>     - Don't allow resizing VMA as part of MREMAP_DONTUNMAP.
>       There is no clear use case at the moment and it can be added
>       later as it simplifies the implementation for now.
> 
>   v5 -> v6:
>     - Code cleanup suggested by Kirill.
> 
>   v4 -> v5:
>     - Correct commit message to more accurately reflect the behavior.
>     - Clear VM_LOCKED and VM_LOCKEDONFAULT on the old vma.
>            
> Signed-off-by: Brian Geffon <bgeffon@google.com>
> Reviewed-by: Minchan Kim <minchan@kernel.org>
> Tested-by: Lokesh Gidra <lokeshgidra@google.com>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

Thanks.

> diff --git a/mm/mremap.c b/mm/mremap.c
> index 1fc8a29fbe3f..8b7bf3845e50 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
>  static unsigned long move_vma(struct vm_area_struct *vma,
>  		unsigned long old_addr, unsigned long old_len,
>  		unsigned long new_len, unsigned long new_addr,
> -		bool *locked, struct vm_userfaultfd_ctx *uf,
> -		struct list_head *uf_unmap)
> +		bool *locked, unsigned long flags,
> +		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)

The usage of MREMAP_DONTUNMAP directly in the "flags" parameter seems
weird for generically named vma manipulation functions, but as they are
all local to mremap.c then it's probably fine.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v7] mremap.2: Add information for MREMAP_DONTUNMAP.
  2020-02-21 17:42 ` [PATCH v7] mremap.2: Add information for MREMAP_DONTUNMAP Brian Geffon
@ 2020-02-25 13:51   ` Vlastimil Babka
  2020-02-25 17:48     ` Brian Geffon
  0 siblings, 1 reply; 7+ messages in thread
From: Vlastimil Babka @ 2020-02-25 13:51 UTC (permalink / raw)
  To: Brian Geffon, Andrew Morton
  Cc: Michael S . Tsirkin, Arnd Bergmann, linux-kernel, linux-mm,
	linux-api, Andy Lutomirski, Will Deacon, Andrea Arcangeli,
	Sonny Rao, Minchan Kim, Joel Fernandes, Yu Zhao, Jesse Barnes,
	Florian Weimer, Kirill A . Shutemov, mtk.manpages, linux-man

On 2/21/20 6:42 PM, Brian Geffon wrote:
> @@ -209,10 +227,20 @@ flag was not specified.
>  .RE
>  .TP
>  .B ENOMEM
> +Not enough memory was available to complete the operation.
> +Possible causes are:
> +.RS
> +.IP * 3
>  The memory area cannot be expanded at the current virtual address, and the
>  .B MREMAP_MAYMOVE
>  flag is not set in \fIflags\fP.
>  Or, there is not enough (virtual) memory available.
> +.IP *
> +.B MREMAP_DONTUNMAP
> +was used without
> +.B MREMAP_FIXED
> +causing a new mapping to be created that would exceed the virtual memory available or it would exceed the maximum number of allowed mappings.

So this can also result with MREMAP_FIXED, no?

> +.RE
>  .SH CONFORMING TO
>  This call is Linux-specific, and should not be used in programs
>  intended to be portable.
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v7] mremap.2: Add information for MREMAP_DONTUNMAP.
  2020-02-25 13:51   ` Vlastimil Babka
@ 2020-02-25 17:48     ` Brian Geffon
  0 siblings, 0 replies; 7+ messages in thread
From: Brian Geffon @ 2020-02-25 17:48 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Andrew Morton, Michael S . Tsirkin, Arnd Bergmann, LKML,
	linux-mm, Linux API, Andy Lutomirski, Will Deacon,
	Andrea Arcangeli, Sonny Rao, Minchan Kim, Joel Fernandes,
	Yu Zhao, Jesse Barnes, Florian Weimer, Kirill A . Shutemov,
	mtk.manpages, linux-man

Yes, you're correct that is poorly worded. I'll send an updated
manpage closer to the release of this in case anything else changes.
Thanks for taking a look.

Brian

On Tue, Feb 25, 2020 at 5:51 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 2/21/20 6:42 PM, Brian Geffon wrote:
> > @@ -209,10 +227,20 @@ flag was not specified.
> >  .RE
> >  .TP
> >  .B ENOMEM
> > +Not enough memory was available to complete the operation.
> > +Possible causes are:
> > +.RS
> > +.IP * 3
> >  The memory area cannot be expanded at the current virtual address, and the
> >  .B MREMAP_MAYMOVE
> >  flag is not set in \fIflags\fP.
> >  Or, there is not enough (virtual) memory available.
> > +.IP *
> > +.B MREMAP_DONTUNMAP
> > +was used without
> > +.B MREMAP_FIXED
> > +causing a new mapping to be created that would exceed the virtual memory available or it would exceed the maximum number of allowed mappings.
>
> So this can also result with MREMAP_FIXED, no?
>
> > +.RE
> >  .SH CONFORMING TO
> >  This call is Linux-specific, and should not be used in programs
> >  intended to be portable.
> >
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, back to index

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-21 17:42 [PATCH v7 1/2] mm: Add MREMAP_DONTUNMAP to mremap() Brian Geffon
2020-02-21 17:42 ` [PATCH v7 2/2] selftest: Add MREMAP_DONTUNMAP selftest Brian Geffon
2020-02-21 17:42 ` [PATCH v7] mremap.2: Add information for MREMAP_DONTUNMAP Brian Geffon
2020-02-25 13:51   ` Vlastimil Babka
2020-02-25 17:48     ` Brian Geffon
2020-02-24  8:16 ` [PATCH v7 1/2] mm: Add MREMAP_DONTUNMAP to mremap() Kirill A. Shutemov
2020-02-25 13:48 ` Vlastimil Babka

Linux-man Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-man/0 linux-man/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-man linux-man/ https://lore.kernel.org/linux-man \
		linux-man@vger.kernel.org
	public-inbox-index linux-man

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-man


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git