linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCHv3 1/2] mm: rearrange madvise code to allow for reuse
@ 2013-10-15  1:31 Colin Cross
  2013-10-15  1:31 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
  0 siblings, 1 reply; 12+ messages in thread
From: Colin Cross @ 2013-10-15  1:31 UTC (permalink / raw)
  To: linux-kernel, Pekka Enberg, Dave Hansen, Peter Zijlstra,
	Ingo Molnar, Oleg Nesterov, Eric W. Biederman, Jan Glauber,
	John Stultz
  Cc: Colin Cross, Andrew Morton, Sasha Levin, Rasmus Villemoes,
	Shaohua Li, open list:MEMORY MANAGEMENT

This patch refactors the madvise syscall to allow for parts of it
to be reused by a prctl syscall that affects vmas.

Move the code that walks vmas in a virtual address range into a
function that takes a function pointer as a parameter.  The only
caller for now is sys_madvise, which uses it to call
madvise_vma_behavior on each vma, but the next patch will add
an additional caller.

Move handling all vma behaviors inside madvise_behavior, and
rename it to madvise_vma_behavior.

Move the code that updates the flags on a vma, including splitting
or merging the vma as necessary, into a new function called
madvise_update_vma.  The next patch will add support for updating
a new anon_name field as well.

Signed-off-by: Colin Cross <ccross@android.com>
---
 mm/madvise.c | 272 +++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 151 insertions(+), 121 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 7055883..b8820fd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -39,65 +39,20 @@ static int madvise_need_mmap_write(int behavior)
 }
 
 /*
- * We can potentially split a vm area into separate
- * areas, each area with its own behavior.
+ * Update the vm_flags on regiion of a vma, splitting it or merging it as
+ * necessary.  Must be called with mmap_sem held for writing;
  */
-static long madvise_behavior(struct vm_area_struct * vma,
-		     struct vm_area_struct **prev,
-		     unsigned long start, unsigned long end, int behavior)
+static int madvise_update_vma(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev, unsigned long start,
+		     unsigned long end, unsigned long new_flags)
 {
 	struct mm_struct * mm = vma->vm_mm;
-	int error = 0;
 	pgoff_t pgoff;
-	unsigned long new_flags = vma->vm_flags;
-
-	switch (behavior) {
-	case MADV_NORMAL:
-		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
-		break;
-	case MADV_SEQUENTIAL:
-		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
-		break;
-	case MADV_RANDOM:
-		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
-		break;
-	case MADV_DONTFORK:
-		new_flags |= VM_DONTCOPY;
-		break;
-	case MADV_DOFORK:
-		if (vma->vm_flags & VM_IO) {
-			error = -EINVAL;
-			goto out;
-		}
-		new_flags &= ~VM_DONTCOPY;
-		break;
-	case MADV_DONTDUMP:
-		new_flags |= VM_DONTDUMP;
-		break;
-	case MADV_DODUMP:
-		if (new_flags & VM_SPECIAL) {
-			error = -EINVAL;
-			goto out;
-		}
-		new_flags &= ~VM_DONTDUMP;
-		break;
-	case MADV_MERGEABLE:
-	case MADV_UNMERGEABLE:
-		error = ksm_madvise(vma, start, end, behavior, &new_flags);
-		if (error)
-			goto out;
-		break;
-	case MADV_HUGEPAGE:
-	case MADV_NOHUGEPAGE:
-		error = hugepage_madvise(vma, &new_flags, behavior);
-		if (error)
-			goto out;
-		break;
-	}
+	int error;
 
 	if (new_flags == vma->vm_flags) {
 		*prev = vma;
-		goto out;
+		return 0;
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -113,13 +68,13 @@ static long madvise_behavior(struct vm_area_struct * vma,
 	if (start != vma->vm_start) {
 		error = split_vma(mm, vma, start, 1);
 		if (error)
-			goto out;
+			return error;
 	}
 
 	if (end != vma->vm_end) {
 		error = split_vma(mm, vma, end, 0);
 		if (error)
-			goto out;
+			return error;
 	}
 
 success:
@@ -128,10 +83,7 @@ success:
 	 */
 	vma->vm_flags = new_flags;
 
-out:
-	if (error == -ENOMEM)
-		error = -EAGAIN;
-	return error;
+	return 0;
 }
 
 #ifdef CONFIG_SWAP
@@ -337,6 +289,77 @@ static long madvise_remove(struct vm_area_struct *vma,
 	return error;
 }
 
+/*
+ * Apply an madvise behavior to a region of a vma.  madvise_update_vma
+ * will handle splitting a vm area into separate areas, each area with its own
+ * behavior.
+ */
+static int madvise_vma_behavior(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev,
+		     unsigned long start, unsigned long end,
+		     unsigned long behavior)
+{
+	int error = 0;
+	unsigned long new_flags = vma->vm_flags;
+
+	switch (behavior) {
+	case MADV_REMOVE:
+		return madvise_remove(vma, prev, start, end);
+	case MADV_WILLNEED:
+		return madvise_willneed(vma, prev, start, end);
+	case MADV_DONTNEED:
+		return madvise_dontneed(vma, prev, start, end);
+	case MADV_NORMAL:
+		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+		break;
+	case MADV_SEQUENTIAL:
+		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
+		break;
+	case MADV_RANDOM:
+		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
+		break;
+	case MADV_DONTFORK:
+		new_flags |= VM_DONTCOPY;
+		break;
+	case MADV_DOFORK:
+		if (vma->vm_flags & VM_IO) {
+			error = -EINVAL;
+			goto out;
+		}
+		new_flags &= ~VM_DONTCOPY;
+		break;
+	case MADV_DONTDUMP:
+		new_flags |= VM_DONTDUMP;
+		break;
+	case MADV_DODUMP:
+		if (new_flags & VM_SPECIAL) {
+			error = -EINVAL;
+			goto out;
+		}
+		new_flags &= ~VM_DONTDUMP;
+		break;
+	case MADV_MERGEABLE:
+	case MADV_UNMERGEABLE:
+		error = ksm_madvise(vma, start, end, behavior, &new_flags);
+		if (error)
+			goto out;
+		break;
+	case MADV_HUGEPAGE:
+	case MADV_NOHUGEPAGE:
+		error = hugepage_madvise(vma, &new_flags, behavior);
+		if (error)
+			goto out;
+		break;
+	}
+
+	error = madvise_update_vma(vma, prev, start, end, new_flags);
+
+out:
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+	return error;
+}
+
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Error injection support for memory error handling.
@@ -369,22 +392,6 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 }
 #endif
 
-static long
-madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
-		unsigned long start, unsigned long end, int behavior)
-{
-	switch (behavior) {
-	case MADV_REMOVE:
-		return madvise_remove(vma, prev, start, end);
-	case MADV_WILLNEED:
-		return madvise_willneed(vma, prev, start, end);
-	case MADV_DONTNEED:
-		return madvise_dontneed(vma, prev, start, end);
-	default:
-		return madvise_behavior(vma, prev, start, end, behavior);
-	}
-}
-
 static int
 madvise_behavior_valid(int behavior)
 {
@@ -415,6 +422,73 @@ madvise_behavior_valid(int behavior)
 }
 
 /*
+ * Walk the vmas in range [start,end), and call the visit function on each one.
+ * The visit function will get start and end parameters that cover the overlap
+ * between the current vma and the original range.  Any unmapped regions in the
+ * original range will result in this function returning -ENOMEM while still
+ * calling the visit function on all of the existing vmas in the range.
+ * Must be called with the mmap_sem held for reading or writing.
+ */
+static
+int madvise_walk_vmas(unsigned long start, unsigned long end,
+		unsigned long arg,
+		int (*visit)(struct vm_area_struct *vma,
+			struct vm_area_struct **prev, unsigned long start,
+			unsigned long end, unsigned long arg))
+{
+	struct vm_area_struct *vma;
+	struct vm_area_struct *prev;
+	unsigned long tmp;
+	int unmapped_error = 0;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 * - different from the way of handling in mlock etc.
+	 */
+	vma = find_vma_prev(current->mm, start, &prev);
+	if (vma && start > vma->vm_start)
+		prev = vma;
+
+	for (;;) {
+		int error;
+
+		/* Still start < end. */
+		if (!vma)
+			return -ENOMEM;
+
+		/* Here start < (end|vma->vm_end). */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+			if (start >= end)
+				break;
+		}
+
+		/* Here vma->vm_start <= start < (end|vma->vm_end) */
+		tmp = vma->vm_end;
+		if (end < tmp)
+			tmp = end;
+
+		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+		error = visit(vma, &prev, start, tmp, arg);
+		if (error)
+			return error;
+		start = tmp;
+		if (prev && start < prev->vm_end)
+			start = prev->vm_end;
+		if (start >= end)
+			break;
+		if (prev)
+			vma = prev->vm_next;
+		else	/* madvise_remove dropped mmap_sem */
+			vma = find_vma(current->mm, start);
+	}
+
+	return unmapped_error;
+}
+
+/*
  * The madvise(2) system call.
  *
  * Applications can use madvise() to advise the kernel how it should
@@ -458,9 +532,7 @@ madvise_behavior_valid(int behavior)
  */
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
-	unsigned long end, tmp;
-	struct vm_area_struct * vma, *prev;
-	int unmapped_error = 0;
+	unsigned long end;
 	int error = -EINVAL;
 	int write;
 	size_t len;
@@ -495,52 +567,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 	else
 		down_read(&current->mm->mmap_sem);
 
-	/*
-	 * If the interval [start,end) covers some unmapped address
-	 * ranges, just ignore them, but return -ENOMEM at the end.
-	 * - different from the way of handling in mlock etc.
-	 */
-	vma = find_vma_prev(current->mm, start, &prev);
-	if (vma && start > vma->vm_start)
-		prev = vma;
-
 	blk_start_plug(&plug);
-	for (;;) {
-		/* Still start < end. */
-		error = -ENOMEM;
-		if (!vma)
-			goto out;
-
-		/* Here start < (end|vma->vm_end). */
-		if (start < vma->vm_start) {
-			unmapped_error = -ENOMEM;
-			start = vma->vm_start;
-			if (start >= end)
-				goto out;
-		}
-
-		/* Here vma->vm_start <= start < (end|vma->vm_end) */
-		tmp = vma->vm_end;
-		if (end < tmp)
-			tmp = end;
-
-		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
-		error = madvise_vma(vma, &prev, start, tmp, behavior);
-		if (error)
-			goto out;
-		start = tmp;
-		if (prev && start < prev->vm_end)
-			start = prev->vm_end;
-		error = unmapped_error;
-		if (start >= end)
-			goto out;
-		if (prev)
-			vma = prev->vm_next;
-		else	/* madvise_remove dropped mmap_sem */
-			vma = find_vma(current->mm, start);
-	}
-out:
+	error = madvise_walk_vmas(start, end, behavior, madvise_vma_behavior);
 	blk_finish_plug(&plug);
+
 	if (write)
 		up_write(&current->mm->mmap_sem);
 	else
-- 
1.8.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-15  1:31 [PATCHv3 1/2] mm: rearrange madvise code to allow for reuse Colin Cross
@ 2013-10-15  1:31 ` Colin Cross
  2013-10-15 21:21   ` Andrew Morton
                     ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Colin Cross @ 2013-10-15  1:31 UTC (permalink / raw)
  To: linux-kernel, Pekka Enberg, Dave Hansen, Peter Zijlstra,
	Ingo Molnar, Oleg Nesterov, Eric W. Biederman, Jan Glauber,
	John Stultz
  Cc: Colin Cross, Rob Landley, Andrew Morton, Cyrill Gorcunov,
	Kees Cook, Serge E. Hallyn, David Rientjes, Al Viro,
	Hugh Dickins, Rik van Riel, Mel Gorman, Michel Lespinasse,
	Tang Chen, Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

In many userspace applications, and especially in VM based
applications like Android uses heavily, there are multiple different
allocators in use.  At a minimum there is libc malloc and the stack,
and in many cases there are libc malloc, the stack, direct syscalls to
mmap anonymous memory, and multiple VM heaps (one for small objects,
one for big objects, etc.).  Each of these layers usually has its own
tools to inspect its usage; malloc by compiling a debug version, the
VM through heap inspection tools, and for direct syscalls there is
usually no way to track them.

On Android we heavily use a set of tools that use an extended version
of the logic covered in Documentation/vm/pagemap.txt to walk all pages
mapped in userspace and slice their usage by process, shared (COW) vs.
unique mappings, backing, etc.  This can account for real physical
memory usage even in cases like fork without exec (which Android uses
heavily to share as many private COW pages as possible between
processes), Kernel SamePage Merging, and clean zero pages.  It
produces a measurement of the pages that only exist in that process
(USS, for unique), and a measurement of the physical memory usage of
that process with the cost of shared pages being evenly split between
processes that share them (PSS).

If all anonymous memory is indistinguishable then figuring out the
real physical memory usage (PSS) of each heap requires either a pagemap
walking tool that can understand the heap debugging of every layer, or
for every layer's heap debugging tools to implement the pagemap
walking logic, in which case it is hard to get a consistent view of
memory across the whole system.

This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
show a userspace-provided name for anonymous vmas.  The names of
named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
as [anon:<name>].

Userspace can set the name for a region of memory by calling
prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
Setting the name to NULL clears it.

The name is stored in a user pointer in the shared union in
vm_area_struct that points to a null terminated string inside
the user process.  vmas that point to the same address and are
otherwise mergeable will be merged, but vmas that point to
equivalent strings at different addresses will not be merged.

The idea to store a userspace pointer to reduce the complexity
within mm (at the expense of the complexity of reading
/proc/pid/mem) came from Dave Hansen.  This results in no
runtime overhead in the mm subsystem other than comparing
the anon_name pointers when considering vma merging.  The pointer
is stored in a union with fields that are only used on file-backed
mappings, so it does not increase memory usage.

Signed-off-by: Colin Cross <ccross@android.com>
---

v2: updates the commit message to explain in more detail why the
    patch is useful.
v3: renames vma_get_anon_name to vma_anon_name
    replaces logic in seq_print_vma_name with access_process_vm
    removes Name: entry from smaps, it's already on the header line
    changes the prctl option number to match what is currently in
       use on Android

 Documentation/filesystems/proc.txt |  2 ++
 fs/proc/task_mmu.c                 | 22 +++++++++++++++
 include/linux/mm.h                 |  5 +++-
 include/linux/mm_types.h           | 15 +++++++++++
 include/uapi/linux/prctl.h         |  3 +++
 kernel/sys.c                       | 24 +++++++++++++++++
 mm/madvise.c                       | 55 +++++++++++++++++++++++++++++++++++---
 mm/mempolicy.c                     |  2 +-
 mm/mlock.c                         |  3 ++-
 mm/mmap.c                          | 44 +++++++++++++++++-------------
 mm/mprotect.c                      |  3 ++-
 11 files changed, 152 insertions(+), 26 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fd8d0d5..ec5b7d8 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -369,6 +369,8 @@ is not associated with a file:
  [stack:1001]             = the stack of the thread with tid 1001
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
+ [anon:<name>]            = an anonymous mapping that has been
+                            named by userspace
 
  or if empty, the mapping is anonymous.
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d8..681af03 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -90,6 +90,22 @@ static void pad_len_spaces(struct seq_file *m, int len)
 	seq_printf(m, "%*c", len, ' ');
 }
 
+static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	char anon_name[NAME_MAX + 1];
+	unsigned long addr;
+	int n;
+
+	n = access_remote_vm(mm, (unsigned long)vma_anon_name(vma),
+				anon_name, NAME_MAX, 0);
+	if (n > 0) {
+		seq_puts(m, "[anon:");
+		seq_write(m, anon_name, strnlen(anon_name, n));
+		seq_putc(m, ']');
+	}
+}
+
 #ifdef CONFIG_NUMA
 /*
  * These functions are for numa_maps but called in generic **maps seq_file
@@ -335,6 +351,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 				pad_len_spaces(m, len);
 				seq_printf(m, "[stack:%d]", tid);
 			}
+			goto done;
+		}
+
+		if (vma_anon_name(vma)) {
+			pad_len_spaces(m, len);
+			seq_print_vma_name(m, vma);
 		}
 	}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e0c8528..36260c7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1485,7 +1485,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 extern struct vm_area_struct *vma_merge(struct mm_struct *,
 	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
 	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-	struct mempolicy *);
+	struct mempolicy *, const char __user *);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int split_vma(struct mm_struct *,
 	struct vm_area_struct *, unsigned long addr, int new_below);
@@ -1828,5 +1828,8 @@ void __init setup_nr_node_ids(void);
 static inline void setup_nr_node_ids(void) {}
 #endif
 
+int madvise_set_anon_name(unsigned long start, unsigned long len_in,
+				unsigned long name_addr);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f..6dc6667 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -255,6 +255,10 @@ struct vm_area_struct {
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree, or
 	 * linkage of vma in the address_space->i_mmap_nonlinear list.
+	 *
+	 * For private anonymous mappings, a pointer to a null terminated string
+	 * in the user process containing the name given to the vma, or NULL
+	 * if unnamed.
 	 */
 	union {
 		struct {
@@ -262,6 +266,7 @@ struct vm_area_struct {
 			unsigned long rb_subtree_last;
 		} linear;
 		struct list_head nonlinear;
+		const char __user *anon_name;
 	} shared;
 
 	/*
@@ -456,4 +461,14 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 	return mm->cpu_vm_mask_var;
 }
 
+
+/* Return the name for an anonymous mapping or NULL for a file-backed mapping */
+static inline const char __user *vma_anon_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_file)
+		return NULL;
+
+	return vma->shared.anon_name;
+}
+
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 289760f..253856a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -149,4 +149,7 @@
 
 #define PR_GET_TID_ADDRESS	40
 
+#define PR_SET_VMA		0x53564d41
+# define PR_SET_VMA_ANON_NAME		0
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 2bbd9a7..401852f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2099,6 +2099,27 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
 
+static int prctl_set_vma(unsigned long opt, unsigned long addr,
+		unsigned long len, unsigned long arg)
+{
+	struct mm_struct *mm = current->mm;
+	int error;
+
+	down_write(&mm->mmap_sem);
+
+	switch (opt) {
+	case PR_SET_VMA_ANON_NAME:
+		error = madvise_set_anon_name(addr, len, arg);
+		break;
+	default:
+		error = -EINVAL;
+	}
+
+	up_write(&mm->mmap_sem);
+
+	return error;
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -2262,6 +2283,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
 		return current->no_new_privs ? 1 : 0;
+	case PR_SET_VMA:
+		error = prctl_set_vma(arg2, arg3, arg4, arg5);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/mm/madvise.c b/mm/madvise.c
index b8820fd..30cb366 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -44,20 +44,22 @@ static int madvise_need_mmap_write(int behavior)
  */
 static int madvise_update_vma(struct vm_area_struct *vma,
 		     struct vm_area_struct **prev, unsigned long start,
-		     unsigned long end, unsigned long new_flags)
+		     unsigned long end, unsigned long new_flags,
+		     const char __user *new_anon_name)
 {
 	struct mm_struct * mm = vma->vm_mm;
 	pgoff_t pgoff;
 	int error;
 
-	if (new_flags == vma->vm_flags) {
+	if (new_flags == vma->vm_flags && new_anon_name == vma_anon_name(vma)) {
 		*prev = vma;
 		return 0;
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
-				vma->vm_file, pgoff, vma_policy(vma));
+				vma->vm_file, pgoff, vma_policy(vma),
+				new_anon_name);
 	if (*prev) {
 		vma = *prev;
 		goto success;
@@ -82,10 +84,30 @@ success:
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 */
 	vma->vm_flags = new_flags;
+	if (!vma->vm_file)
+		vma->shared.anon_name = new_anon_name;
 
 	return 0;
 }
 
+static int madvise_vma_anon_name(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev,
+		     unsigned long start, unsigned long end,
+		     unsigned long name_addr)
+{
+	int error;
+
+	/* Only anonymous mappings can be named */
+	if (vma->vm_file)
+		return -EINVAL;
+
+	error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
+			(const char __user *)name_addr);
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+	return error;
+}
+
 #ifdef CONFIG_SWAP
 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 	unsigned long end, struct mm_walk *walk)
@@ -352,7 +374,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
 		break;
 	}
 
-	error = madvise_update_vma(vma, prev, start, end, new_flags);
+	error = madvise_update_vma(vma, prev, start, end, new_flags,
+				vma_anon_name(vma));
 
 out:
 	if (error == -ENOMEM)
@@ -488,6 +511,30 @@ int madvise_walk_vmas(unsigned long start, unsigned long end,
 	return unmapped_error;
 }
 
+int madvise_set_anon_name(unsigned long start, unsigned long len_in,
+		unsigned long name_addr)
+{
+	unsigned long end;
+	unsigned long len;
+
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+	/* Check to see whether len was rounded up from small -ve to zero */
+	if (len_in && !len)
+		return -EINVAL;
+
+	end = start + len;
+	if (end < start)
+		return -EINVAL;
+
+	if (end == start)
+		return 0;
+
+	return madvise_walk_vmas(start, end, name_addr, madvise_vma_anon_name);
+}
+
 /*
  * The madvise(2) system call.
  *
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7431001..7cca5e6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -728,7 +728,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 				  vma->anon_vma, vma->vm_file, pgoff,
-				  new_pol);
+				  new_pol, vma_anon_name(name));
 		if (prev) {
 			vma = prev;
 			next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index 79b7cf7..4692d9c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -287,7 +287,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
-			  vma->vm_file, pgoff, vma_policy(vma));
+			  vma->vm_file, pgoff, vma_policy(vma),
+			  vma_anon_name(vma));
 	if (*prev) {
 		vma = *prev;
 		goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e18..1f4a5b6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -893,7 +893,8 @@ again:			remove_next = 1 + (end > next->vm_end);
  * per-vma resources, so we don't attempt to merge those.
  */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
-			struct file *file, unsigned long vm_flags)
+			struct file *file, unsigned long vm_flags,
+			const char __user *anon_name)
 {
 	if (vma->vm_flags ^ vm_flags)
 		return 0;
@@ -901,6 +902,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
 		return 0;
 	if (vma->vm_ops && vma->vm_ops->close)
 		return 0;
+	if (vma_anon_name(vma) != anon_name)
+		return 0;
 	return 1;
 }
 
@@ -931,9 +934,10 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  */
 static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+	const char __user *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		if (vma->vm_pgoff == vm_pgoff)
 			return 1;
@@ -950,9 +954,10 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  */
 static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+	const char __user *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		pgoff_t vm_pglen;
 		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
@@ -963,9 +968,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 }
 
 /*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor.  Or both (it neatly fills a hole).
  *
  * In most cases - when called for mmap, brk or mremap - [addr,end) is
  * certain not to be mapped by the time vma_merge is called; but when
@@ -995,7 +1000,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 			struct vm_area_struct *prev, unsigned long addr,
 			unsigned long end, unsigned long vm_flags,
 		     	struct anon_vma *anon_vma, struct file *file,
-			pgoff_t pgoff, struct mempolicy *policy)
+			pgoff_t pgoff, struct mempolicy *policy,
+			const char __user *anon_name)
 {
 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
@@ -1021,15 +1027,15 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	 */
 	if (prev && prev->vm_end == addr &&
   			mpol_equal(vma_policy(prev), policy) &&
-			can_vma_merge_after(prev, vm_flags,
-						anon_vma, file, pgoff)) {
+			can_vma_merge_after(prev, vm_flags, anon_vma,
+						file, pgoff, anon_name)) {
 		/*
 		 * OK, it can.  Can we now merge in the successor as well?
 		 */
 		if (next && end == next->vm_start &&
 				mpol_equal(policy, vma_policy(next)) &&
-				can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen) &&
+				can_vma_merge_before(next, vm_flags, anon_vma,
+						file, pgoff+pglen, anon_name) &&
 				is_mergeable_anon_vma(prev->anon_vma,
 						      next->anon_vma, NULL)) {
 							/* cases 1, 6 */
@@ -1049,8 +1055,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	 */
 	if (next && end == next->vm_start &&
  			mpol_equal(policy, vma_policy(next)) &&
-			can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen)) {
+			can_vma_merge_before(next, vm_flags, anon_vma,
+					file, pgoff+pglen, anon_name)) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err = vma_adjust(prev, prev->vm_start,
 				addr, prev->vm_pgoff, NULL);
@@ -1519,7 +1525,8 @@ munmap_back:
 	/*
 	 * Can we just expand an old mapping?
 	 */
-	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+			NULL, NULL);
 	if (vma)
 		goto out;
 
@@ -2663,7 +2670,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
 	/* Can we just expand an old private anonymous mapping? */
 	vma = vma_merge(mm, prev, addr, addr + len, flags,
-					NULL, NULL, pgoff, NULL);
+					NULL, NULL, pgoff, NULL, NULL);
 	if (vma)
 		goto out;
 
@@ -2821,7 +2828,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
 		return NULL;	/* should never get here */
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			vma_anon_name(vma));
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4..09060cc 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -271,7 +271,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 */
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*pprev = vma_merge(mm, *pprev, start, end, newflags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			vma_anon_name(vma));
 	if (*pprev) {
 		vma = *pprev;
 		goto success;
-- 
1.8.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-15  1:31 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
@ 2013-10-15 21:21   ` Andrew Morton
  2013-10-15 21:32     ` Dave Hansen
  2013-10-15 21:47   ` Colin Cross
  2013-10-16  0:33   ` Minchan Kim
  2 siblings, 1 reply; 12+ messages in thread
From: Andrew Morton @ 2013-10-15 21:21 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Pekka Enberg, Dave Hansen, Peter Zijlstra,
	Ingo Molnar, Oleg Nesterov, Eric W. Biederman, Jan Glauber,
	John Stultz, Rob Landley, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION, linux-mm

On Mon, 14 Oct 2013 18:31:17 -0700 Colin Cross <ccross@android.com> wrote:

> In many userspace applications, and especially in VM based
> applications like Android uses heavily, there are multiple different
> allocators in use.  At a minimum there is libc malloc and the stack,
> and in many cases there are libc malloc, the stack, direct syscalls to
> mmap anonymous memory, and multiple VM heaps (one for small objects,
> one for big objects, etc.).  Each of these layers usually has its own
> tools to inspect its usage; malloc by compiling a debug version, the
> VM through heap inspection tools, and for direct syscalls there is
> usually no way to track them.
> 
> On Android we heavily use a set of tools that use an extended version
> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> mapped in userspace and slice their usage by process, shared (COW) vs.
> unique mappings, backing, etc.  This can account for real physical
> memory usage even in cases like fork without exec (which Android uses
> heavily to share as many private COW pages as possible between
> processes), Kernel SamePage Merging, and clean zero pages.  It
> produces a measurement of the pages that only exist in that process
> (USS, for unique), and a measurement of the physical memory usage of
> that process with the cost of shared pages being evenly split between
> processes that share them (PSS).
> 
> If all anonymous memory is indistinguishable then figuring out the
> real physical memory usage (PSS) of each heap requires either a pagemap
> walking tool that can understand the heap debugging of every layer, or
> for every layer's heap debugging tools to implement the pagemap
> walking logic, in which case it is hard to get a consistent view of
> memory across the whole system.
> 
> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
> show a userspace-provided name for anonymous vmas.  The names of
> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
> as [anon:<name>].

I'm pretty wobbly about this.

- Fishing around in another process's user memory for /proc strings
  is unusual and problems might crop up if we missed something.  

- Adding thing to the userspace interface is a big deal, because we
  should continue to support them evermore.  This becomes more of a
  concern when the implementation and interface is so unusual.

- I'm not aware of anyone else expressing interest in or a need for
  this extension, and Android are well able to carry their own kernel
  patches.

- otoh, it's undesirable that external groups carry their own
  patches, and we should try to get these things integrated to better
  serve our users.

So, wobble wobble.  Does anyone else have an opinion?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-15 21:21   ` Andrew Morton
@ 2013-10-15 21:32     ` Dave Hansen
  0 siblings, 0 replies; 12+ messages in thread
From: Dave Hansen @ 2013-10-15 21:32 UTC (permalink / raw)
  To: Andrew Morton, Colin Cross
  Cc: linux-kernel, Pekka Enberg, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Cyrill Gorcunov, Kees Cook, Serge E. Hallyn,
	David Rientjes, Al Viro, Hugh Dickins, Rik van Riel, Mel Gorman,
	Michel Lespinasse, Tang Chen, Robin Holt, Shaohua Li,
	Sasha Levin, Johannes Weiner, Peter Zijlstra, open, list,
	DOCUMENTATION

On 10/15/2013 02:21 PM, Andrew Morton wrote:
> - Fishing around in another process's user memory for /proc strings
>   is unusual and problems might crop up if we missed something.  

FWIW, it might not be the _most_ common thing, but there is quite a bit
of precedent provided by /proc/$pid/cmdline.  We can be at least assured
that if we follow the same rules as that file we shouldn't be making the
situation any worse.  The cmdline mm->arg_start is just as
user-controlled as the pointers are in this new case.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-15  1:31 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
  2013-10-15 21:21   ` Andrew Morton
@ 2013-10-15 21:47   ` Colin Cross
  2013-10-16  0:33   ` Minchan Kim
  2 siblings, 0 replies; 12+ messages in thread
From: Colin Cross @ 2013-10-15 21:47 UTC (permalink / raw)
  To: lkml, Pekka Enberg, Dave Hansen, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz
  Cc: Colin Cross, Rob Landley, Andrew Morton, Cyrill Gorcunov,
	Kees Cook, Serge E. Hallyn, David Rientjes, Al Viro,
	Hugh Dickins, Rik van Riel, Mel Gorman, Michel Lespinasse,
	Tang Chen, Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

On Mon, Oct 14, 2013 at 6:31 PM, Colin Cross <ccross@android.com> wrote:
> In many userspace applications, and especially in VM based
> applications like Android uses heavily, there are multiple different
> allocators in use.  At a minimum there is libc malloc and the stack,
> and in many cases there are libc malloc, the stack, direct syscalls to
> mmap anonymous memory, and multiple VM heaps (one for small objects,
> one for big objects, etc.).  Each of these layers usually has its own
> tools to inspect its usage; malloc by compiling a debug version, the
> VM through heap inspection tools, and for direct syscalls there is
> usually no way to track them.
>
> On Android we heavily use a set of tools that use an extended version
> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> mapped in userspace and slice their usage by process, shared (COW) vs.
> unique mappings, backing, etc.  This can account for real physical
> memory usage even in cases like fork without exec (which Android uses
> heavily to share as many private COW pages as possible between
> processes), Kernel SamePage Merging, and clean zero pages.  It
> produces a measurement of the pages that only exist in that process
> (USS, for unique), and a measurement of the physical memory usage of
> that process with the cost of shared pages being evenly split between
> processes that share them (PSS).
>
> If all anonymous memory is indistinguishable then figuring out the
> real physical memory usage (PSS) of each heap requires either a pagemap
> walking tool that can understand the heap debugging of every layer, or
> for every layer's heap debugging tools to implement the pagemap
> walking logic, in which case it is hard to get a consistent view of
> memory across the whole system.
>
> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
> show a userspace-provided name for anonymous vmas.  The names of
> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
> as [anon:<name>].
>
> Userspace can set the name for a region of memory by calling
> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> Setting the name to NULL clears it.
>
> The name is stored in a user pointer in the shared union in
> vm_area_struct that points to a null terminated string inside
> the user process.  vmas that point to the same address and are
> otherwise mergeable will be merged, but vmas that point to
> equivalent strings at different addresses will not be merged.
>
> The idea to store a userspace pointer to reduce the complexity
> within mm (at the expense of the complexity of reading
> /proc/pid/mem) came from Dave Hansen.  This results in no
> runtime overhead in the mm subsystem other than comparing
> the anon_name pointers when considering vma merging.  The pointer
> is stored in a union with fields that are only used on file-backed
> mappings, so it does not increase memory usage.
>
> Signed-off-by: Colin Cross <ccross@android.com>
> ---
>
> v2: updates the commit message to explain in more detail why the
>     patch is useful.
> v3: renames vma_get_anon_name to vma_anon_name
>     replaces logic in seq_print_vma_name with access_process_vm
>     removes Name: entry from smaps, it's already on the header line
>     changes the prctl option number to match what is currently in
>        use on Android
>
>  Documentation/filesystems/proc.txt |  2 ++
>  fs/proc/task_mmu.c                 | 22 +++++++++++++++
>  include/linux/mm.h                 |  5 +++-
>  include/linux/mm_types.h           | 15 +++++++++++
>  include/uapi/linux/prctl.h         |  3 +++
>  kernel/sys.c                       | 24 +++++++++++++++++
>  mm/madvise.c                       | 55 +++++++++++++++++++++++++++++++++++---
>  mm/mempolicy.c                     |  2 +-
>  mm/mlock.c                         |  3 ++-
>  mm/mmap.c                          | 44 +++++++++++++++++-------------
>  mm/mprotect.c                      |  3 ++-
>  11 files changed, 152 insertions(+), 26 deletions(-)
>

<snip>

> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 7431001..7cca5e6 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -728,7 +728,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
>                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
>                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
>                                   vma->anon_vma, vma->vm_file, pgoff,
> -                                 new_pol);
> +                                 new_pol, vma_anon_name(name));

Dumb typo here that snuck back in, this should be vma_anon_name(vma).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-15  1:31 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
  2013-10-15 21:21   ` Andrew Morton
  2013-10-15 21:47   ` Colin Cross
@ 2013-10-16  0:33   ` Minchan Kim
  2013-10-16 20:00     ` Colin Cross
  2 siblings, 1 reply; 12+ messages in thread
From: Minchan Kim @ 2013-10-16  0:33 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Pekka Enberg, Dave Hansen, Peter Zijlstra,
	Ingo Molnar, Oleg Nesterov, Eric W. Biederman, Jan Glauber,
	John Stultz, Rob Landley, Andrew Morton, Cyrill Gorcunov,
	Kees Cook, Serge E. Hallyn, David Rientjes, Al Viro,
	Hugh Dickins, Rik van Riel, Mel Gorman, Michel Lespinasse,
	Tang Chen, Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

Hello,

On Mon, Oct 14, 2013 at 06:31:17PM -0700, Colin Cross wrote:
> In many userspace applications, and especially in VM based
> applications like Android uses heavily, there are multiple different
> allocators in use.  At a minimum there is libc malloc and the stack,
> and in many cases there are libc malloc, the stack, direct syscalls to
> mmap anonymous memory, and multiple VM heaps (one for small objects,
> one for big objects, etc.).  Each of these layers usually has its own
> tools to inspect its usage; malloc by compiling a debug version, the
> VM through heap inspection tools, and for direct syscalls there is
> usually no way to track them.
> 
> On Android we heavily use a set of tools that use an extended version
> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> mapped in userspace and slice their usage by process, shared (COW) vs.
> unique mappings, backing, etc.  This can account for real physical
> memory usage even in cases like fork without exec (which Android uses
> heavily to share as many private COW pages as possible between
> processes), Kernel SamePage Merging, and clean zero pages.  It
> produces a measurement of the pages that only exist in that process
> (USS, for unique), and a measurement of the physical memory usage of
> that process with the cost of shared pages being evenly split between
> processes that share them (PSS).
> 
> If all anonymous memory is indistinguishable then figuring out the
> real physical memory usage (PSS) of each heap requires either a pagemap
> walking tool that can understand the heap debugging of every layer, or
> for every layer's heap debugging tools to implement the pagemap
> walking logic, in which case it is hard to get a consistent view of
> memory across the whole system.
> 
> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
> show a userspace-provided name for anonymous vmas.  The names of
> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
> as [anon:<name>].
> 
> Userspace can set the name for a region of memory by calling
> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> Setting the name to NULL clears it.
> 
> The name is stored in a user pointer in the shared union in
> vm_area_struct that points to a null terminated string inside
> the user process.  vmas that point to the same address and are
> otherwise mergeable will be merged, but vmas that point to
> equivalent strings at different addresses will not be merged.
> 
> The idea to store a userspace pointer to reduce the complexity
> within mm (at the expense of the complexity of reading
> /proc/pid/mem) came from Dave Hansen.  This results in no
> runtime overhead in the mm subsystem other than comparing
> the anon_name pointers when considering vma merging.  The pointer
> is stored in a union with fields that are only used on file-backed
> mappings, so it does not increase memory usage.

I'm not against this idea although I don't have review it in detail
but we need description to convince why it's hard to be done in
userspace.

I guess this feature would be used with allocators tightly
so my concern of kernel approach like this that it needs mmap_sem
write-side lock to split/merge vmas which is really thing
allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
that allocators have lots of complicated logic to avoid munmap which
needs mmap_sem write-side lock but this feature would make it invalid.


-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-16  0:33   ` Minchan Kim
@ 2013-10-16 20:00     ` Colin Cross
  2013-10-16 20:34       ` Dave Hansen
  2013-10-17  2:47       ` Minchan Kim
  0 siblings, 2 replies; 12+ messages in thread
From: Colin Cross @ 2013-10-16 20:00 UTC (permalink / raw)
  To: Minchan Kim
  Cc: lkml, Pekka Enberg, Dave Hansen, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

On Tue, Oct 15, 2013 at 5:33 PM, Minchan Kim <minchan@kernel.org> wrote:
> Hello,
>
> On Mon, Oct 14, 2013 at 06:31:17PM -0700, Colin Cross wrote:
>> In many userspace applications, and especially in VM based
>> applications like Android uses heavily, there are multiple different
>> allocators in use.  At a minimum there is libc malloc and the stack,
>> and in many cases there are libc malloc, the stack, direct syscalls to
>> mmap anonymous memory, and multiple VM heaps (one for small objects,
>> one for big objects, etc.).  Each of these layers usually has its own
>> tools to inspect its usage; malloc by compiling a debug version, the
>> VM through heap inspection tools, and for direct syscalls there is
>> usually no way to track them.
>>
>> On Android we heavily use a set of tools that use an extended version
>> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
>> mapped in userspace and slice their usage by process, shared (COW) vs.
>> unique mappings, backing, etc.  This can account for real physical
>> memory usage even in cases like fork without exec (which Android uses
>> heavily to share as many private COW pages as possible between
>> processes), Kernel SamePage Merging, and clean zero pages.  It
>> produces a measurement of the pages that only exist in that process
>> (USS, for unique), and a measurement of the physical memory usage of
>> that process with the cost of shared pages being evenly split between
>> processes that share them (PSS).
>>
>> If all anonymous memory is indistinguishable then figuring out the
>> real physical memory usage (PSS) of each heap requires either a pagemap
>> walking tool that can understand the heap debugging of every layer, or
>> for every layer's heap debugging tools to implement the pagemap
>> walking logic, in which case it is hard to get a consistent view of
>> memory across the whole system.
>>
>> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
>> show a userspace-provided name for anonymous vmas.  The names of
>> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
>> as [anon:<name>].
>>
>> Userspace can set the name for a region of memory by calling
>> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
>> Setting the name to NULL clears it.
>>
>> The name is stored in a user pointer in the shared union in
>> vm_area_struct that points to a null terminated string inside
>> the user process.  vmas that point to the same address and are
>> otherwise mergeable will be merged, but vmas that point to
>> equivalent strings at different addresses will not be merged.
>>
>> The idea to store a userspace pointer to reduce the complexity
>> within mm (at the expense of the complexity of reading
>> /proc/pid/mem) came from Dave Hansen.  This results in no
>> runtime overhead in the mm subsystem other than comparing
>> the anon_name pointers when considering vma merging.  The pointer
>> is stored in a union with fields that are only used on file-backed
>> mappings, so it does not increase memory usage.
>
> I'm not against this idea although I don't have review it in detail
> but we need description to convince why it's hard to be done in
> userspace.

I covered the reasoning in more detail at
http://permalink.gmane.org/gmane.linux.kernel.mm/103228.  The short
version is that this is useful for a system-wide look at memory,
combining all processes with the kernel's knowledge of map counts and
page flags to produce a measurement of what a process' actual impact
on physical memory usage is.  Doing it in userspace would require
collating data from every allocator in every process on the system,
requiring every process to export it somehow, and then reading the
kernel information anyways to get the mapping info.

> I guess this feature would be used with allocators tightly
> so my concern of kernel approach like this that it needs mmap_sem
> write-side lock to split/merge vmas which is really thing
> allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
> that allocators have lots of complicated logic to avoid munmap which
> needs mmap_sem write-side lock but this feature would make it invalid.

My expected use case is that the allocator will mmap a new large chunk
of anonymous memory, and then immediately name it, resulting in taking
the mmap_sem twice in a row.  This is the same pattern required for
example by KSM to mark malloc'd memory as mergeable.  The avoid-munmap
optimization is actually even more important if the allocator names
memory, creating a new mapping + name would require the mmap_sem
twice, although the total number of mmap_sem write locks is still
increased with naming.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-16 20:00     ` Colin Cross
@ 2013-10-16 20:34       ` Dave Hansen
  2013-10-16 20:41         ` Colin Cross
  2013-10-17  2:47       ` Minchan Kim
  1 sibling, 1 reply; 12+ messages in thread
From: Dave Hansen @ 2013-10-16 20:34 UTC (permalink / raw)
  To: Colin Cross, Minchan Kim
  Cc: lkml, Pekka Enberg, Peter Zijlstra, Ingo Molnar, Oleg Nesterov,
	Eric W. Biederman, Jan Glauber, John Stultz, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, Kees Cook, Serge E. Hallyn,
	David Rientjes, Al Viro, Hugh Dickins, Rik van Riel, Mel Gorman,
	Michel Lespinasse, Tang Chen, Robin Holt, Shaohua Li,
	Sasha Levin, Johannes Weiner, Peter Zijlstra, open, list,
	DOCUMENTATION

On 10/16/2013 01:00 PM, Colin Cross wrote:
>> > I guess this feature would be used with allocators tightly
>> > so my concern of kernel approach like this that it needs mmap_sem
>> > write-side lock to split/merge vmas which is really thing
>> > allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
>> > that allocators have lots of complicated logic to avoid munmap which
>> > needs mmap_sem write-side lock but this feature would make it invalid.
> My expected use case is that the allocator will mmap a new large chunk
> of anonymous memory, and then immediately name it, resulting in taking
> the mmap_sem twice in a row. 

I guess the prctl (or a new one) _could_ just set a kernel-internal
variable (per-thread?) that says "point any future anonymous areas at
this name".  That way, you at least have the _possibility_ of not having
to do it for _every_ mmap().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-16 20:34       ` Dave Hansen
@ 2013-10-16 20:41         ` Colin Cross
  0 siblings, 0 replies; 12+ messages in thread
From: Colin Cross @ 2013-10-16 20:41 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Minchan Kim, lkml, Pekka Enberg, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra,
	open list:DOCUMENTATION <linux-doc@vger.kernel.org>,
	open list:MEMORY MANAGEMENT

On Wed, Oct 16, 2013 at 1:34 PM, Dave Hansen <dave.hansen@intel.com> wrote:
> On 10/16/2013 01:00 PM, Colin Cross wrote:
>>> > I guess this feature would be used with allocators tightly
>>> > so my concern of kernel approach like this that it needs mmap_sem
>>> > write-side lock to split/merge vmas which is really thing
>>> > allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
>>> > that allocators have lots of complicated logic to avoid munmap which
>>> > needs mmap_sem write-side lock but this feature would make it invalid.
>> My expected use case is that the allocator will mmap a new large chunk
>> of anonymous memory, and then immediately name it, resulting in taking
>> the mmap_sem twice in a row.
>
> I guess the prctl (or a new one) _could_ just set a kernel-internal
> variable (per-thread?) that says "point any future anonymous areas at
> this name".  That way, you at least have the _possibility_ of not having
> to do it for _every_ mmap().

That won't work for multiple allocators.  A thread can easily allocate
through Java, then call into native code and allocate through malloc,
and those will need different names.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-16 20:00     ` Colin Cross
  2013-10-16 20:34       ` Dave Hansen
@ 2013-10-17  2:47       ` Minchan Kim
  2013-10-30 21:15         ` Colin Cross
  1 sibling, 1 reply; 12+ messages in thread
From: Minchan Kim @ 2013-10-17  2:47 UTC (permalink / raw)
  To: Colin Cross
  Cc: lkml, Pekka Enberg, Dave Hansen, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

On Wed, Oct 16, 2013 at 01:00:03PM -0700, Colin Cross wrote:
> On Tue, Oct 15, 2013 at 5:33 PM, Minchan Kim <minchan@kernel.org> wrote:
> > Hello,
> >
> > On Mon, Oct 14, 2013 at 06:31:17PM -0700, Colin Cross wrote:
> >> In many userspace applications, and especially in VM based
> >> applications like Android uses heavily, there are multiple different
> >> allocators in use.  At a minimum there is libc malloc and the stack,
> >> and in many cases there are libc malloc, the stack, direct syscalls to
> >> mmap anonymous memory, and multiple VM heaps (one for small objects,
> >> one for big objects, etc.).  Each of these layers usually has its own
> >> tools to inspect its usage; malloc by compiling a debug version, the
> >> VM through heap inspection tools, and for direct syscalls there is
> >> usually no way to track them.
> >>
> >> On Android we heavily use a set of tools that use an extended version
> >> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> >> mapped in userspace and slice their usage by process, shared (COW) vs.
> >> unique mappings, backing, etc.  This can account for real physical
> >> memory usage even in cases like fork without exec (which Android uses
> >> heavily to share as many private COW pages as possible between
> >> processes), Kernel SamePage Merging, and clean zero pages.  It
> >> produces a measurement of the pages that only exist in that process
> >> (USS, for unique), and a measurement of the physical memory usage of
> >> that process with the cost of shared pages being evenly split between
> >> processes that share them (PSS).
> >>
> >> If all anonymous memory is indistinguishable then figuring out the
> >> real physical memory usage (PSS) of each heap requires either a pagemap
> >> walking tool that can understand the heap debugging of every layer, or
> >> for every layer's heap debugging tools to implement the pagemap
> >> walking logic, in which case it is hard to get a consistent view of
> >> memory across the whole system.
> >>
> >> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
> >> show a userspace-provided name for anonymous vmas.  The names of
> >> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
> >> as [anon:<name>].
> >>
> >> Userspace can set the name for a region of memory by calling
> >> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> >> Setting the name to NULL clears it.
> >>
> >> The name is stored in a user pointer in the shared union in
> >> vm_area_struct that points to a null terminated string inside
> >> the user process.  vmas that point to the same address and are
> >> otherwise mergeable will be merged, but vmas that point to
> >> equivalent strings at different addresses will not be merged.
> >>
> >> The idea to store a userspace pointer to reduce the complexity
> >> within mm (at the expense of the complexity of reading
> >> /proc/pid/mem) came from Dave Hansen.  This results in no
> >> runtime overhead in the mm subsystem other than comparing
> >> the anon_name pointers when considering vma merging.  The pointer
> >> is stored in a union with fields that are only used on file-backed
> >> mappings, so it does not increase memory usage.
> >
> > I'm not against this idea although I don't have review it in detail
> > but we need description to convince why it's hard to be done in
> > userspace.
> 
> I covered the reasoning in more detail at
> http://permalink.gmane.org/gmane.linux.kernel.mm/103228.  The short
> version is that this is useful for a system-wide look at memory,
> combining all processes with the kernel's knowledge of map counts and
> page flags to produce a measurement of what a process' actual impact
> on physical memory usage is.  Doing it in userspace would require
> collating data from every allocator in every process on the system,
> requiring every process to export it somehow, and then reading the
> kernel information anyways to get the mapping info.

I agree that kernel approach would be performance win and make it easy
to collect system-wide information. That's why I am not against the idea
because I think it would be very useful on comtemporary platforms.
But I doubt vma opeartion is proper.

BTW, as Peter and I already asked, maybe other developer in future
will have a question about that so let's remain it in git log.
"Tacking infomrationin userspace leads to all sorts of problems.
...
...
"

> 
> > I guess this feature would be used with allocators tightly
> > so my concern of kernel approach like this that it needs mmap_sem
> > write-side lock to split/merge vmas which is really thing
> > allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
> > that allocators have lots of complicated logic to avoid munmap which
> > needs mmap_sem write-side lock but this feature would make it invalid.
> 
> My expected use case is that the allocator will mmap a new large chunk
> of anonymous memory, and then immediately name it, resulting in taking

It makes new system call very limited.
You are assuming that this new system call should be used very carefully
inside new invented allocator which is aware of naming? So, it allocates
large chunk per name and user have to request memory with naming tag to
allocate object from chunk reserved for the name? Otherwise, large chunk
would be separated per every different name objct and allocator performance
will be drop.

Why couldn't we use it in application layer, not allocator itself?
I mean we can use this following as.

struct js_object *alloc_js_object(void) {
        if (pool_is_empty) {
                struct js_object *obj_pool = malloc(sizeof(obj) * POOL_SIZE);
                prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, obj_pool, SIZE, js_name);
        }

        return get_a_object_from_pool(obj_pool);
}

It could work with any allocators which are not aware of naming.
And If pool size is bigger than a chunk, performance lose would be small.

Other some insane user might want to call it per object frequently, even it's
small size under 4K. Why not? The result is that vma scheme couldn't work.

> the mmap_sem twice in a row.  This is the same pattern required for
> example by KSM to mark malloc'd memory as mergeable.  The avoid-munmap

I guess KSM usecase would be very rare compared to naming API because
I dare to expect this feature will be very useful and be popular for lots of
platforms. Actually, our platform is considering such features and some of stack
in our platform already have owned such profiling although it's not system-wide.

Why should we bind the feature into vma? At a glance, vma binding looks good
but the result is 

1) We couldn't avoid write mmap_sem
2) We couldn't represent small size object under 4K.

Couldn't we use another data structure which represent range like
vrange interval tree I and John are implementing?

So the result would be /proc/<pid>/named_anon

It could solve above both problem all but it needs one more system call
to see /proc/<pid>/maps if you need maps information but I imagine that
gathering isn't frequent so it's not a big concern.

> optimization is actually even more important if the allocator names
> memory, creating a new mapping + name would require the mmap_sem
> twice, although the total number of mmap_sem write locks is still
> increased with naming.

> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-17  2:47       ` Minchan Kim
@ 2013-10-30 21:15         ` Colin Cross
  2013-11-01  1:30           ` Minchan Kim
  0 siblings, 1 reply; 12+ messages in thread
From: Colin Cross @ 2013-10-30 21:15 UTC (permalink / raw)
  To: Minchan Kim
  Cc: lkml, Pekka Enberg, Dave Hansen, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

On Wed, Oct 16, 2013 at 7:47 PM, Minchan Kim <minchan@kernel.org> wrote:
> On Wed, Oct 16, 2013 at 01:00:03PM -0700, Colin Cross wrote:
>> On Tue, Oct 15, 2013 at 5:33 PM, Minchan Kim <minchan@kernel.org> wrote:
>> > Hello,
>> >
>> > On Mon, Oct 14, 2013 at 06:31:17PM -0700, Colin Cross wrote:
>> >> In many userspace applications, and especially in VM based
>> >> applications like Android uses heavily, there are multiple different
>> >> allocators in use.  At a minimum there is libc malloc and the stack,
>> >> and in many cases there are libc malloc, the stack, direct syscalls to
>> >> mmap anonymous memory, and multiple VM heaps (one for small objects,
>> >> one for big objects, etc.).  Each of these layers usually has its own
>> >> tools to inspect its usage; malloc by compiling a debug version, the
>> >> VM through heap inspection tools, and for direct syscalls there is
>> >> usually no way to track them.
>> >>
>> >> On Android we heavily use a set of tools that use an extended version
>> >> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
>> >> mapped in userspace and slice their usage by process, shared (COW) vs.
>> >> unique mappings, backing, etc.  This can account for real physical
>> >> memory usage even in cases like fork without exec (which Android uses
>> >> heavily to share as many private COW pages as possible between
>> >> processes), Kernel SamePage Merging, and clean zero pages.  It
>> >> produces a measurement of the pages that only exist in that process
>> >> (USS, for unique), and a measurement of the physical memory usage of
>> >> that process with the cost of shared pages being evenly split between
>> >> processes that share them (PSS).
>> >>
>> >> If all anonymous memory is indistinguishable then figuring out the
>> >> real physical memory usage (PSS) of each heap requires either a pagemap
>> >> walking tool that can understand the heap debugging of every layer, or
>> >> for every layer's heap debugging tools to implement the pagemap
>> >> walking logic, in which case it is hard to get a consistent view of
>> >> memory across the whole system.
>> >>
>> >> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
>> >> show a userspace-provided name for anonymous vmas.  The names of
>> >> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
>> >> as [anon:<name>].
>> >>
>> >> Userspace can set the name for a region of memory by calling
>> >> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
>> >> Setting the name to NULL clears it.
>> >>
>> >> The name is stored in a user pointer in the shared union in
>> >> vm_area_struct that points to a null terminated string inside
>> >> the user process.  vmas that point to the same address and are
>> >> otherwise mergeable will be merged, but vmas that point to
>> >> equivalent strings at different addresses will not be merged.
>> >>
>> >> The idea to store a userspace pointer to reduce the complexity
>> >> within mm (at the expense of the complexity of reading
>> >> /proc/pid/mem) came from Dave Hansen.  This results in no
>> >> runtime overhead in the mm subsystem other than comparing
>> >> the anon_name pointers when considering vma merging.  The pointer
>> >> is stored in a union with fields that are only used on file-backed
>> >> mappings, so it does not increase memory usage.
>> >
>> > I'm not against this idea although I don't have review it in detail
>> > but we need description to convince why it's hard to be done in
>> > userspace.
>>
>> I covered the reasoning in more detail at
>> http://permalink.gmane.org/gmane.linux.kernel.mm/103228.  The short
>> version is that this is useful for a system-wide look at memory,
>> combining all processes with the kernel's knowledge of map counts and
>> page flags to produce a measurement of what a process' actual impact
>> on physical memory usage is.  Doing it in userspace would require
>> collating data from every allocator in every process on the system,
>> requiring every process to export it somehow, and then reading the
>> kernel information anyways to get the mapping info.
>
> I agree that kernel approach would be performance win and make it easy
> to collect system-wide information. That's why I am not against the idea
> because I think it would be very useful on comtemporary platforms.
> But I doubt vma opeartion is proper.
>
> BTW, as Peter and I already asked, maybe other developer in future
> will have a question about that so let's remain it in git log.
> "Tacking infomrationin userspace leads to all sorts of problems.
> ...
> ...
> "
>
>>
>> > I guess this feature would be used with allocators tightly
>> > so my concern of kernel approach like this that it needs mmap_sem
>> > write-side lock to split/merge vmas which is really thing
>> > allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
>> > that allocators have lots of complicated logic to avoid munmap which
>> > needs mmap_sem write-side lock but this feature would make it invalid.
>>
>> My expected use case is that the allocator will mmap a new large chunk
>> of anonymous memory, and then immediately name it, resulting in taking
>
> It makes new system call very limited.
> You are assuming that this new system call should be used very carefully
> inside new invented allocator which is aware of naming? So, it allocates
> large chunk per name and user have to request memory with naming tag to
> allocate object from chunk reserved for the name? Otherwise, large chunk
> would be separated per every different name objct and allocator performance
> will be drop.

I'm not sure I understand your question.

It is normal for allocators to mmap a large chunk of anonymous memory
and then suballocate out of it to amortize the cost of the mmap across
multiple smaller allocations.  I'm proposing adding a second
syscall/grabbing the mmap_sem to this already slow path.  If a
particular allocator is limited by the mmap_sem, it can conditionally
skip the second syscall unless a "name memory" flag is set.  I expect
an allocator to have a single name that it always uses.  It would be
nice to avoid having to take the mmap_sem twice either by atomically
mmaping and naming a region of memory or by protecting the names with
something besides mmap_sem, but I can't think of a good way to
accomplish either.

> Why couldn't we use it in application layer, not allocator itself?
> I mean we can use this following as.
>
> struct js_object *alloc_js_object(void) {
>         if (pool_is_empty) {
>                 struct js_object *obj_pool = malloc(sizeof(obj) * POOL_SIZE);
>                 prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, obj_pool, SIZE, js_name);
>         }
>
>         return get_a_object_from_pool(obj_pool);
> }
>
> It could work with any allocators which are not aware of naming.
> And If pool size is bigger than a chunk, performance lose would be small.
>
> Other some insane user might want to call it per object frequently, even it's
> small size under 4K. Why not? The result is that vma scheme couldn't work.

I guess what I'm really trying to accomplish here is to name physical
pages, which is something only the kernel can track.  Naming every
page would be costly, and cause problems when different processes
wanted different names, so the closest I can get to that is to name a
process' view of physical pages, with the assumption that processes
that share a page will be using it for the same thing and so won't
name them differently.  Physical pages are a very kernel-y thing to
track, where as virtual address space, especially non-page-aligned
virtual address space, is a little more nebulous on the
kernel/userspace boundary.  Naming pages makes it clear who will name
them - whoever requested them from the kernel.  Naming address space
is less clear, what if the allocator names them and then the caller
also wants to name them?

>> the mmap_sem twice in a row.  This is the same pattern required for
>> example by KSM to mark malloc'd memory as mergeable.  The avoid-munmap
>
> I guess KSM usecase would be very rare compared to naming API because
> I dare to expect this feature will be very useful and be popular for lots of
> platforms. Actually, our platform is considering such features and some of stack
> in our platform already have owned such profiling although it's not system-wide.
>
> Why should we bind the feature into vma? At a glance, vma binding looks good
> but the result is
>
> 1) We couldn't avoid write mmap_sem
> 2) We couldn't represent small size object under 4K.
>
> Couldn't we use another data structure which represent range like
> vrange interval tree I and John are implementing?
>
> So the result would be /proc/<pid>/named_anon
>
> It could solve above both problem all but it needs one more system call
> to see /proc/<pid>/maps if you need maps information but I imagine that
> gathering isn't frequent so it's not a big concern.

I chose to put it in the vma because the vmas cover exactly the right
area that I want to name for my use case, and because when determining
real system-wide memory usage only 4k aligned chunks matter.  An
anonymous memory mmap normally results in a new vma covering exactly
the allocation (ignoring merging with an adjacent anonymous mmap),
which means there is normally zero memory cost to my naming.  Your
proposal would require a vrange object for every named region.  I can
see how it would be useful, but it would increase the cost of naming
page-aligned regions significantly.  As an example, on one of my
devices I have over 11,000 named regions.  Using a range_tree_node +
userspace pointer for each one is already 500KB of memory.

>> optimization is actually even more important if the allocator names
>> memory, creating a new mapping + name would require the mmap_sem
>> twice, although the total number of mmap_sem write locks is still
>> increased with naming.
>
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majordomo@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
> --
> Kind regards,
> Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-30 21:15         ` Colin Cross
@ 2013-11-01  1:30           ` Minchan Kim
  0 siblings, 0 replies; 12+ messages in thread
From: Minchan Kim @ 2013-11-01  1:30 UTC (permalink / raw)
  To: Colin Cross
  Cc: lkml, Pekka Enberg, Dave Hansen, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

Hello,

On Wed, Oct 30, 2013 at 02:15:37PM -0700, Colin Cross wrote:
> On Wed, Oct 16, 2013 at 7:47 PM, Minchan Kim <minchan@kernel.org> wrote:
> > On Wed, Oct 16, 2013 at 01:00:03PM -0700, Colin Cross wrote:
> >> On Tue, Oct 15, 2013 at 5:33 PM, Minchan Kim <minchan@kernel.org> wrote:
> >> > Hello,
> >> >
> >> > On Mon, Oct 14, 2013 at 06:31:17PM -0700, Colin Cross wrote:
> >> >> In many userspace applications, and especially in VM based
> >> >> applications like Android uses heavily, there are multiple different
> >> >> allocators in use.  At a minimum there is libc malloc and the stack,
> >> >> and in many cases there are libc malloc, the stack, direct syscalls to
> >> >> mmap anonymous memory, and multiple VM heaps (one for small objects,
> >> >> one for big objects, etc.).  Each of these layers usually has its own
> >> >> tools to inspect its usage; malloc by compiling a debug version, the
> >> >> VM through heap inspection tools, and for direct syscalls there is
> >> >> usually no way to track them.
> >> >>
> >> >> On Android we heavily use a set of tools that use an extended version
> >> >> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> >> >> mapped in userspace and slice their usage by process, shared (COW) vs.
> >> >> unique mappings, backing, etc.  This can account for real physical
> >> >> memory usage even in cases like fork without exec (which Android uses
> >> >> heavily to share as many private COW pages as possible between
> >> >> processes), Kernel SamePage Merging, and clean zero pages.  It
> >> >> produces a measurement of the pages that only exist in that process
> >> >> (USS, for unique), and a measurement of the physical memory usage of
> >> >> that process with the cost of shared pages being evenly split between
> >> >> processes that share them (PSS).
> >> >>
> >> >> If all anonymous memory is indistinguishable then figuring out the
> >> >> real physical memory usage (PSS) of each heap requires either a pagemap
> >> >> walking tool that can understand the heap debugging of every layer, or
> >> >> for every layer's heap debugging tools to implement the pagemap
> >> >> walking logic, in which case it is hard to get a consistent view of
> >> >> memory across the whole system.
> >> >>
> >> >> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
> >> >> show a userspace-provided name for anonymous vmas.  The names of
> >> >> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
> >> >> as [anon:<name>].
> >> >>
> >> >> Userspace can set the name for a region of memory by calling
> >> >> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> >> >> Setting the name to NULL clears it.
> >> >>
> >> >> The name is stored in a user pointer in the shared union in
> >> >> vm_area_struct that points to a null terminated string inside
> >> >> the user process.  vmas that point to the same address and are
> >> >> otherwise mergeable will be merged, but vmas that point to
> >> >> equivalent strings at different addresses will not be merged.
> >> >>
> >> >> The idea to store a userspace pointer to reduce the complexity
> >> >> within mm (at the expense of the complexity of reading
> >> >> /proc/pid/mem) came from Dave Hansen.  This results in no
> >> >> runtime overhead in the mm subsystem other than comparing
> >> >> the anon_name pointers when considering vma merging.  The pointer
> >> >> is stored in a union with fields that are only used on file-backed
> >> >> mappings, so it does not increase memory usage.
> >> >
> >> > I'm not against this idea although I don't have review it in detail
> >> > but we need description to convince why it's hard to be done in
> >> > userspace.
> >>
> >> I covered the reasoning in more detail at
> >> http://permalink.gmane.org/gmane.linux.kernel.mm/103228.  The short
> >> version is that this is useful for a system-wide look at memory,
> >> combining all processes with the kernel's knowledge of map counts and
> >> page flags to produce a measurement of what a process' actual impact
> >> on physical memory usage is.  Doing it in userspace would require
> >> collating data from every allocator in every process on the system,
> >> requiring every process to export it somehow, and then reading the
> >> kernel information anyways to get the mapping info.
> >
> > I agree that kernel approach would be performance win and make it easy
> > to collect system-wide information. That's why I am not against the idea
> > because I think it would be very useful on comtemporary platforms.
> > But I doubt vma opeartion is proper.
> >
> > BTW, as Peter and I already asked, maybe other developer in future
> > will have a question about that so let's remain it in git log.
> > "Tacking infomrationin userspace leads to all sorts of problems.
> > ...
> > ...
> > "
> >
> >>
> >> > I guess this feature would be used with allocators tightly
> >> > so my concern of kernel approach like this that it needs mmap_sem
> >> > write-side lock to split/merge vmas which is really thing
> >> > allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
> >> > that allocators have lots of complicated logic to avoid munmap which
> >> > needs mmap_sem write-side lock but this feature would make it invalid.
> >>
> >> My expected use case is that the allocator will mmap a new large chunk
> >> of anonymous memory, and then immediately name it, resulting in taking
> >
> > It makes new system call very limited.
> > You are assuming that this new system call should be used very carefully
> > inside new invented allocator which is aware of naming? So, it allocates
> > large chunk per name and user have to request memory with naming tag to
> > allocate object from chunk reserved for the name? Otherwise, large chunk
> > would be separated per every different name objct and allocator performance
> > will be drop.
> 
> I'm not sure I understand your question.
> 
> It is normal for allocators to mmap a large chunk of anonymous memory
> and then suballocate out of it to amortize the cost of the mmap across
> multiple smaller allocations.  I'm proposing adding a second
> syscall/grabbing the mmap_sem to this already slow path.  If a
> particular allocator is limited by the mmap_sem, it can conditionally
> skip the second syscall unless a "name memory" flag is set.  I expect
> an allocator to have a single name that it always uses.  It would be

I think it's very limited.
My requirement is that I'd like to name any anon object in process so that
a daemon in the platform could gather all important object statistics easily
from all of process which share some libraries.
For it, I don't want to replace my allocator(ex, jemalloc) with naming-aware
allocator like malloc(sizeofobject, "name") which could mmap a large of
anonymous memory per name.

> nice to avoid having to take the mmap_sem twice either by atomically
> mmaping and naming a region of memory or by protecting the names with
> something besides mmap_sem, but I can't think of a good way to
> accomplish either.

Yes, it's stuff related with allocator so it should be very sensitive with
alloc/fault performance. If we really care of it, we would need another data
structure to avoid lose.

> 
> > Why couldn't we use it in application layer, not allocator itself?
> > I mean we can use this following as.
> >
> > struct js_object *alloc_js_object(void) {
> >         if (pool_is_empty) {
> >                 struct js_object *obj_pool = malloc(sizeof(obj) * POOL_SIZE);
> >                 prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, obj_pool, SIZE, js_name);
> >         }
> >
> >         return get_a_object_from_pool(obj_pool);
> > }
> >
> > It could work with any allocators which are not aware of naming.
> > And If pool size is bigger than a chunk, performance lose would be small.
> >
> > Other some insane user might want to call it per object frequently, even it's
> > small size under 4K. Why not? The result is that vma scheme couldn't work.
> 
> I guess what I'm really trying to accomplish here is to name physical
> pages, which is something only the kernel can track.  Naming every

It seems the difference between you and me. You want to tag page
but I want object. And object inclues page.

> page would be costly, and cause problems when different processes
> wanted different names, so the closest I can get to that is to name a
> process' view of physical pages, with the assumption that processes
> that share a page will be using it for the same thing and so won't
> name them differently.  Physical pages are a very kernel-y thing to

If the page is shared, it does make sense but it makes new systemcall
too limited, too.

> track, where as virtual address space, especially non-page-aligned
> virtual address space, is a little more nebulous on the
> kernel/userspace boundary.  Naming pages makes it clear who will name
> them - whoever requested them from the kernel.  Naming address space
> is less clear, what if the allocator names them and then the caller
> also wants to name them?

In that case, caller first because upper layer has more clear view.

> 
> >> the mmap_sem twice in a row.  This is the same pattern required for
> >> example by KSM to mark malloc'd memory as mergeable.  The avoid-munmap
> >
> > I guess KSM usecase would be very rare compared to naming API because
> > I dare to expect this feature will be very useful and be popular for lots of
> > platforms. Actually, our platform is considering such features and some of stack
> > in our platform already have owned such profiling although it's not system-wide.
> >
> > Why should we bind the feature into vma? At a glance, vma binding looks good
> > but the result is
> >
> > 1) We couldn't avoid write mmap_sem
> > 2) We couldn't represent small size object under 4K.
> >
> > Couldn't we use another data structure which represent range like
> > vrange interval tree I and John are implementing?
> >
> > So the result would be /proc/<pid>/named_anon
> >
> > It could solve above both problem all but it needs one more system call
> > to see /proc/<pid>/maps if you need maps information but I imagine that
> > gathering isn't frequent so it's not a big concern.
> 
> I chose to put it in the vma because the vmas cover exactly the right
> area that I want to name for my use case, and because when determining
> real system-wide memory usage only 4k aligned chunks matter.  An
> anonymous memory mmap normally results in a new vma covering exactly
> the allocation (ignoring merging with an adjacent anonymous mmap),
> which means there is normally zero memory cost to my naming.  Your
> proposal would require a vrange object for every named region.  I can
> see how it would be useful, but it would increase the cost of naming
> page-aligned regions significantly.  As an example, on one of my
> devices I have over 11,000 named regions.  Using a range_tree_node +
> userspace pointer for each one is already 500KB of memory.

In 32bit, 300K anyway, it could be huge for embedded device but with your
approach could need vm_area_struct if space is needed to split by new
system call so memory would be more significant.

> 
> >> optimization is actually even more important if the allocator names
> >> memory, creating a new mapping + name would require the mmap_sem
> >> twice, although the total number of mmap_sem write locks is still
> >> increased with naming.
> >
> >>
> >> --
> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >> the body to majordomo@kvack.org.  For more info on Linux MM,
> >> see: http://www.linux-mm.org/ .
> >> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> >
> > --
> > Kind regards,
> > Minchan Kim
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2013-11-01  1:30 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-10-15  1:31 [PATCHv3 1/2] mm: rearrange madvise code to allow for reuse Colin Cross
2013-10-15  1:31 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
2013-10-15 21:21   ` Andrew Morton
2013-10-15 21:32     ` Dave Hansen
2013-10-15 21:47   ` Colin Cross
2013-10-16  0:33   ` Minchan Kim
2013-10-16 20:00     ` Colin Cross
2013-10-16 20:34       ` Dave Hansen
2013-10-16 20:41         ` Colin Cross
2013-10-17  2:47       ` Minchan Kim
2013-10-30 21:15         ` Colin Cross
2013-11-01  1:30           ` Minchan Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).