linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCHv3 1/2] mm: rearrange madvise code to allow for reuse
@ 2013-10-15  1:31 Colin Cross
  2013-10-15  1:31 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
  0 siblings, 1 reply; 44+ messages in thread
From: Colin Cross @ 2013-10-15  1:31 UTC (permalink / raw)
  To: linux-kernel, Pekka Enberg, Dave Hansen, Peter Zijlstra,
	Ingo Molnar, Oleg Nesterov, Eric W. Biederman, Jan Glauber,
	John Stultz
  Cc: Colin Cross, Andrew Morton, Sasha Levin, Rasmus Villemoes,
	Shaohua Li, open list:MEMORY MANAGEMENT

This patch refactors the madvise syscall to allow for parts of it
to be reused by a prctl syscall that affects vmas.

Move the code that walks vmas in a virtual address range into a
function that takes a function pointer as a parameter.  The only
caller for now is sys_madvise, which uses it to call
madvise_vma_behavior on each vma, but the next patch will add
an additional caller.

Move handling all vma behaviors inside madvise_behavior, and
rename it to madvise_vma_behavior.

Move the code that updates the flags on a vma, including splitting
or merging the vma as necessary, into a new function called
madvise_update_vma.  The next patch will add support for updating
a new anon_name field as well.

Signed-off-by: Colin Cross <ccross@android.com>
---
 mm/madvise.c | 272 +++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 151 insertions(+), 121 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 7055883..b8820fd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -39,65 +39,20 @@ static int madvise_need_mmap_write(int behavior)
 }
 
 /*
- * We can potentially split a vm area into separate
- * areas, each area with its own behavior.
+ * Update the vm_flags on regiion of a vma, splitting it or merging it as
+ * necessary.  Must be called with mmap_sem held for writing;
  */
-static long madvise_behavior(struct vm_area_struct * vma,
-		     struct vm_area_struct **prev,
-		     unsigned long start, unsigned long end, int behavior)
+static int madvise_update_vma(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev, unsigned long start,
+		     unsigned long end, unsigned long new_flags)
 {
 	struct mm_struct * mm = vma->vm_mm;
-	int error = 0;
 	pgoff_t pgoff;
-	unsigned long new_flags = vma->vm_flags;
-
-	switch (behavior) {
-	case MADV_NORMAL:
-		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
-		break;
-	case MADV_SEQUENTIAL:
-		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
-		break;
-	case MADV_RANDOM:
-		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
-		break;
-	case MADV_DONTFORK:
-		new_flags |= VM_DONTCOPY;
-		break;
-	case MADV_DOFORK:
-		if (vma->vm_flags & VM_IO) {
-			error = -EINVAL;
-			goto out;
-		}
-		new_flags &= ~VM_DONTCOPY;
-		break;
-	case MADV_DONTDUMP:
-		new_flags |= VM_DONTDUMP;
-		break;
-	case MADV_DODUMP:
-		if (new_flags & VM_SPECIAL) {
-			error = -EINVAL;
-			goto out;
-		}
-		new_flags &= ~VM_DONTDUMP;
-		break;
-	case MADV_MERGEABLE:
-	case MADV_UNMERGEABLE:
-		error = ksm_madvise(vma, start, end, behavior, &new_flags);
-		if (error)
-			goto out;
-		break;
-	case MADV_HUGEPAGE:
-	case MADV_NOHUGEPAGE:
-		error = hugepage_madvise(vma, &new_flags, behavior);
-		if (error)
-			goto out;
-		break;
-	}
+	int error;
 
 	if (new_flags == vma->vm_flags) {
 		*prev = vma;
-		goto out;
+		return 0;
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -113,13 +68,13 @@ static long madvise_behavior(struct vm_area_struct * vma,
 	if (start != vma->vm_start) {
 		error = split_vma(mm, vma, start, 1);
 		if (error)
-			goto out;
+			return error;
 	}
 
 	if (end != vma->vm_end) {
 		error = split_vma(mm, vma, end, 0);
 		if (error)
-			goto out;
+			return error;
 	}
 
 success:
@@ -128,10 +83,7 @@ success:
 	 */
 	vma->vm_flags = new_flags;
 
-out:
-	if (error == -ENOMEM)
-		error = -EAGAIN;
-	return error;
+	return 0;
 }
 
 #ifdef CONFIG_SWAP
@@ -337,6 +289,77 @@ static long madvise_remove(struct vm_area_struct *vma,
 	return error;
 }
 
+/*
+ * Apply an madvise behavior to a region of a vma.  madvise_update_vma
+ * will handle splitting a vm area into separate areas, each area with its own
+ * behavior.
+ */
+static int madvise_vma_behavior(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev,
+		     unsigned long start, unsigned long end,
+		     unsigned long behavior)
+{
+	int error = 0;
+	unsigned long new_flags = vma->vm_flags;
+
+	switch (behavior) {
+	case MADV_REMOVE:
+		return madvise_remove(vma, prev, start, end);
+	case MADV_WILLNEED:
+		return madvise_willneed(vma, prev, start, end);
+	case MADV_DONTNEED:
+		return madvise_dontneed(vma, prev, start, end);
+	case MADV_NORMAL:
+		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+		break;
+	case MADV_SEQUENTIAL:
+		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
+		break;
+	case MADV_RANDOM:
+		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
+		break;
+	case MADV_DONTFORK:
+		new_flags |= VM_DONTCOPY;
+		break;
+	case MADV_DOFORK:
+		if (vma->vm_flags & VM_IO) {
+			error = -EINVAL;
+			goto out;
+		}
+		new_flags &= ~VM_DONTCOPY;
+		break;
+	case MADV_DONTDUMP:
+		new_flags |= VM_DONTDUMP;
+		break;
+	case MADV_DODUMP:
+		if (new_flags & VM_SPECIAL) {
+			error = -EINVAL;
+			goto out;
+		}
+		new_flags &= ~VM_DONTDUMP;
+		break;
+	case MADV_MERGEABLE:
+	case MADV_UNMERGEABLE:
+		error = ksm_madvise(vma, start, end, behavior, &new_flags);
+		if (error)
+			goto out;
+		break;
+	case MADV_HUGEPAGE:
+	case MADV_NOHUGEPAGE:
+		error = hugepage_madvise(vma, &new_flags, behavior);
+		if (error)
+			goto out;
+		break;
+	}
+
+	error = madvise_update_vma(vma, prev, start, end, new_flags);
+
+out:
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+	return error;
+}
+
 #ifdef CONFIG_MEMORY_FAILURE
 /*
  * Error injection support for memory error handling.
@@ -369,22 +392,6 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 }
 #endif
 
-static long
-madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
-		unsigned long start, unsigned long end, int behavior)
-{
-	switch (behavior) {
-	case MADV_REMOVE:
-		return madvise_remove(vma, prev, start, end);
-	case MADV_WILLNEED:
-		return madvise_willneed(vma, prev, start, end);
-	case MADV_DONTNEED:
-		return madvise_dontneed(vma, prev, start, end);
-	default:
-		return madvise_behavior(vma, prev, start, end, behavior);
-	}
-}
-
 static int
 madvise_behavior_valid(int behavior)
 {
@@ -415,6 +422,73 @@ madvise_behavior_valid(int behavior)
 }
 
 /*
+ * Walk the vmas in range [start,end), and call the visit function on each one.
+ * The visit function will get start and end parameters that cover the overlap
+ * between the current vma and the original range.  Any unmapped regions in the
+ * original range will result in this function returning -ENOMEM while still
+ * calling the visit function on all of the existing vmas in the range.
+ * Must be called with the mmap_sem held for reading or writing.
+ */
+static
+int madvise_walk_vmas(unsigned long start, unsigned long end,
+		unsigned long arg,
+		int (*visit)(struct vm_area_struct *vma,
+			struct vm_area_struct **prev, unsigned long start,
+			unsigned long end, unsigned long arg))
+{
+	struct vm_area_struct *vma;
+	struct vm_area_struct *prev;
+	unsigned long tmp;
+	int unmapped_error = 0;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 * - different from the way of handling in mlock etc.
+	 */
+	vma = find_vma_prev(current->mm, start, &prev);
+	if (vma && start > vma->vm_start)
+		prev = vma;
+
+	for (;;) {
+		int error;
+
+		/* Still start < end. */
+		if (!vma)
+			return -ENOMEM;
+
+		/* Here start < (end|vma->vm_end). */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+			if (start >= end)
+				break;
+		}
+
+		/* Here vma->vm_start <= start < (end|vma->vm_end) */
+		tmp = vma->vm_end;
+		if (end < tmp)
+			tmp = end;
+
+		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+		error = visit(vma, &prev, start, tmp, arg);
+		if (error)
+			return error;
+		start = tmp;
+		if (prev && start < prev->vm_end)
+			start = prev->vm_end;
+		if (start >= end)
+			break;
+		if (prev)
+			vma = prev->vm_next;
+		else	/* madvise_remove dropped mmap_sem */
+			vma = find_vma(current->mm, start);
+	}
+
+	return unmapped_error;
+}
+
+/*
  * The madvise(2) system call.
  *
  * Applications can use madvise() to advise the kernel how it should
@@ -458,9 +532,7 @@ madvise_behavior_valid(int behavior)
  */
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
-	unsigned long end, tmp;
-	struct vm_area_struct * vma, *prev;
-	int unmapped_error = 0;
+	unsigned long end;
 	int error = -EINVAL;
 	int write;
 	size_t len;
@@ -495,52 +567,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 	else
 		down_read(&current->mm->mmap_sem);
 
-	/*
-	 * If the interval [start,end) covers some unmapped address
-	 * ranges, just ignore them, but return -ENOMEM at the end.
-	 * - different from the way of handling in mlock etc.
-	 */
-	vma = find_vma_prev(current->mm, start, &prev);
-	if (vma && start > vma->vm_start)
-		prev = vma;
-
 	blk_start_plug(&plug);
-	for (;;) {
-		/* Still start < end. */
-		error = -ENOMEM;
-		if (!vma)
-			goto out;
-
-		/* Here start < (end|vma->vm_end). */
-		if (start < vma->vm_start) {
-			unmapped_error = -ENOMEM;
-			start = vma->vm_start;
-			if (start >= end)
-				goto out;
-		}
-
-		/* Here vma->vm_start <= start < (end|vma->vm_end) */
-		tmp = vma->vm_end;
-		if (end < tmp)
-			tmp = end;
-
-		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
-		error = madvise_vma(vma, &prev, start, tmp, behavior);
-		if (error)
-			goto out;
-		start = tmp;
-		if (prev && start < prev->vm_end)
-			start = prev->vm_end;
-		error = unmapped_error;
-		if (start >= end)
-			goto out;
-		if (prev)
-			vma = prev->vm_next;
-		else	/* madvise_remove dropped mmap_sem */
-			vma = find_vma(current->mm, start);
-	}
-out:
+	error = madvise_walk_vmas(start, end, behavior, madvise_vma_behavior);
 	blk_finish_plug(&plug);
+
 	if (write)
 		up_write(&current->mm->mmap_sem);
 	else
-- 
1.8.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-15  1:31 [PATCHv3 1/2] mm: rearrange madvise code to allow for reuse Colin Cross
@ 2013-10-15  1:31 ` Colin Cross
  2013-10-15 21:21   ` Andrew Morton
                     ` (2 more replies)
  0 siblings, 3 replies; 44+ messages in thread
From: Colin Cross @ 2013-10-15  1:31 UTC (permalink / raw)
  To: linux-kernel, Pekka Enberg, Dave Hansen, Peter Zijlstra,
	Ingo Molnar, Oleg Nesterov, Eric W. Biederman, Jan Glauber,
	John Stultz
  Cc: Colin Cross, Rob Landley, Andrew Morton, Cyrill Gorcunov,
	Kees Cook, Serge E. Hallyn, David Rientjes, Al Viro,
	Hugh Dickins, Rik van Riel, Mel Gorman, Michel Lespinasse,
	Tang Chen, Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

In many userspace applications, and especially in VM based
applications like Android uses heavily, there are multiple different
allocators in use.  At a minimum there is libc malloc and the stack,
and in many cases there are libc malloc, the stack, direct syscalls to
mmap anonymous memory, and multiple VM heaps (one for small objects,
one for big objects, etc.).  Each of these layers usually has its own
tools to inspect its usage; malloc by compiling a debug version, the
VM through heap inspection tools, and for direct syscalls there is
usually no way to track them.

On Android we heavily use a set of tools that use an extended version
of the logic covered in Documentation/vm/pagemap.txt to walk all pages
mapped in userspace and slice their usage by process, shared (COW) vs.
unique mappings, backing, etc.  This can account for real physical
memory usage even in cases like fork without exec (which Android uses
heavily to share as many private COW pages as possible between
processes), Kernel SamePage Merging, and clean zero pages.  It
produces a measurement of the pages that only exist in that process
(USS, for unique), and a measurement of the physical memory usage of
that process with the cost of shared pages being evenly split between
processes that share them (PSS).

If all anonymous memory is indistinguishable then figuring out the
real physical memory usage (PSS) of each heap requires either a pagemap
walking tool that can understand the heap debugging of every layer, or
for every layer's heap debugging tools to implement the pagemap
walking logic, in which case it is hard to get a consistent view of
memory across the whole system.

This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
show a userspace-provided name for anonymous vmas.  The names of
named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
as [anon:<name>].

Userspace can set the name for a region of memory by calling
prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
Setting the name to NULL clears it.

The name is stored in a user pointer in the shared union in
vm_area_struct that points to a null terminated string inside
the user process.  vmas that point to the same address and are
otherwise mergeable will be merged, but vmas that point to
equivalent strings at different addresses will not be merged.

The idea to store a userspace pointer to reduce the complexity
within mm (at the expense of the complexity of reading
/proc/pid/mem) came from Dave Hansen.  This results in no
runtime overhead in the mm subsystem other than comparing
the anon_name pointers when considering vma merging.  The pointer
is stored in a union with fields that are only used on file-backed
mappings, so it does not increase memory usage.

Signed-off-by: Colin Cross <ccross@android.com>
---

v2: updates the commit message to explain in more detail why the
    patch is useful.
v3: renames vma_get_anon_name to vma_anon_name
    replaces logic in seq_print_vma_name with access_process_vm
    removes Name: entry from smaps, it's already on the header line
    changes the prctl option number to match what is currently in
       use on Android

 Documentation/filesystems/proc.txt |  2 ++
 fs/proc/task_mmu.c                 | 22 +++++++++++++++
 include/linux/mm.h                 |  5 +++-
 include/linux/mm_types.h           | 15 +++++++++++
 include/uapi/linux/prctl.h         |  3 +++
 kernel/sys.c                       | 24 +++++++++++++++++
 mm/madvise.c                       | 55 +++++++++++++++++++++++++++++++++++---
 mm/mempolicy.c                     |  2 +-
 mm/mlock.c                         |  3 ++-
 mm/mmap.c                          | 44 +++++++++++++++++-------------
 mm/mprotect.c                      |  3 ++-
 11 files changed, 152 insertions(+), 26 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fd8d0d5..ec5b7d8 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -369,6 +369,8 @@ is not associated with a file:
  [stack:1001]             = the stack of the thread with tid 1001
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
+ [anon:<name>]            = an anonymous mapping that has been
+                            named by userspace
 
  or if empty, the mapping is anonymous.
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d8..681af03 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -90,6 +90,22 @@ static void pad_len_spaces(struct seq_file *m, int len)
 	seq_printf(m, "%*c", len, ' ');
 }
 
+static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	char anon_name[NAME_MAX + 1];
+	unsigned long addr;
+	int n;
+
+	n = access_remote_vm(mm, (unsigned long)vma_anon_name(vma),
+				anon_name, NAME_MAX, 0);
+	if (n > 0) {
+		seq_puts(m, "[anon:");
+		seq_write(m, anon_name, strnlen(anon_name, n));
+		seq_putc(m, ']');
+	}
+}
+
 #ifdef CONFIG_NUMA
 /*
  * These functions are for numa_maps but called in generic **maps seq_file
@@ -335,6 +351,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 				pad_len_spaces(m, len);
 				seq_printf(m, "[stack:%d]", tid);
 			}
+			goto done;
+		}
+
+		if (vma_anon_name(vma)) {
+			pad_len_spaces(m, len);
+			seq_print_vma_name(m, vma);
 		}
 	}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e0c8528..36260c7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1485,7 +1485,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 extern struct vm_area_struct *vma_merge(struct mm_struct *,
 	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
 	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-	struct mempolicy *);
+	struct mempolicy *, const char __user *);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int split_vma(struct mm_struct *,
 	struct vm_area_struct *, unsigned long addr, int new_below);
@@ -1828,5 +1828,8 @@ void __init setup_nr_node_ids(void);
 static inline void setup_nr_node_ids(void) {}
 #endif
 
+int madvise_set_anon_name(unsigned long start, unsigned long len_in,
+				unsigned long name_addr);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f..6dc6667 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -255,6 +255,10 @@ struct vm_area_struct {
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree, or
 	 * linkage of vma in the address_space->i_mmap_nonlinear list.
+	 *
+	 * For private anonymous mappings, a pointer to a null terminated string
+	 * in the user process containing the name given to the vma, or NULL
+	 * if unnamed.
 	 */
 	union {
 		struct {
@@ -262,6 +266,7 @@ struct vm_area_struct {
 			unsigned long rb_subtree_last;
 		} linear;
 		struct list_head nonlinear;
+		const char __user *anon_name;
 	} shared;
 
 	/*
@@ -456,4 +461,14 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 	return mm->cpu_vm_mask_var;
 }
 
+
+/* Return the name for an anonymous mapping or NULL for a file-backed mapping */
+static inline const char __user *vma_anon_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_file)
+		return NULL;
+
+	return vma->shared.anon_name;
+}
+
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 289760f..253856a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -149,4 +149,7 @@
 
 #define PR_GET_TID_ADDRESS	40
 
+#define PR_SET_VMA		0x53564d41
+# define PR_SET_VMA_ANON_NAME		0
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 2bbd9a7..401852f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2099,6 +2099,27 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
 
+static int prctl_set_vma(unsigned long opt, unsigned long addr,
+		unsigned long len, unsigned long arg)
+{
+	struct mm_struct *mm = current->mm;
+	int error;
+
+	down_write(&mm->mmap_sem);
+
+	switch (opt) {
+	case PR_SET_VMA_ANON_NAME:
+		error = madvise_set_anon_name(addr, len, arg);
+		break;
+	default:
+		error = -EINVAL;
+	}
+
+	up_write(&mm->mmap_sem);
+
+	return error;
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -2262,6 +2283,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
 		return current->no_new_privs ? 1 : 0;
+	case PR_SET_VMA:
+		error = prctl_set_vma(arg2, arg3, arg4, arg5);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/mm/madvise.c b/mm/madvise.c
index b8820fd..30cb366 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -44,20 +44,22 @@ static int madvise_need_mmap_write(int behavior)
  */
 static int madvise_update_vma(struct vm_area_struct *vma,
 		     struct vm_area_struct **prev, unsigned long start,
-		     unsigned long end, unsigned long new_flags)
+		     unsigned long end, unsigned long new_flags,
+		     const char __user *new_anon_name)
 {
 	struct mm_struct * mm = vma->vm_mm;
 	pgoff_t pgoff;
 	int error;
 
-	if (new_flags == vma->vm_flags) {
+	if (new_flags == vma->vm_flags && new_anon_name == vma_anon_name(vma)) {
 		*prev = vma;
 		return 0;
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
-				vma->vm_file, pgoff, vma_policy(vma));
+				vma->vm_file, pgoff, vma_policy(vma),
+				new_anon_name);
 	if (*prev) {
 		vma = *prev;
 		goto success;
@@ -82,10 +84,30 @@ success:
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 */
 	vma->vm_flags = new_flags;
+	if (!vma->vm_file)
+		vma->shared.anon_name = new_anon_name;
 
 	return 0;
 }
 
+static int madvise_vma_anon_name(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev,
+		     unsigned long start, unsigned long end,
+		     unsigned long name_addr)
+{
+	int error;
+
+	/* Only anonymous mappings can be named */
+	if (vma->vm_file)
+		return -EINVAL;
+
+	error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
+			(const char __user *)name_addr);
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+	return error;
+}
+
 #ifdef CONFIG_SWAP
 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 	unsigned long end, struct mm_walk *walk)
@@ -352,7 +374,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
 		break;
 	}
 
-	error = madvise_update_vma(vma, prev, start, end, new_flags);
+	error = madvise_update_vma(vma, prev, start, end, new_flags,
+				vma_anon_name(vma));
 
 out:
 	if (error == -ENOMEM)
@@ -488,6 +511,30 @@ int madvise_walk_vmas(unsigned long start, unsigned long end,
 	return unmapped_error;
 }
 
+int madvise_set_anon_name(unsigned long start, unsigned long len_in,
+		unsigned long name_addr)
+{
+	unsigned long end;
+	unsigned long len;
+
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+	/* Check to see whether len was rounded up from small -ve to zero */
+	if (len_in && !len)
+		return -EINVAL;
+
+	end = start + len;
+	if (end < start)
+		return -EINVAL;
+
+	if (end == start)
+		return 0;
+
+	return madvise_walk_vmas(start, end, name_addr, madvise_vma_anon_name);
+}
+
 /*
  * The madvise(2) system call.
  *
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7431001..7cca5e6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -728,7 +728,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 				  vma->anon_vma, vma->vm_file, pgoff,
-				  new_pol);
+				  new_pol, vma_anon_name(name));
 		if (prev) {
 			vma = prev;
 			next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index 79b7cf7..4692d9c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -287,7 +287,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
-			  vma->vm_file, pgoff, vma_policy(vma));
+			  vma->vm_file, pgoff, vma_policy(vma),
+			  vma_anon_name(vma));
 	if (*prev) {
 		vma = *prev;
 		goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e18..1f4a5b6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -893,7 +893,8 @@ again:			remove_next = 1 + (end > next->vm_end);
  * per-vma resources, so we don't attempt to merge those.
  */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
-			struct file *file, unsigned long vm_flags)
+			struct file *file, unsigned long vm_flags,
+			const char __user *anon_name)
 {
 	if (vma->vm_flags ^ vm_flags)
 		return 0;
@@ -901,6 +902,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
 		return 0;
 	if (vma->vm_ops && vma->vm_ops->close)
 		return 0;
+	if (vma_anon_name(vma) != anon_name)
+		return 0;
 	return 1;
 }
 
@@ -931,9 +934,10 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  */
 static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+	const char __user *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		if (vma->vm_pgoff == vm_pgoff)
 			return 1;
@@ -950,9 +954,10 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  */
 static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+	const char __user *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		pgoff_t vm_pglen;
 		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
@@ -963,9 +968,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 }
 
 /*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor.  Or both (it neatly fills a hole).
  *
  * In most cases - when called for mmap, brk or mremap - [addr,end) is
  * certain not to be mapped by the time vma_merge is called; but when
@@ -995,7 +1000,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 			struct vm_area_struct *prev, unsigned long addr,
 			unsigned long end, unsigned long vm_flags,
 		     	struct anon_vma *anon_vma, struct file *file,
-			pgoff_t pgoff, struct mempolicy *policy)
+			pgoff_t pgoff, struct mempolicy *policy,
+			const char __user *anon_name)
 {
 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
@@ -1021,15 +1027,15 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	 */
 	if (prev && prev->vm_end == addr &&
   			mpol_equal(vma_policy(prev), policy) &&
-			can_vma_merge_after(prev, vm_flags,
-						anon_vma, file, pgoff)) {
+			can_vma_merge_after(prev, vm_flags, anon_vma,
+						file, pgoff, anon_name)) {
 		/*
 		 * OK, it can.  Can we now merge in the successor as well?
 		 */
 		if (next && end == next->vm_start &&
 				mpol_equal(policy, vma_policy(next)) &&
-				can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen) &&
+				can_vma_merge_before(next, vm_flags, anon_vma,
+						file, pgoff+pglen, anon_name) &&
 				is_mergeable_anon_vma(prev->anon_vma,
 						      next->anon_vma, NULL)) {
 							/* cases 1, 6 */
@@ -1049,8 +1055,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	 */
 	if (next && end == next->vm_start &&
  			mpol_equal(policy, vma_policy(next)) &&
-			can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen)) {
+			can_vma_merge_before(next, vm_flags, anon_vma,
+					file, pgoff+pglen, anon_name)) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err = vma_adjust(prev, prev->vm_start,
 				addr, prev->vm_pgoff, NULL);
@@ -1519,7 +1525,8 @@ munmap_back:
 	/*
 	 * Can we just expand an old mapping?
 	 */
-	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+			NULL, NULL);
 	if (vma)
 		goto out;
 
@@ -2663,7 +2670,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
 	/* Can we just expand an old private anonymous mapping? */
 	vma = vma_merge(mm, prev, addr, addr + len, flags,
-					NULL, NULL, pgoff, NULL);
+					NULL, NULL, pgoff, NULL, NULL);
 	if (vma)
 		goto out;
 
@@ -2821,7 +2828,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
 		return NULL;	/* should never get here */
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			vma_anon_name(vma));
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4..09060cc 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -271,7 +271,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 */
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*pprev = vma_merge(mm, *pprev, start, end, newflags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			vma_anon_name(vma));
 	if (*pprev) {
 		vma = *pprev;
 		goto success;
-- 
1.8.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-15  1:31 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
@ 2013-10-15 21:21   ` Andrew Morton
  2013-10-15 21:32     ` Dave Hansen
  2013-10-15 21:47   ` Colin Cross
  2013-10-16  0:33   ` Minchan Kim
  2 siblings, 1 reply; 44+ messages in thread
From: Andrew Morton @ 2013-10-15 21:21 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Pekka Enberg, Dave Hansen, Peter Zijlstra,
	Ingo Molnar, Oleg Nesterov, Eric W. Biederman, Jan Glauber,
	John Stultz, Rob Landley, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION, linux-mm

On Mon, 14 Oct 2013 18:31:17 -0700 Colin Cross <ccross@android.com> wrote:

> In many userspace applications, and especially in VM based
> applications like Android uses heavily, there are multiple different
> allocators in use.  At a minimum there is libc malloc and the stack,
> and in many cases there are libc malloc, the stack, direct syscalls to
> mmap anonymous memory, and multiple VM heaps (one for small objects,
> one for big objects, etc.).  Each of these layers usually has its own
> tools to inspect its usage; malloc by compiling a debug version, the
> VM through heap inspection tools, and for direct syscalls there is
> usually no way to track them.
> 
> On Android we heavily use a set of tools that use an extended version
> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> mapped in userspace and slice their usage by process, shared (COW) vs.
> unique mappings, backing, etc.  This can account for real physical
> memory usage even in cases like fork without exec (which Android uses
> heavily to share as many private COW pages as possible between
> processes), Kernel SamePage Merging, and clean zero pages.  It
> produces a measurement of the pages that only exist in that process
> (USS, for unique), and a measurement of the physical memory usage of
> that process with the cost of shared pages being evenly split between
> processes that share them (PSS).
> 
> If all anonymous memory is indistinguishable then figuring out the
> real physical memory usage (PSS) of each heap requires either a pagemap
> walking tool that can understand the heap debugging of every layer, or
> for every layer's heap debugging tools to implement the pagemap
> walking logic, in which case it is hard to get a consistent view of
> memory across the whole system.
> 
> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
> show a userspace-provided name for anonymous vmas.  The names of
> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
> as [anon:<name>].

I'm pretty wobbly about this.

- Fishing around in another process's user memory for /proc strings
  is unusual and problems might crop up if we missed something.  

- Adding thing to the userspace interface is a big deal, because we
  should continue to support them evermore.  This becomes more of a
  concern when the implementation and interface is so unusual.

- I'm not aware of anyone else expressing interest in or a need for
  this extension, and Android are well able to carry their own kernel
  patches.

- otoh, it's undesirable that external groups carry their own
  patches, and we should try to get these things integrated to better
  serve our users.

So, wobble wobble.  Does anyone else have an opinion?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-15 21:21   ` Andrew Morton
@ 2013-10-15 21:32     ` Dave Hansen
  0 siblings, 0 replies; 44+ messages in thread
From: Dave Hansen @ 2013-10-15 21:32 UTC (permalink / raw)
  To: Andrew Morton, Colin Cross
  Cc: linux-kernel, Pekka Enberg, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Cyrill Gorcunov, Kees Cook, Serge E. Hallyn,
	David Rientjes, Al Viro, Hugh Dickins, Rik van Riel, Mel Gorman,
	Michel Lespinasse, Tang Chen, Robin Holt, Shaohua Li,
	Sasha Levin, Johannes Weiner, Peter Zijlstra, open, list,
	DOCUMENTATION

On 10/15/2013 02:21 PM, Andrew Morton wrote:
> - Fishing around in another process's user memory for /proc strings
>   is unusual and problems might crop up if we missed something.  

FWIW, it might not be the _most_ common thing, but there is quite a bit
of precedent provided by /proc/$pid/cmdline.  We can be at least assured
that if we follow the same rules as that file we shouldn't be making the
situation any worse.  The cmdline mm->arg_start is just as
user-controlled as the pointers are in this new case.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-15  1:31 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
  2013-10-15 21:21   ` Andrew Morton
@ 2013-10-15 21:47   ` Colin Cross
  2013-10-16  0:33   ` Minchan Kim
  2 siblings, 0 replies; 44+ messages in thread
From: Colin Cross @ 2013-10-15 21:47 UTC (permalink / raw)
  To: lkml, Pekka Enberg, Dave Hansen, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz
  Cc: Colin Cross, Rob Landley, Andrew Morton, Cyrill Gorcunov,
	Kees Cook, Serge E. Hallyn, David Rientjes, Al Viro,
	Hugh Dickins, Rik van Riel, Mel Gorman, Michel Lespinasse,
	Tang Chen, Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

On Mon, Oct 14, 2013 at 6:31 PM, Colin Cross <ccross@android.com> wrote:
> In many userspace applications, and especially in VM based
> applications like Android uses heavily, there are multiple different
> allocators in use.  At a minimum there is libc malloc and the stack,
> and in many cases there are libc malloc, the stack, direct syscalls to
> mmap anonymous memory, and multiple VM heaps (one for small objects,
> one for big objects, etc.).  Each of these layers usually has its own
> tools to inspect its usage; malloc by compiling a debug version, the
> VM through heap inspection tools, and for direct syscalls there is
> usually no way to track them.
>
> On Android we heavily use a set of tools that use an extended version
> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> mapped in userspace and slice their usage by process, shared (COW) vs.
> unique mappings, backing, etc.  This can account for real physical
> memory usage even in cases like fork without exec (which Android uses
> heavily to share as many private COW pages as possible between
> processes), Kernel SamePage Merging, and clean zero pages.  It
> produces a measurement of the pages that only exist in that process
> (USS, for unique), and a measurement of the physical memory usage of
> that process with the cost of shared pages being evenly split between
> processes that share them (PSS).
>
> If all anonymous memory is indistinguishable then figuring out the
> real physical memory usage (PSS) of each heap requires either a pagemap
> walking tool that can understand the heap debugging of every layer, or
> for every layer's heap debugging tools to implement the pagemap
> walking logic, in which case it is hard to get a consistent view of
> memory across the whole system.
>
> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
> show a userspace-provided name for anonymous vmas.  The names of
> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
> as [anon:<name>].
>
> Userspace can set the name for a region of memory by calling
> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> Setting the name to NULL clears it.
>
> The name is stored in a user pointer in the shared union in
> vm_area_struct that points to a null terminated string inside
> the user process.  vmas that point to the same address and are
> otherwise mergeable will be merged, but vmas that point to
> equivalent strings at different addresses will not be merged.
>
> The idea to store a userspace pointer to reduce the complexity
> within mm (at the expense of the complexity of reading
> /proc/pid/mem) came from Dave Hansen.  This results in no
> runtime overhead in the mm subsystem other than comparing
> the anon_name pointers when considering vma merging.  The pointer
> is stored in a union with fields that are only used on file-backed
> mappings, so it does not increase memory usage.
>
> Signed-off-by: Colin Cross <ccross@android.com>
> ---
>
> v2: updates the commit message to explain in more detail why the
>     patch is useful.
> v3: renames vma_get_anon_name to vma_anon_name
>     replaces logic in seq_print_vma_name with access_process_vm
>     removes Name: entry from smaps, it's already on the header line
>     changes the prctl option number to match what is currently in
>        use on Android
>
>  Documentation/filesystems/proc.txt |  2 ++
>  fs/proc/task_mmu.c                 | 22 +++++++++++++++
>  include/linux/mm.h                 |  5 +++-
>  include/linux/mm_types.h           | 15 +++++++++++
>  include/uapi/linux/prctl.h         |  3 +++
>  kernel/sys.c                       | 24 +++++++++++++++++
>  mm/madvise.c                       | 55 +++++++++++++++++++++++++++++++++++---
>  mm/mempolicy.c                     |  2 +-
>  mm/mlock.c                         |  3 ++-
>  mm/mmap.c                          | 44 +++++++++++++++++-------------
>  mm/mprotect.c                      |  3 ++-
>  11 files changed, 152 insertions(+), 26 deletions(-)
>

<snip>

> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 7431001..7cca5e6 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -728,7 +728,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
>                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
>                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
>                                   vma->anon_vma, vma->vm_file, pgoff,
> -                                 new_pol);
> +                                 new_pol, vma_anon_name(name));

Dumb typo here that snuck back in, this should be vma_anon_name(vma).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-15  1:31 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
  2013-10-15 21:21   ` Andrew Morton
  2013-10-15 21:47   ` Colin Cross
@ 2013-10-16  0:33   ` Minchan Kim
  2013-10-16 20:00     ` Colin Cross
  2 siblings, 1 reply; 44+ messages in thread
From: Minchan Kim @ 2013-10-16  0:33 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Pekka Enberg, Dave Hansen, Peter Zijlstra,
	Ingo Molnar, Oleg Nesterov, Eric W. Biederman, Jan Glauber,
	John Stultz, Rob Landley, Andrew Morton, Cyrill Gorcunov,
	Kees Cook, Serge E. Hallyn, David Rientjes, Al Viro,
	Hugh Dickins, Rik van Riel, Mel Gorman, Michel Lespinasse,
	Tang Chen, Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

Hello,

On Mon, Oct 14, 2013 at 06:31:17PM -0700, Colin Cross wrote:
> In many userspace applications, and especially in VM based
> applications like Android uses heavily, there are multiple different
> allocators in use.  At a minimum there is libc malloc and the stack,
> and in many cases there are libc malloc, the stack, direct syscalls to
> mmap anonymous memory, and multiple VM heaps (one for small objects,
> one for big objects, etc.).  Each of these layers usually has its own
> tools to inspect its usage; malloc by compiling a debug version, the
> VM through heap inspection tools, and for direct syscalls there is
> usually no way to track them.
> 
> On Android we heavily use a set of tools that use an extended version
> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> mapped in userspace and slice their usage by process, shared (COW) vs.
> unique mappings, backing, etc.  This can account for real physical
> memory usage even in cases like fork without exec (which Android uses
> heavily to share as many private COW pages as possible between
> processes), Kernel SamePage Merging, and clean zero pages.  It
> produces a measurement of the pages that only exist in that process
> (USS, for unique), and a measurement of the physical memory usage of
> that process with the cost of shared pages being evenly split between
> processes that share them (PSS).
> 
> If all anonymous memory is indistinguishable then figuring out the
> real physical memory usage (PSS) of each heap requires either a pagemap
> walking tool that can understand the heap debugging of every layer, or
> for every layer's heap debugging tools to implement the pagemap
> walking logic, in which case it is hard to get a consistent view of
> memory across the whole system.
> 
> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
> show a userspace-provided name for anonymous vmas.  The names of
> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
> as [anon:<name>].
> 
> Userspace can set the name for a region of memory by calling
> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> Setting the name to NULL clears it.
> 
> The name is stored in a user pointer in the shared union in
> vm_area_struct that points to a null terminated string inside
> the user process.  vmas that point to the same address and are
> otherwise mergeable will be merged, but vmas that point to
> equivalent strings at different addresses will not be merged.
> 
> The idea to store a userspace pointer to reduce the complexity
> within mm (at the expense of the complexity of reading
> /proc/pid/mem) came from Dave Hansen.  This results in no
> runtime overhead in the mm subsystem other than comparing
> the anon_name pointers when considering vma merging.  The pointer
> is stored in a union with fields that are only used on file-backed
> mappings, so it does not increase memory usage.

I'm not against this idea although I don't have review it in detail
but we need description to convince why it's hard to be done in
userspace.

I guess this feature would be used with allocators tightly
so my concern of kernel approach like this that it needs mmap_sem
write-side lock to split/merge vmas which is really thing
allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
that allocators have lots of complicated logic to avoid munmap which
needs mmap_sem write-side lock but this feature would make it invalid.


-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-16  0:33   ` Minchan Kim
@ 2013-10-16 20:00     ` Colin Cross
  2013-10-16 20:34       ` Dave Hansen
  2013-10-17  2:47       ` Minchan Kim
  0 siblings, 2 replies; 44+ messages in thread
From: Colin Cross @ 2013-10-16 20:00 UTC (permalink / raw)
  To: Minchan Kim
  Cc: lkml, Pekka Enberg, Dave Hansen, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

On Tue, Oct 15, 2013 at 5:33 PM, Minchan Kim <minchan@kernel.org> wrote:
> Hello,
>
> On Mon, Oct 14, 2013 at 06:31:17PM -0700, Colin Cross wrote:
>> In many userspace applications, and especially in VM based
>> applications like Android uses heavily, there are multiple different
>> allocators in use.  At a minimum there is libc malloc and the stack,
>> and in many cases there are libc malloc, the stack, direct syscalls to
>> mmap anonymous memory, and multiple VM heaps (one for small objects,
>> one for big objects, etc.).  Each of these layers usually has its own
>> tools to inspect its usage; malloc by compiling a debug version, the
>> VM through heap inspection tools, and for direct syscalls there is
>> usually no way to track them.
>>
>> On Android we heavily use a set of tools that use an extended version
>> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
>> mapped in userspace and slice their usage by process, shared (COW) vs.
>> unique mappings, backing, etc.  This can account for real physical
>> memory usage even in cases like fork without exec (which Android uses
>> heavily to share as many private COW pages as possible between
>> processes), Kernel SamePage Merging, and clean zero pages.  It
>> produces a measurement of the pages that only exist in that process
>> (USS, for unique), and a measurement of the physical memory usage of
>> that process with the cost of shared pages being evenly split between
>> processes that share them (PSS).
>>
>> If all anonymous memory is indistinguishable then figuring out the
>> real physical memory usage (PSS) of each heap requires either a pagemap
>> walking tool that can understand the heap debugging of every layer, or
>> for every layer's heap debugging tools to implement the pagemap
>> walking logic, in which case it is hard to get a consistent view of
>> memory across the whole system.
>>
>> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
>> show a userspace-provided name for anonymous vmas.  The names of
>> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
>> as [anon:<name>].
>>
>> Userspace can set the name for a region of memory by calling
>> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
>> Setting the name to NULL clears it.
>>
>> The name is stored in a user pointer in the shared union in
>> vm_area_struct that points to a null terminated string inside
>> the user process.  vmas that point to the same address and are
>> otherwise mergeable will be merged, but vmas that point to
>> equivalent strings at different addresses will not be merged.
>>
>> The idea to store a userspace pointer to reduce the complexity
>> within mm (at the expense of the complexity of reading
>> /proc/pid/mem) came from Dave Hansen.  This results in no
>> runtime overhead in the mm subsystem other than comparing
>> the anon_name pointers when considering vma merging.  The pointer
>> is stored in a union with fields that are only used on file-backed
>> mappings, so it does not increase memory usage.
>
> I'm not against this idea although I don't have review it in detail
> but we need description to convince why it's hard to be done in
> userspace.

I covered the reasoning in more detail at
http://permalink.gmane.org/gmane.linux.kernel.mm/103228.  The short
version is that this is useful for a system-wide look at memory,
combining all processes with the kernel's knowledge of map counts and
page flags to produce a measurement of what a process' actual impact
on physical memory usage is.  Doing it in userspace would require
collating data from every allocator in every process on the system,
requiring every process to export it somehow, and then reading the
kernel information anyways to get the mapping info.

> I guess this feature would be used with allocators tightly
> so my concern of kernel approach like this that it needs mmap_sem
> write-side lock to split/merge vmas which is really thing
> allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
> that allocators have lots of complicated logic to avoid munmap which
> needs mmap_sem write-side lock but this feature would make it invalid.

My expected use case is that the allocator will mmap a new large chunk
of anonymous memory, and then immediately name it, resulting in taking
the mmap_sem twice in a row.  This is the same pattern required for
example by KSM to mark malloc'd memory as mergeable.  The avoid-munmap
optimization is actually even more important if the allocator names
memory, creating a new mapping + name would require the mmap_sem
twice, although the total number of mmap_sem write locks is still
increased with naming.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-16 20:00     ` Colin Cross
@ 2013-10-16 20:34       ` Dave Hansen
  2013-10-16 20:41         ` Colin Cross
  2013-10-17  2:47       ` Minchan Kim
  1 sibling, 1 reply; 44+ messages in thread
From: Dave Hansen @ 2013-10-16 20:34 UTC (permalink / raw)
  To: Colin Cross, Minchan Kim
  Cc: lkml, Pekka Enberg, Peter Zijlstra, Ingo Molnar, Oleg Nesterov,
	Eric W. Biederman, Jan Glauber, John Stultz, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, Kees Cook, Serge E. Hallyn,
	David Rientjes, Al Viro, Hugh Dickins, Rik van Riel, Mel Gorman,
	Michel Lespinasse, Tang Chen, Robin Holt, Shaohua Li,
	Sasha Levin, Johannes Weiner, Peter Zijlstra, open, list,
	DOCUMENTATION

On 10/16/2013 01:00 PM, Colin Cross wrote:
>> > I guess this feature would be used with allocators tightly
>> > so my concern of kernel approach like this that it needs mmap_sem
>> > write-side lock to split/merge vmas which is really thing
>> > allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
>> > that allocators have lots of complicated logic to avoid munmap which
>> > needs mmap_sem write-side lock but this feature would make it invalid.
> My expected use case is that the allocator will mmap a new large chunk
> of anonymous memory, and then immediately name it, resulting in taking
> the mmap_sem twice in a row. 

I guess the prctl (or a new one) _could_ just set a kernel-internal
variable (per-thread?) that says "point any future anonymous areas at
this name".  That way, you at least have the _possibility_ of not having
to do it for _every_ mmap().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-16 20:34       ` Dave Hansen
@ 2013-10-16 20:41         ` Colin Cross
  0 siblings, 0 replies; 44+ messages in thread
From: Colin Cross @ 2013-10-16 20:41 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Minchan Kim, lkml, Pekka Enberg, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra,
	open list:DOCUMENTATION <linux-doc@vger.kernel.org>,
	open list:MEMORY MANAGEMENT

On Wed, Oct 16, 2013 at 1:34 PM, Dave Hansen <dave.hansen@intel.com> wrote:
> On 10/16/2013 01:00 PM, Colin Cross wrote:
>>> > I guess this feature would be used with allocators tightly
>>> > so my concern of kernel approach like this that it needs mmap_sem
>>> > write-side lock to split/merge vmas which is really thing
>>> > allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
>>> > that allocators have lots of complicated logic to avoid munmap which
>>> > needs mmap_sem write-side lock but this feature would make it invalid.
>> My expected use case is that the allocator will mmap a new large chunk
>> of anonymous memory, and then immediately name it, resulting in taking
>> the mmap_sem twice in a row.
>
> I guess the prctl (or a new one) _could_ just set a kernel-internal
> variable (per-thread?) that says "point any future anonymous areas at
> this name".  That way, you at least have the _possibility_ of not having
> to do it for _every_ mmap().

That won't work for multiple allocators.  A thread can easily allocate
through Java, then call into native code and allocate through malloc,
and those will need different names.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-16 20:00     ` Colin Cross
  2013-10-16 20:34       ` Dave Hansen
@ 2013-10-17  2:47       ` Minchan Kim
  2013-10-30 21:15         ` Colin Cross
  1 sibling, 1 reply; 44+ messages in thread
From: Minchan Kim @ 2013-10-17  2:47 UTC (permalink / raw)
  To: Colin Cross
  Cc: lkml, Pekka Enberg, Dave Hansen, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

On Wed, Oct 16, 2013 at 01:00:03PM -0700, Colin Cross wrote:
> On Tue, Oct 15, 2013 at 5:33 PM, Minchan Kim <minchan@kernel.org> wrote:
> > Hello,
> >
> > On Mon, Oct 14, 2013 at 06:31:17PM -0700, Colin Cross wrote:
> >> In many userspace applications, and especially in VM based
> >> applications like Android uses heavily, there are multiple different
> >> allocators in use.  At a minimum there is libc malloc and the stack,
> >> and in many cases there are libc malloc, the stack, direct syscalls to
> >> mmap anonymous memory, and multiple VM heaps (one for small objects,
> >> one for big objects, etc.).  Each of these layers usually has its own
> >> tools to inspect its usage; malloc by compiling a debug version, the
> >> VM through heap inspection tools, and for direct syscalls there is
> >> usually no way to track them.
> >>
> >> On Android we heavily use a set of tools that use an extended version
> >> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> >> mapped in userspace and slice their usage by process, shared (COW) vs.
> >> unique mappings, backing, etc.  This can account for real physical
> >> memory usage even in cases like fork without exec (which Android uses
> >> heavily to share as many private COW pages as possible between
> >> processes), Kernel SamePage Merging, and clean zero pages.  It
> >> produces a measurement of the pages that only exist in that process
> >> (USS, for unique), and a measurement of the physical memory usage of
> >> that process with the cost of shared pages being evenly split between
> >> processes that share them (PSS).
> >>
> >> If all anonymous memory is indistinguishable then figuring out the
> >> real physical memory usage (PSS) of each heap requires either a pagemap
> >> walking tool that can understand the heap debugging of every layer, or
> >> for every layer's heap debugging tools to implement the pagemap
> >> walking logic, in which case it is hard to get a consistent view of
> >> memory across the whole system.
> >>
> >> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
> >> show a userspace-provided name for anonymous vmas.  The names of
> >> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
> >> as [anon:<name>].
> >>
> >> Userspace can set the name for a region of memory by calling
> >> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> >> Setting the name to NULL clears it.
> >>
> >> The name is stored in a user pointer in the shared union in
> >> vm_area_struct that points to a null terminated string inside
> >> the user process.  vmas that point to the same address and are
> >> otherwise mergeable will be merged, but vmas that point to
> >> equivalent strings at different addresses will not be merged.
> >>
> >> The idea to store a userspace pointer to reduce the complexity
> >> within mm (at the expense of the complexity of reading
> >> /proc/pid/mem) came from Dave Hansen.  This results in no
> >> runtime overhead in the mm subsystem other than comparing
> >> the anon_name pointers when considering vma merging.  The pointer
> >> is stored in a union with fields that are only used on file-backed
> >> mappings, so it does not increase memory usage.
> >
> > I'm not against this idea although I don't have review it in detail
> > but we need description to convince why it's hard to be done in
> > userspace.
> 
> I covered the reasoning in more detail at
> http://permalink.gmane.org/gmane.linux.kernel.mm/103228.  The short
> version is that this is useful for a system-wide look at memory,
> combining all processes with the kernel's knowledge of map counts and
> page flags to produce a measurement of what a process' actual impact
> on physical memory usage is.  Doing it in userspace would require
> collating data from every allocator in every process on the system,
> requiring every process to export it somehow, and then reading the
> kernel information anyways to get the mapping info.

I agree that kernel approach would be performance win and make it easy
to collect system-wide information. That's why I am not against the idea
because I think it would be very useful on comtemporary platforms.
But I doubt vma opeartion is proper.

BTW, as Peter and I already asked, maybe other developer in future
will have a question about that so let's remain it in git log.
"Tacking infomrationin userspace leads to all sorts of problems.
...
...
"

> 
> > I guess this feature would be used with allocators tightly
> > so my concern of kernel approach like this that it needs mmap_sem
> > write-side lock to split/merge vmas which is really thing
> > allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
> > that allocators have lots of complicated logic to avoid munmap which
> > needs mmap_sem write-side lock but this feature would make it invalid.
> 
> My expected use case is that the allocator will mmap a new large chunk
> of anonymous memory, and then immediately name it, resulting in taking

It makes new system call very limited.
You are assuming that this new system call should be used very carefully
inside new invented allocator which is aware of naming? So, it allocates
large chunk per name and user have to request memory with naming tag to
allocate object from chunk reserved for the name? Otherwise, large chunk
would be separated per every different name objct and allocator performance
will be drop.

Why couldn't we use it in application layer, not allocator itself?
I mean we can use this following as.

struct js_object *alloc_js_object(void) {
        if (pool_is_empty) {
                struct js_object *obj_pool = malloc(sizeof(obj) * POOL_SIZE);
                prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, obj_pool, SIZE, js_name);
        }

        return get_a_object_from_pool(obj_pool);
}

It could work with any allocators which are not aware of naming.
And If pool size is bigger than a chunk, performance lose would be small.

Other some insane user might want to call it per object frequently, even it's
small size under 4K. Why not? The result is that vma scheme couldn't work.

> the mmap_sem twice in a row.  This is the same pattern required for
> example by KSM to mark malloc'd memory as mergeable.  The avoid-munmap

I guess KSM usecase would be very rare compared to naming API because
I dare to expect this feature will be very useful and be popular for lots of
platforms. Actually, our platform is considering such features and some of stack
in our platform already have owned such profiling although it's not system-wide.

Why should we bind the feature into vma? At a glance, vma binding looks good
but the result is 

1) We couldn't avoid write mmap_sem
2) We couldn't represent small size object under 4K.

Couldn't we use another data structure which represent range like
vrange interval tree I and John are implementing?

So the result would be /proc/<pid>/named_anon

It could solve above both problem all but it needs one more system call
to see /proc/<pid>/maps if you need maps information but I imagine that
gathering isn't frequent so it's not a big concern.

> optimization is actually even more important if the allocator names
> memory, creating a new mapping + name would require the mmap_sem
> twice, although the total number of mmap_sem write locks is still
> increased with naming.

> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-17  2:47       ` Minchan Kim
@ 2013-10-30 21:15         ` Colin Cross
  2013-11-01  1:30           ` Minchan Kim
  0 siblings, 1 reply; 44+ messages in thread
From: Colin Cross @ 2013-10-30 21:15 UTC (permalink / raw)
  To: Minchan Kim
  Cc: lkml, Pekka Enberg, Dave Hansen, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

On Wed, Oct 16, 2013 at 7:47 PM, Minchan Kim <minchan@kernel.org> wrote:
> On Wed, Oct 16, 2013 at 01:00:03PM -0700, Colin Cross wrote:
>> On Tue, Oct 15, 2013 at 5:33 PM, Minchan Kim <minchan@kernel.org> wrote:
>> > Hello,
>> >
>> > On Mon, Oct 14, 2013 at 06:31:17PM -0700, Colin Cross wrote:
>> >> In many userspace applications, and especially in VM based
>> >> applications like Android uses heavily, there are multiple different
>> >> allocators in use.  At a minimum there is libc malloc and the stack,
>> >> and in many cases there are libc malloc, the stack, direct syscalls to
>> >> mmap anonymous memory, and multiple VM heaps (one for small objects,
>> >> one for big objects, etc.).  Each of these layers usually has its own
>> >> tools to inspect its usage; malloc by compiling a debug version, the
>> >> VM through heap inspection tools, and for direct syscalls there is
>> >> usually no way to track them.
>> >>
>> >> On Android we heavily use a set of tools that use an extended version
>> >> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
>> >> mapped in userspace and slice their usage by process, shared (COW) vs.
>> >> unique mappings, backing, etc.  This can account for real physical
>> >> memory usage even in cases like fork without exec (which Android uses
>> >> heavily to share as many private COW pages as possible between
>> >> processes), Kernel SamePage Merging, and clean zero pages.  It
>> >> produces a measurement of the pages that only exist in that process
>> >> (USS, for unique), and a measurement of the physical memory usage of
>> >> that process with the cost of shared pages being evenly split between
>> >> processes that share them (PSS).
>> >>
>> >> If all anonymous memory is indistinguishable then figuring out the
>> >> real physical memory usage (PSS) of each heap requires either a pagemap
>> >> walking tool that can understand the heap debugging of every layer, or
>> >> for every layer's heap debugging tools to implement the pagemap
>> >> walking logic, in which case it is hard to get a consistent view of
>> >> memory across the whole system.
>> >>
>> >> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
>> >> show a userspace-provided name for anonymous vmas.  The names of
>> >> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
>> >> as [anon:<name>].
>> >>
>> >> Userspace can set the name for a region of memory by calling
>> >> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
>> >> Setting the name to NULL clears it.
>> >>
>> >> The name is stored in a user pointer in the shared union in
>> >> vm_area_struct that points to a null terminated string inside
>> >> the user process.  vmas that point to the same address and are
>> >> otherwise mergeable will be merged, but vmas that point to
>> >> equivalent strings at different addresses will not be merged.
>> >>
>> >> The idea to store a userspace pointer to reduce the complexity
>> >> within mm (at the expense of the complexity of reading
>> >> /proc/pid/mem) came from Dave Hansen.  This results in no
>> >> runtime overhead in the mm subsystem other than comparing
>> >> the anon_name pointers when considering vma merging.  The pointer
>> >> is stored in a union with fields that are only used on file-backed
>> >> mappings, so it does not increase memory usage.
>> >
>> > I'm not against this idea although I don't have review it in detail
>> > but we need description to convince why it's hard to be done in
>> > userspace.
>>
>> I covered the reasoning in more detail at
>> http://permalink.gmane.org/gmane.linux.kernel.mm/103228.  The short
>> version is that this is useful for a system-wide look at memory,
>> combining all processes with the kernel's knowledge of map counts and
>> page flags to produce a measurement of what a process' actual impact
>> on physical memory usage is.  Doing it in userspace would require
>> collating data from every allocator in every process on the system,
>> requiring every process to export it somehow, and then reading the
>> kernel information anyways to get the mapping info.
>
> I agree that kernel approach would be performance win and make it easy
> to collect system-wide information. That's why I am not against the idea
> because I think it would be very useful on comtemporary platforms.
> But I doubt vma opeartion is proper.
>
> BTW, as Peter and I already asked, maybe other developer in future
> will have a question about that so let's remain it in git log.
> "Tacking infomrationin userspace leads to all sorts of problems.
> ...
> ...
> "
>
>>
>> > I guess this feature would be used with allocators tightly
>> > so my concern of kernel approach like this that it needs mmap_sem
>> > write-side lock to split/merge vmas which is really thing
>> > allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
>> > that allocators have lots of complicated logic to avoid munmap which
>> > needs mmap_sem write-side lock but this feature would make it invalid.
>>
>> My expected use case is that the allocator will mmap a new large chunk
>> of anonymous memory, and then immediately name it, resulting in taking
>
> It makes new system call very limited.
> You are assuming that this new system call should be used very carefully
> inside new invented allocator which is aware of naming? So, it allocates
> large chunk per name and user have to request memory with naming tag to
> allocate object from chunk reserved for the name? Otherwise, large chunk
> would be separated per every different name objct and allocator performance
> will be drop.

I'm not sure I understand your question.

It is normal for allocators to mmap a large chunk of anonymous memory
and then suballocate out of it to amortize the cost of the mmap across
multiple smaller allocations.  I'm proposing adding a second
syscall/grabbing the mmap_sem to this already slow path.  If a
particular allocator is limited by the mmap_sem, it can conditionally
skip the second syscall unless a "name memory" flag is set.  I expect
an allocator to have a single name that it always uses.  It would be
nice to avoid having to take the mmap_sem twice either by atomically
mmaping and naming a region of memory or by protecting the names with
something besides mmap_sem, but I can't think of a good way to
accomplish either.

> Why couldn't we use it in application layer, not allocator itself?
> I mean we can use this following as.
>
> struct js_object *alloc_js_object(void) {
>         if (pool_is_empty) {
>                 struct js_object *obj_pool = malloc(sizeof(obj) * POOL_SIZE);
>                 prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, obj_pool, SIZE, js_name);
>         }
>
>         return get_a_object_from_pool(obj_pool);
> }
>
> It could work with any allocators which are not aware of naming.
> And If pool size is bigger than a chunk, performance lose would be small.
>
> Other some insane user might want to call it per object frequently, even it's
> small size under 4K. Why not? The result is that vma scheme couldn't work.

I guess what I'm really trying to accomplish here is to name physical
pages, which is something only the kernel can track.  Naming every
page would be costly, and cause problems when different processes
wanted different names, so the closest I can get to that is to name a
process' view of physical pages, with the assumption that processes
that share a page will be using it for the same thing and so won't
name them differently.  Physical pages are a very kernel-y thing to
track, where as virtual address space, especially non-page-aligned
virtual address space, is a little more nebulous on the
kernel/userspace boundary.  Naming pages makes it clear who will name
them - whoever requested them from the kernel.  Naming address space
is less clear, what if the allocator names them and then the caller
also wants to name them?

>> the mmap_sem twice in a row.  This is the same pattern required for
>> example by KSM to mark malloc'd memory as mergeable.  The avoid-munmap
>
> I guess KSM usecase would be very rare compared to naming API because
> I dare to expect this feature will be very useful and be popular for lots of
> platforms. Actually, our platform is considering such features and some of stack
> in our platform already have owned such profiling although it's not system-wide.
>
> Why should we bind the feature into vma? At a glance, vma binding looks good
> but the result is
>
> 1) We couldn't avoid write mmap_sem
> 2) We couldn't represent small size object under 4K.
>
> Couldn't we use another data structure which represent range like
> vrange interval tree I and John are implementing?
>
> So the result would be /proc/<pid>/named_anon
>
> It could solve above both problem all but it needs one more system call
> to see /proc/<pid>/maps if you need maps information but I imagine that
> gathering isn't frequent so it's not a big concern.

I chose to put it in the vma because the vmas cover exactly the right
area that I want to name for my use case, and because when determining
real system-wide memory usage only 4k aligned chunks matter.  An
anonymous memory mmap normally results in a new vma covering exactly
the allocation (ignoring merging with an adjacent anonymous mmap),
which means there is normally zero memory cost to my naming.  Your
proposal would require a vrange object for every named region.  I can
see how it would be useful, but it would increase the cost of naming
page-aligned regions significantly.  As an example, on one of my
devices I have over 11,000 named regions.  Using a range_tree_node +
userspace pointer for each one is already 500KB of memory.

>> optimization is actually even more important if the allocator names
>> memory, creating a new mapping + name would require the mmap_sem
>> twice, although the total number of mmap_sem write locks is still
>> increased with naming.
>
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majordomo@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
> --
> Kind regards,
> Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-10-30 21:15         ` Colin Cross
@ 2013-11-01  1:30           ` Minchan Kim
  0 siblings, 0 replies; 44+ messages in thread
From: Minchan Kim @ 2013-11-01  1:30 UTC (permalink / raw)
  To: Colin Cross
  Cc: lkml, Pekka Enberg, Dave Hansen, Peter Zijlstra, Ingo Molnar,
	Oleg Nesterov, Eric W. Biederman, Jan Glauber, John Stultz,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, Kees Cook,
	Serge E. Hallyn, David Rientjes, Al Viro, Hugh Dickins,
	Rik van Riel, Mel Gorman, Michel Lespinasse, Tang Chen,
	Robin Holt, Shaohua Li, Sasha Levin, Johannes Weiner,
	Peter Zijlstra, open list:DOCUMENTATION,
	open list:MEMORY MANAGEMENT

Hello,

On Wed, Oct 30, 2013 at 02:15:37PM -0700, Colin Cross wrote:
> On Wed, Oct 16, 2013 at 7:47 PM, Minchan Kim <minchan@kernel.org> wrote:
> > On Wed, Oct 16, 2013 at 01:00:03PM -0700, Colin Cross wrote:
> >> On Tue, Oct 15, 2013 at 5:33 PM, Minchan Kim <minchan@kernel.org> wrote:
> >> > Hello,
> >> >
> >> > On Mon, Oct 14, 2013 at 06:31:17PM -0700, Colin Cross wrote:
> >> >> In many userspace applications, and especially in VM based
> >> >> applications like Android uses heavily, there are multiple different
> >> >> allocators in use.  At a minimum there is libc malloc and the stack,
> >> >> and in many cases there are libc malloc, the stack, direct syscalls to
> >> >> mmap anonymous memory, and multiple VM heaps (one for small objects,
> >> >> one for big objects, etc.).  Each of these layers usually has its own
> >> >> tools to inspect its usage; malloc by compiling a debug version, the
> >> >> VM through heap inspection tools, and for direct syscalls there is
> >> >> usually no way to track them.
> >> >>
> >> >> On Android we heavily use a set of tools that use an extended version
> >> >> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> >> >> mapped in userspace and slice their usage by process, shared (COW) vs.
> >> >> unique mappings, backing, etc.  This can account for real physical
> >> >> memory usage even in cases like fork without exec (which Android uses
> >> >> heavily to share as many private COW pages as possible between
> >> >> processes), Kernel SamePage Merging, and clean zero pages.  It
> >> >> produces a measurement of the pages that only exist in that process
> >> >> (USS, for unique), and a measurement of the physical memory usage of
> >> >> that process with the cost of shared pages being evenly split between
> >> >> processes that share them (PSS).
> >> >>
> >> >> If all anonymous memory is indistinguishable then figuring out the
> >> >> real physical memory usage (PSS) of each heap requires either a pagemap
> >> >> walking tool that can understand the heap debugging of every layer, or
> >> >> for every layer's heap debugging tools to implement the pagemap
> >> >> walking logic, in which case it is hard to get a consistent view of
> >> >> memory across the whole system.
> >> >>
> >> >> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to
> >> >> show a userspace-provided name for anonymous vmas.  The names of
> >> >> named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps
> >> >> as [anon:<name>].
> >> >>
> >> >> Userspace can set the name for a region of memory by calling
> >> >> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> >> >> Setting the name to NULL clears it.
> >> >>
> >> >> The name is stored in a user pointer in the shared union in
> >> >> vm_area_struct that points to a null terminated string inside
> >> >> the user process.  vmas that point to the same address and are
> >> >> otherwise mergeable will be merged, but vmas that point to
> >> >> equivalent strings at different addresses will not be merged.
> >> >>
> >> >> The idea to store a userspace pointer to reduce the complexity
> >> >> within mm (at the expense of the complexity of reading
> >> >> /proc/pid/mem) came from Dave Hansen.  This results in no
> >> >> runtime overhead in the mm subsystem other than comparing
> >> >> the anon_name pointers when considering vma merging.  The pointer
> >> >> is stored in a union with fields that are only used on file-backed
> >> >> mappings, so it does not increase memory usage.
> >> >
> >> > I'm not against this idea although I don't have review it in detail
> >> > but we need description to convince why it's hard to be done in
> >> > userspace.
> >>
> >> I covered the reasoning in more detail at
> >> http://permalink.gmane.org/gmane.linux.kernel.mm/103228.  The short
> >> version is that this is useful for a system-wide look at memory,
> >> combining all processes with the kernel's knowledge of map counts and
> >> page flags to produce a measurement of what a process' actual impact
> >> on physical memory usage is.  Doing it in userspace would require
> >> collating data from every allocator in every process on the system,
> >> requiring every process to export it somehow, and then reading the
> >> kernel information anyways to get the mapping info.
> >
> > I agree that kernel approach would be performance win and make it easy
> > to collect system-wide information. That's why I am not against the idea
> > because I think it would be very useful on comtemporary platforms.
> > But I doubt vma opeartion is proper.
> >
> > BTW, as Peter and I already asked, maybe other developer in future
> > will have a question about that so let's remain it in git log.
> > "Tacking infomrationin userspace leads to all sorts of problems.
> > ...
> > ...
> > "
> >
> >>
> >> > I guess this feature would be used with allocators tightly
> >> > so my concern of kernel approach like this that it needs mmap_sem
> >> > write-side lock to split/merge vmas which is really thing
> >> > allocators(ex, tcmalloc, jemalloc) want to avoid for performance win
> >> > that allocators have lots of complicated logic to avoid munmap which
> >> > needs mmap_sem write-side lock but this feature would make it invalid.
> >>
> >> My expected use case is that the allocator will mmap a new large chunk
> >> of anonymous memory, and then immediately name it, resulting in taking
> >
> > It makes new system call very limited.
> > You are assuming that this new system call should be used very carefully
> > inside new invented allocator which is aware of naming? So, it allocates
> > large chunk per name and user have to request memory with naming tag to
> > allocate object from chunk reserved for the name? Otherwise, large chunk
> > would be separated per every different name objct and allocator performance
> > will be drop.
> 
> I'm not sure I understand your question.
> 
> It is normal for allocators to mmap a large chunk of anonymous memory
> and then suballocate out of it to amortize the cost of the mmap across
> multiple smaller allocations.  I'm proposing adding a second
> syscall/grabbing the mmap_sem to this already slow path.  If a
> particular allocator is limited by the mmap_sem, it can conditionally
> skip the second syscall unless a "name memory" flag is set.  I expect
> an allocator to have a single name that it always uses.  It would be

I think it's very limited.
My requirement is that I'd like to name any anon object in process so that
a daemon in the platform could gather all important object statistics easily
from all of process which share some libraries.
For it, I don't want to replace my allocator(ex, jemalloc) with naming-aware
allocator like malloc(sizeofobject, "name") which could mmap a large of
anonymous memory per name.

> nice to avoid having to take the mmap_sem twice either by atomically
> mmaping and naming a region of memory or by protecting the names with
> something besides mmap_sem, but I can't think of a good way to
> accomplish either.

Yes, it's stuff related with allocator so it should be very sensitive with
alloc/fault performance. If we really care of it, we would need another data
structure to avoid lose.

> 
> > Why couldn't we use it in application layer, not allocator itself?
> > I mean we can use this following as.
> >
> > struct js_object *alloc_js_object(void) {
> >         if (pool_is_empty) {
> >                 struct js_object *obj_pool = malloc(sizeof(obj) * POOL_SIZE);
> >                 prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, obj_pool, SIZE, js_name);
> >         }
> >
> >         return get_a_object_from_pool(obj_pool);
> > }
> >
> > It could work with any allocators which are not aware of naming.
> > And If pool size is bigger than a chunk, performance lose would be small.
> >
> > Other some insane user might want to call it per object frequently, even it's
> > small size under 4K. Why not? The result is that vma scheme couldn't work.
> 
> I guess what I'm really trying to accomplish here is to name physical
> pages, which is something only the kernel can track.  Naming every

It seems the difference between you and me. You want to tag page
but I want object. And object inclues page.

> page would be costly, and cause problems when different processes
> wanted different names, so the closest I can get to that is to name a
> process' view of physical pages, with the assumption that processes
> that share a page will be using it for the same thing and so won't
> name them differently.  Physical pages are a very kernel-y thing to

If the page is shared, it does make sense but it makes new systemcall
too limited, too.

> track, where as virtual address space, especially non-page-aligned
> virtual address space, is a little more nebulous on the
> kernel/userspace boundary.  Naming pages makes it clear who will name
> them - whoever requested them from the kernel.  Naming address space
> is less clear, what if the allocator names them and then the caller
> also wants to name them?

In that case, caller first because upper layer has more clear view.

> 
> >> the mmap_sem twice in a row.  This is the same pattern required for
> >> example by KSM to mark malloc'd memory as mergeable.  The avoid-munmap
> >
> > I guess KSM usecase would be very rare compared to naming API because
> > I dare to expect this feature will be very useful and be popular for lots of
> > platforms. Actually, our platform is considering such features and some of stack
> > in our platform already have owned such profiling although it's not system-wide.
> >
> > Why should we bind the feature into vma? At a glance, vma binding looks good
> > but the result is
> >
> > 1) We couldn't avoid write mmap_sem
> > 2) We couldn't represent small size object under 4K.
> >
> > Couldn't we use another data structure which represent range like
> > vrange interval tree I and John are implementing?
> >
> > So the result would be /proc/<pid>/named_anon
> >
> > It could solve above both problem all but it needs one more system call
> > to see /proc/<pid>/maps if you need maps information but I imagine that
> > gathering isn't frequent so it's not a big concern.
> 
> I chose to put it in the vma because the vmas cover exactly the right
> area that I want to name for my use case, and because when determining
> real system-wide memory usage only 4k aligned chunks matter.  An
> anonymous memory mmap normally results in a new vma covering exactly
> the allocation (ignoring merging with an adjacent anonymous mmap),
> which means there is normally zero memory cost to my naming.  Your
> proposal would require a vrange object for every named region.  I can
> see how it would be useful, but it would increase the cost of naming
> page-aligned regions significantly.  As an example, on one of my
> devices I have over 11,000 named regions.  Using a range_tree_node +
> userspace pointer for each one is already 500KB of memory.

In 32bit, 300K anyway, it could be huge for embedded device but with your
approach could need vm_area_struct if space is needed to split by new
system call so memory would be more significant.

> 
> >> optimization is actually even more important if the allocator names
> >> memory, creating a new mapping + name would require the mmap_sem
> >> twice, although the total number of mmap_sem write locks is still
> >> increased with naming.
> >
> >>
> >> --
> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> >> the body to majordomo@kvack.org.  For more info on Linux MM,
> >> see: http://www.linux-mm.org/ .
> >> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> >
> > --
> > Kind regards,
> > Minchan Kim
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12 20:51                     ` Colin Cross
@ 2013-09-26  1:24                       ` Colin Cross
  0 siblings, 0 replies; 44+ messages in thread
From: Colin Cross @ 2013-09-26  1:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Pekka Enberg, lkml, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, Linux-MM,
	Linus Torvalds

On Fri, Jul 12, 2013 at 1:51 PM, Colin Cross <ccross@android.com> wrote:
> On Fri, Jul 12, 2013 at 2:49 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>> On Fri, Jul 12, 2013 at 11:40:44AM +0200, Ingo Molnar wrote:
>>> * Peter Zijlstra <peterz@infradead.org> wrote:
>>>
>>> > On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
>>> > >
>>> > > * Peter Zijlstra <peterz@infradead.org> wrote:
>>> > >
>>> > > > We need those files anyway.. The current proposal is that the entire VMA
>>> > > > has a single userspace pointer in it. Or rather a 64bit value.
>>> > >
>>> > > Yes but accessible via /proc/<PID>/mem or so?
>>> >
>>> > *shudder*.. yes. But you're again opening two files. The only advantage
>>> > of this over userspace writing its own files is that the kernel cleans
>>> > things up for you.
>>>
>>> Opening of the files only occurs in the instrumentation case, which is
>>> rare. But temporary files would be forced upon the regular usecase when no
>>> instrumentation goes on.
>>
>> Well, Colin didn't describe the intended use, but I can imagine a case where
>> its not all that rare. System health monitors might frequently want to update
>> this.
>>
>>> > However from what I understood android runs apps as individual users,
>>> > and I think we can do per user tmpfs mounts. So app dies, user exits,
>>> > mount goes *poof*.
>>>
>>> Yes, user-space could be smarter about temporary files.
>>>
>>> Just like big banks could be less risk happy.
>>>
>>> Yet the reality is that if left alone both apps and banks mess up, I don't
>>> think libertarianism works for policy: we are better off offering a
>>> framework that is simple, robust, self-contained, low risk and hard to
>>> mess up?
>>
>> Fair enough; but I still want Colin to tell me why he can't do this in
>> userspace. And what all he wants to go do with this information etc.
>>
>> He's basically not told us much at all.
>
> I covered it a little in the thread on the previous version of the
> patch, but I'll try to give more detail (and include it in a patch
> stack description if I post another version).
>
> In many userspace applications, and especially in VM based
> applications like Android uses heavily, there are multiple different
> allocators in use.  At a minimum there is libc malloc and the stack,
> and in many cases there are libc malloc, the stack, direct syscalls to
> mmap anonymous memory, and multiple VM heaps (one for small objects,
> one for big objects, etc.).  Each of these layers usually has its own
> tools to inspect its usage; malloc by compiling a debug version, the
> VM through heap inspection tools, and for direct syscalls there is
> usually no way to track them.
>
> On Android we heavily use a set of tools that use an extended version
> of the logic covered in Documentation/vm/pagemap.txt to walk all pages
> mapped in userspace and slice their usage by process, shared (COW) vs.
> unique mappings, backing, etc.  This can account for real physical
> memory usage even in cases like fork without exec (which Android uses
> heavily to share as many private COW pages as possible between
> processes), Kernel SamePage Merging, and clean zero pages.  It
> produces a measurement of the pages that only exist in that process
> (USS, for unique), and a measurement of the physical memory usage of
> that process with the cost of shared pages being evenly split between
> processes that share them (PSS).  We need the feature to be efficient
> enough to be left on at all times because app developers and end users
> can use similar tools exposed through system reports and bugreports to
> determine the memory usage of apps
>
> If all anonymous memory is indistinguishable then figuring out the
> real physical memory usage of each heap requires either a pagemap
> walking tool that can understand the heap debugging of every layer, or
> for every layer's heap debugging tools to implement the pagemap
> walking logic, in which case it is hard to get a consistent view of
> memory across the whole system.
>
> Tracking the information in userspace leads to all sorts of problems.
> It either needs to be stored inside the process, which means every
> process has to have an API to export its current heap information upon
> request, or it has to be stored externally in a filesystem that
> somebody needs to clean up on crashes.  It needs to be readable while
> the process is still running, so it has to have some sort of
> synchronization with every layer of userspace.  Efficiently tracking
> the ranges requires reimplementing something like the kernel vma
> trees, and linking to it from every layer of userspace.  It requires
> more memory, more syscalls, more runtime cost, and more complexity to
> separately track regions that the kernel is already tracking.
>
> This feature is considered critical enough that Dalvik (Android's VM)
> uses ashmem, which is effectively deleted tmpfs files, solely to name
> their heaps.   I'd like to get rid of as much ashmem use within
> Android as possible, with an eye towards deprecating it.  ashmem heaps
> work reasonably well for a VM, which is likely to want a single
> contiguous region of address space that it will manage on its own, but
> falls apart for malloc, which often wants small kernel-allocated
> address space regions that may or may not merge with adjacent regions.
>  Blindly using ashmem/deleted tmpfs files instead of anonymous mmaps
> in malloc doubled the number of vmas in our main system process and
> was worse for the GLBenchmark process.
>
> As a concrete example of its usefulness (which should not be
> considered the extent of its usefulness, it's just what I happened to
> be looking at), I was recently tracking down why we were seeing many
> dirty private pages that were all zeroes being merged by KSM.  Using a
> mixture of ashmem naming and an early version of this patch, I could
> slice the the number of KSM merged pages per process and per heap,
> which then told me which heap debugging tools I should use to find who
> was dirtying large regions of zeroes.

Peter, any thoughts on this?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-14 14:17   ` Oleg Nesterov
@ 2013-07-14 19:34     ` Colin Cross
  0 siblings, 0 replies; 44+ messages in thread
From: Colin Cross @ 2013-07-14 19:34 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, Linux-MM

On Sun, Jul 14, 2013 at 7:17 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> On 07/11, Colin Cross wrote:
>>
>> +static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
>> +{
>> +     const char __user *name = vma_get_anon_name(vma);
>> +     struct mm_struct *mm = vma->vm_mm;
>> +
>> +     unsigned long page_start_vaddr;
>> +     unsigned long page_offset;
>> +     unsigned long num_pages;
>> +     unsigned long max_len = NAME_MAX;
>> +     int i;
>> +
>> +     page_start_vaddr = (unsigned long)name & PAGE_MASK;
>> +     page_offset = (unsigned long)name - page_start_vaddr;
>> +     num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
>> +
>> +     seq_puts(m, "[anon:");
>> +
>> +     for (i = 0; i < num_pages; i++) {
>> +             int len;
>> +             int write_len;
>> +             const char *kaddr;
>> +             long pages_pinned;
>> +             struct page *page;
>> +
>> +             pages_pinned = get_user_pages(current, mm, page_start_vaddr,
>> +                             1, 0, 0, &page, NULL);
>> +             if (pages_pinned < 1) {
>> +                     seq_puts(m, "<fault>]");
>> +                     return;
>> +             }
>> +
>> +             kaddr = (const char *)kmap(page);
>> +             len = min(max_len, PAGE_SIZE - page_offset);
>> +             write_len = strnlen(kaddr + page_offset, len);
>> +             seq_write(m, kaddr + page_offset, write_len);
>> +             kunmap(page);
>> +             put_page(page);
>> +
>> +             /* if strnlen hit a null terminator then we're done */
>> +             if (write_len != len)
>> +                     break;
>> +
>> +             max_len -= len;
>> +             page_offset = 0;
>> +             page_start_vaddr += PAGE_SIZE;
>> +     }
>> +
>> +     seq_putc(m, ']');
>> +}
>
> Again, sorry if this was already discussed...
>
> But for what? This moves the policy into the kernel and afaics buys nothing.
> Can't it simply print the number?
>
> If an application reads its own /proc/pid/maps, surely it knows how it should
> interpret the numeric values.
>
> If another process reads this file, and if it assumes that this number is a
> pointer into that task's memory, it can do sys_process_vm_readv() ?

I think there is value in keeping /proc/pid/maps human readable.  A
userspace tool could certainly put together the same information, but
there would be no easy way to do it from the command line.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-14 14:11   ` Oleg Nesterov
@ 2013-07-14 19:27     ` Colin Cross
  0 siblings, 0 replies; 44+ messages in thread
From: Colin Cross @ 2013-07-14 19:27 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: lkml, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, Linux-MM

On Sun, Jul 14, 2013 at 7:11 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> Sorry if this was already discussed... I am still trying to think if
> we can make a simpler patch.
>
> So, iiuc, the main problem is that if you want to track a vma you need
> to prevent the merging with other vma's.
>
> Question: is it important that vma's with the same vma_name should be
> _merged_ automatically?
>
> If not, can't we make "do not merge" a separate feature and then add
> vma_name?
>
> IOW, please forget about vma_name for the moment. Can't we start with
> the trivial patch below? It simply adds the new vm flag which blocks
> the merging, and MADV_ to set/clear it.
>
> Yes, this is more limited. Once you set VM_TAINTED this vma is always
> isolated. If you unmap a page in this vma, you create 2 isolated vma's.
> If, for example, you do MADV_DONTFORK + MADV_DOFORK inside the tainted
> vma, you will have 2 adjacent VM_TAINTED vma's with the same flags after
> that. But you can do MADV_UNTAINT + MADV_TAINT again if you want to
> merge them back. And perhaps this feature is useful even without the
> naming. And perhaps we can also add MAP_TAINTED.
>
> Now about vma_name. In this case PR_SET_VMA or MADV_NAME should simply
> set/overwrite vma_name and nothing else, no need to do merge/split vma.
>
> And if we add MAP_TAINTED, MAP_ANONYMOUS can reuse pgoff as vma_name
> (we only need a simple changes in do_mmap_pgoff and mmap_region). But
> this is minor.
>
> Or this is too simple/ugly? Probably yes, this means that an allocator
> which simply does a lot of MAP_ANONYMOUS + MADV_TAINT will create more
> vma's than it needs. So I won't insist but I'd like to ask anyway.

This is no different than using a new tmpfs file for every mmap
(although it saves the struct file and the inode), it results in a
huge increase in the number of vmas.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
                     ` (3 preceding siblings ...)
  2013-07-14 14:11   ` Oleg Nesterov
@ 2013-07-14 14:17   ` Oleg Nesterov
  2013-07-14 19:34     ` Colin Cross
  4 siblings, 1 reply; 44+ messages in thread
From: Oleg Nesterov @ 2013-07-14 14:17 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, linux-mm

On 07/11, Colin Cross wrote:
>
> +static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
> +{
> +	const char __user *name = vma_get_anon_name(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +
> +	unsigned long page_start_vaddr;
> +	unsigned long page_offset;
> +	unsigned long num_pages;
> +	unsigned long max_len = NAME_MAX;
> +	int i;
> +
> +	page_start_vaddr = (unsigned long)name & PAGE_MASK;
> +	page_offset = (unsigned long)name - page_start_vaddr;
> +	num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
> +
> +	seq_puts(m, "[anon:");
> +
> +	for (i = 0; i < num_pages; i++) {
> +		int len;
> +		int write_len;
> +		const char *kaddr;
> +		long pages_pinned;
> +		struct page *page;
> +
> +		pages_pinned = get_user_pages(current, mm, page_start_vaddr,
> +				1, 0, 0, &page, NULL);
> +		if (pages_pinned < 1) {
> +			seq_puts(m, "<fault>]");
> +			return;
> +		}
> +
> +		kaddr = (const char *)kmap(page);
> +		len = min(max_len, PAGE_SIZE - page_offset);
> +		write_len = strnlen(kaddr + page_offset, len);
> +		seq_write(m, kaddr + page_offset, write_len);
> +		kunmap(page);
> +		put_page(page);
> +
> +		/* if strnlen hit a null terminator then we're done */
> +		if (write_len != len)
> +			break;
> +
> +		max_len -= len;
> +		page_offset = 0;
> +		page_start_vaddr += PAGE_SIZE;
> +	}
> +
> +	seq_putc(m, ']');
> +}

Again, sorry if this was already discussed...

But for what? This moves the policy into the kernel and afaics buys nothing.
Can't it simply print the number?

If an application reads its own /proc/pid/maps, surely it knows how it should
interpret the numeric values.

If another process reads this file, and if it assumes that this number is a
pointer into that task's memory, it can do sys_process_vm_readv() ?

Oleg.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
                     ` (2 preceding siblings ...)
  2013-07-12  6:36   ` Dave Hansen
@ 2013-07-14 14:11   ` Oleg Nesterov
  2013-07-14 19:27     ` Colin Cross
  2013-07-14 14:17   ` Oleg Nesterov
  4 siblings, 1 reply; 44+ messages in thread
From: Oleg Nesterov @ 2013-07-14 14:11 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, linux-mm

Sorry if this was already discussed... I am still trying to think if
we can make a simpler patch.

So, iiuc, the main problem is that if you want to track a vma you need
to prevent the merging with other vma's.

Question: is it important that vma's with the same vma_name should be
_merged_ automatically?

If not, can't we make "do not merge" a separate feature and then add
vma_name?

IOW, please forget about vma_name for the moment. Can't we start with
the trivial patch below? It simply adds the new vm flag which blocks
the merging, and MADV_ to set/clear it.

Yes, this is more limited. Once you set VM_TAINTED this vma is always
isolated. If you unmap a page in this vma, you create 2 isolated vma's.
If, for example, you do MADV_DONTFORK + MADV_DOFORK inside the tainted
vma, you will have 2 adjacent VM_TAINTED vma's with the same flags after
that. But you can do MADV_UNTAINT + MADV_TAINT again if you want to
merge them back. And perhaps this feature is useful even without the
naming. And perhaps we can also add MAP_TAINTED.

Now about vma_name. In this case PR_SET_VMA or MADV_NAME should simply
set/overwrite vma_name and nothing else, no need to do merge/split vma.

And if we add MAP_TAINTED, MAP_ANONYMOUS can reuse pgoff as vma_name
(we only need a simple changes in do_mmap_pgoff and mmap_region). But
this is minor.

Or this is too simple/ugly? Probably yes, this means that an allocator
which simply does a lot of MAP_ANONYMOUS + MADV_TAINT will create more
vma's than it needs. So I won't insist but I'd like to ask anyway.

Oleg.

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -90,6 +90,8 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
 
+#define VM_TAINTED	0x00001000
+
 #define VM_LOCKED	0x00002000
 #define VM_IO           0x00004000	/* Memory mapped I/O or similar */
 
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 4164529..888af10 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -52,6 +52,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
 
+#define MADV_TAINT	18
+#define MADV_UNTAINT	19
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/mm/madvise.c b/mm/madvise.c
index 7055883..0ddc76f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -81,6 +81,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
 		}
 		new_flags &= ~VM_DONTDUMP;
 		break;
+	case MADV_TAINT:
+		new_flags |= VM_TAINTED;
+		break;
+	case MADV_UNTAINT:
+		new_flags &= ~VM_TAINTED;
+		break;
 	case MADV_MERGEABLE:
 	case MADV_UNMERGEABLE:
 		error = ksm_madvise(vma, start, end, behavior, &new_flags);
@@ -407,6 +413,8 @@ madvise_behavior_valid(int behavior)
 #endif
 	case MADV_DONTDUMP:
 	case MADV_DODUMP:
+	case MADV_TAINT:
+	case MADV_UNTAINT:
 		return 1;
 
 	default:
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e18..00323b7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1003,9 +1003,9 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 
 	/*
 	 * We later require that vma->vm_flags == vm_flags,
-	 * so this tests vma->vm_flags & VM_SPECIAL, too.
+	 * so this tests vma->vm_flags & VM_XXX, too.
 	 */
-	if (vm_flags & VM_SPECIAL)
+	if (vm_flags & (VM_SPECIAL | VM_TAINTED))
 		return NULL;
 
 	if (prev)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:49                   ` Peter Zijlstra
  2013-07-12 10:01                     ` Ingo Molnar
@ 2013-07-12 20:51                     ` Colin Cross
  2013-09-26  1:24                       ` Colin Cross
  1 sibling, 1 reply; 44+ messages in thread
From: Colin Cross @ 2013-07-12 20:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Pekka Enberg, lkml, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, Linux-MM,
	Linus Torvalds

On Fri, Jul 12, 2013 at 2:49 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, Jul 12, 2013 at 11:40:44AM +0200, Ingo Molnar wrote:
>> * Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> > On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
>> > >
>> > > * Peter Zijlstra <peterz@infradead.org> wrote:
>> > >
>> > > > We need those files anyway.. The current proposal is that the entire VMA
>> > > > has a single userspace pointer in it. Or rather a 64bit value.
>> > >
>> > > Yes but accessible via /proc/<PID>/mem or so?
>> >
>> > *shudder*.. yes. But you're again opening two files. The only advantage
>> > of this over userspace writing its own files is that the kernel cleans
>> > things up for you.
>>
>> Opening of the files only occurs in the instrumentation case, which is
>> rare. But temporary files would be forced upon the regular usecase when no
>> instrumentation goes on.
>
> Well, Colin didn't describe the intended use, but I can imagine a case where
> its not all that rare. System health monitors might frequently want to update
> this.
>
>> > However from what I understood android runs apps as individual users,
>> > and I think we can do per user tmpfs mounts. So app dies, user exits,
>> > mount goes *poof*.
>>
>> Yes, user-space could be smarter about temporary files.
>>
>> Just like big banks could be less risk happy.
>>
>> Yet the reality is that if left alone both apps and banks mess up, I don't
>> think libertarianism works for policy: we are better off offering a
>> framework that is simple, robust, self-contained, low risk and hard to
>> mess up?
>
> Fair enough; but I still want Colin to tell me why he can't do this in
> userspace. And what all he wants to go do with this information etc.
>
> He's basically not told us much at all.

I covered it a little in the thread on the previous version of the
patch, but I'll try to give more detail (and include it in a patch
stack description if I post another version).

In many userspace applications, and especially in VM based
applications like Android uses heavily, there are multiple different
allocators in use.  At a minimum there is libc malloc and the stack,
and in many cases there are libc malloc, the stack, direct syscalls to
mmap anonymous memory, and multiple VM heaps (one for small objects,
one for big objects, etc.).  Each of these layers usually has its own
tools to inspect its usage; malloc by compiling a debug version, the
VM through heap inspection tools, and for direct syscalls there is
usually no way to track them.

On Android we heavily use a set of tools that use an extended version
of the logic covered in Documentation/vm/pagemap.txt to walk all pages
mapped in userspace and slice their usage by process, shared (COW) vs.
unique mappings, backing, etc.  This can account for real physical
memory usage even in cases like fork without exec (which Android uses
heavily to share as many private COW pages as possible between
processes), Kernel SamePage Merging, and clean zero pages.  It
produces a measurement of the pages that only exist in that process
(USS, for unique), and a measurement of the physical memory usage of
that process with the cost of shared pages being evenly split between
processes that share them (PSS).  We need the feature to be efficient
enough to be left on at all times because app developers and end users
can use similar tools exposed through system reports and bugreports to
determine the memory usage of apps

If all anonymous memory is indistinguishable then figuring out the
real physical memory usage of each heap requires either a pagemap
walking tool that can understand the heap debugging of every layer, or
for every layer's heap debugging tools to implement the pagemap
walking logic, in which case it is hard to get a consistent view of
memory across the whole system.

Tracking the information in userspace leads to all sorts of problems.
It either needs to be stored inside the process, which means every
process has to have an API to export its current heap information upon
request, or it has to be stored externally in a filesystem that
somebody needs to clean up on crashes.  It needs to be readable while
the process is still running, so it has to have some sort of
synchronization with every layer of userspace.  Efficiently tracking
the ranges requires reimplementing something like the kernel vma
trees, and linking to it from every layer of userspace.  It requires
more memory, more syscalls, more runtime cost, and more complexity to
separately track regions that the kernel is already tracking.

This feature is considered critical enough that Dalvik (Android's VM)
uses ashmem, which is effectively deleted tmpfs files, solely to name
their heaps.   I'd like to get rid of as much ashmem use within
Android as possible, with an eye towards deprecating it.  ashmem heaps
work reasonably well for a VM, which is likely to want a single
contiguous region of address space that it will manage on its own, but
falls apart for malloc, which often wants small kernel-allocated
address space regions that may or may not merge with adjacent regions.
 Blindly using ashmem/deleted tmpfs files instead of anonymous mmaps
in malloc doubled the number of vmas in our main system process and
was worse for the GLBenchmark process.

As a concrete example of its usefulness (which should not be
considered the extent of its usefulness, it's just what I happened to
be looking at), I was recently tracking down why we were seeing many
dirty private pages that were all zeroes being merged by KSM.  Using a
mixture of ashmem naming and an early version of this patch, I could
slice the the number of KSM merged pages per process and per heap,
which then told me which heap debugging tools I should use to find who
was dirtying large regions of zeroes.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:45                 ` Ingo Molnar
@ 2013-07-12 10:09                   ` Peter Zijlstra
  0 siblings, 0 replies; 44+ messages in thread
From: Peter Zijlstra @ 2013-07-12 10:09 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT

On Fri, Jul 12, 2013 at 11:45:17AM +0200, Ingo Molnar wrote:
> One thing where we could help JITs is to offer a direct channel to any 
> perf profiling process: a prctl(SYS_TRACE) which would send a free-form 
> string to any profiling task interested in it.
> 
> This would be a glorified anonymous write() in essence, without using a 
> temporary file.
> 
> The advantage would be that the string could be captured as-is and copied 
> to the ring-buffer of the profiling task - instead of having to recover it 
> later on.
> 
> This is a model that I'd generally advocate: a single channel [per 
> CPU-ified] for instrumentation/tracing.

'free format text string' is long and cumbersome and requires parsing.

And size is the primary component in speed.

But yes, we could allow injection of something like 

struct PERF_RECORD_SYMBOL {
	struct perf_event_header	header;
	u32				pid, tid;
	u64				addr;
	u64				len;
	char				symbol[];
};

I still like the idea of actually writing valid ELF DSOs in that that would
also get us the TEXT and allow assembly inspection etc. It might also allow a
JIT to re-map those DSOs and decrease warm-up time -- provided the actual
program didn't change meanwhile.

How to do injection is another thing though; I don't much like prctl(). Then
again, offering a special file like /sys/bus/event_source/sink isn't
particularly pretty either.

Then there is the issue of attaching to an already running JIT; we'd need means
to 'catch' up. The DSOs trivially allow this; the injection not so much.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:49                   ` Peter Zijlstra
@ 2013-07-12 10:01                     ` Ingo Molnar
  2013-07-12 20:51                     ` Colin Cross
  1 sibling, 0 replies; 44+ messages in thread
From: Ingo Molnar @ 2013-07-12 10:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Jul 12, 2013 at 11:40:44AM +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
> > > > 
> > > > * Peter Zijlstra <peterz@infradead.org> wrote:
> > > > 
> > > > > We need those files anyway.. The current proposal is that the entire VMA 
> > > > > has a single userspace pointer in it. Or rather a 64bit value.
> > > > 
> > > > Yes but accessible via /proc/<PID>/mem or so?
> > > 
> > > *shudder*.. yes. But you're again opening two files. The only advantage 
> > > of this over userspace writing its own files is that the kernel cleans 
> > > things up for you.
> > 
> > Opening of the files only occurs in the instrumentation case, which is 
> > rare. But temporary files would be forced upon the regular usecase 
> > when no instrumentation goes on.
> 
> Well, Colin didn't describe the intended use, but I can imagine a case 
> where its not all that rare. System health monitors might frequently 
> want to update this.

That's true.

So maybe it would be better to offer a tracepoint that allows apps to emit 
such information - to any system monitor around to listen.

If it's made a vsyscall that does not enter the kernel if the process is 
not being monitored would make it very low overhead.

> > So, these 400+ memory ranges are from Firefox's /proc/*/maps file:
> > 
> <snip>
> > 
> > It's about 35% out of 1300+ mappings that Firefox uses.
> > 
> > It is likely that the ---p mappings (about 40 of them) are guard pages.
> > 
> > How do I tell what the remaining anonymous areas are about?
> 
> Well, if you'd ran it within a memory allocator debug framework that 
> would have kept track of this. Typically memory debuggers can keep 
> allocation time stacks etc.
> 
> If I'm not actively debugging firefox I don't give a damn.

Yet people are nosy and find it rather useful to have such 
'heap/stack/vdso/vsyscall' annotations:

 0237c000-0239d000 rw-p 00000000 00:00 0                                  [heap]
 ...
 7fff622af000-7fff622d0000 rw-p 00000000 00:00 0                          [stack]
 7fff623fe000-7fff62400000 r-xp 00000000 00:00 0                          [vdso]
 ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]

and named vmas have names as well:

 7fa5b02eb000-7fa5b6718000 r--p 00000000 08:03 1710237                    /usr/lib/locale/locale-archive

so why not allow some simple mechanism to descriptively name anonymous 
vmas as well?

Maybe the 8 bytes shouldn't be a pointer to user-space memory, but a short 
string, a bit like task_struct:comm[16]?

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:40                 ` Ingo Molnar
@ 2013-07-12  9:49                   ` Peter Zijlstra
  2013-07-12 10:01                     ` Ingo Molnar
  2013-07-12 20:51                     ` Colin Cross
  0 siblings, 2 replies; 44+ messages in thread
From: Peter Zijlstra @ 2013-07-12  9:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds

On Fri, Jul 12, 2013 at 11:40:44AM +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
> > > 
> > > * Peter Zijlstra <peterz@infradead.org> wrote:
> > > 
> > > > We need those files anyway.. The current proposal is that the entire VMA 
> > > > has a single userspace pointer in it. Or rather a 64bit value.
> > > 
> > > Yes but accessible via /proc/<PID>/mem or so?
> > 
> > *shudder*.. yes. But you're again opening two files. The only advantage 
> > of this over userspace writing its own files is that the kernel cleans 
> > things up for you.
> 
> Opening of the files only occurs in the instrumentation case, which is 
> rare. But temporary files would be forced upon the regular usecase when no 
> instrumentation goes on.

Well, Colin didn't describe the intended use, but I can imagine a case where
its not all that rare. System health monitors might frequently want to update
this.

> > However from what I understood android runs apps as individual users, 
> > and I think we can do per user tmpfs mounts. So app dies, user exits, 
> > mount goes *poof*.
> 
> Yes, user-space could be smarter about temporary files.
> 
> Just like big banks could be less risk happy.
> 
> Yet the reality is that if left alone both apps and banks mess up, I don't 
> think libertarianism works for policy: we are better off offering a 
> framework that is simple, robust, self-contained, low risk and hard to 
> mess up?

Fair enough; but I still want Colin to tell me why he can't do this in
userspace. And what all he wants to go do with this information etc.

He's basically not told us much at all.

> So, these 400+ memory ranges are from Firefox's /proc/*/maps file:
> 
<snip>
> 
> It's about 35% out of 1300+ mappings that Firefox uses.
> 
> It is likely that the ---p mappings (about 40 of them) are guard pages.
> 
> How do I tell what the remaining anonymous areas are about?

Well, if you'd ran it within a memory allocator debug framework that would have
kept track of this. Typically memory debuggers can keep allocation time stacks
etc.

If I'm not actively debugging firefox I don't give a damn.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:38               ` Pekka Enberg
@ 2013-07-12  9:45                 ` Ingo Molnar
  2013-07-12 10:09                   ` Peter Zijlstra
  0 siblings, 1 reply; 44+ messages in thread
From: Ingo Molnar @ 2013-07-12  9:45 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Peter Zijlstra, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT


* Pekka Enberg <penberg@kernel.org> wrote:

> On Fri, Jul 12, 2013 at 12:26 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> > Well, the JIT profiling case is really special - there we are 
> > constructing code and a symbol table on the fly. Talking to perf via a 
> > temporary file sounds unavoidable (and thus proper), because symbol 
> > information on that level is not something the kernel knows (or should 
> > know) about.
> >
> > I was arguing primarily in the context of the original patch: naming 
> > allocator heaps. Today the kernel makes a few educated guesses about 
> > what each memory area is about, in /proc/*/maps:
> >
> >  34511ac000-34511b0000 r--p 001ac000 08:03 1706770                        /usr/lib64/libc-2.15.so
> >  34511b0000-34511b2000 rw-p 001b0000 08:03 1706770                        /usr/lib64/libc-2.15.so
> >  34511b2000-34511b7000 rw-p 00000000 00:00 0
> >  7f5bdff94000-7f5be63c1000 r--p 00000000 08:03 1710237                    /usr/lib/locale/locale-archive
> >  7f5be63c1000-7f5be63c4000 rw-p 00000000 00:00 0
> >  7f5be63d6000-7f5be63d7000 rw-p 00000000 00:00 0
> >  7fff7677f000-7fff767a0000 rw-p 00000000 00:00 0                          [stack]
> >  7fff767dd000-7fff767df000 r-xp 00000000 00:00 0                          [vdso]
> >  ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]
> >
> > ... but on any larger app there's lots of anon mmap areas that are ... 
> > anonymous! ;-) User-space could help out a bit by naming them. It's 
> > not like there's many heaps, so the performance overhead aspect is 
> > minimal.
> >
> > In the JIT case we have something different, a 'file IO' abstraction 
> > really: the JIT is generating (writing) new code and associated symbol 
> > records. So using temporary files there is natural and proper and most 
> > of the disadvantages I list don't apply because the sheer volume of 
> > new code generated dillutes the overhead of open()/close(), plus we do 
> > need some space for those symbols so a JIT cannot really expect to be 
> > able to run in a pure readonly environment.
> >
> > In the allocator/heap case we have a _memory_ abstraction it's just 
> > that we also want to name the heap minimally.
> >
> > For any finer than vma granularity user-space attributes the kernel 
> > cannot help much, it does not know (and probably should not know) 
> > about all user-space data structures.
> >
> > Right now I don't see any good way to merge the two. (might be due to 
> > lack of imagination)
> 
> I have no trouble with the imagination part but you make a strong point 
> about the kernel not helping at finer granularity than vma anyway.
> 
> The current functionality is already quite helpful for VMs as well. We 
> could annotate the different GC and JIT regions and make perf more 
> human-friendly by default.

One thing where we could help JITs is to offer a direct channel to any 
perf profiling process: a prctl(SYS_TRACE) which would send a free-form 
string to any profiling task interested in it.

This would be a glorified anonymous write() in essence, without using a 
temporary file.

The advantage would be that the string could be captured as-is and copied 
to the ring-buffer of the profiling task - instead of having to recover it 
later on.

This is a model that I'd generally advocate: a single channel [per 
CPU-ified] for instrumentation/tracing.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:27               ` Peter Zijlstra
@ 2013-07-12  9:40                 ` Ingo Molnar
  2013-07-12  9:49                   ` Peter Zijlstra
  0 siblings, 1 reply; 44+ messages in thread
From: Ingo Molnar @ 2013-07-12  9:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
> > 
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > We need those files anyway.. The current proposal is that the entire VMA 
> > > has a single userspace pointer in it. Or rather a 64bit value.
> > 
> > Yes but accessible via /proc/<PID>/mem or so?
> 
> *shudder*.. yes. But you're again opening two files. The only advantage 
> of this over userspace writing its own files is that the kernel cleans 
> things up for you.

Opening of the files only occurs in the instrumentation case, which is 
rare. But temporary files would be forced upon the regular usecase when no 
instrumentation goes on.

> However from what I understood android runs apps as individual users, 
> and I think we can do per user tmpfs mounts. So app dies, user exits, 
> mount goes *poof*.

Yes, user-space could be smarter about temporary files.

Just like big banks could be less risk happy.

Yet the reality is that if left alone both apps and banks mess up, I don't 
think libertarianism works for policy: we are better off offering a 
framework that is simple, robust, self-contained, low risk and hard to 
mess up?

> > I was thinking about it in the context of its original purpose: naming 
> > heap areas, which are pretty anonymous right now - /proc/*/maps is 
> > full of mystery ranges today.
> 
> It is.. although I've myself never had trouble with that. Most every 
> memory debugging that I've used/written over the past two decades was 
> adequately able to identify memory regions.

So, these 400+ memory ranges are from Firefox's /proc/*/maps file:

7fbf59eff000-7fbf59f00000 ---p 00000000 00:00 0 
7fbf59f00000-7fbf5a800000 rw-p 00000000 00:00 0 
7fbf5a900000-7fbf5aa00000 rw-p 00000000 00:00 0 
7fbf5ad00000-7fbf5ae00000 rw-p 00000000 00:00 0 
7fbf5af00000-7fbf5b000000 rw-p 00000000 00:00 0 
7fbf5b100000-7fbf5b200000 rw-p 00000000 00:00 0 
7fbf5b500000-7fbf5b600000 rw-p 00000000 00:00 0 
7fbf5b800000-7fbf5b900000 rw-p 00000000 00:00 0 
7fbf5bb00000-7fbf5bc00000 rw-p 00000000 00:00 0 
7fbf5c000000-7fbf5c100000 rw-p 00000000 00:00 0 
7fbf5c200000-7fbf5c300000 rw-p 00000000 00:00 0 
7fbf5c800000-7fbf5c900000 rw-p 00000000 00:00 0 
7fbf5ca00000-7fbf5cb00000 rw-p 00000000 00:00 0 
7fbf5ce00000-7fbf5cf00000 rw-p 00000000 00:00 0 
7fbf5d200000-7fbf5d300000 rw-p 00000000 00:00 0 
7fbf5d400000-7fbf5d500000 rw-p 00000000 00:00 0 
7fbf5db00000-7fbf5dc00000 rw-p 00000000 00:00 0 
7fbf5dd00000-7fbf5de00000 rw-p 00000000 00:00 0 
7fbf5df00000-7fbf5e000000 rw-p 00000000 00:00 0 
7fbf5e300000-7fbf5e400000 rw-p 00000000 00:00 0 
7fbf5e500000-7fbf5e600000 rw-p 00000000 00:00 0 
7fbf5e900000-7fbf5ea00000 rw-p 00000000 00:00 0 
7fbf5eb00000-7fbf5ec00000 rw-p 00000000 00:00 0 
7fbf5ed00000-7fbf5ef00000 rw-p 00000000 00:00 0 
7fbf5f2ff000-7fbf5f300000 ---p 00000000 00:00 0 
7fbf5f300000-7fbf5fc00000 rw-p 00000000 00:00 0 
7fbf5fd00000-7fbf5fe00000 rw-p 00000000 00:00 0 
7fbf5ff00000-7fbf60000000 rw-p 00000000 00:00 0 
7fbf60200000-7fbf60300000 rw-p 00000000 00:00 0 
7fbf60600000-7fbf60700000 rw-p 00000000 00:00 0 
7fbf60800000-7fbf60900000 rw-p 00000000 00:00 0 
7fbf60a00000-7fbf60b00000 rw-p 00000000 00:00 0 
7fbf60e00000-7fbf60f00000 rw-p 00000000 00:00 0 
7fbf61000000-7fbf61100000 rw-p 00000000 00:00 0 
7fbf61a00000-7fbf61b00000 rw-p 00000000 00:00 0 
7fbf61c00000-7fbf61d00000 rw-p 00000000 00:00 0 
7fbf62000000-7fbf62100000 rw-p 00000000 00:00 0 
7fbf62200000-7fbf62300000 rw-p 00000000 00:00 0 
7fbf62400000-7fbf62500000 rw-p 00000000 00:00 0 
7fbf62600000-7fbf62700000 rw-p 00000000 00:00 0 
7fbf62800000-7fbf62a00000 rw-p 00000000 00:00 0 
7fbf62e00000-7fbf63000000 rw-p 00000000 00:00 0 
7fbf63100000-7fbf63200000 rw-p 00000000 00:00 0 
7fbf63300000-7fbf63400000 rw-p 00000000 00:00 0 
7fbf63600000-7fbf63700000 rw-p 00000000 00:00 0 
7fbf63900000-7fbf63a00000 rw-p 00000000 00:00 0 
7fbf63b00000-7fbf63c00000 rw-p 00000000 00:00 0 
7fbf63d00000-7fbf63e00000 rw-p 00000000 00:00 0 
7fbf63f00000-7fbf64000000 rw-p 00000000 00:00 0 
7fbf64100000-7fbf64200000 rw-p 00000000 00:00 0 
7fbf64300000-7fbf64400000 rw-p 00000000 00:00 0 
7fbf64500000-7fbf64600000 rw-p 00000000 00:00 0 
7fbf64700000-7fbf64800000 rw-p 00000000 00:00 0 
7fbf64a00000-7fbf64b00000 rw-p 00000000 00:00 0 
7fbf64c00000-7fbf64d00000 rw-p 00000000 00:00 0 
7fbf64e00000-7fbf64f00000 rw-p 00000000 00:00 0 
7fbf65400000-7fbf65500000 rw-p 00000000 00:00 0 
7fbf65600000-7fbf65700000 rw-p 00000000 00:00 0 
7fbf65800000-7fbf65900000 rw-p 00000000 00:00 0 
7fbf65a00000-7fbf65b00000 rw-p 00000000 00:00 0 
7fbf65c00000-7fbf65d00000 rw-p 00000000 00:00 0 
7fbf65e00000-7fbf65f00000 rw-p 00000000 00:00 0 
7fbf66000000-7fbf66100000 rw-p 00000000 00:00 0 
7fbf66200000-7fbf66300000 rw-p 00000000 00:00 0 
7fbf663ff000-7fbf66400000 ---p 00000000 00:00 0 
7fbf66400000-7fbf66d00000 rw-p 00000000 00:00 0 
7fbf66e00000-7fbf66f00000 rw-p 00000000 00:00 0 
7fbf67000000-7fbf67100000 rw-p 00000000 00:00 0 
7fbf67200000-7fbf67300000 rw-p 00000000 00:00 0 
7fbf67400000-7fbf67500000 rw-p 00000000 00:00 0 
7fbf67600000-7fbf67700000 rw-p 00000000 00:00 0 
7fbf67800000-7fbf67900000 rw-p 00000000 00:00 0 
7fbf67a00000-7fbf67b00000 rw-p 00000000 00:00 0 
7fbf67c00000-7fbf67d00000 rw-p 00000000 00:00 0 
7fbf67e00000-7fbf67f00000 rw-p 00000000 00:00 0 
7fbf68000000-7fbf68100000 rw-p 00000000 00:00 0 
7fbf68200000-7fbf68300000 rw-p 00000000 00:00 0 
7fbf68400000-7fbf68500000 rw-p 00000000 00:00 0 
7fbf68600000-7fbf68700000 rw-p 00000000 00:00 0 
7fbf68800000-7fbf68900000 rw-p 00000000 00:00 0 
7fbf68a00000-7fbf68b00000 rw-p 00000000 00:00 0 
7fbf68c00000-7fbf68d00000 rw-p 00000000 00:00 0 
7fbf68e00000-7fbf68f00000 rw-p 00000000 00:00 0 
7fbf69000000-7fbf69100000 rw-p 00000000 00:00 0 
7fbf692ff000-7fbf69300000 ---p 00000000 00:00 0 
7fbf69e00000-7fbf69f00000 rw-p 00000000 00:00 0 
7fbf6a000000-7fbf6a100000 rw-p 00000000 00:00 0 
7fbf6a200000-7fbf6a300000 rw-p 00000000 00:00 0 
7fbf6a400000-7fbf6a500000 rw-p 00000000 00:00 0 
7fbf6a600000-7fbf6a700000 rw-p 00000000 00:00 0 
7fbf6a800000-7fbf6a900000 rw-p 00000000 00:00 0 
7fbf6aa00000-7fbf6ad00000 rw-p 00000000 00:00 0 
7fbf6ae00000-7fbf6b000000 rw-p 00000000 00:00 0 
7fbf6b100000-7fbf6b200000 rw-p 00000000 00:00 0 
7fbf6b300000-7fbf6b400000 rw-p 00000000 00:00 0 
7fbf6b700000-7fbf6b800000 rw-p 00000000 00:00 0 
7fbf6b900000-7fbf6ba00000 rw-p 00000000 00:00 0 
7fbf6bb00000-7fbf6bd00000 rw-p 00000000 00:00 0 
7fbf6be00000-7fbf6bf00000 rw-p 00000000 00:00 0 
7fbf6c000000-7fbf6c100000 rw-p 00000000 00:00 0 
7fbf6c200000-7fbf6c300000 rw-p 00000000 00:00 0 
7fbf6c400000-7fbf6c500000 rw-p 00000000 00:00 0 
7fbf6c600000-7fbf6c700000 rw-p 00000000 00:00 0 
7fbf6c800000-7fbf6c900000 rw-p 00000000 00:00 0 
7fbf6ca00000-7fbf6cb00000 rw-p 00000000 00:00 0 
7fbf6cc00000-7fbf6cd00000 rw-p 00000000 00:00 0 
7fbf6ce00000-7fbf6d000000 rw-p 00000000 00:00 0 
7fbf6d100000-7fbf6d200000 rw-p 00000000 00:00 0 
7fbf6d300000-7fbf6d400000 rw-p 00000000 00:00 0 
7fbf6d500000-7fbf6da00000 rw-p 00000000 00:00 0 
7fbf6db00000-7fbf6dc00000 rw-p 00000000 00:00 0 
7fbf6e100000-7fbf6e200000 rw-p 00000000 00:00 0 
7fbf6e300000-7fbf6e400000 rw-p 00000000 00:00 0 
7fbf6e500000-7fbf6e600000 rw-p 00000000 00:00 0 
7fbf6e700000-7fbf6ed00000 rw-p 00000000 00:00 0 
7fbf6ee00000-7fbf6ef00000 rw-p 00000000 00:00 0 
7fbf6f000000-7fbf6f200000 rw-p 00000000 00:00 0 
7fbf6f300000-7fbf6f400000 rw-p 00000000 00:00 0 
7fbf6f500000-7fbf6f800000 rw-p 00000000 00:00 0 
7fbf6f900000-7fbf6fa00000 rw-p 00000000 00:00 0 
7fbf6fb00000-7fbf6fc00000 rw-p 00000000 00:00 0 
7fbf6fd00000-7fbf6fe00000 rw-p 00000000 00:00 0 
7fbf6ff00000-7fbf70000000 rw-p 00000000 00:00 0 
7fbf70100000-7fbf70200000 rw-p 00000000 00:00 0 
7fbf70300000-7fbf70400000 rw-p 00000000 00:00 0 
7fbf70500000-7fbf70600000 rw-p 00000000 00:00 0 
7fbf70700000-7fbf70800000 rw-p 00000000 00:00 0 
7fbf70900000-7fbf70b00000 rw-p 00000000 00:00 0 
7fbf70c00000-7fbf70d00000 rw-p 00000000 00:00 0 
7fbf70e00000-7fbf71300000 rw-p 00000000 00:00 0 
7fbf71400000-7fbf71500000 rw-p 00000000 00:00 0 
7fbf71600000-7fbf71700000 rw-p 00000000 00:00 0 
7fbf71800000-7fbf71900000 rw-p 00000000 00:00 0 
7fbf71a00000-7fbf71c00000 rw-p 00000000 00:00 0 
7fbf71d00000-7fbf71e00000 rw-p 00000000 00:00 0 
7fbf71f00000-7fbf72000000 rw-p 00000000 00:00 0 
7fbf72100000-7fbf72200000 rw-p 00000000 00:00 0 
7fbf72300000-7fbf72400000 rw-p 00000000 00:00 0 
7fbf72500000-7fbf72600000 rw-p 00000000 00:00 0 
7fbf72700000-7fbf72800000 rw-p 00000000 00:00 0 
7fbf72900000-7fbf72a00000 rw-p 00000000 00:00 0 
7fbf72b00000-7fbf72c00000 rw-p 00000000 00:00 0 
7fbf72d00000-7fbf72e00000 rw-p 00000000 00:00 0 
7fbf72f00000-7fbf73000000 rw-p 00000000 00:00 0 
7fbf73100000-7fbf73200000 rw-p 00000000 00:00 0 
7fbf73300000-7fbf73400000 rw-p 00000000 00:00 0 
7fbf734fb000-7fbf734fc000 ---p 00000000 00:00 0 
7fbf73d00000-7fbf73e00000 rw-p 00000000 00:00 0 
7fbf73f00000-7fbf74000000 rw-p 00000000 00:00 0 
7fbf741ff000-7fbf74200000 ---p 00000000 00:00 0 
7fbf74200000-7fbf74d00000 rw-p 00000000 00:00 0 
7fbf74e00000-7fbf75000000 rw-p 00000000 00:00 0 
7fbf75100000-7fbf75400000 rw-p 00000000 00:00 0 
7fbf754ff000-7fbf75500000 ---p 00000000 00:00 0 
7fbf75eff000-7fbf75f00000 ---p 00000000 00:00 0 
7fbf76900000-7fbf76b00000 rw-p 00000000 00:00 0 
7fbf76c00000-7fbf76d00000 rw-p 00000000 00:00 0 
7fbf76e00000-7fbf76f00000 rw-p 00000000 00:00 0 
7fbf77000000-7fbf77100000 rw-p 00000000 00:00 0 
7fbf77200000-7fbf77300000 rw-p 00000000 00:00 0 
7fbf77400000-7fbf77500000 rw-p 00000000 00:00 0 
7fbf77600000-7fbf77700000 rw-p 00000000 00:00 0 
7fbf77800000-7fbf77900000 rw-p 00000000 00:00 0 
7fbf77a00000-7fbf77b00000 rw-p 00000000 00:00 0 
7fbf77c00000-7fbf77e00000 rw-p 00000000 00:00 0 
7fbf77f00000-7fbf78000000 rw-p 00000000 00:00 0 
7fbf78100000-7fbf78200000 rw-p 00000000 00:00 0 
7fbf78300000-7fbf78400000 rw-p 00000000 00:00 0 
7fbf78500000-7fbf78700000 rw-p 00000000 00:00 0 
7fbf78800000-7fbf78900000 rw-p 00000000 00:00 0 
7fbf78a00000-7fbf78b00000 rw-p 00000000 00:00 0 
7fbf78c00000-7fbf78d00000 rw-p 00000000 00:00 0 
7fbf78e00000-7fbf78f00000 rw-p 00000000 00:00 0 
7fbf79000000-7fbf79100000 rw-p 00000000 00:00 0 
7fbf79200000-7fbf79300000 rw-p 00000000 00:00 0 
7fbf79400000-7fbf79600000 rw-p 00000000 00:00 0 
7fbf79700000-7fbf79900000 rw-p 00000000 00:00 0 
7fbf79a00000-7fbf79b00000 rw-p 00000000 00:00 0 
7fbf79c00000-7fbf79e00000 rw-p 00000000 00:00 0 
7fbf79f00000-7fbf7a000000 rw-p 00000000 00:00 0 
7fbf7a100000-7fbf7a200000 rw-p 00000000 00:00 0 
7fbf7a300000-7fbf7a600000 rw-p 00000000 00:00 0 
7fbf7a700000-7fbf7a800000 rw-p 00000000 00:00 0 
7fbf7ab00000-7fbf7ac00000 rw-p 00000000 00:00 0 
7fbf7ad00000-7fbf7ae00000 rw-p 00000000 00:00 0 
7fbf7af00000-7fbf7b000000 rw-p 00000000 00:00 0 
7fbf84100000-7fbf84200000 rw-p 00000000 00:00 0 
7fbf84600000-7fbf84f00000 rw-p 00000000 00:00 0 
7fbf85000000-7fbf85100000 rw-p 00000000 00:00 0 
7fbf85200000-7fbf85400000 rw-p 00000000 00:00 0 
7fbf85500000-7fbf85600000 rw-p 00000000 00:00 0 
7fbf85700000-7fbf85800000 rw-p 00000000 00:00 0 
7fbf85900000-7fbf85a00000 rw-p 00000000 00:00 0 
7fbf85b00000-7fbf85d00000 rw-p 00000000 00:00 0 
7fbf85e00000-7fbf86000000 rw-p 00000000 00:00 0 
7fbf86100000-7fbf86200000 rw-p 00000000 00:00 0 
7fbf86300000-7fbf86400000 rw-p 00000000 00:00 0 
7fbf86500000-7fbf86700000 rw-p 00000000 00:00 0 
7fbf86800000-7fbf86e00000 rw-p 00000000 00:00 0 
7fbf86f00000-7fbf87000000 rw-p 00000000 00:00 0 
7fbf87100000-7fbf87200000 rw-p 00000000 00:00 0 
7fbf87300000-7fbf87400000 rw-p 00000000 00:00 0 
7fbf87500000-7fbf87700000 rw-p 00000000 00:00 0 
7fbf87800000-7fbf87900000 rw-p 00000000 00:00 0 
7fbf87a00000-7fbf87b00000 rw-p 00000000 00:00 0 
7fbf87c00000-7fbf87d00000 rw-p 00000000 00:00 0 
7fbf87e00000-7fbf87f00000 rw-p 00000000 00:00 0 
7fbf88000000-7fbf88100000 rw-p 00000000 00:00 0 
7fbf88100000-7fbf88200000 rw-p 00000000 00:00 0 
7fbf88300000-7fbf88600000 rw-p 00000000 00:00 0 
7fbf887fe000-7fbf887ff000 ---p 00000000 00:00 0 
7fbf89100000-7fbf89200000 rw-p 00000000 00:00 0 
7fbf89300000-7fbf89400000 rw-p 00000000 00:00 0 
7fbf89500000-7fbf89600000 rw-p 00000000 00:00 0 
7fbf89700000-7fbf89900000 rw-p 00000000 00:00 0 
7fbf899f9000-7fbf899fa000 ---p 00000000 00:00 0 
7fbf8a200000-7fbf8a300000 rw-p 00000000 00:00 0 
7fbf8a400000-7fbf8a500000 rw-p 00000000 00:00 0 
7fbf8a600000-7fbf8a700000 rw-p 00000000 00:00 0 
7fbf8a800000-7fbf8a900000 rw-p 00000000 00:00 0 
7fbf8aa00000-7fbf8ab00000 rw-p 00000000 00:00 0 
7fbf8ab00000-7fbf8ad00000 rw-p 00000000 00:00 0 
7fbf8ae00000-7fbf8af00000 rw-p 00000000 00:00 0 
7fbf8b000000-7fbf8b100000 rw-p 00000000 00:00 0 
7fbf8b200000-7fbf8b300000 rw-p 00000000 00:00 0 
7fbf8b600000-7fbf8b700000 rw-p 00000000 00:00 0 
7fbf8b800000-7fbf8b900000 rw-p 00000000 00:00 0 
7fbf8ba00000-7fbf8bc00000 rw-p 00000000 00:00 0 
7fbf8bd00000-7fbf8be00000 rw-p 00000000 00:00 0 
7fbf8bf00000-7fbf8c000000 rw-p 00000000 00:00 0 
7fbf8c100000-7fbf8c200000 rw-p 00000000 00:00 0 
7fbf8c300000-7fbf8c400000 rw-p 00000000 00:00 0 
7fbf8c500000-7fbf8c600000 rw-p 00000000 00:00 0 
7fbf8c700000-7fbf8c800000 rw-p 00000000 00:00 0 
7fbf8c900000-7fbf8ca00000 rw-p 00000000 00:00 0 
7fbf8cb00000-7fbf8cc00000 rw-p 00000000 00:00 0 
7fbf8cd00000-7fbf8d300000 rw-p 00000000 00:00 0 
7fbf8d400000-7fbf8d600000 rw-p 00000000 00:00 0 
7fbf8d700000-7fbf8d800000 rw-p 00000000 00:00 0 
7fbf8d900000-7fbf8da00000 rw-p 00000000 00:00 0 
7fbf8dafc000-7fbf8dafd000 ---p 00000000 00:00 0 
7fbf8e2fd000-7fbf8e2fe000 ---p 00000000 00:00 0 
7fbf8eafe000-7fbf8eaff000 ---p 00000000 00:00 0 
7fbf8f2ff000-7fbf8f300000 ---p 00000000 00:00 0 
7fbf8fe00000-7fbf8ff00000 rw-p 00000000 00:00 0 
7fbf90000000-7fbf90100000 rw-p 00000000 00:00 0 
7fbf90200000-7fbf90300000 rw-p 00000000 00:00 0 
7fbf90400000-7fbf90500000 rw-p 00000000 00:00 0 
7fbf90600000-7fbf90700000 rw-p 00000000 00:00 0 
7fbf907ff000-7fbf90800000 ---p 00000000 00:00 0 
7fbf91200000-7fbf91300000 rw-p 00000000 00:00 0 
7fbf913ff000-7fbf91400000 ---p 00000000 00:00 0 
7fbf91800000-7fbf91900000 rw-p 00000000 00:00 0 
7fbf919fe000-7fbf919ff000 ---p 00000000 00:00 0 
7fbf921ff000-7fbf92200000 ---p 00000000 00:00 0 
7fbf92c00000-7fbf92d00000 rw-p 00000000 00:00 0 
7fbf92e00000-7fbf92f00000 rw-p 00000000 00:00 0 
7fbf93000000-7fbf93100000 rw-p 00000000 00:00 0 
7fbf93200000-7fbf93300000 rw-p 00000000 00:00 0 
7fbf93400000-7fbf93500000 rw-p 00000000 00:00 0 
7fbf93600000-7fbf93700000 rw-p 00000000 00:00 0 
7fbf937ff000-7fbf93800000 ---p 00000000 00:00 0 
7fbf94200000-7fbf94300000 rw-p 00000000 00:00 0 
7fbf94400000-7fbf94500000 rw-p 00000000 00:00 0 
7fbf94600000-7fbf94700000 rw-p 00000000 00:00 0 
7fbf94800000-7fbf94900000 rw-p 00000000 00:00 0 
7fbf94a00000-7fbf94b00000 rw-p 00000000 00:00 0 
7fbf94c00000-7fbf94d00000 rw-p 00000000 00:00 0 
7fbf94e00000-7fbf94f00000 rw-p 00000000 00:00 0 
7fbf95000000-7fbf95100000 rw-p 00000000 00:00 0 
7fbf95200000-7fbf95300000 rw-p 00000000 00:00 0 
7fbf95400000-7fbf95500000 rw-p 00000000 00:00 0 
7fbf95600000-7fbf95700000 rw-p 00000000 00:00 0 
7fbf95800000-7fbf95900000 rw-p 00000000 00:00 0 
7fbf95a00000-7fbf95b00000 rw-p 00000000 00:00 0 
7fbf95c00000-7fbf95d00000 rw-p 00000000 00:00 0 
7fbf95e00000-7fbf95f00000 rw-p 00000000 00:00 0 
7fbf96000000-7fbf96100000 rw-p 00000000 00:00 0 
7fbf96200000-7fbf96300000 rw-p 00000000 00:00 0 
7fbf96400000-7fbf96500000 rw-p 00000000 00:00 0 
7fbf96600000-7fbf96700000 rw-p 00000000 00:00 0 
7fbf96800000-7fbf96a00000 rw-p 00000000 00:00 0 
7fbf96b00000-7fbf96c00000 rw-p 00000000 00:00 0 
7fbf96d00000-7fbf96e00000 rw-p 00000000 00:00 0 
7fbf96f00000-7fbf97000000 rw-p 00000000 00:00 0 
7fbf97100000-7fbf97200000 rw-p 00000000 00:00 0 
7fbf97300000-7fbf97400000 rw-p 00000000 00:00 0 
7fbf97500000-7fbf97600000 rw-p 00000000 00:00 0 
7fbf97700000-7fbf97800000 rw-p 00000000 00:00 0 
7fbf97900000-7fbf97a00000 rw-p 00000000 00:00 0 
7fbf97b00000-7fbf97c00000 rw-p 00000000 00:00 0 
7fbf97d00000-7fbf97e00000 rw-p 00000000 00:00 0 
7fbf97f00000-7fbf98000000 rw-p 00000000 00:00 0 
7fbf98100000-7fbf98200000 rw-p 00000000 00:00 0 
7fbf98300000-7fbf98400000 rw-p 00000000 00:00 0 
7fbf98500000-7fbf98600000 rw-p 00000000 00:00 0 
7fbf98700000-7fbf98800000 rw-p 00000000 00:00 0 
7fbf98900000-7fbf98a00000 rw-p 00000000 00:00 0 
7fbf98b00000-7fbf98c00000 rw-p 00000000 00:00 0 
7fbf98d00000-7fbf98e00000 rw-p 00000000 00:00 0 
7fbf98f00000-7fbf99000000 rw-p 00000000 00:00 0 
7fbf99100000-7fbf99200000 rw-p 00000000 00:00 0 
7fbf99300000-7fbf99400000 rw-p 00000000 00:00 0 
7fbf99500000-7fbf99600000 rw-p 00000000 00:00 0 
7fbf99700000-7fbf99800000 rw-p 00000000 00:00 0 
7fbf99900000-7fbf99a00000 rw-p 00000000 00:00 0 
7fbf99b00000-7fbf99c00000 rw-p 00000000 00:00 0 
7fbf99d00000-7fbf99e00000 rw-p 00000000 00:00 0 
7fbf99f00000-7fbf9a000000 rw-p 00000000 00:00 0 
7fbf9a100000-7fbf9a200000 rw-p 00000000 00:00 0 
7fbf9a300000-7fbf9a400000 rw-p 00000000 00:00 0 
7fbf9a500000-7fbf9a600000 rw-p 00000000 00:00 0 
7fbf9a700000-7fbf9a800000 rw-p 00000000 00:00 0 
7fbf9a900000-7fbf9aa00000 rw-p 00000000 00:00 0 
7fbf9ab00000-7fbf9ac00000 rw-p 00000000 00:00 0 
7fbf9ad00000-7fbf9ae00000 rw-p 00000000 00:00 0 
7fbf9af00000-7fbf9b000000 rw-p 00000000 00:00 0 
7fbf9b100000-7fbf9b200000 rw-p 00000000 00:00 0 
7fbf9b300000-7fbf9b400000 rw-p 00000000 00:00 0 
7fbf9b500000-7fbf9b600000 rw-p 00000000 00:00 0 
7fbf9b700000-7fbf9b900000 rw-p 00000000 00:00 0 
7fbf9ba00000-7fbf9bb00000 rw-p 00000000 00:00 0 
7fbf9bc00000-7fbf9bd00000 rw-p 00000000 00:00 0 
7fbf9be00000-7fbf9bf00000 rw-p 00000000 00:00 0 
7fbf9c000000-7fbf9c100000 rw-p 00000000 00:00 0 
7fbf9c200000-7fbf9c700000 rw-p 00000000 00:00 0 
7fbf9c800000-7fbf9c900000 rw-p 00000000 00:00 0 
7fbf9ca00000-7fbf9cd00000 rw-p 00000000 00:00 0 
7fbf9ce00000-7fbf9cf00000 rw-p 00000000 00:00 0 
7fbf9d000000-7fbf9d100000 rw-p 00000000 00:00 0 
7fbf9d200000-7fbf9d300000 rw-p 00000000 00:00 0 
7fbf9d400000-7fbf9d500000 rw-p 00000000 00:00 0 
7fbf9d600000-7fbf9d700000 rw-p 00000000 00:00 0 
7fbf9d800000-7fbf9d900000 rw-p 00000000 00:00 0 
7fbf9da00000-7fbf9db00000 rw-p 00000000 00:00 0 
7fbf9dc00000-7fbf9dd00000 rw-p 00000000 00:00 0 
7fbf9de00000-7fbf9df00000 rw-p 00000000 00:00 0 
7fbf9e000000-7fbf9e100000 rw-p 00000000 00:00 0 
7fbf9e200000-7fbf9e300000 rw-p 00000000 00:00 0 
7fbf9e400000-7fbf9e500000 rw-p 00000000 00:00 0 
7fbf9e600000-7fbf9e700000 rw-p 00000000 00:00 0 
7fbf9e800000-7fbf9e900000 rw-p 00000000 00:00 0 
7fbf9ea00000-7fbf9eb00000 rw-p 00000000 00:00 0 
7fbf9ec00000-7fbf9ed00000 rw-p 00000000 00:00 0 
7fbf9f200000-7fbf9f300000 rw-p 00000000 00:00 0 
7fbf9f600000-7fbf9f700000 rw-p 00000000 00:00 0 
7fbf9fc00000-7fbf9fd00000 rw-p 00000000 00:00 0 
7fbf9fe00000-7fbf9ff00000 rw-p 00000000 00:00 0 
7fbfa0400000-7fbfa0c00000 rw-p 00000000 00:00 0 
7fbfa0d00000-7fbfa0e00000 rw-p 00000000 00:00 0 
7fbfa0f00000-7fbfa1000000 rw-p 00000000 00:00 0 
7fbfa1100000-7fbfa1300000 rw-p 00000000 00:00 0 
7fbfa1400000-7fbfa1700000 rw-p 00000000 00:00 0 
7fbfa1a00000-7fbfa4a00000 rw-p 00000000 00:00 0 
7fbfa4e00000-7fbfa7400000 rw-p 00000000 00:00 0 
7fbfa74fa000-7fbfa74fb000 ---p 00000000 00:00 0 
7fbfa7cfb000-7fbfa7cfc000 ---p 00000000 00:00 0 
7fbfa84fc000-7fbfa84fd000 ---p 00000000 00:00 0 
7fbfa908d000-7fbfa9091000 rw-p 00000000 00:00 0 
7fbfa94fe000-7fbfa94ff000 ---p 00000000 00:00 0 
7fbfa9cff000-7fbfa9d00000 ---p 00000000 00:00 0 
7fbfab2ff000-7fbfab300000 ---p 00000000 00:00 0 
7fbfac6ff000-7fbfac700000 ---p 00000000 00:00 0 
7fbfaec2c000-7fbfaec2d000 ---p 00000000 00:00 0 
7fbfaef00000-7fbfaf300000 rw-p 00000000 00:00 0 
7fbfafa00000-7fbfafb00000 rw-p 00000000 00:00 0 
7fbfafc00000-7fbfafd00000 rw-p 00000000 00:00 0 
7fbfb087c000-7fbfb087d000 rw-p 00000000 00:00 0 
7fbfb0f00000-7fbfb1000000 rw-p 00000000 00:00 0 
7fbfb1700000-7fbfb1d00000 rw-p 00000000 00:00 0 
7fbfb1dff000-7fbfb1e00000 ---p 00000000 00:00 0 
7fbfb283e000-7fbfb284e000 rwxp 00000000 00:00 0 
7fbfb2b00000-7fbfb3200000 rw-p 00000000 00:00 0 
7fbfb3200000-7fbfb3300000 rw-p 00000000 00:00 0 
7fbfb3500000-7fbfb3c00000 rw-p 00000000 00:00 0 
7fbfb3c1b000-7fbfb3c3b000 rwxp 00000000 00:00 0 
7fbfb3cfb000-7fbfb3cfc000 ---p 00000000 00:00 0 
7fbfb4700000-7fbfb4b00000 rw-p 00000000 00:00 0 
7fbfb4b00000-7fbfb5200000 rw-p 00000000 00:00 0 
7fbfb56fd000-7fbfb56fe000 ---p 00000000 00:00 0 
7fbfb5efe000-7fbfb5eff000 ---p 00000000 00:00 0 
7fbfb66ff000-7fbfb6700000 ---p 00000000 00:00 0 
7fbfb70be000-7fbfb70bf000 ---p 00000000 00:00 0 
7fbfb70ff000-7fbfb7100000 ---p 00000000 00:00 0 
7fbfb85c2000-7fbfb85f0000 rw-p 00000000 00:00 0 
7fbfb889b000-7fbfb889c000 rw-p 00000000 00:00 0 
7fbfb9362000-7fbfb9363000 rw-p 00000000 00:00 0 
7fbfb987a000-7fbfb987b000 rw-p 00000000 00:00 0 
7fbfb9ec2000-7fbfb9ec3000 rw-p 00000000 00:00 0 
7fbfba712000-7fbfba713000 rw-p 00000000 00:00 0 
7fbfbac2e000-7fbfbac30000 rw-p 00000000 00:00 0 
7fbfbd7f1000-7fbfbdb00000 rw-p 00000000 00:00 0 
7fbfbdbfd000-7fbfbdbfe000 ---p 00000000 00:00 0 
7fbfbe3fe000-7fbfbe3ff000 ---p 00000000 00:00 0 
7fbfbebff000-7fbfbec00000 ---p 00000000 00:00 0 
7fbfbf9ff000-7fbfbfa00000 ---p 00000000 00:00 0 
7fbfc7e00000-7fbfc8500000 rw-p 00000000 00:00 0 
7fbfc858a000-7fbfc859a000 rwxp 00000000 00:00 0 
7fbfc859a000-7fbfc859b000 ---p 00000000 00:00 0 
7fbfc929b000-7fbfc929f000 rw-p 00000000 00:00 0 
7fbfc9700000-7fbfc9900000 rw-p 00000000 00:00 0 
7fbfc990b000-7fbfc993b000 rwxp 00000000 00:00 0 
7fbfc9986000-7fbfc9996000 rwxp 00000000 00:00 0 
7fbfc99ef000-7fbfc99ff000 rwxp 00000000 00:00 0 
7fbfc99ff000-7fbfc9a00000 ---p 00000000 00:00 0 
7fbfcae02000-7fbfcae32000 rwxp 00000000 00:00 0 
7fbfcae32000-7fbfcae33000 ---p 00000000 00:00 0 
7fbfcb633000-7fbfcb634000 ---p 00000000 00:00 0 
7fbfcc234000-7fbfcc235000 ---p 00000000 00:00 0 
7fbfcca35000-7fbfcca36000 ---p 00000000 00:00 0 
7fbfcdaf8000-7fbfcdaf9000 rw-p 00000000 00:00 0 
7fbfcdd00000-7fbfcde00000 rw-p 00000000 00:00 0 
7fbfcde04000-7fbfcde24000 rwxp 00000000 00:00 0 
7fbfcde2c000-7fbfcde3c000 rwxp 00000000 00:00 0 
7fbfcde50000-7fbfcde70000 rwxp 00000000 00:00 0 
7fbfce0e7000-7fbfce0e8000 ---p 00000000 00:00 0 
7fbfcef00000-7fbfcf000000 rw-p 00000000 00:00 0 
7fbfcf005000-7fbfcf015000 rwxp 00000000 00:00 0 
7fbfcf0ff000-7fbfcf100000 ---p 00000000 00:00 0 
7fbfcfa0a000-7fbfcfa1a000 rwxp 00000000 00:00 0 
7fbfd030a000-7fbfd030b000 ---p 00000000 00:00 0 
7fbfd113c000-7fbfd113e000 rw-p 00000000 00:00 0 
7fbfd2af6000-7fbfd2af7000 rw-p 00000000 00:00 0 
7fbfd326d000-7fbfd326e000 rw-p 00000000 00:00 0 
7fbfd401f000-7fbfd4020000 rw-p 00000000 00:00 0 
7fbfd48f1000-7fbfd48f2000 rw-p 00000000 00:00 0 
7fbfd4b8c000-7fbfd4b8d000 rw-p 00000000 00:00 0 
7fbfd5c00000-7fbfd5d00000 rw-p 00000000 00:00 0 
7fbfd5d0e000-7fbfd5d3e000 rwxp 00000000 00:00 0 
7fbfd7f2a000-7fbfd7f2e000 rw-p 00000000 00:00 0 
7fbfd8b8e000-7fbfd8b92000 rw-p 00000000 00:00 0 
7fbfd9be0000-7fbfd9be2000 rw-p 00000000 00:00 0 
7fbfd9e00000-7fbfd9e01000 rw-p 00000000 00:00 0 
7fbfdbe81000-7fbfdbe82000 rw-p 00000000 00:00 0 
7fbfdc6ac000-7fbfdc6ae000 rw-p 00000000 00:00 0 
7fbfdcf07000-7fbfdcf09000 rw-p 00000000 00:00 0 
7fbfdd20b000-7fbfdd20d000 rw-p 00000000 00:00 0 
7fbfdd89e000-7fbfdd89f000 rw-p 00000000 00:00 0 
7fbfde597000-7fbfde598000 rw-p 00000000 00:00 0 
7fbfe1e0d000-7fbfe1f9c000 rw-p 00000000 00:00 0 
7fbfe2427000-7fbfe2428000 rw-p 00000000 00:00 0 
7fbfe2b87000-7fbfe2b89000 rw-p 00000000 00:00 0 
7fbfe35fd000-7fbfe3700000 rw-p 00000000 00:00 0 
7fbfe3701000-7fbfe3711000 rwxp 00000000 00:00 0 
7fbfe3d64000-7fbfe3d69000 rw-p 00000000 00:00 0 
7fbfe3f82000-7fbfe3f86000 rw-p 00000000 00:00 0 
7fbfe4579000-7fbfe458e000 rw-p 00000000 00:00 0 
7fbfe47b7000-7fbfe47b8000 rw-p 00000000 00:00 0 
7fbfe47b8000-7fbfe47b9000 rwxp 00000000 00:00 0 
7fbfe47d7000-7fbfe47e7000 rwxp 00000000 00:00 0 
7fbfe4883000-7fbfe4989000 rw-p 00000000 00:00 0 
7fbfe49b2000-7fbfe49b4000 rw-p 00000000 00:00 0 

It's about 35% out of 1300+ mappings that Firefox uses.

It is likely that the ---p mappings (about 40 of them) are guard pages.

How do I tell what the remaining anonymous areas are about?

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:26             ` Ingo Molnar
@ 2013-07-12  9:38               ` Pekka Enberg
  2013-07-12  9:45                 ` Ingo Molnar
  0 siblings, 1 reply; 44+ messages in thread
From: Pekka Enberg @ 2013-07-12  9:38 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT

On Fri, Jul 12, 2013 at 12:26 PM, Ingo Molnar <mingo@kernel.org> wrote:
> Well, the JIT profiling case is really special - there we are constructing
> code and a symbol table on the fly. Talking to perf via a temporary file
> sounds unavoidable (and thus proper), because symbol information on that
> level is not something the kernel knows (or should know) about.
>
> I was arguing primarily in the context of the original patch: naming
> allocator heaps. Today the kernel makes a few educated guesses about what
> each memory area is about, in /proc/*/maps:
>
>  34511ac000-34511b0000 r--p 001ac000 08:03 1706770                        /usr/lib64/libc-2.15.so
>  34511b0000-34511b2000 rw-p 001b0000 08:03 1706770                        /usr/lib64/libc-2.15.so
>  34511b2000-34511b7000 rw-p 00000000 00:00 0
>  7f5bdff94000-7f5be63c1000 r--p 00000000 08:03 1710237                    /usr/lib/locale/locale-archive
>  7f5be63c1000-7f5be63c4000 rw-p 00000000 00:00 0
>  7f5be63d6000-7f5be63d7000 rw-p 00000000 00:00 0
>  7fff7677f000-7fff767a0000 rw-p 00000000 00:00 0                          [stack]
>  7fff767dd000-7fff767df000 r-xp 00000000 00:00 0                          [vdso]
>  ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]
>
> ... but on any larger app there's lots of anon mmap areas that are ...
> anonymous! ;-) User-space could help out a bit by naming them. It's not
> like there's many heaps, so the performance overhead aspect is minimal.
>
> In the JIT case we have something different, a 'file IO' abstraction
> really: the JIT is generating (writing) new code and associated symbol
> records. So using temporary files there is natural and proper and most of
> the disadvantages I list don't apply because the sheer volume of new code
> generated dillutes the overhead of open()/close(), plus we do need some
> space for those symbols so a JIT cannot really expect to be able to run in
> a pure readonly environment.
>
> In the allocator/heap case we have a _memory_ abstraction it's just that
> we also want to name the heap minimally.
>
> For any finer than vma granularity user-space attributes the kernel cannot
> help much, it does not know (and probably should not know) about all
> user-space data structures.
>
> Right now I don't see any good way to merge the two. (might be due to lack
> of imagination)

I have no trouble with the imagination part but you make a strong point about
the kernel not helping at finer granularity than vma anyway.

The current functionality is already quite helpful for VMs as well. We could
annotate the different GC and JIT regions and make perf more human-friendly
by default.

                                Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:14             ` Peter Zijlstra
@ 2013-07-12  9:28               ` Ingo Molnar
  0 siblings, 0 replies; 44+ messages in thread
From: Ingo Molnar @ 2013-07-12  9:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pekka Enberg, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT


* Peter Zijlstra <peterz@infradead.org> wrote:

> > Yeah, I could see that working. It doesn't solve the problems Ingo 
> > mentioned which are also important, though.
> 
> Nothing I've yet seen would do that. Its intrinsic to the fact that we 
> want 'anonymous' text tied to a process instance but require part of 
> that text (symbol information at the very least) to be available after 
> the process instance.
> 
> That are two contradictory requirements. You cannot preserve and not 
> preserve at the same time.
> 
> And pushing the symbol info into the kernel isn't going to fix that 
> either.

I fully agree with you in the JIT case.

I was arguing the utilty of the original, somewhat limited usecase: 
minimally naming allocator areas/heaps, on a high level.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:15             ` Ingo Molnar
@ 2013-07-12  9:27               ` Peter Zijlstra
  2013-07-12  9:40                 ` Ingo Molnar
  0 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2013-07-12  9:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds

On Fri, Jul 12, 2013 at 11:15:06AM +0200, Ingo Molnar wrote:
> 
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > We need those files anyway.. The current proposal is that the entire VMA 
> > has a single userspace pointer in it. Or rather a 64bit value.
> 
> Yes but accessible via /proc/<PID>/mem or so?

*shudder*.. yes. But you're again opening two files. The only advantage of this
over userspace writing its own files is that the kernel cleans things up for
you.

However from what I understood android runs apps as individual users, and I
think we can do per user tmpfs mounts. So app dies, user exits, mount goes
*poof*.

> I was thinking about it in the context of its original purpose: naming 
> heap areas, which are pretty anonymous right now - /proc/*/maps is full
> of mystery ranges today.

It is.. although I've myself never had trouble with that. Most every memory
debugging that I've used/written over the past two decades was adequately able
to identify memory regions.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:04           ` Pekka Enberg
  2013-07-12  9:14             ` Peter Zijlstra
@ 2013-07-12  9:26             ` Ingo Molnar
  2013-07-12  9:38               ` Pekka Enberg
  1 sibling, 1 reply; 44+ messages in thread
From: Ingo Molnar @ 2013-07-12  9:26 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Peter Zijlstra, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT


* Pekka Enberg <penberg@kernel.org> wrote:

> > Once the DSO is full -- equal to your previous anon-exec region being 
> > full, you simply mmap a new DSO.
> >
> > Wouldn't that work?
> 
> Okay and then whenever 'perf top' sees a non-mapped IP it reloads the 
> DSO (if it has changed)?
> 
> Yeah, I could see that working. It doesn't solve the problems Ingo 
> mentioned which are also important, though.

Well, the JIT profiling case is really special - there we are constructing 
code and a symbol table on the fly. Talking to perf via a temporary file 
sounds unavoidable (and thus proper), because symbol information on that 
level is not something the kernel knows (or should know) about.

I was arguing primarily in the context of the original patch: naming 
allocator heaps. Today the kernel makes a few educated guesses about what 
each memory area is about, in /proc/*/maps:

 34511ac000-34511b0000 r--p 001ac000 08:03 1706770                        /usr/lib64/libc-2.15.so
 34511b0000-34511b2000 rw-p 001b0000 08:03 1706770                        /usr/lib64/libc-2.15.so
 34511b2000-34511b7000 rw-p 00000000 00:00 0 
 7f5bdff94000-7f5be63c1000 r--p 00000000 08:03 1710237                    /usr/lib/locale/locale-archive
 7f5be63c1000-7f5be63c4000 rw-p 00000000 00:00 0 
 7f5be63d6000-7f5be63d7000 rw-p 00000000 00:00 0 
 7fff7677f000-7fff767a0000 rw-p 00000000 00:00 0                          [stack]
 7fff767dd000-7fff767df000 r-xp 00000000 00:00 0                          [vdso]
 ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]

... but on any larger app there's lots of anon mmap areas that are ... 
anonymous! ;-) User-space could help out a bit by naming them. It's not 
like there's many heaps, so the performance overhead aspect is minimal.

In the JIT case we have something different, a 'file IO' abstraction 
really: the JIT is generating (writing) new code and associated symbol 
records. So using temporary files there is natural and proper and most of 
the disadvantages I list don't apply because the sheer volume of new code 
generated dillutes the overhead of open()/close(), plus we do need some 
space for those symbols so a JIT cannot really expect to be able to run in 
a pure readonly environment.

In the allocator/heap case we have a _memory_ abstraction it's just that 
we also want to name the heap minimally.

For any finer than vma granularity user-space attributes the kernel cannot 
help much, it does not know (and probably should not know) about all 
user-space data structures.

Right now I don't see any good way to merge the two. (might be due to lack 
of imagination)

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:00           ` Peter Zijlstra
@ 2013-07-12  9:15             ` Ingo Molnar
  2013-07-12  9:27               ` Peter Zijlstra
  0 siblings, 1 reply; 44+ messages in thread
From: Ingo Molnar @ 2013-07-12  9:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Jul 12, 2013 at 10:44:06AM +0200, Ingo Molnar wrote:
> > It makes tons of sense.
> > 
> > Just like we have a task's cmd-name it makes a lot of sense to name 
> > objects in a human readable fashion, to help debugging, instrumentation, 
> > performance analysis, etc.
> > 
> > Yes, in theory user-space could do all that. That's not the point: the 
> > point is to make it fast, easy enough and to have a central version (the 
> > kernel).
> > 
> > Doing it via temporary files has various disadvantages:
> 
> We need those files anyway.. The current proposal is that the entire VMA 
> has a single userspace pointer in it. Or rather a 64bit value.

Yes but accessible via /proc/<PID>/mem or so?

> > I guess the real question is not whether it's useful, I think it 
> > clearly is. The question should be: are there real downsides? Does the 
> > addition to the anon mmap field blow up the size of vma_struct by a 
> > pointer, or is there still space?
> 
> I don't see how the single u64 is useful at all for perf; you can have 
> at most one u64 per page; that's not nearly enough to put symbol 
> information in. Therefore we still require external files.

I was thinking about it in the context of its original purpose: naming 
heap areas, which are pretty anonymous right now - /proc/*/maps is full
of mystery ranges today.

It's indeed not good enough for finer grained structure.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  9:04           ` Pekka Enberg
@ 2013-07-12  9:14             ` Peter Zijlstra
  2013-07-12  9:28               ` Ingo Molnar
  2013-07-12  9:26             ` Ingo Molnar
  1 sibling, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2013-07-12  9:14 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Colin Cross, LKML, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT

On Fri, Jul 12, 2013 at 12:04:46PM +0300, Pekka Enberg wrote:
> On 07/12/2013 11:55 AM, Peter Zijlstra wrote:
> >Mmap the file PROT_READ|PROT_WRITE|PROT_EXEC, map the _entire_ file, not just
> >the text section; make the symbol table larger than you expect. Then write the
> >symbol name after you've jit'ed the text but before you use it.
> >
> >IIRC you once told me you never overwrite text but always append new symbols.
> >So you can basically fill the DSO with text/symbols use mmap memory writes.
> 
> I don't but I think Hotspot, for example, does recompile method. Dunno
> if it's a problem really, we could easily come up with a versioning
> scheme for the methods and teach perf to treat the different memory
> regions as the same method.

Anything that overwrites symbols is going to have issues with profiling;
there's really nothing we can do about that.

> On 07/12/2013 11:55 AM, Peter Zijlstra wrote:
> >Once the DSO is full -- equal to your previous anon-exec region being full,
> >you simply mmap a new DSO.
> >
> >Wouldn't that work?
> 
> Okay and then whenever 'perf top' sees a non-mapped IP it reloads the
> DSO (if it has changed)?

I suppose, yeah. There might be a few issues with determining if a mmap()
written file has changed though :/

> Yeah, I could see that working. It doesn't solve the problems Ingo mentioned
> which are also important, though.

Nothing I've yet seen would do that. Its intrinsic to the fact that we want
'anonymous' text tied to a process instance but require part of that text
(symbol information at the very least) to be available after the process
instance.

That are two contradictory requirements. You cannot preserve and not preserve
at the same time.

And pushing the symbol info into the kernel isn't going to fix that either.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:55         ` Peter Zijlstra
@ 2013-07-12  9:04           ` Pekka Enberg
  2013-07-12  9:14             ` Peter Zijlstra
  2013-07-12  9:26             ` Ingo Molnar
  0 siblings, 2 replies; 44+ messages in thread
From: Pekka Enberg @ 2013-07-12  9:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Colin Cross, LKML, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, list@ebiederm.org:DOCUMENTATION,
	list@ebiederm.org:MEMORY MANAGEMENT

On 07/12/2013 11:55 AM, Peter Zijlstra wrote:
> Mmap the file PROT_READ|PROT_WRITE|PROT_EXEC, map the _entire_ file, not just
> the text section; make the symbol table larger than you expect. Then write the
> symbol name after you've jit'ed the text but before you use it.
>
> IIRC you once told me you never overwrite text but always append new symbols.
> So you can basically fill the DSO with text/symbols use mmap memory writes.

I don't but I think Hotspot, for example, does recompile method. Dunno
if it's a problem really, we could easily come up with a versioning
scheme for the methods and teach perf to treat the different memory
regions as the same method.

On 07/12/2013 11:55 AM, Peter Zijlstra wrote:
> Once the DSO is full -- equal to your previous anon-exec region being full,
> you simply mmap a new DSO.
>
> Wouldn't that work?

Okay and then whenever 'perf top' sees a non-mapped IP it reloads the
DSO (if it has changed)?

Yeah, I could see that working. It doesn't solve the problems Ingo 
mentioned which are also important, though.

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:44         ` Ingo Molnar
  2013-07-12  8:55           ` Pekka Enberg
@ 2013-07-12  9:00           ` Peter Zijlstra
  2013-07-12  9:15             ` Ingo Molnar
  1 sibling, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2013-07-12  9:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds

On Fri, Jul 12, 2013 at 10:44:06AM +0200, Ingo Molnar wrote:
> It makes tons of sense.
> 
> Just like we have a task's cmd-name it makes a lot of sense to name 
> objects in a human readable fashion, to help debugging, instrumentation, 
> performance analysis, etc.
> 
> Yes, in theory user-space could do all that. That's not the point: the 
> point is to make it fast, easy enough and to have a central version (the 
> kernel).
> 
> Doing it via temporary files has various disadvantages:

We need those files anyway.. The current proposal is that the entire VMA has a
single userspace pointer in it. Or rather a 64bit value.

> I guess the real question is not whether it's useful, I think it clearly 
> is. The question should be: are there real downsides? Does the addition to 
> the anon mmap field blow up the size of vma_struct by a pointer, or is 
> there still space?

I don't see how the single u64 is useful at all for perf; you can have at most
one u64 per page; that's not nearly enough to put symbol information in.
Therefore we still require external files.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:44         ` Ingo Molnar
@ 2013-07-12  8:55           ` Pekka Enberg
  2013-07-12  9:00           ` Peter Zijlstra
  1 sibling, 0 replies; 44+ messages in thread
From: Pekka Enberg @ 2013-07-12  8:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Colin Cross, LKML, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner,
	list@ebiederm.org:DOCUMENTATION <linux-doc@vger.kernel.org>,
	list@ebiederm.org:MEMORY MANAGEMENT <linux-mm@kvack.org>,

On Fri, Jul 12, 2013 at 11:44 AM, Ingo Molnar <mingo@kernel.org> wrote:
> I guess the real question is not whether it's useful, I think it clearly
> is. The question should be: are there real downsides? Does the addition to
> the anon mmap field blow up the size of vma_struct by a pointer, or is
> there still space?

No, it's part of an union of 'struct vma_struct' in the current implementation
so the size doesn't change.

I'd still like to see something that's not restricted to page aligned memory
areas, though.

                                Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:21       ` Pekka Enberg
@ 2013-07-12  8:55         ` Peter Zijlstra
  2013-07-12  9:04           ` Pekka Enberg
  0 siblings, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2013-07-12  8:55 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Colin Cross, LKML, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar,
	list@ebiederm.org:DOCUMENTATION <linux-doc@vger.kernel.org>,
	list@ebiederm.org:MEMORY MANAGEMENT <linux-mm@kvack.org>,

On Fri, Jul 12, 2013 at 11:21:55AM +0300, Pekka Enberg wrote:
> On Fri, Jul 12, 2013 at 11:13 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > I also don't see it helping with the JIT stuff; you still need to write out a
> > file with symbol information, we still need to find the file. A less hacky
> > solution for the entire JIT thing is you writing a proper ELF-DSO and
> > mmap()'ing that :-)
> >
> > Storing a JIT specific userspace pointer in the VMA doesn't help with any of
> > that.
> 
> I'm thinking about corner cases like 'perf top' here. I don't see how we can
> write out a ELF-DSO because the JIT compiler can generate new symbols
> at any given time.

Mmap the file PROT_READ|PROT_WRITE|PROT_EXEC, map the _entire_ file, not just
the text section; make the symbol table larger than you expect. Then write the
symbol name after you've jit'ed the text but before you use it.

IIRC you once told me you never overwrite text but always append new symbols.
So you can basically fill the DSO with text/symbols use mmap memory writes.

Once the DSO is full -- equal to your previous anon-exec region being full,
you simply mmap a new DSO.

Wouldn't that work?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:17       ` Peter Zijlstra
@ 2013-07-12  8:44         ` Ingo Molnar
  2013-07-12  8:55           ` Pekka Enberg
  2013-07-12  9:00           ` Peter Zijlstra
  0 siblings, 2 replies; 44+ messages in thread
From: Ingo Molnar @ 2013-07-12  8:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Pekka Enberg, Colin Cross, linux-kernel, Kyungmin Park,
	Christoph Hellwig, John Stultz, Eric W. Biederman, Dave Hansen,
	Rob Landley, Andrew Morton, Cyrill Gorcunov, David Rientjes,
	Davidlohr Bueso, Kees Cook, Al Viro, Hugh Dickins, Mel Gorman,
	Michel Lespinasse, Rik van Riel, Konstantin Khlebnikov,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, linux-doc, linux-mm,
	Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Jul 12, 2013 at 10:13:48AM +0200, Peter Zijlstra wrote:
> > On Fri, Jul 12, 2013 at 08:39:14AM +0300, Pekka Enberg wrote:
> > > On 07/12/2013 05:34 AM, Colin Cross wrote:
> > > >Userspace processes often have multiple allocators that each do
> > > >anonymous mmaps to get memory.  When examining memory usage of
> > > >individual processes or systems as a whole, it is useful to be
> > > >able to break down the various heaps that were allocated by
> > > >each layer and examine their size, RSS, and physical memory
> > > >usage.
> > > >
> > > >This patch adds a user pointer to the shared union in
> > > >vm_area_struct that points to a null terminated string inside
> > > >the user process containing a name for the vma.  vmas that
> > > >point to the same address will be merged, but vmas that
> > > >point to equivalent strings at different addresses will
> > > >not be merged.
> > > >
> > > >Userspace can set the name for a region of memory by calling
> > > >prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> > > >Setting the name to NULL clears it.
> > > >
> > > >The names of named anonymous vmas are shown in /proc/pid/maps
> > > >as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
> > > >that is only present for named vmas.  If the userspace pointer
> > > >is no longer valid all or part of the name will be replaced
> > > >with "<fault>".
> > > >
> > > >The idea to store a userspace pointer to reduce the complexity
> > > >within mm (at the expense of the complexity of reading
> > > >/proc/pid/mem) came from Dave Hansen.  This results in no
> > > >runtime overhead in the mm subsystem other than comparing
> > > >the anon_name pointers when considering vma merging.  The pointer
> > > >is stored in a union with fieds that are only used on file-backed
> > > >mappings, so it does not increase memory usage.
> > > >
> > > >Signed-off-by: Colin Cross <ccross@android.com>
> > > 
> > > Ingo, PeterZ, is this something worthwhile for replacing our
> > > current JIT symbol hack with perf?
> > 
> > I really don't see the point of this stuff; in fact I intensely 
> > dislike it as I don't think this is something the kernel needs to do 
> > at all.
> > 
> > Why can't these allocators Collin talks about use file maps and/or 
> > write their own meta-data to file? He is after all only interested in 
> > Android and they have complete control over the entire userspace 
> > stack.
> 
> In fact, nowhere in his entire Changelog does he explain why this needs 
> be in the kernel; _why_ can't userspace do this?
> 
> He needs to go change his allocators to use the new madv syscall anyway, 
> he might as well change them to write the stuff to a local file and be 
> done with it.
> 
> what gives?

It makes tons of sense.

Just like we have a task's cmd-name it makes a lot of sense to name 
objects in a human readable fashion, to help debugging, instrumentation, 
performance analysis, etc.

Yes, in theory user-space could do all that. That's not the point: the 
point is to make it fast, easy enough and to have a central version (the 
kernel).

Doing it via temporary files has various disadvantages:

 - many tools really like to be filesystem invariant (not touch any files 
   even in tmpfs, be able to run in a readonly environment, etc.)

 - the overhead of opening, writing to and closing a file is an order of
   magnitude larger than a single prctl() call. [I'd even argue for such
   user-space tags to be attached to do_mmap(), unfortunately the mmap
   system call argument space is already pretty full. ]

 - stray files hang around (even in tmpfs). Point of instrumentation is to 
   be non-intrusive and as fool-proof as possible. When we are
   debugging problems the last thing we want are extra problems
   and unreliable instrumentation introduced by a fragile temporary file
   solution...

 - user space also tends to get the security model of temporary files
   wrong. static linking makes the user-space version iteration of such
   facilities harder. etc. etc. - there's other disadvantages as well.

So using temporary files is an instrumentation and debugging nightmare 
really. A simple self-contained prctl() variant, with the info stored by 
the kernel is as convenient as it gets.

I guess the real question is not whether it's useful, I think it clearly 
is. The question should be: are there real downsides? Does the addition to 
the anon mmap field blow up the size of vma_struct by a pointer, or is 
there still space?

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:13     ` Peter Zijlstra
  2013-07-12  8:17       ` Peter Zijlstra
@ 2013-07-12  8:21       ` Pekka Enberg
  2013-07-12  8:55         ` Peter Zijlstra
  1 sibling, 1 reply; 44+ messages in thread
From: Pekka Enberg @ 2013-07-12  8:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Colin Cross, LKML, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar,
	list@ebiederm.org:DOCUMENTATION <linux-doc@vger.kernel.org>,
	list@ebiederm.org:MEMORY MANAGEMENT <linux-mm@kvack.org>,

On Fri, Jul 12, 2013 at 11:13 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> I also don't see it helping with the JIT stuff; you still need to write out a
> file with symbol information, we still need to find the file. A less hacky
> solution for the entire JIT thing is you writing a proper ELF-DSO and
> mmap()'ing that :-)
>
> Storing a JIT specific userspace pointer in the VMA doesn't help with any of
> that.

I'm thinking about corner cases like 'perf top' here. I don't see how we can
write out a ELF-DSO because the JIT compiler can generate new symbols
at any given time.

That's what made me think it'd be best for the _kernel_ to know about the
symbols so that perf could take advantage of that as well.

                                    Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  8:13     ` Peter Zijlstra
@ 2013-07-12  8:17       ` Peter Zijlstra
  2013-07-12  8:44         ` Ingo Molnar
  2013-07-12  8:21       ` Pekka Enberg
  1 sibling, 1 reply; 44+ messages in thread
From: Peter Zijlstra @ 2013-07-12  8:17 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Colin Cross, linux-kernel, Kyungmin Park, Christoph Hellwig,
	John Stultz, Eric W. Biederman, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, linux-mm

On Fri, Jul 12, 2013 at 10:13:48AM +0200, Peter Zijlstra wrote:
> On Fri, Jul 12, 2013 at 08:39:14AM +0300, Pekka Enberg wrote:
> > On 07/12/2013 05:34 AM, Colin Cross wrote:
> > >Userspace processes often have multiple allocators that each do
> > >anonymous mmaps to get memory.  When examining memory usage of
> > >individual processes or systems as a whole, it is useful to be
> > >able to break down the various heaps that were allocated by
> > >each layer and examine their size, RSS, and physical memory
> > >usage.
> > >
> > >This patch adds a user pointer to the shared union in
> > >vm_area_struct that points to a null terminated string inside
> > >the user process containing a name for the vma.  vmas that
> > >point to the same address will be merged, but vmas that
> > >point to equivalent strings at different addresses will
> > >not be merged.
> > >
> > >Userspace can set the name for a region of memory by calling
> > >prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> > >Setting the name to NULL clears it.
> > >
> > >The names of named anonymous vmas are shown in /proc/pid/maps
> > >as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
> > >that is only present for named vmas.  If the userspace pointer
> > >is no longer valid all or part of the name will be replaced
> > >with "<fault>".
> > >
> > >The idea to store a userspace pointer to reduce the complexity
> > >within mm (at the expense of the complexity of reading
> > >/proc/pid/mem) came from Dave Hansen.  This results in no
> > >runtime overhead in the mm subsystem other than comparing
> > >the anon_name pointers when considering vma merging.  The pointer
> > >is stored in a union with fieds that are only used on file-backed
> > >mappings, so it does not increase memory usage.
> > >
> > >Signed-off-by: Colin Cross <ccross@android.com>
> > 
> > Ingo, PeterZ, is this something worthwhile for replacing our
> > current JIT symbol hack with perf?
> 
> I really don't see the point of this stuff; in fact I intensely dislike it as I
> don't think this is something the kernel needs to do at all.
> 
> Why can't these allocators Collin talks about use file maps and/or write their
> own meta-data to file? He is after all only interested in Android and they have
> complete control over the entire userspace stack.

In fact, nowhere in his entire Changelog does he explain why this needs be in
the kernel; _why_ can't userspace do this?

He needs to go change his allocators to use the new madv syscall anyway, he
might as well change them to write the stuff to a local file and be done with
it.

what gives?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  5:39   ` Pekka Enberg
@ 2013-07-12  8:13     ` Peter Zijlstra
  2013-07-12  8:17       ` Peter Zijlstra
  2013-07-12  8:21       ` Pekka Enberg
  0 siblings, 2 replies; 44+ messages in thread
From: Peter Zijlstra @ 2013-07-12  8:13 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Colin Cross, linux-kernel, Kyungmin Park, Christoph Hellwig,
	John Stultz, Eric W. Biederman, Dave Hansen, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Paul E. McKenney,
	David Howells, Arnd Bergmann, Dave Jones, Rafael J. Wysocki,
	Oleg Nesterov, Shaohua Li, Sasha Levin, KOSAKI Motohiro,
	Johannes Weiner, Ingo Molnar, linux-doc, linux-mm

On Fri, Jul 12, 2013 at 08:39:14AM +0300, Pekka Enberg wrote:
> On 07/12/2013 05:34 AM, Colin Cross wrote:
> >Userspace processes often have multiple allocators that each do
> >anonymous mmaps to get memory.  When examining memory usage of
> >individual processes or systems as a whole, it is useful to be
> >able to break down the various heaps that were allocated by
> >each layer and examine their size, RSS, and physical memory
> >usage.
> >
> >This patch adds a user pointer to the shared union in
> >vm_area_struct that points to a null terminated string inside
> >the user process containing a name for the vma.  vmas that
> >point to the same address will be merged, but vmas that
> >point to equivalent strings at different addresses will
> >not be merged.
> >
> >Userspace can set the name for a region of memory by calling
> >prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> >Setting the name to NULL clears it.
> >
> >The names of named anonymous vmas are shown in /proc/pid/maps
> >as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
> >that is only present for named vmas.  If the userspace pointer
> >is no longer valid all or part of the name will be replaced
> >with "<fault>".
> >
> >The idea to store a userspace pointer to reduce the complexity
> >within mm (at the expense of the complexity of reading
> >/proc/pid/mem) came from Dave Hansen.  This results in no
> >runtime overhead in the mm subsystem other than comparing
> >the anon_name pointers when considering vma merging.  The pointer
> >is stored in a union with fieds that are only used on file-backed
> >mappings, so it does not increase memory usage.
> >
> >Signed-off-by: Colin Cross <ccross@android.com>
> 
> Ingo, PeterZ, is this something worthwhile for replacing our
> current JIT symbol hack with perf?

I really don't see the point of this stuff; in fact I intensely dislike it as I
don't think this is something the kernel needs to do at all.

Why can't these allocators Collin talks about use file maps and/or write their
own meta-data to file? He is after all only interested in Android and they have
complete control over the entire userspace stack.

I also don't see it helping with the JIT stuff; you still need to write out a
file with symbol information, we still need to find the file. A less hacky
solution for the entire JIT thing is you writing a proper ELF-DSO and
mmap()'ing that :-)

Storing a JIT specific userspace pointer in the VMA doesn't help with any of
that.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  6:18     ` Colin Cross
@ 2013-07-12  7:03       ` Pekka Enberg
  0 siblings, 0 replies; 44+ messages in thread
From: Pekka Enberg @ 2013-07-12  7:03 UTC (permalink / raw)
  To: Colin Cross
  Cc: lkml, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	Linux-MM

On 07/12/2013 09:18 AM, Colin Cross wrote:
> This operates on vmas, so it can only handle naming page aligned
> regions.  It would work fine to identify the regions that contain JIT
> code, but not to identify individual functions.

Right. The obvious question is: does this need to be attached to
VMAs or could it be a separate data structure that can be used for
both?

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  6:36   ` Dave Hansen
@ 2013-07-12  6:42     ` Colin Cross
  0 siblings, 0 replies; 44+ messages in thread
From: Colin Cross @ 2013-07-12  6:42 UTC (permalink / raw)
  To: Dave Hansen
  Cc: lkml, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	Linux-MM

On Thu, Jul 11, 2013 at 11:36 PM, Dave Hansen <dave.hansen@intel.com> wrote:
> On 07/11/2013 07:34 PM, Colin Cross wrote:
>> +             pages_pinned = get_user_pages(current, mm, page_start_vaddr,
>> +                             1, 0, 0, &page, NULL);
>> +             if (pages_pinned < 1) {
>> +                     seq_puts(m, "<fault>]");
>> +                     return;
>> +             }
>> +
>> +             kaddr = (const char *)kmap(page);
>> +             len = min(max_len, PAGE_SIZE - page_offset);
>> +             write_len = strnlen(kaddr + page_offset, len);
>> +             seq_write(m, kaddr + page_offset, write_len);
>> +             kunmap(page);
>> +             put_page(page);
>
> This looks a bit like access_process_vm()?  Can you perhaps use it here?

It's a lot like __access_remote_vm, and this pattern is repeated in
many other places in the kernel.  I didn't try to reuse any of them
because I wanted to stop reading at a null byte and __access_remote_vm
would read the full NAME_MAX every time.  I was also avoiding having
to allocate a NAME_MAX sized buffer to copy into, instead passing the
mapped user page directly to seq_write.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
  2013-07-12  5:39   ` Pekka Enberg
  2013-07-12  5:43   ` Pekka Enberg
@ 2013-07-12  6:36   ` Dave Hansen
  2013-07-12  6:42     ` Colin Cross
  2013-07-14 14:11   ` Oleg Nesterov
  2013-07-14 14:17   ` Oleg Nesterov
  4 siblings, 1 reply; 44+ messages in thread
From: Dave Hansen @ 2013-07-12  6:36 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Pekka Enberg, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	linux-mm

On 07/11/2013 07:34 PM, Colin Cross wrote:
> +		pages_pinned = get_user_pages(current, mm, page_start_vaddr,
> +				1, 0, 0, &page, NULL);
> +		if (pages_pinned < 1) {
> +			seq_puts(m, "<fault>]");
> +			return;
> +		}
> +
> +		kaddr = (const char *)kmap(page);
> +		len = min(max_len, PAGE_SIZE - page_offset);
> +		write_len = strnlen(kaddr + page_offset, len);
> +		seq_write(m, kaddr + page_offset, write_len);
> +		kunmap(page);
> +		put_page(page);

This looks a bit like access_process_vm()?  Can you perhaps use it here?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  5:43   ` Pekka Enberg
@ 2013-07-12  6:18     ` Colin Cross
  2013-07-12  7:03       ` Pekka Enberg
  0 siblings, 1 reply; 44+ messages in thread
From: Colin Cross @ 2013-07-12  6:18 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: lkml, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	Linux-MM

On Thu, Jul 11, 2013 at 10:43 PM, Pekka Enberg <penberg@kernel.org> wrote:
> On 07/12/2013 05:34 AM, Colin Cross wrote:
>>
>> Userspace processes often have multiple allocators that each do
>> anonymous mmaps to get memory.  When examining memory usage of
>> individual processes or systems as a whole, it is useful to be
>> able to break down the various heaps that were allocated by
>> each layer and examine their size, RSS, and physical memory
>> usage.
>>
>> This patch adds a user pointer to the shared union in
>> vm_area_struct that points to a null terminated string inside
>> the user process containing a name for the vma.  vmas that
>> point to the same address will be merged, but vmas that
>> point to equivalent strings at different addresses will
>> not be merged.
>>
>> Userspace can set the name for a region of memory by calling
>> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
>> Setting the name to NULL clears it.
>>
>> The names of named anonymous vmas are shown in /proc/pid/maps
>> as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
>> that is only present for named vmas.  If the userspace pointer
>> is no longer valid all or part of the name will be replaced
>> with "<fault>".
>>
>> The idea to store a userspace pointer to reduce the complexity
>> within mm (at the expense of the complexity of reading
>> /proc/pid/mem) came from Dave Hansen.  This results in no
>> runtime overhead in the mm subsystem other than comparing
>> the anon_name pointers when considering vma merging.  The pointer
>> is stored in a union with fieds that are only used on file-backed
>> mappings, so it does not increase memory usage.
>>
>> Signed-off-by: Colin Cross <ccross@android.com>
>
>
> So how does this perform if I do prctl(PR_SET_VMA_ANON_NAME)
> for thousands of relatively small (max 1 KB) JIT generated
> functions? Will we run into MM problems because the VMAs are
> not mergeable?

This operates on vmas, so it can only handle naming page aligned
regions.  It would work fine to identify the regions that contain JIT
code, but not to identify individual functions.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
  2013-07-12  5:39   ` Pekka Enberg
@ 2013-07-12  5:43   ` Pekka Enberg
  2013-07-12  6:18     ` Colin Cross
  2013-07-12  6:36   ` Dave Hansen
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 44+ messages in thread
From: Pekka Enberg @ 2013-07-12  5:43 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	linux-mm

On 07/12/2013 05:34 AM, Colin Cross wrote:
> Userspace processes often have multiple allocators that each do
> anonymous mmaps to get memory.  When examining memory usage of
> individual processes or systems as a whole, it is useful to be
> able to break down the various heaps that were allocated by
> each layer and examine their size, RSS, and physical memory
> usage.
>
> This patch adds a user pointer to the shared union in
> vm_area_struct that points to a null terminated string inside
> the user process containing a name for the vma.  vmas that
> point to the same address will be merged, but vmas that
> point to equivalent strings at different addresses will
> not be merged.
>
> Userspace can set the name for a region of memory by calling
> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> Setting the name to NULL clears it.
>
> The names of named anonymous vmas are shown in /proc/pid/maps
> as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
> that is only present for named vmas.  If the userspace pointer
> is no longer valid all or part of the name will be replaced
> with "<fault>".
>
> The idea to store a userspace pointer to reduce the complexity
> within mm (at the expense of the complexity of reading
> /proc/pid/mem) came from Dave Hansen.  This results in no
> runtime overhead in the mm subsystem other than comparing
> the anon_name pointers when considering vma merging.  The pointer
> is stored in a union with fieds that are only used on file-backed
> mappings, so it does not increase memory usage.
>
> Signed-off-by: Colin Cross <ccross@android.com>

So how does this perform if I do prctl(PR_SET_VMA_ANON_NAME)
for thousands of relatively small (max 1 KB) JIT generated
functions? Will we run into MM problems because the VMAs are
not mergeable?

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
@ 2013-07-12  5:39   ` Pekka Enberg
  2013-07-12  8:13     ` Peter Zijlstra
  2013-07-12  5:43   ` Pekka Enberg
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 44+ messages in thread
From: Pekka Enberg @ 2013-07-12  5:39 UTC (permalink / raw)
  To: Colin Cross
  Cc: linux-kernel, Kyungmin Park, Christoph Hellwig, John Stultz,
	Eric W. Biederman, Dave Hansen, Rob Landley, Andrew Morton,
	Cyrill Gorcunov, David Rientjes, Davidlohr Bueso, Kees Cook,
	Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	linux-mm

On 07/12/2013 05:34 AM, Colin Cross wrote:
> Userspace processes often have multiple allocators that each do
> anonymous mmaps to get memory.  When examining memory usage of
> individual processes or systems as a whole, it is useful to be
> able to break down the various heaps that were allocated by
> each layer and examine their size, RSS, and physical memory
> usage.
>
> This patch adds a user pointer to the shared union in
> vm_area_struct that points to a null terminated string inside
> the user process containing a name for the vma.  vmas that
> point to the same address will be merged, but vmas that
> point to equivalent strings at different addresses will
> not be merged.
>
> Userspace can set the name for a region of memory by calling
> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> Setting the name to NULL clears it.
>
> The names of named anonymous vmas are shown in /proc/pid/maps
> as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
> that is only present for named vmas.  If the userspace pointer
> is no longer valid all or part of the name will be replaced
> with "<fault>".
>
> The idea to store a userspace pointer to reduce the complexity
> within mm (at the expense of the complexity of reading
> /proc/pid/mem) came from Dave Hansen.  This results in no
> runtime overhead in the mm subsystem other than comparing
> the anon_name pointers when considering vma merging.  The pointer
> is stored in a union with fieds that are only used on file-backed
> mappings, so it does not increase memory usage.
>
> Signed-off-by: Colin Cross <ccross@android.com>

Ingo, PeterZ, is this something worthwhile for replacing our
current JIT symbol hack with perf?

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH 2/2] mm: add a field to store names for private anonymous memory
  2013-07-12  2:34 [PATCH 1/2] mm: rearrange madvise code to allow for reuse Colin Cross
@ 2013-07-12  2:34 ` Colin Cross
  2013-07-12  5:39   ` Pekka Enberg
                     ` (4 more replies)
  0 siblings, 5 replies; 44+ messages in thread
From: Colin Cross @ 2013-07-12  2:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Kyungmin Park, Christoph Hellwig, John Stultz, Eric W. Biederman,
	Pekka Enberg, Dave Hansen, Colin Cross, Rob Landley,
	Andrew Morton, Cyrill Gorcunov, David Rientjes, Davidlohr Bueso,
	Kees Cook, Al Viro, Hugh Dickins, Mel Gorman, Michel Lespinasse,
	Rik van Riel, Konstantin Khlebnikov, Peter Zijlstra,
	Paul E. McKenney, David Howells, Arnd Bergmann, Dave Jones,
	Rafael J. Wysocki, Oleg Nesterov, Shaohua Li, Sasha Levin,
	KOSAKI Motohiro, Johannes Weiner, Ingo Molnar, linux-doc,
	linux-mm

Userspace processes often have multiple allocators that each do
anonymous mmaps to get memory.  When examining memory usage of
individual processes or systems as a whole, it is useful to be
able to break down the various heaps that were allocated by
each layer and examine their size, RSS, and physical memory
usage.

This patch adds a user pointer to the shared union in
vm_area_struct that points to a null terminated string inside
the user process containing a name for the vma.  vmas that
point to the same address will be merged, but vmas that
point to equivalent strings at different addresses will
not be merged.

Userspace can set the name for a region of memory by calling
prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
Setting the name to NULL clears it.

The names of named anonymous vmas are shown in /proc/pid/maps
as [anon:<name>] and in /proc/pid/smaps in a new "Name" field
that is only present for named vmas.  If the userspace pointer
is no longer valid all or part of the name will be replaced
with "<fault>".

The idea to store a userspace pointer to reduce the complexity
within mm (at the expense of the complexity of reading
/proc/pid/mem) came from Dave Hansen.  This results in no
runtime overhead in the mm subsystem other than comparing
the anon_name pointers when considering vma merging.  The pointer
is stored in a union with fieds that are only used on file-backed
mappings, so it does not increase memory usage.

Signed-off-by: Colin Cross <ccross@android.com>
---
 Documentation/filesystems/proc.txt |  6 ++++
 fs/proc/task_mmu.c                 | 62 ++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h                 |  5 ++-
 include/linux/mm_types.h           | 15 +++++++++
 include/uapi/linux/prctl.h         |  3 ++
 kernel/sys.c                       | 24 +++++++++++++++
 mm/madvise.c                       | 56 +++++++++++++++++++++++++++++++---
 mm/mempolicy.c                     |  2 +-
 mm/mlock.c                         |  3 +-
 mm/mmap.c                          | 44 ++++++++++++++++-----------
 mm/mprotect.c                      |  3 +-
 11 files changed, 197 insertions(+), 26 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fd8d0d5..e0eb9d2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -369,6 +369,8 @@ is not associated with a file:
  [stack:1001]             = the stack of the thread with tid 1001
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
+ [anon:<name>]            = an anonymous mapping that has been
+                            named by userspace
 
  or if empty, the mapping is anonymous.
 
@@ -419,6 +421,7 @@ KernelPageSize:        4 kB
 MMUPageSize:           4 kB
 Locked:              374 kB
 VmFlags: rd ex mr mw me de
+Name:           name from userspace
 
 the first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
@@ -469,6 +472,9 @@ Note that there is no guarantee that every flag and associated mnemonic will
 be present in all further kernel releases. Things get changed, the flags may
 be vanished or the reverse -- new added.
 
+The "Name" field will only be present on a mapping that has been named by
+userspace, and will show the name passed in by userspace.
+
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d8..de76be4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -90,6 +90,56 @@ static void pad_len_spaces(struct seq_file *m, int len)
 	seq_printf(m, "%*c", len, ' ');
 }
 
+static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
+{
+	const char __user *name = vma_get_anon_name(vma);
+	struct mm_struct *mm = vma->vm_mm;
+
+	unsigned long page_start_vaddr;
+	unsigned long page_offset;
+	unsigned long num_pages;
+	unsigned long max_len = NAME_MAX;
+	int i;
+
+	page_start_vaddr = (unsigned long)name & PAGE_MASK;
+	page_offset = (unsigned long)name - page_start_vaddr;
+	num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
+
+	seq_puts(m, "[anon:");
+
+	for (i = 0; i < num_pages; i++) {
+		int len;
+		int write_len;
+		const char *kaddr;
+		long pages_pinned;
+		struct page *page;
+
+		pages_pinned = get_user_pages(current, mm, page_start_vaddr,
+				1, 0, 0, &page, NULL);
+		if (pages_pinned < 1) {
+			seq_puts(m, "<fault>]");
+			return;
+		}
+
+		kaddr = (const char *)kmap(page);
+		len = min(max_len, PAGE_SIZE - page_offset);
+		write_len = strnlen(kaddr + page_offset, len);
+		seq_write(m, kaddr + page_offset, write_len);
+		kunmap(page);
+		put_page(page);
+
+		/* if strnlen hit a null terminator then we're done */
+		if (write_len != len)
+			break;
+
+		max_len -= len;
+		page_offset = 0;
+		page_start_vaddr += PAGE_SIZE;
+	}
+
+	seq_putc(m, ']');
+}
+
 #ifdef CONFIG_NUMA
 /*
  * These functions are for numa_maps but called in generic **maps seq_file
@@ -335,6 +385,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 				pad_len_spaces(m, len);
 				seq_printf(m, "[stack:%d]", tid);
 			}
+			goto done;
+		}
+
+		if (vma_get_anon_name(vma)) {
+			pad_len_spaces(m, len);
+			seq_print_vma_name(m, vma);
 		}
 	}
 
@@ -634,6 +690,12 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 
 	show_smap_vma_flags(m, vma);
 
+	if (vma_get_anon_name(vma)) {
+		seq_puts(m, "Name:           ");
+		seq_print_vma_name(m, vma);
+		seq_putc(m, '\n');
+	}
+
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task->mm))
 			? vma->vm_start : 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bd5679d..60038ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1486,7 +1486,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 extern struct vm_area_struct *vma_merge(struct mm_struct *,
 	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
 	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-	struct mempolicy *);
+	struct mempolicy *, const char __user *);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int split_vma(struct mm_struct *,
 	struct vm_area_struct *, unsigned long addr, int new_below);
@@ -1829,5 +1829,8 @@ void __init setup_nr_node_ids(void);
 static inline void setup_nr_node_ids(void) {}
 #endif
 
+int madvise_set_anon_name(unsigned long start, unsigned long len_in,
+				unsigned long name_addr);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ace9a5f..875ba48 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -255,6 +255,10 @@ struct vm_area_struct {
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree, or
 	 * linkage of vma in the address_space->i_mmap_nonlinear list.
+	 *
+	 * For private anonymous mappings, a pointer to a null terminated string
+	 * in the user process containing the name given to the vma, or NULL
+	 * if unnamed.
 	 */
 	union {
 		struct {
@@ -262,6 +266,7 @@ struct vm_area_struct {
 			unsigned long rb_subtree_last;
 		} linear;
 		struct list_head nonlinear;
+		const char __user *anon_name;
 	} shared;
 
 	/*
@@ -456,4 +461,14 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 	return mm->cpu_vm_mask_var;
 }
 
+
+/* Return the name for an anonymous mapping or NULL for a file-backed mapping */
+static inline const char __user *vma_get_anon_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_file)
+		return NULL;
+
+	return vma->shared.anon_name;
+}
+
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 289760f..063bf75 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -149,4 +149,7 @@
 
 #define PR_GET_TID_ADDRESS	40
 
+#define PR_SET_VMA		41
+# define PR_SET_VMA_ANON_NAME		0
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 2bbd9a7..401852f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2099,6 +2099,27 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
 
+static int prctl_set_vma(unsigned long opt, unsigned long addr,
+		unsigned long len, unsigned long arg)
+{
+	struct mm_struct *mm = current->mm;
+	int error;
+
+	down_write(&mm->mmap_sem);
+
+	switch (opt) {
+	case PR_SET_VMA_ANON_NAME:
+		error = madvise_set_anon_name(addr, len, arg);
+		break;
+	default:
+		error = -EINVAL;
+	}
+
+	up_write(&mm->mmap_sem);
+
+	return error;
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -2262,6 +2283,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
 		return current->no_new_privs ? 1 : 0;
+	case PR_SET_VMA:
+		error = prctl_set_vma(arg2, arg3, arg4, arg5);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/mm/madvise.c b/mm/madvise.c
index b8820fd..b2f8738 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -44,20 +44,23 @@ static int madvise_need_mmap_write(int behavior)
  */
 static int madvise_update_vma(struct vm_area_struct *vma,
 		     struct vm_area_struct **prev, unsigned long start,
-		     unsigned long end, unsigned long new_flags)
+		     unsigned long end, unsigned long new_flags,
+		     const char __user *new_anon_name)
 {
 	struct mm_struct * mm = vma->vm_mm;
 	pgoff_t pgoff;
 	int error;
 
-	if (new_flags == vma->vm_flags) {
+	if (new_flags == vma->vm_flags &&
+			new_anon_name == vma_get_anon_name(vma)) {
 		*prev = vma;
 		return 0;
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
-				vma->vm_file, pgoff, vma_policy(vma));
+				vma->vm_file, pgoff, vma_policy(vma),
+				new_anon_name);
 	if (*prev) {
 		vma = *prev;
 		goto success;
@@ -82,10 +85,30 @@ success:
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 */
 	vma->vm_flags = new_flags;
+	if (!vma->vm_file)
+		vma->shared.anon_name = new_anon_name;
 
 	return 0;
 }
 
+static int madvise_vma_anon_name(struct vm_area_struct *vma,
+		     struct vm_area_struct **prev,
+		     unsigned long start, unsigned long end,
+		     unsigned long name_addr)
+{
+	int error;
+
+	/* Only anonymous mappings can be named */
+	if (vma->vm_file)
+		return -EINVAL;
+
+	error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
+			(const char __user *)name_addr);
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+	return error;
+}
+
 #ifdef CONFIG_SWAP
 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 	unsigned long end, struct mm_walk *walk)
@@ -352,7 +375,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
 		break;
 	}
 
-	error = madvise_update_vma(vma, prev, start, end, new_flags);
+	error = madvise_update_vma(vma, prev, start, end, new_flags,
+				vma_get_anon_name(vma));
 
 out:
 	if (error == -ENOMEM)
@@ -488,6 +512,30 @@ int madvise_walk_vmas(unsigned long start, unsigned long end,
 	return unmapped_error;
 }
 
+int madvise_set_anon_name(unsigned long start, unsigned long len_in,
+		unsigned long name_addr)
+{
+	unsigned long end;
+	unsigned long len;
+
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+	/* Check to see whether len was rounded up from small -ve to zero */
+	if (len_in && !len)
+		return -EINVAL;
+
+	end = start + len;
+	if (end < start)
+		return -EINVAL;
+
+	if (end == start)
+		return 0;
+
+	return madvise_walk_vmas(start, end, name_addr, madvise_vma_anon_name);
+}
+
 /*
  * The madvise(2) system call.
  *
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7431001..11db490 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -728,7 +728,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 				  vma->anon_vma, vma->vm_file, pgoff,
-				  new_pol);
+				  new_pol, vma->vm_name);
 		if (prev) {
 			vma = prev;
 			next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index 79b7cf7..33861c7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -287,7 +287,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
-			  vma->vm_file, pgoff, vma_policy(vma));
+			  vma->vm_file, pgoff, vma_policy(vma),
+			  vma_get_anon_name(vma));
 	if (*prev) {
 		vma = *prev;
 		goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e18..25abb88 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -893,7 +893,8 @@ again:			remove_next = 1 + (end > next->vm_end);
  * per-vma resources, so we don't attempt to merge those.
  */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
-			struct file *file, unsigned long vm_flags)
+			struct file *file, unsigned long vm_flags,
+			const char __user *anon_name)
 {
 	if (vma->vm_flags ^ vm_flags)
 		return 0;
@@ -901,6 +902,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
 		return 0;
 	if (vma->vm_ops && vma->vm_ops->close)
 		return 0;
+	if (vma_get_anon_name(vma) != anon_name)
+		return 0;
 	return 1;
 }
 
@@ -931,9 +934,10 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  */
 static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+	const char __user *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		if (vma->vm_pgoff == vm_pgoff)
 			return 1;
@@ -950,9 +954,10 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  */
 static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff,
+	const char __user *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, anon_name) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		pgoff_t vm_pglen;
 		vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
@@ -963,9 +968,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 }
 
 /*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor.  Or both (it neatly fills a hole).
  *
  * In most cases - when called for mmap, brk or mremap - [addr,end) is
  * certain not to be mapped by the time vma_merge is called; but when
@@ -995,7 +1000,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 			struct vm_area_struct *prev, unsigned long addr,
 			unsigned long end, unsigned long vm_flags,
 		     	struct anon_vma *anon_vma, struct file *file,
-			pgoff_t pgoff, struct mempolicy *policy)
+			pgoff_t pgoff, struct mempolicy *policy,
+			const char __user *anon_name)
 {
 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
@@ -1021,15 +1027,15 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	 */
 	if (prev && prev->vm_end == addr &&
   			mpol_equal(vma_policy(prev), policy) &&
-			can_vma_merge_after(prev, vm_flags,
-						anon_vma, file, pgoff)) {
+			can_vma_merge_after(prev, vm_flags, anon_vma,
+						file, pgoff, anon_name)) {
 		/*
 		 * OK, it can.  Can we now merge in the successor as well?
 		 */
 		if (next && end == next->vm_start &&
 				mpol_equal(policy, vma_policy(next)) &&
-				can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen) &&
+				can_vma_merge_before(next, vm_flags, anon_vma,
+						file, pgoff+pglen, anon_name) &&
 				is_mergeable_anon_vma(prev->anon_vma,
 						      next->anon_vma, NULL)) {
 							/* cases 1, 6 */
@@ -1049,8 +1055,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	 */
 	if (next && end == next->vm_start &&
  			mpol_equal(policy, vma_policy(next)) &&
-			can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen)) {
+			can_vma_merge_before(next, vm_flags, anon_vma,
+					file, pgoff+pglen, anon_name)) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err = vma_adjust(prev, prev->vm_start,
 				addr, prev->vm_pgoff, NULL);
@@ -1519,7 +1525,8 @@ munmap_back:
 	/*
 	 * Can we just expand an old mapping?
 	 */
-	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+			NULL, NULL);
 	if (vma)
 		goto out;
 
@@ -2663,7 +2670,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
 	/* Can we just expand an old private anonymous mapping? */
 	vma = vma_merge(mm, prev, addr, addr + len, flags,
-					NULL, NULL, pgoff, NULL);
+					NULL, NULL, pgoff, NULL, NULL);
 	if (vma)
 		goto out;
 
@@ -2821,7 +2828,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
 		return NULL;	/* should never get here */
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			vma_get_anon_name(vma));
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4..94d50b7 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -271,7 +271,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 */
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*pprev = vma_merge(mm, *pprev, start, end, newflags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			vma_get_anon_name(vma));
 	if (*pprev) {
 		vma = *pprev;
 		goto success;
-- 
1.8.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2013-11-01  1:30 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-10-15  1:31 [PATCHv3 1/2] mm: rearrange madvise code to allow for reuse Colin Cross
2013-10-15  1:31 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
2013-10-15 21:21   ` Andrew Morton
2013-10-15 21:32     ` Dave Hansen
2013-10-15 21:47   ` Colin Cross
2013-10-16  0:33   ` Minchan Kim
2013-10-16 20:00     ` Colin Cross
2013-10-16 20:34       ` Dave Hansen
2013-10-16 20:41         ` Colin Cross
2013-10-17  2:47       ` Minchan Kim
2013-10-30 21:15         ` Colin Cross
2013-11-01  1:30           ` Minchan Kim
  -- strict thread matches above, loose matches on Subject: below --
2013-07-12  2:34 [PATCH 1/2] mm: rearrange madvise code to allow for reuse Colin Cross
2013-07-12  2:34 ` [PATCH 2/2] mm: add a field to store names for private anonymous memory Colin Cross
2013-07-12  5:39   ` Pekka Enberg
2013-07-12  8:13     ` Peter Zijlstra
2013-07-12  8:17       ` Peter Zijlstra
2013-07-12  8:44         ` Ingo Molnar
2013-07-12  8:55           ` Pekka Enberg
2013-07-12  9:00           ` Peter Zijlstra
2013-07-12  9:15             ` Ingo Molnar
2013-07-12  9:27               ` Peter Zijlstra
2013-07-12  9:40                 ` Ingo Molnar
2013-07-12  9:49                   ` Peter Zijlstra
2013-07-12 10:01                     ` Ingo Molnar
2013-07-12 20:51                     ` Colin Cross
2013-09-26  1:24                       ` Colin Cross
2013-07-12  8:21       ` Pekka Enberg
2013-07-12  8:55         ` Peter Zijlstra
2013-07-12  9:04           ` Pekka Enberg
2013-07-12  9:14             ` Peter Zijlstra
2013-07-12  9:28               ` Ingo Molnar
2013-07-12  9:26             ` Ingo Molnar
2013-07-12  9:38               ` Pekka Enberg
2013-07-12  9:45                 ` Ingo Molnar
2013-07-12 10:09                   ` Peter Zijlstra
2013-07-12  5:43   ` Pekka Enberg
2013-07-12  6:18     ` Colin Cross
2013-07-12  7:03       ` Pekka Enberg
2013-07-12  6:36   ` Dave Hansen
2013-07-12  6:42     ` Colin Cross
2013-07-14 14:11   ` Oleg Nesterov
2013-07-14 19:27     ` Colin Cross
2013-07-14 14:17   ` Oleg Nesterov
2013-07-14 19:34     ` Colin Cross

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).