All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-17 21:09 ` Shaohua Li
  0 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-17 21:09 UTC (permalink / raw)
  To: linux-mm-Bw31MaZKKs3YtjvyW6yDsg
  Cc: danielmicay-Re5JQEeQqe8AvxtiuMwx3w,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Andrew Morton,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski

There was a similar patch posted before, but it doesn't get merged. I'd like
to try again if there are more discussions.
http://marc.info/?l=linux-mm&m=141230769431688&w=2

mremap can be used to accelerate realloc. The problem is mremap will
punch a hole in original VMA, which makes specific memory allocator
unable to utilize it. Jemalloc is an example. It manages memory in 4M
chunks. mremap a range of the chunk will punch a hole, which other
mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
can't handle it.

This patch adds a new flag for mremap. With it, mremap will not punch the
hole. page tables of original vma will be zapped in the same way, but
vma is still there. That is original vma will look like a vma without
pagefault. Behavior of new vma isn't changed.

For private vma, accessing original vma will cause
page fault and just like the address of the vma has never been accessed.
So for anonymous, new page/zero page will be fault in. For file mapping,
new page will be allocated with file reading for cow, or pagefault will
use existing page cache.

For shared vma, original and new vma will map to the same file. We can
optimize this without zaping original vma's page table in this case, but
this patch doesn't do it yet.

Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
for special vma might not able to handle pagefault for mremap'd area.
The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
mremap.

Cc: Rik van Riel <riel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
Cc: Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Cc: Mel Gorman <mel-wPRd99KPJ+uzQB+pC5nmwQ@public.gmane.org>
Cc: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
Cc: Michal Hocko <mhocko-AlSwsSmVLrQ@public.gmane.org>
Cc: Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org>
Signed-off-by: Shaohua Li <shli-b10kYP2dOMg@public.gmane.org>
---
 include/uapi/linux/mman.h |  1 +
 mm/mremap.c               | 97 ++++++++++++++++++++++++++++++++---------------
 2 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd..9ee9a15 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,6 +5,7 @@
 
 #define MREMAP_MAYMOVE	1
 #define MREMAP_FIXED	2
+#define MREMAP_NOHOLE	4
 
 #define OVERCOMMIT_GUESS		0
 #define OVERCOMMIT_ALWAYS		1
diff --git a/mm/mremap.c b/mm/mremap.c
index 38df67b..4771fd1 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -234,7 +234,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
-		unsigned long new_len, unsigned long new_addr, bool *locked)
+		unsigned long new_len, unsigned long new_addr, bool *locked,
+		bool nohole)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -290,7 +291,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
 
 	/* Conceal VM_ACCOUNT so old reservation is not undone */
-	if (vm_flags & VM_ACCOUNT) {
+	if ((vm_flags & VM_ACCOUNT) && !nohole) {
 		vma->vm_flags &= ~VM_ACCOUNT;
 		excess = vma->vm_end - vma->vm_start - old_len;
 		if (old_addr > vma->vm_start &&
@@ -310,11 +311,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	hiwater_vm = mm->hiwater_vm;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
-	if (do_munmap(mm, old_addr, old_len) < 0) {
+	if (!nohole && do_munmap(mm, old_addr, old_len) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
 	}
+
+	if (nohole && (new_addr & ~PAGE_MASK)) {
+		/* caller will unaccount */
+		vma->vm_flags &= ~VM_ACCOUNT;
+		do_munmap(mm, old_addr, old_len);
+	}
+
 	mm->hiwater_vm = hiwater_vm;
 
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
@@ -332,14 +340,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	return new_addr;
 }
 
-static struct vm_area_struct *vma_to_resize(unsigned long addr,
-	unsigned long old_len, unsigned long new_len, unsigned long *p)
+static unsigned long validate_vma_and_charge(struct vm_area_struct *vma,
+	unsigned long addr,
+	unsigned long old_len, unsigned long new_len, unsigned long *p,
+	bool nohole)
 {
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma = find_vma(mm, addr);
-
-	if (!vma || vma->vm_start > addr)
-		goto Efault;
+	unsigned long diff;
 
 	if (is_vm_hugetlb_page(vma))
 		goto Einval;
@@ -348,6 +355,9 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 	if (old_len > vma->vm_end - addr)
 		goto Efault;
 
+	if (nohole && (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
+		goto Einval;
+
 	/* Need to be careful about a growing mapping */
 	if (new_len > old_len) {
 		unsigned long pgoff;
@@ -360,39 +370,45 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 			goto Einval;
 	}
 
+	if (nohole)
+		diff = new_len;
+	else
+		diff = new_len - old_len;
+
 	if (vma->vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
 		locked = mm->locked_vm << PAGE_SHIFT;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
-		locked += new_len - old_len;
+		locked += diff;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			goto Eagain;
 	}
 
-	if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+	if (!may_expand_vm(mm, diff >> PAGE_SHIFT))
 		goto Enomem;
 
 	if (vma->vm_flags & VM_ACCOUNT) {
-		unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
+		unsigned long charged = diff >> PAGE_SHIFT;
 		if (security_vm_enough_memory_mm(mm, charged))
 			goto Efault;
 		*p = charged;
 	}
 
-	return vma;
+	return 0;
 
 Efault:	/* very odd choice for most of the cases, but... */
-	return ERR_PTR(-EFAULT);
+	return -EFAULT;
 Einval:
-	return ERR_PTR(-EINVAL);
+	return -EINVAL;
 Enomem:
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 Eagain:
-	return ERR_PTR(-EAGAIN);
+	return -EAGAIN;
 }
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
-		unsigned long new_addr, unsigned long new_len, bool *locked)
+		unsigned long new_addr, unsigned long new_len, bool *locked,
+		bool nohole)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -420,17 +436,23 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		goto out;
 
 	if (old_len >= new_len) {
-		ret = do_munmap(mm, addr+new_len, old_len - new_len);
-		if (ret && old_len != new_len)
-			goto out;
+		if (!nohole) {
+			ret = do_munmap(mm, addr+new_len, old_len - new_len);
+			if (ret && old_len != new_len)
+				goto out;
+		}
 		old_len = new_len;
 	}
 
-	vma = vma_to_resize(addr, old_len, new_len, &charged);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
+	vma = find_vma(mm, addr);
+	if (!vma || vma->vm_start > addr) {
+		ret = -EFAULT;
 		goto out;
 	}
+	ret = validate_vma_and_charge(vma, addr, old_len, new_len, &charged,
+		nohole);
+	if (ret)
+		goto out;
 
 	map_flags = MAP_FIXED;
 	if (vma->vm_flags & VM_MAYSHARE)
@@ -442,7 +464,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (ret & ~PAGE_MASK)
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, nohole);
 	if (!(ret & ~PAGE_MASK))
 		goto out;
 out1:
@@ -481,8 +503,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	unsigned long ret = -EINVAL;
 	unsigned long charged = 0;
 	bool locked = false;
+	bool nohole = flags & MREMAP_NOHOLE;
 
-	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_NOHOLE))
 		return ret;
 
 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
@@ -506,7 +529,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked);
+				&locked, nohole);
 		goto out;
 	}
 
@@ -526,9 +549,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	/*
 	 * Ok, we need to grow..
 	 */
-	vma = vma_to_resize(addr, old_len, new_len, &charged);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
+	vma = find_vma(mm, addr);
+	if (!vma || vma->vm_start > addr) {
+		ret = -EFAULT;
 		goto out;
 	}
 
@@ -539,6 +562,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		if (vma_expandable(vma, new_len - old_len)) {
 			int pages = (new_len - old_len) >> PAGE_SHIFT;
 
+			ret = validate_vma_and_charge(vma, addr, old_len, new_len,
+				&charged, false);
+			if (ret) {
+				BUG_ON(charged != 0);
+				goto out;
+			}
 			if (vma_adjust(vma, vma->vm_start, addr + new_len,
 				       vma->vm_pgoff, NULL)) {
 				ret = -ENOMEM;
@@ -556,6 +585,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		}
 	}
 
+	ret = validate_vma_and_charge(vma, addr, old_len, new_len,
+		&charged, nohole);
+	if (ret)
+		goto out;
+
 	/*
 	 * We weren't able to just expand or shrink the area,
 	 * we need to create a new one and move it..
@@ -575,7 +609,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 			goto out;
 		}
 
-		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked,
+			nohole);
 	}
 out:
 	if (ret & ~PAGE_MASK)
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-17 21:09 ` Shaohua Li
  0 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-17 21:09 UTC (permalink / raw)
  To: linux-mm
  Cc: danielmicay, linux-api, Rik van Riel, Andrew Morton,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski

There was a similar patch posted before, but it doesn't get merged. I'd like
to try again if there are more discussions.
http://marc.info/?l=linux-mm&m=141230769431688&w=2

mremap can be used to accelerate realloc. The problem is mremap will
punch a hole in original VMA, which makes specific memory allocator
unable to utilize it. Jemalloc is an example. It manages memory in 4M
chunks. mremap a range of the chunk will punch a hole, which other
mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
can't handle it.

This patch adds a new flag for mremap. With it, mremap will not punch the
hole. page tables of original vma will be zapped in the same way, but
vma is still there. That is original vma will look like a vma without
pagefault. Behavior of new vma isn't changed.

For private vma, accessing original vma will cause
page fault and just like the address of the vma has never been accessed.
So for anonymous, new page/zero page will be fault in. For file mapping,
new page will be allocated with file reading for cow, or pagefault will
use existing page cache.

For shared vma, original and new vma will map to the same file. We can
optimize this without zaping original vma's page table in this case, but
this patch doesn't do it yet.

Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
for special vma might not able to handle pagefault for mremap'd area.
The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
mremap.

Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 include/uapi/linux/mman.h |  1 +
 mm/mremap.c               | 97 ++++++++++++++++++++++++++++++++---------------
 2 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd..9ee9a15 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,6 +5,7 @@
 
 #define MREMAP_MAYMOVE	1
 #define MREMAP_FIXED	2
+#define MREMAP_NOHOLE	4
 
 #define OVERCOMMIT_GUESS		0
 #define OVERCOMMIT_ALWAYS		1
diff --git a/mm/mremap.c b/mm/mremap.c
index 38df67b..4771fd1 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -234,7 +234,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 
 static unsigned long move_vma(struct vm_area_struct *vma,
 		unsigned long old_addr, unsigned long old_len,
-		unsigned long new_len, unsigned long new_addr, bool *locked)
+		unsigned long new_len, unsigned long new_addr, bool *locked,
+		bool nohole)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma;
@@ -290,7 +291,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
 
 	/* Conceal VM_ACCOUNT so old reservation is not undone */
-	if (vm_flags & VM_ACCOUNT) {
+	if ((vm_flags & VM_ACCOUNT) && !nohole) {
 		vma->vm_flags &= ~VM_ACCOUNT;
 		excess = vma->vm_end - vma->vm_start - old_len;
 		if (old_addr > vma->vm_start &&
@@ -310,11 +311,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	hiwater_vm = mm->hiwater_vm;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
-	if (do_munmap(mm, old_addr, old_len) < 0) {
+	if (!nohole && do_munmap(mm, old_addr, old_len) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
 	}
+
+	if (nohole && (new_addr & ~PAGE_MASK)) {
+		/* caller will unaccount */
+		vma->vm_flags &= ~VM_ACCOUNT;
+		do_munmap(mm, old_addr, old_len);
+	}
+
 	mm->hiwater_vm = hiwater_vm;
 
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
@@ -332,14 +340,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	return new_addr;
 }
 
-static struct vm_area_struct *vma_to_resize(unsigned long addr,
-	unsigned long old_len, unsigned long new_len, unsigned long *p)
+static unsigned long validate_vma_and_charge(struct vm_area_struct *vma,
+	unsigned long addr,
+	unsigned long old_len, unsigned long new_len, unsigned long *p,
+	bool nohole)
 {
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma = find_vma(mm, addr);
-
-	if (!vma || vma->vm_start > addr)
-		goto Efault;
+	unsigned long diff;
 
 	if (is_vm_hugetlb_page(vma))
 		goto Einval;
@@ -348,6 +355,9 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 	if (old_len > vma->vm_end - addr)
 		goto Efault;
 
+	if (nohole && (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
+		goto Einval;
+
 	/* Need to be careful about a growing mapping */
 	if (new_len > old_len) {
 		unsigned long pgoff;
@@ -360,39 +370,45 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 			goto Einval;
 	}
 
+	if (nohole)
+		diff = new_len;
+	else
+		diff = new_len - old_len;
+
 	if (vma->vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
 		locked = mm->locked_vm << PAGE_SHIFT;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
-		locked += new_len - old_len;
+		locked += diff;
 		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
 			goto Eagain;
 	}
 
-	if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+	if (!may_expand_vm(mm, diff >> PAGE_SHIFT))
 		goto Enomem;
 
 	if (vma->vm_flags & VM_ACCOUNT) {
-		unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
+		unsigned long charged = diff >> PAGE_SHIFT;
 		if (security_vm_enough_memory_mm(mm, charged))
 			goto Efault;
 		*p = charged;
 	}
 
-	return vma;
+	return 0;
 
 Efault:	/* very odd choice for most of the cases, but... */
-	return ERR_PTR(-EFAULT);
+	return -EFAULT;
 Einval:
-	return ERR_PTR(-EINVAL);
+	return -EINVAL;
 Enomem:
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 Eagain:
-	return ERR_PTR(-EAGAIN);
+	return -EAGAIN;
 }
 
 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
-		unsigned long new_addr, unsigned long new_len, bool *locked)
+		unsigned long new_addr, unsigned long new_len, bool *locked,
+		bool nohole)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -420,17 +436,23 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 		goto out;
 
 	if (old_len >= new_len) {
-		ret = do_munmap(mm, addr+new_len, old_len - new_len);
-		if (ret && old_len != new_len)
-			goto out;
+		if (!nohole) {
+			ret = do_munmap(mm, addr+new_len, old_len - new_len);
+			if (ret && old_len != new_len)
+				goto out;
+		}
 		old_len = new_len;
 	}
 
-	vma = vma_to_resize(addr, old_len, new_len, &charged);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
+	vma = find_vma(mm, addr);
+	if (!vma || vma->vm_start > addr) {
+		ret = -EFAULT;
 		goto out;
 	}
+	ret = validate_vma_and_charge(vma, addr, old_len, new_len, &charged,
+		nohole);
+	if (ret)
+		goto out;
 
 	map_flags = MAP_FIXED;
 	if (vma->vm_flags & VM_MAYSHARE)
@@ -442,7 +464,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (ret & ~PAGE_MASK)
 		goto out1;
 
-	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
+	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, nohole);
 	if (!(ret & ~PAGE_MASK))
 		goto out;
 out1:
@@ -481,8 +503,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	unsigned long ret = -EINVAL;
 	unsigned long charged = 0;
 	bool locked = false;
+	bool nohole = flags & MREMAP_NOHOLE;
 
-	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_NOHOLE))
 		return ret;
 
 	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
@@ -506,7 +529,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 
 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
-				&locked);
+				&locked, nohole);
 		goto out;
 	}
 
@@ -526,9 +549,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	/*
 	 * Ok, we need to grow..
 	 */
-	vma = vma_to_resize(addr, old_len, new_len, &charged);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
+	vma = find_vma(mm, addr);
+	if (!vma || vma->vm_start > addr) {
+		ret = -EFAULT;
 		goto out;
 	}
 
@@ -539,6 +562,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		if (vma_expandable(vma, new_len - old_len)) {
 			int pages = (new_len - old_len) >> PAGE_SHIFT;
 
+			ret = validate_vma_and_charge(vma, addr, old_len, new_len,
+				&charged, false);
+			if (ret) {
+				BUG_ON(charged != 0);
+				goto out;
+			}
 			if (vma_adjust(vma, vma->vm_start, addr + new_len,
 				       vma->vm_pgoff, NULL)) {
 				ret = -ENOMEM;
@@ -556,6 +585,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		}
 	}
 
+	ret = validate_vma_and_charge(vma, addr, old_len, new_len,
+		&charged, nohole);
+	if (ret)
+		goto out;
+
 	/*
 	 * We weren't able to just expand or shrink the area,
 	 * we need to create a new one and move it..
@@ -575,7 +609,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 			goto out;
 		}
 
-		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
+		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked,
+			nohole);
 	}
 out:
 	if (ret & ~PAGE_MASK)
-- 
1.8.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-17 21:09 ` Shaohua Li
@ 2015-03-18 22:31     ` Andrew Morton
  -1 siblings, 0 replies; 44+ messages in thread
From: Andrew Morton @ 2015-03-18 22:31 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	danielmicay-Re5JQEeQqe8AvxtiuMwx3w,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski

On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:

> There was a similar patch posted before, but it doesn't get merged. I'd like
> to try again if there are more discussions.
> http://marc.info/?l=linux-mm&m=141230769431688&w=2
> 
> mremap can be used to accelerate realloc. The problem is mremap will
> punch a hole in original VMA, which makes specific memory allocator
> unable to utilize it. Jemalloc is an example. It manages memory in 4M
> chunks. mremap a range of the chunk will punch a hole, which other
> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> can't handle it.

Daniel's changelog had additional details regarding the userspace
allocators' behaviour.  It would be best to incorporate that into your
changelog.

Daniel also had microbenchmark testing results for glibc and jemalloc. 
Can you please do this?

I'm not seeing any testing results for tcmalloc and I'm not seeing
confirmation that this patch will be useful for tcmalloc.  Has anyone
tried it, or sought input from tcmalloc developers?

> This patch adds a new flag for mremap. With it, mremap will not punch the
> hole. page tables of original vma will be zapped in the same way, but
> vma is still there. That is original vma will look like a vma without
> pagefault. Behavior of new vma isn't changed.
> 
> For private vma, accessing original vma will cause
> page fault and just like the address of the vma has never been accessed.
> So for anonymous, new page/zero page will be fault in. For file mapping,
> new page will be allocated with file reading for cow, or pagefault will
> use existing page cache.
> 
> For shared vma, original and new vma will map to the same file. We can
> optimize this without zaping original vma's page table in this case, but
> this patch doesn't do it yet.
> 
> Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
> for special vma might not able to handle pagefault for mremap'd area.
> The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
> mremap.

At some point (preferably an early point) we'd like a manpage update
and a cc: to linux-man-u79uwXL29TY76Z2rM5mHXA@public.gmane.org please.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-18 22:31     ` Andrew Morton
  0 siblings, 0 replies; 44+ messages in thread
From: Andrew Morton @ 2015-03-18 22:31 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-mm, danielmicay, linux-api, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski

On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:

> There was a similar patch posted before, but it doesn't get merged. I'd like
> to try again if there are more discussions.
> http://marc.info/?l=linux-mm&m=141230769431688&w=2
> 
> mremap can be used to accelerate realloc. The problem is mremap will
> punch a hole in original VMA, which makes specific memory allocator
> unable to utilize it. Jemalloc is an example. It manages memory in 4M
> chunks. mremap a range of the chunk will punch a hole, which other
> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> can't handle it.

Daniel's changelog had additional details regarding the userspace
allocators' behaviour.  It would be best to incorporate that into your
changelog.

Daniel also had microbenchmark testing results for glibc and jemalloc. 
Can you please do this?

I'm not seeing any testing results for tcmalloc and I'm not seeing
confirmation that this patch will be useful for tcmalloc.  Has anyone
tried it, or sought input from tcmalloc developers?

> This patch adds a new flag for mremap. With it, mremap will not punch the
> hole. page tables of original vma will be zapped in the same way, but
> vma is still there. That is original vma will look like a vma without
> pagefault. Behavior of new vma isn't changed.
> 
> For private vma, accessing original vma will cause
> page fault and just like the address of the vma has never been accessed.
> So for anonymous, new page/zero page will be fault in. For file mapping,
> new page will be allocated with file reading for cow, or pagefault will
> use existing page cache.
> 
> For shared vma, original and new vma will map to the same file. We can
> optimize this without zaping original vma's page table in this case, but
> this patch doesn't do it yet.
> 
> Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
> for special vma might not able to handle pagefault for mremap'd area.
> The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
> mremap.

At some point (preferably an early point) we'd like a manpage update
and a cc: to linux-man@vger.kernel.org please.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-18 22:31     ` Andrew Morton
@ 2015-03-19  5:08         ` Shaohua Li
  -1 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-19  5:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	danielmicay-Re5JQEeQqe8AvxtiuMwx3w,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski

On Wed, Mar 18, 2015 at 03:31:00PM -0700, Andrew Morton wrote:
> On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:
> 
> > There was a similar patch posted before, but it doesn't get merged. I'd like
> > to try again if there are more discussions.
> > http://marc.info/?l=linux-mm&m=141230769431688&w=2
> > 
> > mremap can be used to accelerate realloc. The problem is mremap will
> > punch a hole in original VMA, which makes specific memory allocator
> > unable to utilize it. Jemalloc is an example. It manages memory in 4M
> > chunks. mremap a range of the chunk will punch a hole, which other
> > mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> > can't handle it.
> 
> Daniel's changelog had additional details regarding the userspace
> allocators' behaviour.  It would be best to incorporate that into your
> changelog.

I'll extract some from his changelog in next post
 
> Daniel also had microbenchmark testing results for glibc and jemalloc. 
> Can you please do this?

I run Daniel's microbenchmark too, and not surprise the result is
similar:
glibc: 32.82
jemalloc: 70.35
jemalloc+mremap: 33.01
tcmalloc: 68.81

but tcmalloc doesn't support mremap currently, so I cant test it.
 
> I'm not seeing any testing results for tcmalloc and I'm not seeing
> confirmation that this patch will be useful for tcmalloc.  Has anyone
> tried it, or sought input from tcmalloc developers?
> 
> > This patch adds a new flag for mremap. With it, mremap will not punch the
> > hole. page tables of original vma will be zapped in the same way, but
> > vma is still there. That is original vma will look like a vma without
> > pagefault. Behavior of new vma isn't changed.
> > 
> > For private vma, accessing original vma will cause
> > page fault and just like the address of the vma has never been accessed.
> > So for anonymous, new page/zero page will be fault in. For file mapping,
> > new page will be allocated with file reading for cow, or pagefault will
> > use existing page cache.
> > 
> > For shared vma, original and new vma will map to the same file. We can
> > optimize this without zaping original vma's page table in this case, but
> > this patch doesn't do it yet.
> > 
> > Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
> > for special vma might not able to handle pagefault for mremap'd area.
> > The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
> > mremap.
> 
> At some point (preferably an early point) we'd like a manpage update
> and a cc: to linux-man-u79uwXL29TY76Z2rM5mHXA@public.gmane.org please.

ok, will add in next post.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-19  5:08         ` Shaohua Li
  0 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-19  5:08 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, danielmicay, linux-api, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski

On Wed, Mar 18, 2015 at 03:31:00PM -0700, Andrew Morton wrote:
> On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:
> 
> > There was a similar patch posted before, but it doesn't get merged. I'd like
> > to try again if there are more discussions.
> > http://marc.info/?l=linux-mm&m=141230769431688&w=2
> > 
> > mremap can be used to accelerate realloc. The problem is mremap will
> > punch a hole in original VMA, which makes specific memory allocator
> > unable to utilize it. Jemalloc is an example. It manages memory in 4M
> > chunks. mremap a range of the chunk will punch a hole, which other
> > mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> > can't handle it.
> 
> Daniel's changelog had additional details regarding the userspace
> allocators' behaviour.  It would be best to incorporate that into your
> changelog.

I'll extract some from his changelog in next post
 
> Daniel also had microbenchmark testing results for glibc and jemalloc. 
> Can you please do this?

I run Daniel's microbenchmark too, and not surprise the result is
similar:
glibc: 32.82
jemalloc: 70.35
jemalloc+mremap: 33.01
tcmalloc: 68.81

but tcmalloc doesn't support mremap currently, so I cant test it.
 
> I'm not seeing any testing results for tcmalloc and I'm not seeing
> confirmation that this patch will be useful for tcmalloc.  Has anyone
> tried it, or sought input from tcmalloc developers?
> 
> > This patch adds a new flag for mremap. With it, mremap will not punch the
> > hole. page tables of original vma will be zapped in the same way, but
> > vma is still there. That is original vma will look like a vma without
> > pagefault. Behavior of new vma isn't changed.
> > 
> > For private vma, accessing original vma will cause
> > page fault and just like the address of the vma has never been accessed.
> > So for anonymous, new page/zero page will be fault in. For file mapping,
> > new page will be allocated with file reading for cow, or pagefault will
> > use existing page cache.
> > 
> > For shared vma, original and new vma will map to the same file. We can
> > optimize this without zaping original vma's page table in this case, but
> > this patch doesn't do it yet.
> > 
> > Since with MREMAP_NOHOLE, original vma still exists. pagefault handler
> > for special vma might not able to handle pagefault for mremap'd area.
> > The patch doesn't allow vmas with VM_PFNMAP|VM_MIXEDMAP flags do NOHOLE
> > mremap.
> 
> At some point (preferably an early point) we'd like a manpage update
> and a cc: to linux-man@vger.kernel.org please.

ok, will add in next post.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-19  5:08         ` Shaohua Li
@ 2015-03-19  5:22             ` Andrew Morton
  -1 siblings, 0 replies; 44+ messages in thread
From: Andrew Morton @ 2015-03-19  5:22 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	danielmicay-Re5JQEeQqe8AvxtiuMwx3w,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski

On Wed, 18 Mar 2015 22:08:26 -0700 Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:

> > Daniel also had microbenchmark testing results for glibc and jemalloc. 
> > Can you please do this?
> 
> I run Daniel's microbenchmark too, and not surprise the result is
> similar:
> glibc: 32.82
> jemalloc: 70.35
> jemalloc+mremap: 33.01
> tcmalloc: 68.81
> 
> but tcmalloc doesn't support mremap currently, so I cant test it.

But Daniel's changelog implies strongly that tcmalloc would benefit
from his patch.  Was that inaccurate or is this a difference between
his patch and yours?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-19  5:22             ` Andrew Morton
  0 siblings, 0 replies; 44+ messages in thread
From: Andrew Morton @ 2015-03-19  5:22 UTC (permalink / raw)
  To: Shaohua Li
  Cc: linux-mm, danielmicay, linux-api, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski

On Wed, 18 Mar 2015 22:08:26 -0700 Shaohua Li <shli@fb.com> wrote:

> > Daniel also had microbenchmark testing results for glibc and jemalloc. 
> > Can you please do this?
> 
> I run Daniel's microbenchmark too, and not surprise the result is
> similar:
> glibc: 32.82
> jemalloc: 70.35
> jemalloc+mremap: 33.01
> tcmalloc: 68.81
> 
> but tcmalloc doesn't support mremap currently, so I cant test it.

But Daniel's changelog implies strongly that tcmalloc would benefit
from his patch.  Was that inaccurate or is this a difference between
his patch and yours?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-18 22:31     ` Andrew Morton
@ 2015-03-19  5:34         ` Daniel Micay
  -1 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-19  5:34 UTC (permalink / raw)
  To: Andrew Morton, Shaohua Li
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	Aliaksey Kandratsenka

[-- Attachment #1: Type: text/plain, Size: 2780 bytes --]

On 18/03/15 06:31 PM, Andrew Morton wrote:
> On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:
> 
>> There was a similar patch posted before, but it doesn't get merged. I'd like
>> to try again if there are more discussions.
>> http://marc.info/?l=linux-mm&m=141230769431688&w=2
>>
>> mremap can be used to accelerate realloc. The problem is mremap will
>> punch a hole in original VMA, which makes specific memory allocator
>> unable to utilize it. Jemalloc is an example. It manages memory in 4M
>> chunks. mremap a range of the chunk will punch a hole, which other
>> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
>> can't handle it.
> 
> Daniel's changelog had additional details regarding the userspace
> allocators' behaviour.  It would be best to incorporate that into your
> changelog.
>
> Daniel also had microbenchmark testing results for glibc and jemalloc. 
> Can you please do this?
> 
> I'm not seeing any testing results for tcmalloc and I'm not seeing
> confirmation that this patch will be useful for tcmalloc.  Has anyone
> tried it, or sought input from tcmalloc developers?

TCMalloc and jemalloc are currently equally slow in this benchmark, as
neither makes use of mremap. They're ~2-3x slower than glibc. I CC'ed
the currently most active TCMalloc developer so they can give input
into whether this patch would let them use it.

#include <string.h>
#include <stdlib.h>

int main(void) {
  void *ptr = NULL;
  size_t old_size = 0;
  for (size_t size = 4 * 1024 * 1024; size < 1024 * 1024 * 1024; size *= 2) {
    ptr = realloc(ptr, size);
    if (!ptr) return 1;
    memset(ptr, 0xff, size - old_size);
    old_size = size;
  }
  free(ptr);
}

If an outer loop is wrapped around this, jemalloc's master branch will
at least be able to do in-place resizing for everything after the 1st
run, but that's much rarer in the real world where there are many users
of the allocator. The lack of mremap still ends up hurting a lot.

FWIW, jemalloc is now the default allocator on Android so there are an
increasing number of Linux machines unable to leverage mremap. It could
be worked around by attempting to use an mmap hint to get the memory
back, but that can fail as it's a race with the other threads and that
leads increases fragmentation over the long term.

It's especially problematic if a large range of virtual memory is
reserved and divided up between per-CPU arenas for concurrency, but
only garbage collectors tend to do stuff like this at the moment. This
can still be dealt with by checking internal uses of mmap and returning
any memory from the reserved range to the right place, but it shouldn't
have to be that ugly.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-19  5:34         ` Daniel Micay
  0 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-19  5:34 UTC (permalink / raw)
  To: Andrew Morton, Shaohua Li
  Cc: linux-mm, linux-api, Rik van Riel, Hugh Dickins, Mel Gorman,
	Johannes Weiner, Michal Hocko, Andy Lutomirski,
	Aliaksey Kandratsenka

[-- Attachment #1: Type: text/plain, Size: 2758 bytes --]

On 18/03/15 06:31 PM, Andrew Morton wrote:
> On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:
> 
>> There was a similar patch posted before, but it doesn't get merged. I'd like
>> to try again if there are more discussions.
>> http://marc.info/?l=linux-mm&m=141230769431688&w=2
>>
>> mremap can be used to accelerate realloc. The problem is mremap will
>> punch a hole in original VMA, which makes specific memory allocator
>> unable to utilize it. Jemalloc is an example. It manages memory in 4M
>> chunks. mremap a range of the chunk will punch a hole, which other
>> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
>> can't handle it.
> 
> Daniel's changelog had additional details regarding the userspace
> allocators' behaviour.  It would be best to incorporate that into your
> changelog.
>
> Daniel also had microbenchmark testing results for glibc and jemalloc. 
> Can you please do this?
> 
> I'm not seeing any testing results for tcmalloc and I'm not seeing
> confirmation that this patch will be useful for tcmalloc.  Has anyone
> tried it, or sought input from tcmalloc developers?

TCMalloc and jemalloc are currently equally slow in this benchmark, as
neither makes use of mremap. They're ~2-3x slower than glibc. I CC'ed
the currently most active TCMalloc developer so they can give input
into whether this patch would let them use it.

#include <string.h>
#include <stdlib.h>

int main(void) {
  void *ptr = NULL;
  size_t old_size = 0;
  for (size_t size = 4 * 1024 * 1024; size < 1024 * 1024 * 1024; size *= 2) {
    ptr = realloc(ptr, size);
    if (!ptr) return 1;
    memset(ptr, 0xff, size - old_size);
    old_size = size;
  }
  free(ptr);
}

If an outer loop is wrapped around this, jemalloc's master branch will
at least be able to do in-place resizing for everything after the 1st
run, but that's much rarer in the real world where there are many users
of the allocator. The lack of mremap still ends up hurting a lot.

FWIW, jemalloc is now the default allocator on Android so there are an
increasing number of Linux machines unable to leverage mremap. It could
be worked around by attempting to use an mmap hint to get the memory
back, but that can fail as it's a race with the other threads and that
leads increases fragmentation over the long term.

It's especially problematic if a large range of virtual memory is
reserved and divided up between per-CPU arenas for concurrency, but
only garbage collectors tend to do stuff like this at the moment. This
can still be dealt with by checking internal uses of mmap and returning
any memory from the reserved range to the right place, but it shouldn't
have to be that ugly.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-19  5:22             ` Andrew Morton
@ 2015-03-19 16:38                 ` Shaohua Li
  -1 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-19 16:38 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	danielmicay-Re5JQEeQqe8AvxtiuMwx3w,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski

On Wed, Mar 18, 2015 at 10:22:46PM -0700, Andrew Morton wrote:
> On Wed, 18 Mar 2015 22:08:26 -0700 Shaohua Li <shli-b10kYP2dOMg@public.gmane.org> wrote:
> 
> > > Daniel also had microbenchmark testing results for glibc and jemalloc. 
> > > Can you please do this?
> > 
> > I run Daniel's microbenchmark too, and not surprise the result is
> > similar:
> > glibc: 32.82
> > jemalloc: 70.35
> > jemalloc+mremap: 33.01
> > tcmalloc: 68.81
> > 
> > but tcmalloc doesn't support mremap currently, so I cant test it.
> 
> But Daniel's changelog implies strongly that tcmalloc would benefit
> from his patch.  Was that inaccurate or is this a difference between
> his patch and yours?

There is no big difference, except I fixed some issues. Daniel didn't
post data for tcmalloc, I suppose it's potential mremap can make
tcmalloc faster too, but Daniel can clarify.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-19 16:38                 ` Shaohua Li
  0 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-19 16:38 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-mm, danielmicay, linux-api, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski

On Wed, Mar 18, 2015 at 10:22:46PM -0700, Andrew Morton wrote:
> On Wed, 18 Mar 2015 22:08:26 -0700 Shaohua Li <shli@fb.com> wrote:
> 
> > > Daniel also had microbenchmark testing results for glibc and jemalloc. 
> > > Can you please do this?
> > 
> > I run Daniel's microbenchmark too, and not surprise the result is
> > similar:
> > glibc: 32.82
> > jemalloc: 70.35
> > jemalloc+mremap: 33.01
> > tcmalloc: 68.81
> > 
> > but tcmalloc doesn't support mremap currently, so I cant test it.
> 
> But Daniel's changelog implies strongly that tcmalloc would benefit
> from his patch.  Was that inaccurate or is this a difference between
> his patch and yours?

There is no big difference, except I fixed some issues. Daniel didn't
post data for tcmalloc, I suppose it's potential mremap can make
tcmalloc faster too, but Daniel can clarify.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-19  5:34         ` Daniel Micay
  (?)
@ 2015-03-22  6:06         ` Aliaksey Kandratsenka
       [not found]           ` <CADpJO7zBLhjecbiQeTubnTReiicVLr0-K43KbB4uCL5w_dyqJg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2015-03-23  5:17           ` Shaohua Li
  -1 siblings, 2 replies; 44+ messages in thread
From: Aliaksey Kandratsenka @ 2015-03-22  6:06 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski, google-perftools

[-- Attachment #1: Type: text/plain, Size: 9888 bytes --]

On Wed, Mar 18, 2015 at 10:34 PM, Daniel Micay <danielmicay@gmail.com>
wrote:
>
> On 18/03/15 06:31 PM, Andrew Morton wrote:
> > On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:
> >
> >> There was a similar patch posted before, but it doesn't get merged.
I'd like
> >> to try again if there are more discussions.
> >> http://marc.info/?l=linux-mm&m=141230769431688&w=2
> >>
> >> mremap can be used to accelerate realloc. The problem is mremap will
> >> punch a hole in original VMA, which makes specific memory allocator
> >> unable to utilize it. Jemalloc is an example. It manages memory in 4M
> >> chunks. mremap a range of the chunk will punch a hole, which other
> >> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> >> can't handle it.
> >
> > Daniel's changelog had additional details regarding the userspace
> > allocators' behaviour.  It would be best to incorporate that into your
> > changelog.
> >
> > Daniel also had microbenchmark testing results for glibc and jemalloc.
> > Can you please do this?
> >
> > I'm not seeing any testing results for tcmalloc and I'm not seeing
> > confirmation that this patch will be useful for tcmalloc.  Has anyone
> > tried it, or sought input from tcmalloc developers?
>
> TCMalloc and jemalloc are currently equally slow in this benchmark, as
> neither makes use of mremap. They're ~2-3x slower than glibc. I CC'ed
> the currently most active TCMalloc developer so they can give input
> into whether this patch would let them use it.


Hi.

Thanks for looping us in for feedback (I'm CC-ing gperftools mailing list).

Yes, that might be useful feature. (Assuming I understood it correctly) I
believe
tcmalloc would likely use:

mremap(old_ptr, move_size, move_size,
       MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_NOHOLE,
       new_ptr);

as optimized equivalent of:

memcpy(new_ptr, old_ptr, move_size);
madvise(old_ptr, move_size, MADV_DONTNEED);

And btw I find MREMAP_RETAIN name from original patch to be slightly more
intuitive than MREMAP_NOHOLE. In my humble opinion the later name does not
reflect semantic of this feature at all (assuming of course I correctly
understood what the patch does).

I do have a couple of questions about this approach however. Please feel
free to
educate me on them.

a) what is the smallest size where mremap is going to be faster ?

My initial thinking was that we'd likely use mremap in all cases where we
know
that touching destination would cause minor page faults (i.e. when
destination
chunk was MADV_DONTNEED-ed or is brand new mapping). And then also always
when
size is large enough, i.e. because "teleporting" large count of pages is
likely
to be faster than copying them.

But now I realize that it is more interesting than that. I.e. because as
Daniel
pointed out, mremap holds mmap_sem exclusively, while page faults are
holding it
for read. That could be optimized of course. Either by separate "teleport
ptes"
syscall (again, as noted by Daniel), or by having mremap drop mmap_sem for
write
and retaking it for read for "moving pages" part of work. Being not really
familiar with kernel code I have no idea if that's doable or not. But it
looks
like it might be quite important.

Another aspect where I am similarly illiterate is performance effect of tlb
flushes needed for such operation.

We can certainly experiment and find that limit. But if mremap threshold is
going to be large, then perhaps this kernel feature is not as useful as we
may
hope.

b) is that optimization worth having at all ?

After all, memcpy is actually known to be fast. I understand that copying
memory
in user space can be slowed down by minor page faults (results below seem to
confirm that). But this is something where either allocator may retain
populated
pages a bit longer or where kernel could help. E.g. maybe by exposing
something
similar to MAP_POPULATE in madvise, or even doing some safe combination of
madvise and MAP_UNINITIALIZED.

I've played with Daniel's original benchmark (copied from
http://marc.info/?l=linux-mm&m=141230769431688&w=2) with some tiny
modifications:

#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/mman.h>

int main(int argc, char **argv)
{
        if (argc > 1 && strcmp(argv[1], "--mlock") == 0) {
                int rv = mlockall(MCL_CURRENT | MCL_FUTURE);
                if (rv) {
                        perror("mlockall");
                        abort();
                }
                puts("mlocked!");
        }

        for (size_t i = 0; i < 64; i++) {
                void *ptr = NULL;
                size_t old_size = 0;
                for (size_t size = 4; size < (1 << 30); size *= 2) {
                        /*
                         * void *hole = malloc(1 << 20);
                         * if (!hole) {
                         *      perror("malloc");
                         *      abort();
                         * }
                         */
                        ptr = realloc(ptr, size);
                        if (!ptr) {
                                perror("realloc");
                                abort();
                        }
                        /* free(hole); */
                        memset(ptr + old_size, 0xff, size - old_size);
                        old_size = size;
                }
                free(ptr);
        }
}

I cannot say if this benchmark's vectors of up to 0.5 gigs are common in
important applications or not. It can be argued that apps that care about
such
large vectors can do mremap themselves.

On the other hand, I believe that this micro benchmark could be plausibly
changed to grow vector by smaller factor (i.e. see
https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md#memory-handling).
And
with smaller growth factor, is seems reasonable to expect larger overhead
from
memcpy and smaller overhead from mremap. And thus favor mremap more.

And I confirm that with all default settings tcmalloc and jemalloc lose to
glibc. Also, notably, recent dev build of jemalloc (what is going to be 4.0
AFAIK) actually matches or exceeds glibc speed, despite still not doing
mremap. Apparently it is smarter about avoiding moving allocation for those
realloc-s. And it was even able to resist my attempt to force it to move
allocation. I haven't investigated why. Note that I built it couple weeks
or so
ago from dev branch, so it might simply have bugs.

Results also vary greatly depending in transparent huge pages setting.
Here's
what I've got:

allocator |   mode    | time  | sys time | pgfaults |             extra
----------+-----------+-------+----------+----------+-------------------------------
glibc     |           | 10.75 |     8.44 |  8388770 |
glibc     |    thp    |  5.67 |     3.44 |   310882 |
glibc     |   mlock   | 13.22 |     9.41 |  8388821 |
glibc     | thp+mlock |  8.43 |     4.63 |   310933 |
tcmalloc  |           | 11.46 |     2.00 |  2104826 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc  |    thp    | 10.61 |     0.89 |   386206 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc  |   mlock   | 10.11 |     0.27 |   264721 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc  | thp+mlock | 10.28 |     0.17 |    46011 |
TCMALLOC_AGGRESSIVE_DECOMMIT=f
tcmalloc  |           | 23.63 |    17.16 | 16770107 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
tcmalloc  |    thp    | 11.82 |     5.14 |   352477 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
tcmalloc  |   mlock   | 10.10 |     0.28 |   264724 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
tcmalloc  | thp+mlock | 10.30 |     0.17 |    49168 |
TCMALLOC_AGGRESSIVE_DECOMMIT=t
jemalloc1 |           | 23.71 |    17.33 | 16744572 |
jemalloc1 |    thp    | 11.65 |     4.68 |    64988 |
jemalloc1 |   mlock   | 10.13 |     0.29 |   263305 |
jemalloc1 | thp+mlock | 10.05 |     0.17 |    50217 |
jemalloc2 |           | 10.87 |     8.64 |  8521796 |
jemalloc2 |    thp    |  4.64 |     2.32 |    56060 |
jemalloc2 |   mlock   |  4.22 |     0.28 |   263181 |
jemalloc2 | thp+mlock |  4.12 |     0.19 |    50411 |
----------+-----------+-------+----------+----------+-------------------------------

NOTE: usual disclaimer applies about possibility of screwing something up
and
getting invalid benchmark results without being able to see it. I apologize
in
advance.

NOTE: jemalloc1 is 3.6 as shipped by up-to-date Debian Sid. jemalloc2 is
home-built snapshot of upcoming jemalloc 4.0.

NOTE: TCMALLOC_AGGRESSIVE_DECOMMIT=t (and default since 2.4) makes tcmalloc
MADV_DONTNEED large free blocks immediately. As opposed to less rare with
setting of "false". And it makes big difference on page faults counts and
thus
on runtime.

Another notable thing is how mlock effectively disables MADV_DONTNEED for
jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
slightly better on runtime to glibc. The later spends a ton of time in
kernel,
probably handling minor page faults, and the former burns cpu in user space
doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
is
doing in this benchmark.

THP changes things however. Where apparently minor page faults become a lot
cheaper. Which makes glibc case a lot faster than even tcmalloc+mlock case.
So
in THP case, cost of page faults is smaller than cost of large memcpy.

So results are somewhat mixed, but overall I'm not sure that I'm able to see
very convincing story for MREMAP_HOLE yet. However:

1) it is possible that I am missing something. If so, please, educate me.

2) if kernel implements this API, I'm going to use it in tcmalloc.

P.S. benchmark results also seem to indicate that tcmalloc could do
something to
explicitly enable THP and maybe better adapt to it's presence. Perhaps with
some
collaboration with kernel, i.e. to prevent that famous delay-ful-ness which
causes people to disable THP.

[-- Attachment #2: Type: text/html, Size: 11931 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-22  6:06         ` Aliaksey Kandratsenka
@ 2015-03-22  7:22               ` Daniel Micay
  2015-03-23  5:17           ` Shaohua Li
  1 sibling, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-22  7:22 UTC (permalink / raw)
  To: Aliaksey Kandratsenka
  Cc: Andrew Morton, Shaohua Li, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw

[-- Attachment #1: Type: text/plain, Size: 7020 bytes --]

> Yes, that might be useful feature. (Assuming I understood it correctly)
> I believe
> tcmalloc would likely use:
> 
> mremap(old_ptr, move_size, move_size,
>        MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_NOHOLE,
>        new_ptr);
> 
> as optimized equivalent of:
> 
> memcpy(new_ptr, old_ptr, move_size);
> madvise(old_ptr, move_size, MADV_DONTNEED);

Yeah, it's essentially an optimized memcpy for when you don't need the
source allocation anymore.

> a) what is the smallest size where mremap is going to be faster ?

There are probably a lot of variables here like the CPU design and the
speed of system calls (syscall auditing makes them much slower!) in
addition to the stuff you've pointed out.

> My initial thinking was that we'd likely use mremap in all cases where
> we know
> that touching destination would cause minor page faults (i.e. when
> destination
> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also
> always when
> size is large enough, i.e. because "teleporting" large count of pages is
> likely
> to be faster than copying them.
> 
> But now I realize that it is more interesting than that. I.e. because as
> Daniel
> pointed out, mremap holds mmap_sem exclusively, while page faults are
> holding it
> for read. That could be optimized of course. Either by separate
> "teleport ptes"
> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem
> for write
> and retaking it for read for "moving pages" part of work. Being not really
> familiar with kernel code I have no idea if that's doable or not. But it
> looks
> like it might be quite important.

I think it's doable but it would pessimize the case where the dest VMA
isn't reusable. It would need to optimistically take the reader lock to
find out and then drop it. However, userspace knows when this is surely
going to work and could give it a hint.

I have a good idea about what the *ideal* API for the jemalloc/tcmalloc
case would be. It would be extremely specific though... they want the
kernel to move pages from a source VMA to a destination VMA where both
are anon/private with identical flags so only the reader lock is
necessary. On top of that, they really want to keep around as many
destination pages as possible, maybe by swapping as many as possible
back to the source.

That's *extremely* specific though and I now think the best way to get
there is by landing this feature and then extending it as necessary down
the road. An allocator may actually want to manage other kinds of
mappings itself and it would want the mmap_sem optimization to be an
optional hint.

> And I confirm that with all default settings tcmalloc and jemalloc lose to
> glibc. Also, notably, recent dev build of jemalloc (what is going to be 4.0
> AFAIK) actually matches or exceeds glibc speed, despite still not doing
> mremap. Apparently it is smarter about avoiding moving allocation for those
> realloc-s. And it was even able to resist my attempt to force it to move
> allocation. I haven't investigated why. Note that I built it couple
> weeks or so
> ago from dev branch, so it might simply have bugs.

I submitted patches teaching jemalloc to expand/shrink huge allocations
in-place, so it's hitting the in-place resize path after the initial
iteration on a repeated reallocation benchmark that's not doing any
other allocations.

In jemalloc, everything is allocated via naturally aligned chunks (4M
before, recently down to 256k in master) so if you want to block
in-place huge reallocation you'll either need to force a new non-huge
chunk to be allocated or make one that's at least as large as the chunk
size.

I don't think in-place reallocation is very common in long-running
programs. It's probably more common now that jemalloc is experimenting
with first-fit for chunk/huge allocation rather than address-ordered
best-fit. The best-fit algorithm is designed to keep the opportunity for
in-place reallocation to a minimum, although address ordering does
counter it :).

> NOTE: TCMALLOC_AGGRESSIVE_DECOMMIT=t (and default since 2.4) makes tcmalloc
> MADV_DONTNEED large free blocks immediately. As opposed to less rare with
> setting of "false". And it makes big difference on page faults counts
> and thus
> on runtime.
> 
> Another notable thing is how mlock effectively disables MADV_DONTNEED for
> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
> slightly better on runtime to glibc. The later spends a ton of time in
> kernel,
> probably handling minor page faults, and the former burns cpu in user space
> doing memcpy-s. So "tons of memcpys" seems to be competitive to what
> glibc is
> doing in this benchmark.

When I taught jemalloc to use the MREMAP_RETAIN flag it was getting
significant wins over glibc, so this might be caused by the time spent
managing metadata, etc.

> THP changes things however. Where apparently minor page faults become a lot
> cheaper. Which makes glibc case a lot faster than even tcmalloc+mlock
> case. So
> in THP case, cost of page faults is smaller than cost of large memcpy.
> 
> So results are somewhat mixed, but overall I'm not sure that I'm able to see
> very convincing story for MREMAP_HOLE yet. However:
> 
> 1) it is possible that I am missing something. If so, please, educate me.
> 
> 2) if kernel implements this API, I'm going to use it in tcmalloc.
> 
> P.S. benchmark results also seem to indicate that tcmalloc could do
> something to
> explicitly enable THP and maybe better adapt to it's presence. Perhaps
> with some
> collaboration with kernel, i.e. to prevent that famous delay-ful-ness which
> causes people to disable THP.

BTW, THP currently interacts very poorly with the jemalloc/tcmalloc
madvise purging. The part where khugepaged assigns huge pages to dense
spans of pages is *great*. The part where the kernel hands out a huge
page on for a fault in a 2M span can be awful. It causes the model
inside the allocator of uncommitted vs. committed pages to break down.

For example, the allocator might use 1M of a huge page and then start
purging. The purging will split it into 4k pages, so there will be 1M of
zeroed 4k pages that are considered purged by the allocator. Over time,
this can cripple purging. Search for "jemalloc huge pages" and you'll
find lots of horror stories about this.

I think a THP implementation playing that played well with purging would
need to drop the page fault heuristic and rely on a significantly better
khugepaged. This would mean faulting in a span of memory would no longer
be faster. Having a flag to populate a range with madvise would help a
lot though, since the allocator knows exactly how much it's going to
clobber with the memcpy. There will still be a threshold where mremap
gets significantly faster, but it would move it higher.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-22  7:22               ` Daniel Micay
  0 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-22  7:22 UTC (permalink / raw)
  To: Aliaksey Kandratsenka
  Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski, google-perftools

[-- Attachment #1: Type: text/plain, Size: 7020 bytes --]

> Yes, that might be useful feature. (Assuming I understood it correctly)
> I believe
> tcmalloc would likely use:
> 
> mremap(old_ptr, move_size, move_size,
>        MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_NOHOLE,
>        new_ptr);
> 
> as optimized equivalent of:
> 
> memcpy(new_ptr, old_ptr, move_size);
> madvise(old_ptr, move_size, MADV_DONTNEED);

Yeah, it's essentially an optimized memcpy for when you don't need the
source allocation anymore.

> a) what is the smallest size where mremap is going to be faster ?

There are probably a lot of variables here like the CPU design and the
speed of system calls (syscall auditing makes them much slower!) in
addition to the stuff you've pointed out.

> My initial thinking was that we'd likely use mremap in all cases where
> we know
> that touching destination would cause minor page faults (i.e. when
> destination
> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also
> always when
> size is large enough, i.e. because "teleporting" large count of pages is
> likely
> to be faster than copying them.
> 
> But now I realize that it is more interesting than that. I.e. because as
> Daniel
> pointed out, mremap holds mmap_sem exclusively, while page faults are
> holding it
> for read. That could be optimized of course. Either by separate
> "teleport ptes"
> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem
> for write
> and retaking it for read for "moving pages" part of work. Being not really
> familiar with kernel code I have no idea if that's doable or not. But it
> looks
> like it might be quite important.

I think it's doable but it would pessimize the case where the dest VMA
isn't reusable. It would need to optimistically take the reader lock to
find out and then drop it. However, userspace knows when this is surely
going to work and could give it a hint.

I have a good idea about what the *ideal* API for the jemalloc/tcmalloc
case would be. It would be extremely specific though... they want the
kernel to move pages from a source VMA to a destination VMA where both
are anon/private with identical flags so only the reader lock is
necessary. On top of that, they really want to keep around as many
destination pages as possible, maybe by swapping as many as possible
back to the source.

That's *extremely* specific though and I now think the best way to get
there is by landing this feature and then extending it as necessary down
the road. An allocator may actually want to manage other kinds of
mappings itself and it would want the mmap_sem optimization to be an
optional hint.

> And I confirm that with all default settings tcmalloc and jemalloc lose to
> glibc. Also, notably, recent dev build of jemalloc (what is going to be 4.0
> AFAIK) actually matches or exceeds glibc speed, despite still not doing
> mremap. Apparently it is smarter about avoiding moving allocation for those
> realloc-s. And it was even able to resist my attempt to force it to move
> allocation. I haven't investigated why. Note that I built it couple
> weeks or so
> ago from dev branch, so it might simply have bugs.

I submitted patches teaching jemalloc to expand/shrink huge allocations
in-place, so it's hitting the in-place resize path after the initial
iteration on a repeated reallocation benchmark that's not doing any
other allocations.

In jemalloc, everything is allocated via naturally aligned chunks (4M
before, recently down to 256k in master) so if you want to block
in-place huge reallocation you'll either need to force a new non-huge
chunk to be allocated or make one that's at least as large as the chunk
size.

I don't think in-place reallocation is very common in long-running
programs. It's probably more common now that jemalloc is experimenting
with first-fit for chunk/huge allocation rather than address-ordered
best-fit. The best-fit algorithm is designed to keep the opportunity for
in-place reallocation to a minimum, although address ordering does
counter it :).

> NOTE: TCMALLOC_AGGRESSIVE_DECOMMIT=t (and default since 2.4) makes tcmalloc
> MADV_DONTNEED large free blocks immediately. As opposed to less rare with
> setting of "false". And it makes big difference on page faults counts
> and thus
> on runtime.
> 
> Another notable thing is how mlock effectively disables MADV_DONTNEED for
> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
> slightly better on runtime to glibc. The later spends a ton of time in
> kernel,
> probably handling minor page faults, and the former burns cpu in user space
> doing memcpy-s. So "tons of memcpys" seems to be competitive to what
> glibc is
> doing in this benchmark.

When I taught jemalloc to use the MREMAP_RETAIN flag it was getting
significant wins over glibc, so this might be caused by the time spent
managing metadata, etc.

> THP changes things however. Where apparently minor page faults become a lot
> cheaper. Which makes glibc case a lot faster than even tcmalloc+mlock
> case. So
> in THP case, cost of page faults is smaller than cost of large memcpy.
> 
> So results are somewhat mixed, but overall I'm not sure that I'm able to see
> very convincing story for MREMAP_HOLE yet. However:
> 
> 1) it is possible that I am missing something. If so, please, educate me.
> 
> 2) if kernel implements this API, I'm going to use it in tcmalloc.
> 
> P.S. benchmark results also seem to indicate that tcmalloc could do
> something to
> explicitly enable THP and maybe better adapt to it's presence. Perhaps
> with some
> collaboration with kernel, i.e. to prevent that famous delay-ful-ness which
> causes people to disable THP.

BTW, THP currently interacts very poorly with the jemalloc/tcmalloc
madvise purging. The part where khugepaged assigns huge pages to dense
spans of pages is *great*. The part where the kernel hands out a huge
page on for a fault in a 2M span can be awful. It causes the model
inside the allocator of uncommitted vs. committed pages to break down.

For example, the allocator might use 1M of a huge page and then start
purging. The purging will split it into 4k pages, so there will be 1M of
zeroed 4k pages that are considered purged by the allocator. Over time,
this can cripple purging. Search for "jemalloc huge pages" and you'll
find lots of horror stories about this.

I think a THP implementation playing that played well with purging would
need to drop the page fault heuristic and rely on a significantly better
khugepaged. This would mean faulting in a span of memory would no longer
be faster. Having a flag to populate a range with madvise would help a
lot though, since the allocator knows exactly how much it's going to
clobber with the memcpy. There will still be a threshold where mremap
gets significantly faster, but it would move it higher.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-22  6:06         ` Aliaksey Kandratsenka
       [not found]           ` <CADpJO7zBLhjecbiQeTubnTReiicVLr0-K43KbB4uCL5w_dyqJg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-03-23  5:17           ` Shaohua Li
  2015-03-24  5:25             ` Aliaksey Kandratsenka
  1 sibling, 1 reply; 44+ messages in thread
From: Shaohua Li @ 2015-03-23  5:17 UTC (permalink / raw)
  To: Aliaksey Kandratsenka
  Cc: Daniel Micay, Andrew Morton, linux-mm, linux-api, Rik van Riel,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski, google-perftools

On Sat, Mar 21, 2015 at 11:06:14PM -0700, Aliaksey Kandratsenka wrote:
> On Wed, Mar 18, 2015 at 10:34 PM, Daniel Micay <danielmicay@gmail.com>
> wrote:
> >
> > On 18/03/15 06:31 PM, Andrew Morton wrote:
> > > On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:
> > >
> > >> There was a similar patch posted before, but it doesn't get merged.
> I'd like
> > >> to try again if there are more discussions.
> > >> http://marc.info/?l=linux-mm&m=141230769431688&w=2
> > >>
> > >> mremap can be used to accelerate realloc. The problem is mremap will
> > >> punch a hole in original VMA, which makes specific memory allocator
> > >> unable to utilize it. Jemalloc is an example. It manages memory in 4M
> > >> chunks. mremap a range of the chunk will punch a hole, which other
> > >> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> > >> can't handle it.
> > >
> > > Daniel's changelog had additional details regarding the userspace
> > > allocators' behaviour.  It would be best to incorporate that into your
> > > changelog.
> > >
> > > Daniel also had microbenchmark testing results for glibc and jemalloc.
> > > Can you please do this?
> > >
> > > I'm not seeing any testing results for tcmalloc and I'm not seeing
> > > confirmation that this patch will be useful for tcmalloc.  Has anyone
> > > tried it, or sought input from tcmalloc developers?
> >
> > TCMalloc and jemalloc are currently equally slow in this benchmark, as
> > neither makes use of mremap. They're ~2-3x slower than glibc. I CC'ed
> > the currently most active TCMalloc developer so they can give input
> > into whether this patch would let them use it.
> 
> 
> Hi.
> 
> Thanks for looping us in for feedback (I'm CC-ing gperftools mailing list).
> 
> Yes, that might be useful feature. (Assuming I understood it correctly) I
> believe
> tcmalloc would likely use:
> 
> mremap(old_ptr, move_size, move_size,
>        MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_NOHOLE,
>        new_ptr);
> 
> as optimized equivalent of:
> 
> memcpy(new_ptr, old_ptr, move_size);
> madvise(old_ptr, move_size, MADV_DONTNEED);
> 
> And btw I find MREMAP_RETAIN name from original patch to be slightly more
> intuitive than MREMAP_NOHOLE. In my humble opinion the later name does not
> reflect semantic of this feature at all (assuming of course I correctly
> understood what the patch does).
> 
> I do have a couple of questions about this approach however. Please feel
> free to
> educate me on them.
> 
> a) what is the smallest size where mremap is going to be faster ?
> 
> My initial thinking was that we'd likely use mremap in all cases where we
> know
> that touching destination would cause minor page faults (i.e. when
> destination
> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also always
> when
> size is large enough, i.e. because "teleporting" large count of pages is
> likely
> to be faster than copying them.
> 
> But now I realize that it is more interesting than that. I.e. because as
> Daniel
> pointed out, mremap holds mmap_sem exclusively, while page faults are
> holding it
> for read. That could be optimized of course. Either by separate "teleport
> ptes"
> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem for
> write
> and retaking it for read for "moving pages" part of work. Being not really
> familiar with kernel code I have no idea if that's doable or not. But it
> looks
> like it might be quite important.

Does mmap_sem contend in your workload? Otherwise, there is no big
difference of read or write lock. memcpy to new allocation could trigger
page fault, new page allocation overhead and etc.
 
> Another aspect where I am similarly illiterate is performance effect of tlb
> flushes needed for such operation.

MADV_DONTNEED does tlb flush too.

> We can certainly experiment and find that limit. But if mremap threshold is
> going to be large, then perhaps this kernel feature is not as useful as we
> may
> hope.

There are a lot of factors here:
For mremap, the overhead:
-mmap sem write lock
-tlb flush

For memcpy + madvise, the overhead:
-memcpy
-new address triggers page fault (allocate new pages, handle page fault)
-is old address MADV_DONTNEED? (tlb flush)

I thought unless allocator only uses memcpy (without madvise, then
allocator will use more memory as necessary) for small size memory
(while memcpy for small size memory is faster than tlb flush), mremap
is a win. We probably can measure the size of memcpy which has smaller
overhead than tlb flush

> b) is that optimization worth having at all ?
> 
> After all, memcpy is actually known to be fast. I understand that copying
> memory
> in user space can be slowed down by minor page faults (results below seem to
> confirm that). But this is something where either allocator may retain
> populated
> pages a bit longer or where kernel could help. E.g. maybe by exposing
> something
> similar to MAP_POPULATE in madvise, or even doing some safe combination of
> madvise and MAP_UNINITIALIZED.

This option will make allocator use more memory than expected.
Eventually the memory must be reclaimed, which has big overhead too.
 
> I've played with Daniel's original benchmark (copied from
> http://marc.info/?l=linux-mm&m=141230769431688&w=2) with some tiny
> modifications:
> 
> #include <string.h>
> #include <stdlib.h>
> #include <stdio.h>
> #include <sys/mman.h>
> 
> int main(int argc, char **argv)
> {
>         if (argc > 1 && strcmp(argv[1], "--mlock") == 0) {
>                 int rv = mlockall(MCL_CURRENT | MCL_FUTURE);
>                 if (rv) {
>                         perror("mlockall");
>                         abort();
>                 }
>                 puts("mlocked!");
>         }
> 
>         for (size_t i = 0; i < 64; i++) {
>                 void *ptr = NULL;
>                 size_t old_size = 0;
>                 for (size_t size = 4; size < (1 << 30); size *= 2) {
>                         /*
>                          * void *hole = malloc(1 << 20);
>                          * if (!hole) {
>                          *      perror("malloc");
>                          *      abort();
>                          * }
>                          */
>                         ptr = realloc(ptr, size);
>                         if (!ptr) {
>                                 perror("realloc");
>                                 abort();
>                         }
>                         /* free(hole); */
>                         memset(ptr + old_size, 0xff, size - old_size);
>                         old_size = size;
>                 }
>                 free(ptr);
>         }
> }
> 
> I cannot say if this benchmark's vectors of up to 0.5 gigs are common in
> important applications or not. It can be argued that apps that care about
> such
> large vectors can do mremap themselves.
> 
> On the other hand, I believe that this micro benchmark could be plausibly
> changed to grow vector by smaller factor (i.e. see
> https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md#memory-handling).
> And
> with smaller growth factor, is seems reasonable to expect larger overhead
> from
> memcpy and smaller overhead from mremap. And thus favor mremap more.
> 
> And I confirm that with all default settings tcmalloc and jemalloc lose to
> glibc. Also, notably, recent dev build of jemalloc (what is going to be 4.0
> AFAIK) actually matches or exceeds glibc speed, despite still not doing
> mremap. Apparently it is smarter about avoiding moving allocation for those
> realloc-s. And it was even able to resist my attempt to force it to move
> allocation. I haven't investigated why. Note that I built it couple weeks
> or so
> ago from dev branch, so it might simply have bugs.
> 
> Results also vary greatly depending in transparent huge pages setting.
> Here's
> what I've got:
> 
> allocator |   mode    | time  | sys time | pgfaults |             extra
> ----------+-----------+-------+----------+----------+-------------------------------
> glibc     |           | 10.75 |     8.44 |  8388770 |
> glibc     |    thp    |  5.67 |     3.44 |   310882 |
> glibc     |   mlock   | 13.22 |     9.41 |  8388821 |
> glibc     | thp+mlock |  8.43 |     4.63 |   310933 |
> tcmalloc  |           | 11.46 |     2.00 |  2104826 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc  |    thp    | 10.61 |     0.89 |   386206 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc  |   mlock   | 10.11 |     0.27 |   264721 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc  | thp+mlock | 10.28 |     0.17 |    46011 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc  |           | 23.63 |    17.16 | 16770107 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> tcmalloc  |    thp    | 11.82 |     5.14 |   352477 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> tcmalloc  |   mlock   | 10.10 |     0.28 |   264724 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> tcmalloc  | thp+mlock | 10.30 |     0.17 |    49168 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> jemalloc1 |           | 23.71 |    17.33 | 16744572 |
> jemalloc1 |    thp    | 11.65 |     4.68 |    64988 |
> jemalloc1 |   mlock   | 10.13 |     0.29 |   263305 |
> jemalloc1 | thp+mlock | 10.05 |     0.17 |    50217 |
> jemalloc2 |           | 10.87 |     8.64 |  8521796 |
> jemalloc2 |    thp    |  4.64 |     2.32 |    56060 |
> jemalloc2 |   mlock   |  4.22 |     0.28 |   263181 |
> jemalloc2 | thp+mlock |  4.12 |     0.19 |    50411 |
> ----------+-----------+-------+----------+----------+-------------------------------
> 
> NOTE: usual disclaimer applies about possibility of screwing something up
> and
> getting invalid benchmark results without being able to see it. I apologize
> in
> advance.
> 
> NOTE: jemalloc1 is 3.6 as shipped by up-to-date Debian Sid. jemalloc2 is
> home-built snapshot of upcoming jemalloc 4.0.
> 
> NOTE: TCMALLOC_AGGRESSIVE_DECOMMIT=t (and default since 2.4) makes tcmalloc
> MADV_DONTNEED large free blocks immediately. As opposed to less rare with
> setting of "false". And it makes big difference on page faults counts and
> thus
> on runtime.
> 
> Another notable thing is how mlock effectively disables MADV_DONTNEED for
> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
> slightly better on runtime to glibc. The later spends a ton of time in
> kernel,
> probably handling minor page faults, and the former burns cpu in user space
> doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
> is
> doing in this benchmark.

mlock disables MADV_DONTNEED, so this is an unfair comparsion. With it,
allocator will use more memory than expected.

I'm kind of confused why we talk about THP, mlock here. When application
uses allocator, it doesn't need to be forced to use THP or mlock. Can we
forcus on normal case?

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-22  7:22               ` Daniel Micay
@ 2015-03-24  4:36                   ` Aliaksey Kandratsenka
  -1 siblings, 0 replies; 44+ messages in thread
From: Aliaksey Kandratsenka @ 2015-03-24  4:36 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Andrew Morton, Shaohua Li, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw

Hi.

First of all, I'd like to apologize for messing up formatting of my
past email. I've learned my lesson.

On Sun, Mar 22, 2015 at 12:22 AM, Daniel Micay <danielmicay-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> My initial thinking was that we'd likely use mremap in all cases where
>> we know
>> that touching destination would cause minor page faults (i.e. when
>> destination
>> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also
>> always when
>> size is large enough, i.e. because "teleporting" large count of pages is
>> likely
>> to be faster than copying them.
>>
>> But now I realize that it is more interesting than that. I.e. because as
>> Daniel
>> pointed out, mremap holds mmap_sem exclusively, while page faults are
>> holding it
>> for read. That could be optimized of course. Either by separate
>> "teleport ptes"
>> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem
>> for write
>> and retaking it for read for "moving pages" part of work. Being not really
>> familiar with kernel code I have no idea if that's doable or not. But it
>> looks
>> like it might be quite important.
>
> I think it's doable but it would pessimize the case where the dest VMA
> isn't reusable. It would need to optimistically take the reader lock to
> find out and then drop it. However, userspace knows when this is surely
> going to work and could give it a hint.
>
> I have a good idea about what the *ideal* API for the jemalloc/tcmalloc
> case would be. It would be extremely specific though... they want the
> kernel to move pages from a source VMA to a destination VMA where both
> are anon/private with identical flags so only the reader lock is
> necessary. On top of that, they really want to keep around as many
> destination pages as possible, maybe by swapping as many as possible
> back to the source.
>
> That's *extremely* specific though and I now think the best way to get
> there is by landing this feature and then extending it as necessary down
> the road. An allocator may actually want to manage other kinds of
> mappings itself and it would want the mmap_sem optimization to be an
> optional hint.

Interesting. But what might be other users of MREMAP_NOHOLE/MREMAP_RETAIN ?

I believe it can be argued that "exchange vmas/pages" as separate
syscall is actually more general and thus possibly more useful thing
to have. Regardless of locking. And MREMAP_NOHOLE/MREMAP_RETAIN
functionality can be built on top of that syscall in userspace if
needed (with more than one syscall naturally, but maybe still with
relatively small overhead).

I'm not saying this is good idea, but just asking.

And here is another observation just to make sure that more options
are considered.

Given that mremap is holding mmap_sem exclusively, how about userspace
malloc implementation taking some exclusive malloc lock and doing
normal mremap followed by mmap with MAP_FIXED to fill the hole ? It
might end up having largely same overhead. Well, modulo some extra TLB
flushing. But arguably, reducing TLB flushes for sequence of page
table updates could be usefully addressed separately (e.g. maybe by
matching those syscalls, maybe via syslets).

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-24  4:36                   ` Aliaksey Kandratsenka
  0 siblings, 0 replies; 44+ messages in thread
From: Aliaksey Kandratsenka @ 2015-03-24  4:36 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski, google-perftools

Hi.

First of all, I'd like to apologize for messing up formatting of my
past email. I've learned my lesson.

On Sun, Mar 22, 2015 at 12:22 AM, Daniel Micay <danielmicay@gmail.com> wrote:
>> My initial thinking was that we'd likely use mremap in all cases where
>> we know
>> that touching destination would cause minor page faults (i.e. when
>> destination
>> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also
>> always when
>> size is large enough, i.e. because "teleporting" large count of pages is
>> likely
>> to be faster than copying them.
>>
>> But now I realize that it is more interesting than that. I.e. because as
>> Daniel
>> pointed out, mremap holds mmap_sem exclusively, while page faults are
>> holding it
>> for read. That could be optimized of course. Either by separate
>> "teleport ptes"
>> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem
>> for write
>> and retaking it for read for "moving pages" part of work. Being not really
>> familiar with kernel code I have no idea if that's doable or not. But it
>> looks
>> like it might be quite important.
>
> I think it's doable but it would pessimize the case where the dest VMA
> isn't reusable. It would need to optimistically take the reader lock to
> find out and then drop it. However, userspace knows when this is surely
> going to work and could give it a hint.
>
> I have a good idea about what the *ideal* API for the jemalloc/tcmalloc
> case would be. It would be extremely specific though... they want the
> kernel to move pages from a source VMA to a destination VMA where both
> are anon/private with identical flags so only the reader lock is
> necessary. On top of that, they really want to keep around as many
> destination pages as possible, maybe by swapping as many as possible
> back to the source.
>
> That's *extremely* specific though and I now think the best way to get
> there is by landing this feature and then extending it as necessary down
> the road. An allocator may actually want to manage other kinds of
> mappings itself and it would want the mmap_sem optimization to be an
> optional hint.

Interesting. But what might be other users of MREMAP_NOHOLE/MREMAP_RETAIN ?

I believe it can be argued that "exchange vmas/pages" as separate
syscall is actually more general and thus possibly more useful thing
to have. Regardless of locking. And MREMAP_NOHOLE/MREMAP_RETAIN
functionality can be built on top of that syscall in userspace if
needed (with more than one syscall naturally, but maybe still with
relatively small overhead).

I'm not saying this is good idea, but just asking.

And here is another observation just to make sure that more options
are considered.

Given that mremap is holding mmap_sem exclusively, how about userspace
malloc implementation taking some exclusive malloc lock and doing
normal mremap followed by mmap with MAP_FIXED to fill the hole ? It
might end up having largely same overhead. Well, modulo some extra TLB
flushing. But arguably, reducing TLB flushes for sequence of page
table updates could be usefully addressed separately (e.g. maybe by
matching those syscalls, maybe via syslets).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-23  5:17           ` Shaohua Li
@ 2015-03-24  5:25             ` Aliaksey Kandratsenka
       [not found]               ` <CADpJO7zk8J3q7Bw9NibV9CzLarO+YkfeshyFTTq=XeS5qziBiA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 44+ messages in thread
From: Aliaksey Kandratsenka @ 2015-03-24  5:25 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Daniel Micay, Andrew Morton, linux-mm, linux-api, Rik van Riel,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski, google-perftools

Hi.

On Sun, Mar 22, 2015 at 10:17 PM, Shaohua Li <shli@fb.com> wrote:
> On Sat, Mar 21, 2015 at 11:06:14PM -0700, Aliaksey Kandratsenka wrote:

>> But now I realize that it is more interesting than that. I.e. because as
>> Daniel
>> pointed out, mremap holds mmap_sem exclusively, while page faults are
>> holding it
>> for read. That could be optimized of course. Either by separate "teleport
>> ptes"
>> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem for
>> write
>> and retaking it for read for "moving pages" part of work. Being not really
>> familiar with kernel code I have no idea if that's doable or not. But it
>> looks
>> like it might be quite important.
>
> Does mmap_sem contend in your workload? Otherwise, there is no big
> difference of read or write lock. memcpy to new allocation could trigger
> page fault, new page allocation overhead and etc.

Well, I don't have any workloads. I'm just maintaining a library that
others run various workloads on. Part of the problem is lack of good
and varied malloc benchmarks which could allow us that prevent
regression. So this makes me a bit more cautious on performance
matters.

But I see your point. Indeed I have no evidence at all that exclusive
locking might cause observable performance difference.

>> b) is that optimization worth having at all ?
>>
>> After all, memcpy is actually known to be fast. I understand that copying
>> memory
>> in user space can be slowed down by minor page faults (results below seem to
>> confirm that). But this is something where either allocator may retain
>> populated
>> pages a bit longer or where kernel could help. E.g. maybe by exposing
>> something
>> similar to MAP_POPULATE in madvise, or even doing some safe combination of
>> madvise and MAP_UNINITIALIZED.
>
> This option will make allocator use more memory than expected.
> Eventually the memory must be reclaimed, which has big overhead too.
>
>> I've played with Daniel's original benchmark (copied from
>> http://marc.info/?l=linux-mm&m=141230769431688&w=2) with some tiny
>> modifications:
>>

...

>> Another notable thing is how mlock effectively disables MADV_DONTNEED for
>> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
>> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
>> slightly better on runtime to glibc. The later spends a ton of time in
>> kernel,
>> probably handling minor page faults, and the former burns cpu in user space
>> doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
>> is
>> doing in this benchmark.
>
> mlock disables MADV_DONTNEED, so this is an unfair comparsion. With it,
> allocator will use more memory than expected.

Do not agree with unfair. I'm actually hoping MADV_FREE to provide
most if not all of benefits of mlock in this benchmark. I believe it's
not too unreasonable expectation.

>
> I'm kind of confused why we talk about THP, mlock here. When application
> uses allocator, it doesn't need to be forced to use THP or mlock. Can we
> forcus on normal case?

See my note on mlock above.

THP it is actually "normal". I know for certain, that many production
workloads are run on boxes with THP enabled. Red Hat famously ships
it's distros with THP set to "always". And I also know that some other
many production workloads are run on boxes with THP disabled. Also, as
seen above, "teleporting" pages is more efficient with THP due to much
smaller overhead of moving those pages. So I felt it was important not
to omit THP in my runs.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-24  5:25             ` Aliaksey Kandratsenka
@ 2015-03-24 14:39                   ` Daniel Micay
  0 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-24 14:39 UTC (permalink / raw)
  To: Aliaksey Kandratsenka, Shaohua Li
  Cc: Andrew Morton, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw

[-- Attachment #1: Type: text/plain, Size: 3787 bytes --]

On 24/03/15 01:25 AM, Aliaksey Kandratsenka wrote:
> 
> Well, I don't have any workloads. I'm just maintaining a library that
> others run various workloads on. Part of the problem is lack of good
> and varied malloc benchmarks which could allow us that prevent
> regression. So this makes me a bit more cautious on performance
> matters.
> 
> But I see your point. Indeed I have no evidence at all that exclusive
> locking might cause observable performance difference.

I'm sure it matters but I expect you'd need *many* cores running many
threads before it started to outweigh the benefit of copying pages
instead of data.

Thinking about it a bit more, it would probably make sense for mremap to
start with the optimistic assumption that the reader lock is enough here
when using MREMAP_NOHOLE|MREMAP_FIXED. It only needs the writer lock if
the destination mapping is incomplete or doesn't match, which is an edge
case as holes would mean thread unsafety.

An ideal allocator will toggle on PROT_NONE when overcommit is disabled
so this assumption would be wrong. The heuristic could just be adjusted
to assume the dest VMA will match with MREMAP_NOHOLE|MREMAP_FIXED when
full memory accounting isn't enabled. The fallback would never ended up
being needed in existing use cases that I'm aware of, and would just add
the overhead of a quick lock, O(log n) check and unlock with the reader
lock held anyway. Another flag isn't really necessary.

>>> Another notable thing is how mlock effectively disables MADV_DONTNEED for
>>> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
>>> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
>>> slightly better on runtime to glibc. The later spends a ton of time in
>>> kernel,
>>> probably handling minor page faults, and the former burns cpu in user space
>>> doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
>>> is
>>> doing in this benchmark.
>>
>> mlock disables MADV_DONTNEED, so this is an unfair comparsion. With it,
>> allocator will use more memory than expected.
> 
> Do not agree with unfair. I'm actually hoping MADV_FREE to provide
> most if not all of benefits of mlock in this benchmark. I believe it's
> not too unreasonable expectation.

MADV_FREE will still result in as many page faults, just no zeroing.

I get ~20k requests/s with jemalloc on the ebizzy benchmark with this
dual core ivy bridge laptop. It jumps to ~60k requests/s with MADV_FREE
IIRC, but disabling purging via MALLOC_CONF=lg_dirty_mult:-1 leads to
3.5 *million* requests/s. It has a similar impact with TCMalloc.

>> I'm kind of confused why we talk about THP, mlock here. When application
>> uses allocator, it doesn't need to be forced to use THP or mlock. Can we
>> forcus on normal case?
> 
> See my note on mlock above.
> 
> THP it is actually "normal". I know for certain, that many production
> workloads are run on boxes with THP enabled. Red Hat famously ships
> it's distros with THP set to "always". And I also know that some other
> many production workloads are run on boxes with THP disabled. Also, as
> seen above, "teleporting" pages is more efficient with THP due to much
> smaller overhead of moving those pages. So I felt it was important not
> to omit THP in my runs.

Yeah, it's quite normal for it to be enabled. Allocators might as well
give up on fine-grained purging when it is though :P. I think it only
really makes sense to purge at 2M boundaries in multiples of 2M if it's
going to end up breaking any other purging over the long-term.

I was originally only testing with THP since Arch uses "always" but I
realized it had an enormous impact and started testing without it too.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-24 14:39                   ` Daniel Micay
  0 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-24 14:39 UTC (permalink / raw)
  To: Aliaksey Kandratsenka, Shaohua Li
  Cc: Andrew Morton, linux-mm, linux-api, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools

[-- Attachment #1: Type: text/plain, Size: 3787 bytes --]

On 24/03/15 01:25 AM, Aliaksey Kandratsenka wrote:
> 
> Well, I don't have any workloads. I'm just maintaining a library that
> others run various workloads on. Part of the problem is lack of good
> and varied malloc benchmarks which could allow us that prevent
> regression. So this makes me a bit more cautious on performance
> matters.
> 
> But I see your point. Indeed I have no evidence at all that exclusive
> locking might cause observable performance difference.

I'm sure it matters but I expect you'd need *many* cores running many
threads before it started to outweigh the benefit of copying pages
instead of data.

Thinking about it a bit more, it would probably make sense for mremap to
start with the optimistic assumption that the reader lock is enough here
when using MREMAP_NOHOLE|MREMAP_FIXED. It only needs the writer lock if
the destination mapping is incomplete or doesn't match, which is an edge
case as holes would mean thread unsafety.

An ideal allocator will toggle on PROT_NONE when overcommit is disabled
so this assumption would be wrong. The heuristic could just be adjusted
to assume the dest VMA will match with MREMAP_NOHOLE|MREMAP_FIXED when
full memory accounting isn't enabled. The fallback would never ended up
being needed in existing use cases that I'm aware of, and would just add
the overhead of a quick lock, O(log n) check and unlock with the reader
lock held anyway. Another flag isn't really necessary.

>>> Another notable thing is how mlock effectively disables MADV_DONTNEED for
>>> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
>>> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
>>> slightly better on runtime to glibc. The later spends a ton of time in
>>> kernel,
>>> probably handling minor page faults, and the former burns cpu in user space
>>> doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
>>> is
>>> doing in this benchmark.
>>
>> mlock disables MADV_DONTNEED, so this is an unfair comparsion. With it,
>> allocator will use more memory than expected.
> 
> Do not agree with unfair. I'm actually hoping MADV_FREE to provide
> most if not all of benefits of mlock in this benchmark. I believe it's
> not too unreasonable expectation.

MADV_FREE will still result in as many page faults, just no zeroing.

I get ~20k requests/s with jemalloc on the ebizzy benchmark with this
dual core ivy bridge laptop. It jumps to ~60k requests/s with MADV_FREE
IIRC, but disabling purging via MALLOC_CONF=lg_dirty_mult:-1 leads to
3.5 *million* requests/s. It has a similar impact with TCMalloc.

>> I'm kind of confused why we talk about THP, mlock here. When application
>> uses allocator, it doesn't need to be forced to use THP or mlock. Can we
>> forcus on normal case?
> 
> See my note on mlock above.
> 
> THP it is actually "normal". I know for certain, that many production
> workloads are run on boxes with THP enabled. Red Hat famously ships
> it's distros with THP set to "always". And I also know that some other
> many production workloads are run on boxes with THP disabled. Also, as
> seen above, "teleporting" pages is more efficient with THP due to much
> smaller overhead of moving those pages. So I felt it was important not
> to omit THP in my runs.

Yeah, it's quite normal for it to be enabled. Allocators might as well
give up on fine-grained purging when it is though :P. I think it only
really makes sense to purge at 2M boundaries in multiples of 2M if it's
going to end up breaking any other purging over the long-term.

I was originally only testing with THP since Arch uses "always" but I
realized it had an enormous impact and started testing without it too.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-24  4:36                   ` Aliaksey Kandratsenka
  (?)
@ 2015-03-24 14:54                   ` Daniel Micay
  -1 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-24 14:54 UTC (permalink / raw)
  To: Aliaksey Kandratsenka
  Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski, google-perftools

[-- Attachment #1: Type: text/plain, Size: 1840 bytes --]

> Given that mremap is holding mmap_sem exclusively, how about userspace
> malloc implementation taking some exclusive malloc lock and doing
> normal mremap followed by mmap with MAP_FIXED to fill the hole ? It
> might end up having largely same overhead. Well, modulo some extra TLB
> flushing. But arguably, reducing TLB flushes for sequence of page
> table updates could be usefully addressed separately (e.g. maybe by
> matching those syscalls, maybe via syslets).

You can't use MAP_FIXED because it has a race with other users of mmap.

The address hint will *usually* work, but you need to deal with the case
where it fails and then cope with the fallout of the fragmentation.

PaX ASLR ignores address hints so that's something else to consider if
you care about running on PaX/Grsecurity patched kernels.

I'm doing this in my own allocator that's heavily based on the jemalloc
design. It just unmaps the memory given by the hinted mmap call if it
fails to get back the hole:

https://github.com/thestinger/allocator/blob/e80d2d0c2863c490b650ecffeb33beaccfcfdc46/huge.c#L167-L180

On 64-bit, it relies on 1TiB of reserved address space (works even with
overcommit disabled) to do per-CPU allocation for chunks and huge (>=
chunk size) allocations via address range checks so it also needs this
ugly workaround too:

https://github.com/thestinger/allocator/blob/e80d2d0c2863c490b650ecffeb33beaccfcfdc46/huge.c#L67-L75

I'm convinced that the mmap_sem writer lock can be avoided for the case
with MREMAP_FIXED via a good heuristic though. It just needs to check
that dst is a single VMA that matches the src properties and fall back
to the writer lock if that's not the case. This will have the same
performance as a separate syscall to move pages in all the cases where
that syscall would work.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-24 14:39                   ` Daniel Micay
  (?)
@ 2015-03-25  5:02                   ` Shaohua Li
  -1 siblings, 0 replies; 44+ messages in thread
From: Shaohua Li @ 2015-03-25  5:02 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Aliaksey Kandratsenka, Andrew Morton, linux-mm, linux-api,
	Rik van Riel, Hugh Dickins, Mel Gorman, Johannes Weiner,
	Michal Hocko, Andy Lutomirski, google-perftools

On Tue, Mar 24, 2015 at 10:39:32AM -0400, Daniel Micay wrote:
> On 24/03/15 01:25 AM, Aliaksey Kandratsenka wrote:
> > 
> > Well, I don't have any workloads. I'm just maintaining a library that
> > others run various workloads on. Part of the problem is lack of good
> > and varied malloc benchmarks which could allow us that prevent
> > regression. So this makes me a bit more cautious on performance
> > matters.
> > 
> > But I see your point. Indeed I have no evidence at all that exclusive
> > locking might cause observable performance difference.
> 
> I'm sure it matters but I expect you'd need *many* cores running many
> threads before it started to outweigh the benefit of copying pages
> instead of data.
> 
> Thinking about it a bit more, it would probably make sense for mremap to
> start with the optimistic assumption that the reader lock is enough here
> when using MREMAP_NOHOLE|MREMAP_FIXED. It only needs the writer lock if
> the destination mapping is incomplete or doesn't match, which is an edge
> case as holes would mean thread unsafety.
> 
> An ideal allocator will toggle on PROT_NONE when overcommit is disabled
> so this assumption would be wrong. The heuristic could just be adjusted
> to assume the dest VMA will match with MREMAP_NOHOLE|MREMAP_FIXED when
> full memory accounting isn't enabled. The fallback would never ended up
> being needed in existing use cases that I'm aware of, and would just add
> the overhead of a quick lock, O(log n) check and unlock with the reader
> lock held anyway. Another flag isn't really necessary.
> 
> >>> Another notable thing is how mlock effectively disables MADV_DONTNEED for
> >>> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
> >>> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
> >>> slightly better on runtime to glibc. The later spends a ton of time in
> >>> kernel,
> >>> probably handling minor page faults, and the former burns cpu in user space
> >>> doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
> >>> is
> >>> doing in this benchmark.
> >>
> >> mlock disables MADV_DONTNEED, so this is an unfair comparsion. With it,
> >> allocator will use more memory than expected.
> > 
> > Do not agree with unfair. I'm actually hoping MADV_FREE to provide
> > most if not all of benefits of mlock in this benchmark. I believe it's
> > not too unreasonable expectation.
> 
> MADV_FREE will still result in as many page faults, just no zeroing.
> 
> I get ~20k requests/s with jemalloc on the ebizzy benchmark with this
> dual core ivy bridge laptop. It jumps to ~60k requests/s with MADV_FREE
> IIRC, but disabling purging via MALLOC_CONF=lg_dirty_mult:-1 leads to
> 3.5 *million* requests/s. It has a similar impact with TCMalloc.

MADV_FREE has side effect (exactly like if you use mlock), which causes
more memory are used. It's lazy memory free, so if there is no memory
pressure, you can think MADV_FREE is a nop. It's undoubt you will see
improvement in such case. But if there is memory pressure, it is
completely different story.
 
> >> I'm kind of confused why we talk about THP, mlock here. When application
> >> uses allocator, it doesn't need to be forced to use THP or mlock. Can we
> >> forcus on normal case?
> > 
> > See my note on mlock above.
> > 
> > THP it is actually "normal". I know for certain, that many production
> > workloads are run on boxes with THP enabled. Red Hat famously ships
> > it's distros with THP set to "always". And I also know that some other
> > many production workloads are run on boxes with THP disabled. Also, as
> > seen above, "teleporting" pages is more efficient with THP due to much
> > smaller overhead of moving those pages. So I felt it was important not
> > to omit THP in my runs.
> 
> Yeah, it's quite normal for it to be enabled. Allocators might as well
> give up on fine-grained purging when it is though :P. I think it only
> really makes sense to purge at 2M boundaries in multiples of 2M if it's
> going to end up breaking any other purging over the long-term.
> 
> I was originally only testing with THP since Arch uses "always" but I
> realized it had an enormous impact and started testing without it too.

Hmm, I didn't intend to ignore THP, but just can't understand why it's
matter. There is extra overhead when purge or mremap THP pages (if range
isn't 2M aligned in multiple of 2M), but other than that, there is no
other difference to me, but your test result doesn't suggest this. Guess
we should understand why THP makes so big difference.

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-22  7:22               ` Daniel Micay
@ 2015-03-25 16:22                   ` Vlastimil Babka
  -1 siblings, 0 replies; 44+ messages in thread
From: Vlastimil Babka @ 2015-03-25 16:22 UTC (permalink / raw)
  To: Daniel Micay, Aliaksey Kandratsenka
  Cc: Andrew Morton, Shaohua Li, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw

On 03/22/2015 08:22 AM, Daniel Micay wrote:
> BTW, THP currently interacts very poorly with the jemalloc/tcmalloc
> madvise purging. The part where khugepaged assigns huge pages to dense
> spans of pages is*great*. The part where the kernel hands out a huge
> page on for a fault in a 2M span can be awful. It causes the model
> inside the allocator of uncommitted vs. committed pages to break down.
>
> For example, the allocator might use 1M of a huge page and then start
> purging. The purging will split it into 4k pages, so there will be 1M of
> zeroed 4k pages that are considered purged by the allocator. Over time,
> this can cripple purging. Search for "jemalloc huge pages" and you'll
> find lots of horror stories about this.

I'm not sure I get your description right. The problem I know about is 
where "purging" means madvise(MADV_DONTNEED) and khugepaged later 
collapses a new hugepage that will repopulate the purged parts, 
increasing the memory usage. One can limit this via 
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That 
setting doesn't affect the page fault THP allocations, which however 
happen only in newly accessed hugepage-sized areas and not partially 
purged ones, though.

> I think a THP implementation playing that played well with purging would
> need to drop the page fault heuristic and rely on a significantly better
> khugepaged.

See here http://lwn.net/Articles/636162/ (the "Compaction" part)

The objection is that some short-lived workloads like gcc have to map 
hugepages immediately if they are to benefit from them. I still plan to 
improve khugepaged and allow admins to say that they don't want THP page 
faults (and rely solely on khugepaged which has more information to 
judge additional memory usage), but I'm not sure if it would be an 
acceptable default behavior.
One workaround in the current state for jemalloc and friends could be to 
use madvise(MADV_NOHUGEPAGE) on hugepage-sized/aligned areas where it 
wants to purge parts of them via madvise(MADV_DONTNEED). It could mean 
overhead of another syscall and tracking of where this was applied and 
when it makes sense to undo this and allow THP to be collapsed again, 
though, and it would also split vma's.

> This would mean faulting in a span of memory would no longer
> be faster. Having a flag to populate a range with madvise would help a

If it's a newly mapped memory, there's mmap(MAP_POPULATE). There is also 
a madvise(MADV_WILLNEED), which sounds like what you want, but I don't 
know what the implementation does exactly - it was apparently added for 
paging in ahead, and maybe it ignores unpopulated anonymous areas, but 
it would probably be well in spirit of the flag to make it prepopulate 
those.

> lot though, since the allocator knows exactly how much it's going to
> clobber with the memcpy. There will still be a threshold where mremap
> gets significantly faster, but it would move it higher.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-25 16:22                   ` Vlastimil Babka
  0 siblings, 0 replies; 44+ messages in thread
From: Vlastimil Babka @ 2015-03-25 16:22 UTC (permalink / raw)
  To: Daniel Micay, Aliaksey Kandratsenka
  Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski, google-perftools

On 03/22/2015 08:22 AM, Daniel Micay wrote:
> BTW, THP currently interacts very poorly with the jemalloc/tcmalloc
> madvise purging. The part where khugepaged assigns huge pages to dense
> spans of pages is*great*. The part where the kernel hands out a huge
> page on for a fault in a 2M span can be awful. It causes the model
> inside the allocator of uncommitted vs. committed pages to break down.
>
> For example, the allocator might use 1M of a huge page and then start
> purging. The purging will split it into 4k pages, so there will be 1M of
> zeroed 4k pages that are considered purged by the allocator. Over time,
> this can cripple purging. Search for "jemalloc huge pages" and you'll
> find lots of horror stories about this.

I'm not sure I get your description right. The problem I know about is 
where "purging" means madvise(MADV_DONTNEED) and khugepaged later 
collapses a new hugepage that will repopulate the purged parts, 
increasing the memory usage. One can limit this via 
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That 
setting doesn't affect the page fault THP allocations, which however 
happen only in newly accessed hugepage-sized areas and not partially 
purged ones, though.

> I think a THP implementation playing that played well with purging would
> need to drop the page fault heuristic and rely on a significantly better
> khugepaged.

See here http://lwn.net/Articles/636162/ (the "Compaction" part)

The objection is that some short-lived workloads like gcc have to map 
hugepages immediately if they are to benefit from them. I still plan to 
improve khugepaged and allow admins to say that they don't want THP page 
faults (and rely solely on khugepaged which has more information to 
judge additional memory usage), but I'm not sure if it would be an 
acceptable default behavior.
One workaround in the current state for jemalloc and friends could be to 
use madvise(MADV_NOHUGEPAGE) on hugepage-sized/aligned areas where it 
wants to purge parts of them via madvise(MADV_DONTNEED). It could mean 
overhead of another syscall and tracking of where this was applied and 
when it makes sense to undo this and allow THP to be collapsed again, 
though, and it would also split vma's.

> This would mean faulting in a span of memory would no longer
> be faster. Having a flag to populate a range with madvise would help a

If it's a newly mapped memory, there's mmap(MAP_POPULATE). There is also 
a madvise(MADV_WILLNEED), which sounds like what you want, but I don't 
know what the implementation does exactly - it was apparently added for 
paging in ahead, and maybe it ignores unpopulated anonymous areas, but 
it would probably be well in spirit of the flag to make it prepopulate 
those.

> lot though, since the allocator knows exactly how much it's going to
> clobber with the memcpy. There will still be a threshold where mremap
> gets significantly faster, but it would move it higher.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-25 16:22                   ` Vlastimil Babka
  (?)
@ 2015-03-25 20:49                   ` Daniel Micay
  2015-03-25 20:54                     ` Daniel Micay
       [not found]                     ` <55131F70.7020503-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  -1 siblings, 2 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-25 20:49 UTC (permalink / raw)
  To: Vlastimil Babka, Aliaksey Kandratsenka
  Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski, google-perftools

[-- Attachment #1: Type: text/plain, Size: 3719 bytes --]

On 25/03/15 12:22 PM, Vlastimil Babka wrote:
> 
> I'm not sure I get your description right. The problem I know about is
> where "purging" means madvise(MADV_DONTNEED) and khugepaged later
> collapses a new hugepage that will repopulate the purged parts,
> increasing the memory usage. One can limit this via
> /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That
> setting doesn't affect the page fault THP allocations, which however
> happen only in newly accessed hugepage-sized areas and not partially
> purged ones, though.

Since jemalloc doesn't unmap memory but instead does recycling itself in
userspace, it ends up with large spans of free virtual memory and gets
*lots* of huge pages from the page fault heuristic. It keeps track of
active vs. dirty (not purged) vs. clean (purged / untouched) ranges
everywhere, and will purge dirty ranges as they build up.

The THP allocation on page faults mean it ends up with memory that's
supposed to be clean but is really not.

A worst case example with the (up until recently) default chunk size of
4M is allocating a bunch of 2.1M allocations. Chunks are naturally
aligned, so each one can be represented as 2 huge pages. It increases
memory usage by nearly *50%*. The allocator thinks the tail is clean
memory, but it's not. When the allocations are freed, it will purge the
2.1M at the head (once enough dirty memory builds up) but all of the
tail memory will be leaked until something else is allocated there and
then freed.

>> I think a THP implementation playing that played well with purging would
>> need to drop the page fault heuristic and rely on a significantly better
>> khugepaged.
> 
> See here http://lwn.net/Articles/636162/ (the "Compaction" part)
> 
> The objection is that some short-lived workloads like gcc have to map
> hugepages immediately if they are to benefit from them. I still plan to
> improve khugepaged and allow admins to say that they don't want THP page
> faults (and rely solely on khugepaged which has more information to
> judge additional memory usage), but I'm not sure if it would be an
> acceptable default behavior.
> One workaround in the current state for jemalloc and friends could be to
> use madvise(MADV_NOHUGEPAGE) on hugepage-sized/aligned areas where it
> wants to purge parts of them via madvise(MADV_DONTNEED). It could mean
> overhead of another syscall and tracking of where this was applied and
> when it makes sense to undo this and allow THP to be collapsed again,
> though, and it would also split vma's.

Huge pages do significantly help performance though, and this would
pretty much mean no huge pages. The overhead of toggling it on and off
based on whether it's a < chunk size allocation or a >= chunk size one
is too high.

The page fault heuristic is just way too aggressive because there's no
indication of how much memory will be used. I don't think it makes sense
to do it without an explicit MADV_NOHUGEPAGE. Collapsing only dense
ranges doesn't have the same risk.

>> This would mean faulting in a span of memory would no longer
>> be faster. Having a flag to populate a range with madvise would help a
> 
> If it's a newly mapped memory, there's mmap(MAP_POPULATE). There is also
> a madvise(MADV_WILLNEED), which sounds like what you want, but I don't
> know what the implementation does exactly - it was apparently added for
> paging in ahead, and maybe it ignores unpopulated anonymous areas, but
> it would probably be well in spirit of the flag to make it prepopulate
> those.

It doesn't seem to do anything for anon mappings atm but I do see a
patch from 2008 for that. I guess it never landed.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-25 20:49                   ` Daniel Micay
@ 2015-03-25 20:54                     ` Daniel Micay
       [not found]                     ` <55131F70.7020503-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  1 sibling, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-25 20:54 UTC (permalink / raw)
  To: Vlastimil Babka, Aliaksey Kandratsenka
  Cc: Andrew Morton, Shaohua Li, linux-mm, linux-api, Rik van Riel,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski, google-perftools

[-- Attachment #1: Type: text/plain, Size: 304 bytes --]

> The page fault heuristic is just way too aggressive because there's no
> indication of how much memory will be used. I don't think it makes sense
> to do it without an explicit MADV_NOHUGEPAGE. Collapsing only dense
> ranges doesn't have the same risk.

Er, without an explicit MADV_HUGEPAGE*.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-25 20:49                   ` Daniel Micay
@ 2015-03-26  0:19                         ` David Rientjes
       [not found]                     ` <55131F70.7020503-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  1 sibling, 0 replies; 44+ messages in thread
From: David Rientjes @ 2015-03-26  0:19 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton,
	Shaohua Li, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw

On Wed, 25 Mar 2015, Daniel Micay wrote:

> > I'm not sure I get your description right. The problem I know about is
> > where "purging" means madvise(MADV_DONTNEED) and khugepaged later
> > collapses a new hugepage that will repopulate the purged parts,
> > increasing the memory usage. One can limit this via
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That
> > setting doesn't affect the page fault THP allocations, which however
> > happen only in newly accessed hugepage-sized areas and not partially
> > purged ones, though.
> 
> Since jemalloc doesn't unmap memory but instead does recycling itself in
> userspace, it ends up with large spans of free virtual memory and gets
> *lots* of huge pages from the page fault heuristic. It keeps track of
> active vs. dirty (not purged) vs. clean (purged / untouched) ranges
> everywhere, and will purge dirty ranges as they build up.
> 
> The THP allocation on page faults mean it ends up with memory that's
> supposed to be clean but is really not.
> 
> A worst case example with the (up until recently) default chunk size of
> 4M is allocating a bunch of 2.1M allocations. Chunks are naturally
> aligned, so each one can be represented as 2 huge pages. It increases
> memory usage by nearly *50%*. The allocator thinks the tail is clean
> memory, but it's not. When the allocations are freed, it will purge the
> 2.1M at the head (once enough dirty memory builds up) but all of the
> tail memory will be leaked until something else is allocated there and
> then freed.
> 

With tcmalloc, it's simple to always expand the heap by mmaping 2MB ranges 
for size classes <= 2MB, allocate its own metadata from an arena that is 
also expanded in 2MB range, and always do madvise(MADV_DONTNEED) for the 
longest span on the freelist when it does periodic memory freeing back to 
the kernel, and even better if the freed memory splits at most one 
hugepage.  When memory is pulled from the freelist of memory that has 
already been returned to the kernel, you can return a span that will make 
it eligible to be collapsed into a hugepage based on your setting of 
max_ptes_none, trying to consolidate the memory as much as possible.  If 
your malloc is implemented in a way to understand the benefit of 
hugepages, and how much memory you're willing to sacrifice (max_ptes_none) 
for it, then you should _never_ be increasing memory usage by 50%.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-26  0:19                         ` David Rientjes
  0 siblings, 0 replies; 44+ messages in thread
From: David Rientjes @ 2015-03-26  0:19 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton,
	Shaohua Li, linux-mm, linux-api, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools

On Wed, 25 Mar 2015, Daniel Micay wrote:

> > I'm not sure I get your description right. The problem I know about is
> > where "purging" means madvise(MADV_DONTNEED) and khugepaged later
> > collapses a new hugepage that will repopulate the purged parts,
> > increasing the memory usage. One can limit this via
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That
> > setting doesn't affect the page fault THP allocations, which however
> > happen only in newly accessed hugepage-sized areas and not partially
> > purged ones, though.
> 
> Since jemalloc doesn't unmap memory but instead does recycling itself in
> userspace, it ends up with large spans of free virtual memory and gets
> *lots* of huge pages from the page fault heuristic. It keeps track of
> active vs. dirty (not purged) vs. clean (purged / untouched) ranges
> everywhere, and will purge dirty ranges as they build up.
> 
> The THP allocation on page faults mean it ends up with memory that's
> supposed to be clean but is really not.
> 
> A worst case example with the (up until recently) default chunk size of
> 4M is allocating a bunch of 2.1M allocations. Chunks are naturally
> aligned, so each one can be represented as 2 huge pages. It increases
> memory usage by nearly *50%*. The allocator thinks the tail is clean
> memory, but it's not. When the allocations are freed, it will purge the
> 2.1M at the head (once enough dirty memory builds up) but all of the
> tail memory will be leaked until something else is allocated there and
> then freed.
> 

With tcmalloc, it's simple to always expand the heap by mmaping 2MB ranges 
for size classes <= 2MB, allocate its own metadata from an arena that is 
also expanded in 2MB range, and always do madvise(MADV_DONTNEED) for the 
longest span on the freelist when it does periodic memory freeing back to 
the kernel, and even better if the freed memory splits at most one 
hugepage.  When memory is pulled from the freelist of memory that has 
already been returned to the kernel, you can return a span that will make 
it eligible to be collapsed into a hugepage based on your setting of 
max_ptes_none, trying to consolidate the memory as much as possible.  If 
your malloc is implemented in a way to understand the benefit of 
hugepages, and how much memory you're willing to sacrifice (max_ptes_none) 
for it, then you should _never_ be increasing memory usage by 50%.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-26  0:19                         ` David Rientjes
  (?)
@ 2015-03-26  0:24                         ` Daniel Micay
       [not found]                           ` <551351CA.3090803-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  -1 siblings, 1 reply; 44+ messages in thread
From: Daniel Micay @ 2015-03-26  0:24 UTC (permalink / raw)
  To: David Rientjes
  Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton,
	Shaohua Li, linux-mm, linux-api, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools

[-- Attachment #1: Type: text/plain, Size: 2727 bytes --]

On 25/03/15 08:19 PM, David Rientjes wrote:
> On Wed, 25 Mar 2015, Daniel Micay wrote:
> 
>>> I'm not sure I get your description right. The problem I know about is
>>> where "purging" means madvise(MADV_DONTNEED) and khugepaged later
>>> collapses a new hugepage that will repopulate the purged parts,
>>> increasing the memory usage. One can limit this via
>>> /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none . That
>>> setting doesn't affect the page fault THP allocations, which however
>>> happen only in newly accessed hugepage-sized areas and not partially
>>> purged ones, though.
>>
>> Since jemalloc doesn't unmap memory but instead does recycling itself in
>> userspace, it ends up with large spans of free virtual memory and gets
>> *lots* of huge pages from the page fault heuristic. It keeps track of
>> active vs. dirty (not purged) vs. clean (purged / untouched) ranges
>> everywhere, and will purge dirty ranges as they build up.
>>
>> The THP allocation on page faults mean it ends up with memory that's
>> supposed to be clean but is really not.
>>
>> A worst case example with the (up until recently) default chunk size of
>> 4M is allocating a bunch of 2.1M allocations. Chunks are naturally
>> aligned, so each one can be represented as 2 huge pages. It increases
>> memory usage by nearly *50%*. The allocator thinks the tail is clean
>> memory, but it's not. When the allocations are freed, it will purge the
>> 2.1M at the head (once enough dirty memory builds up) but all of the
>> tail memory will be leaked until something else is allocated there and
>> then freed.
>>
> 
> With tcmalloc, it's simple to always expand the heap by mmaping 2MB ranges 
> for size classes <= 2MB, allocate its own metadata from an arena that is 
> also expanded in 2MB range, and always do madvise(MADV_DONTNEED) for the 
> longest span on the freelist when it does periodic memory freeing back to 
> the kernel, and even better if the freed memory splits at most one 
> hugepage.  When memory is pulled from the freelist of memory that has 
> already been returned to the kernel, you can return a span that will make 
> it eligible to be collapsed into a hugepage based on your setting of 
> max_ptes_none, trying to consolidate the memory as much as possible.  If 
> your malloc is implemented in a way to understand the benefit of 
> hugepages, and how much memory you're willing to sacrifice (max_ptes_none) 
> for it, then you should _never_ be increasing memory usage by 50%.

If khugepaged was the only source of huge pages, sure. The primary
source of huge pages is the heuristic handing out an entire 2M page on
the first page fault in a 2M range.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-24 14:39                   ` Daniel Micay
  (?)
  (?)
@ 2015-03-26  0:50                   ` Minchan Kim
  2015-03-26  1:21                       ` Daniel Micay
  -1 siblings, 1 reply; 44+ messages in thread
From: Minchan Kim @ 2015-03-26  0:50 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Aliaksey Kandratsenka, Shaohua Li, Andrew Morton, linux-mm,
	linux-api, Rik van Riel, Hugh Dickins, Mel Gorman,
	Johannes Weiner, Michal Hocko, Andy Lutomirski, google-perftools

Hello Daniel,

On Tue, Mar 24, 2015 at 10:39:32AM -0400, Daniel Micay wrote:
> On 24/03/15 01:25 AM, Aliaksey Kandratsenka wrote:
> > 
> > Well, I don't have any workloads. I'm just maintaining a library that
> > others run various workloads on. Part of the problem is lack of good
> > and varied malloc benchmarks which could allow us that prevent
> > regression. So this makes me a bit more cautious on performance
> > matters.
> > 
> > But I see your point. Indeed I have no evidence at all that exclusive
> > locking might cause observable performance difference.
> 
> I'm sure it matters but I expect you'd need *many* cores running many
> threads before it started to outweigh the benefit of copying pages
> instead of data.
> 
> Thinking about it a bit more, it would probably make sense for mremap to
> start with the optimistic assumption that the reader lock is enough here
> when using MREMAP_NOHOLE|MREMAP_FIXED. It only needs the writer lock if
> the destination mapping is incomplete or doesn't match, which is an edge
> case as holes would mean thread unsafety.
> 
> An ideal allocator will toggle on PROT_NONE when overcommit is disabled
> so this assumption would be wrong. The heuristic could just be adjusted
> to assume the dest VMA will match with MREMAP_NOHOLE|MREMAP_FIXED when
> full memory accounting isn't enabled. The fallback would never ended up
> being needed in existing use cases that I'm aware of, and would just add
> the overhead of a quick lock, O(log n) check and unlock with the reader
> lock held anyway. Another flag isn't really necessary.
> 
> >>> Another notable thing is how mlock effectively disables MADV_DONTNEED for
> >>> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
> >>> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
> >>> slightly better on runtime to glibc. The later spends a ton of time in
> >>> kernel,
> >>> probably handling minor page faults, and the former burns cpu in user space
> >>> doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
> >>> is
> >>> doing in this benchmark.
> >>
> >> mlock disables MADV_DONTNEED, so this is an unfair comparsion. With it,
> >> allocator will use more memory than expected.
> > 
> > Do not agree with unfair. I'm actually hoping MADV_FREE to provide
> > most if not all of benefits of mlock in this benchmark. I believe it's
> > not too unreasonable expectation.
> 
> MADV_FREE will still result in as many page faults, just no zeroing.

I didn't follow this thread. However, as you mentioned MADV_FREE will
make many page fault, I jump into here.
One of the benefit with MADV_FREE in current implementation is to
avoid page fault as well as no zeroing.
Why did you see many page fault?


> 
> I get ~20k requests/s with jemalloc on the ebizzy benchmark with this
> dual core ivy bridge laptop. It jumps to ~60k requests/s with MADV_FREE
> IIRC, but disabling purging via MALLOC_CONF=lg_dirty_mult:-1 leads to
> 3.5 *million* requests/s. It has a similar impact with TCMalloc.

When I tested MADV_FREE with ebizzy, I saw similar result two or three
times fater than MADV_DONTNEED. But It's no free cost. It incurs MADV_FREE
cost itself*(ie, enumerating all of page table in the range and clear
dirty bit and tlb flush). Of course, it has mmap_sem with read-side lock.
If you see great improve when you disable purging, I guess mainly it's
caused by no lock of mmap_sem so some threads can allocate while other
threads can do page fault. The reason I think so is I saw similar result
when I implemented vrange syscall which hold mmap_sem read-side lock
during very short time(ie, marking the volatile into vma, ie O(1) while
MADV_FREE holds a lock during enumerating all of pages in the range, ie O(N))

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-26  0:50                   ` Minchan Kim
@ 2015-03-26  1:21                       ` Daniel Micay
  0 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-26  1:21 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Aliaksey Kandratsenka, Shaohua Li, Andrew Morton,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw

[-- Attachment #1: Type: text/plain, Size: 1865 bytes --]

> I didn't follow this thread. However, as you mentioned MADV_FREE will
> make many page fault, I jump into here.
> One of the benefit with MADV_FREE in current implementation is to
> avoid page fault as well as no zeroing.
> Why did you see many page fault?

I think I just misunderstood why it was still so much slower than not
using purging at all.

>> I get ~20k requests/s with jemalloc on the ebizzy benchmark with this
>> dual core ivy bridge laptop. It jumps to ~60k requests/s with MADV_FREE
>> IIRC, but disabling purging via MALLOC_CONF=lg_dirty_mult:-1 leads to
>> 3.5 *million* requests/s. It has a similar impact with TCMalloc.
> 
> When I tested MADV_FREE with ebizzy, I saw similar result two or three
> times fater than MADV_DONTNEED. But It's no free cost. It incurs MADV_FREE
> cost itself*(ie, enumerating all of page table in the range and clear
> dirty bit and tlb flush). Of course, it has mmap_sem with read-side lock.
> If you see great improve when you disable purging, I guess mainly it's
> caused by no lock of mmap_sem so some threads can allocate while other
> threads can do page fault. The reason I think so is I saw similar result
> when I implemented vrange syscall which hold mmap_sem read-side lock
> during very short time(ie, marking the volatile into vma, ie O(1) while
> MADV_FREE holds a lock during enumerating all of pages in the range, ie O(N))

It stops doing mmap after getting warmed up since it never unmaps so I
don't think mmap_sem is a contention issue. It could just be caused by
the cost of the system call itself and TLB flush. I found perf to be
fairly useless in identifying where the time was being spent.

It might be much more important to purge very large ranges in one go
with MADV_FREE. It's a different direction than the current compromises
forced by MADV_DONTNEED.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-26  1:21                       ` Daniel Micay
  0 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-26  1:21 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Aliaksey Kandratsenka, Shaohua Li, Andrew Morton, linux-mm,
	linux-api, Rik van Riel, Hugh Dickins, Mel Gorman,
	Johannes Weiner, Michal Hocko, Andy Lutomirski, google-perftools

[-- Attachment #1: Type: text/plain, Size: 1865 bytes --]

> I didn't follow this thread. However, as you mentioned MADV_FREE will
> make many page fault, I jump into here.
> One of the benefit with MADV_FREE in current implementation is to
> avoid page fault as well as no zeroing.
> Why did you see many page fault?

I think I just misunderstood why it was still so much slower than not
using purging at all.

>> I get ~20k requests/s with jemalloc on the ebizzy benchmark with this
>> dual core ivy bridge laptop. It jumps to ~60k requests/s with MADV_FREE
>> IIRC, but disabling purging via MALLOC_CONF=lg_dirty_mult:-1 leads to
>> 3.5 *million* requests/s. It has a similar impact with TCMalloc.
> 
> When I tested MADV_FREE with ebizzy, I saw similar result two or three
> times fater than MADV_DONTNEED. But It's no free cost. It incurs MADV_FREE
> cost itself*(ie, enumerating all of page table in the range and clear
> dirty bit and tlb flush). Of course, it has mmap_sem with read-side lock.
> If you see great improve when you disable purging, I guess mainly it's
> caused by no lock of mmap_sem so some threads can allocate while other
> threads can do page fault. The reason I think so is I saw similar result
> when I implemented vrange syscall which hold mmap_sem read-side lock
> during very short time(ie, marking the volatile into vma, ie O(1) while
> MADV_FREE holds a lock during enumerating all of pages in the range, ie O(N))

It stops doing mmap after getting warmed up since it never unmaps so I
don't think mmap_sem is a contention issue. It could just be caused by
the cost of the system call itself and TLB flush. I found perf to be
fairly useless in identifying where the time was being spent.

It might be much more important to purge very large ranges in one go
with MADV_FREE. It's a different direction than the current compromises
forced by MADV_DONTNEED.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-26  0:24                         ` Daniel Micay
@ 2015-03-26  2:31                               ` David Rientjes
  0 siblings, 0 replies; 44+ messages in thread
From: David Rientjes @ 2015-03-26  2:31 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton,
	Shaohua Li, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw

On Wed, 25 Mar 2015, Daniel Micay wrote:

> > With tcmalloc, it's simple to always expand the heap by mmaping 2MB ranges 
> > for size classes <= 2MB, allocate its own metadata from an arena that is 
> > also expanded in 2MB range, and always do madvise(MADV_DONTNEED) for the 
> > longest span on the freelist when it does periodic memory freeing back to 
> > the kernel, and even better if the freed memory splits at most one 
> > hugepage.  When memory is pulled from the freelist of memory that has 
> > already been returned to the kernel, you can return a span that will make 
> > it eligible to be collapsed into a hugepage based on your setting of 
> > max_ptes_none, trying to consolidate the memory as much as possible.  If 
> > your malloc is implemented in a way to understand the benefit of 
> > hugepages, and how much memory you're willing to sacrifice (max_ptes_none) 
> > for it, then you should _never_ be increasing memory usage by 50%.
> 
> If khugepaged was the only source of huge pages, sure. The primary
> source of huge pages is the heuristic handing out an entire 2M page on
> the first page fault in a 2M range.
> 

The behavior is a property of what you brk() or mmap() to expand your 
heap, you can intentionally require it to fault hugepages or not fault 
hugepages without any special madvise().

With the example above, the implementation I wrote specifically tries to 
sbrk() in 2MB regions and hands out allocator metadata via a memory arena 
doing the same thing.  Memory is treated as being on a normal freelist so 
that it is considered resident, i.e. the same as faulting 4KB, freeing it, 
before tcmalloc does madvise(MADV_DONTNEED), and we naturally prefer to 
hand that out before going to the returned freelist or mmap() as fallback.  
There will always be fragmentation in your normal freelist spans, so 
there's always wasted memory (with or without thp).  There should never be 
a case where you're always mapping 2MB aligned regions and then only 
touching a small portion of it, for >2MB size classes you could easily map 
only the size required and you would never get an excess of memory due to 
thp at fault.

I think this may be tangential to the thread, though, since this has 
nothing to do with mremap() or any new mremap() flag.

If the thp faulting behavior is going to be changed, then it would need to 
be something that is opted into and not by any system tunable or madvise() 
flag.  It would probably need to be a prctl() like PR_SET_THP_DISABLE is 
that would control only fault behavior.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-26  2:31                               ` David Rientjes
  0 siblings, 0 replies; 44+ messages in thread
From: David Rientjes @ 2015-03-26  2:31 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton,
	Shaohua Li, linux-mm, linux-api, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools

On Wed, 25 Mar 2015, Daniel Micay wrote:

> > With tcmalloc, it's simple to always expand the heap by mmaping 2MB ranges 
> > for size classes <= 2MB, allocate its own metadata from an arena that is 
> > also expanded in 2MB range, and always do madvise(MADV_DONTNEED) for the 
> > longest span on the freelist when it does periodic memory freeing back to 
> > the kernel, and even better if the freed memory splits at most one 
> > hugepage.  When memory is pulled from the freelist of memory that has 
> > already been returned to the kernel, you can return a span that will make 
> > it eligible to be collapsed into a hugepage based on your setting of 
> > max_ptes_none, trying to consolidate the memory as much as possible.  If 
> > your malloc is implemented in a way to understand the benefit of 
> > hugepages, and how much memory you're willing to sacrifice (max_ptes_none) 
> > for it, then you should _never_ be increasing memory usage by 50%.
> 
> If khugepaged was the only source of huge pages, sure. The primary
> source of huge pages is the heuristic handing out an entire 2M page on
> the first page fault in a 2M range.
> 

The behavior is a property of what you brk() or mmap() to expand your 
heap, you can intentionally require it to fault hugepages or not fault 
hugepages without any special madvise().

With the example above, the implementation I wrote specifically tries to 
sbrk() in 2MB regions and hands out allocator metadata via a memory arena 
doing the same thing.  Memory is treated as being on a normal freelist so 
that it is considered resident, i.e. the same as faulting 4KB, freeing it, 
before tcmalloc does madvise(MADV_DONTNEED), and we naturally prefer to 
hand that out before going to the returned freelist or mmap() as fallback.  
There will always be fragmentation in your normal freelist spans, so 
there's always wasted memory (with or without thp).  There should never be 
a case where you're always mapping 2MB aligned regions and then only 
touching a small portion of it, for >2MB size classes you could easily map 
only the size required and you would never get an excess of memory due to 
thp at fault.

I think this may be tangential to the thread, though, since this has 
nothing to do with mremap() or any new mremap() flag.

If the thp faulting behavior is going to be changed, then it would need to 
be something that is opted into and not by any system tunable or madvise() 
flag.  It would probably need to be a prctl() like PR_SET_THP_DISABLE is 
that would control only fault behavior.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-26  2:31                               ` David Rientjes
@ 2015-03-26  3:24                                   ` Daniel Micay
  -1 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-26  3:24 UTC (permalink / raw)
  To: David Rientjes
  Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton,
	Shaohua Li, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw

[-- Attachment #1: Type: text/plain, Size: 2017 bytes --]

It's all well and good to say that you shouldn't do that, but it's the
basis of the design in jemalloc and other zone-based arena allocators.

There's a chosen chunk size and chunks are naturally aligned. An
allocation is either a span of chunks (chunk-aligned) or has metadata
stored in the chunk header. This also means chunks can be assigned to
arenas for a high level of concurrency. Thread caching is then only
necessary for batching operations to amortize the cost of locking rather
than to reduce contention. Per-CPU arenas can be implemented quite well
by using sched_getcpu() to move threads around whenever it detects that
another thread allocated from the arena.

With >= 2M chunks, madvise purging works very well at the chunk level
but there's also fine-grained purging within chunks and it completely
breaks down from THP page faults.

The allocator packs memory towards low addresses (address-ordered
best-fit and first-fit can both be done in O(log n) time) so swings in
memory usage will tend to clear large spans of memory which will then
fault in huge pages no matter how it was mapped. Once MADV_FREE can be
used rather than MADV_DONTNEED, this would only happen after memory
pressure... but that's not very comforting.

I don't find it acceptable that programs can have huge (up to ~30% in
real programs) amounts of memory leaked over time due to THP page
faults. This is a very real problem impacting projects like Redis,
MariaDB and Firefox because they all use jemalloc.

https://shk.io/2015/03/22/transparent-huge-pages/
https://www.percona.com/blog/2014/07/23/why-tokudb-hates-transparent-hugepages/
http://dev.nuodb.com/techblog/linux-transparent-huge-pages-jemalloc-and-nuodb
https://bugzilla.mozilla.org/show_bug.cgi?id=770612

Bionic (Android's libc) switched over to jemalloc too.

The only reason you don't hear about this with glibc is because it
doesn't have aggressive, fine-grained purging and a low fragmentation
design in the first place.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-26  3:24                                   ` Daniel Micay
  0 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-26  3:24 UTC (permalink / raw)
  To: David Rientjes
  Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton,
	Shaohua Li, linux-mm, linux-api, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools

[-- Attachment #1: Type: text/plain, Size: 2017 bytes --]

It's all well and good to say that you shouldn't do that, but it's the
basis of the design in jemalloc and other zone-based arena allocators.

There's a chosen chunk size and chunks are naturally aligned. An
allocation is either a span of chunks (chunk-aligned) or has metadata
stored in the chunk header. This also means chunks can be assigned to
arenas for a high level of concurrency. Thread caching is then only
necessary for batching operations to amortize the cost of locking rather
than to reduce contention. Per-CPU arenas can be implemented quite well
by using sched_getcpu() to move threads around whenever it detects that
another thread allocated from the arena.

With >= 2M chunks, madvise purging works very well at the chunk level
but there's also fine-grained purging within chunks and it completely
breaks down from THP page faults.

The allocator packs memory towards low addresses (address-ordered
best-fit and first-fit can both be done in O(log n) time) so swings in
memory usage will tend to clear large spans of memory which will then
fault in huge pages no matter how it was mapped. Once MADV_FREE can be
used rather than MADV_DONTNEED, this would only happen after memory
pressure... but that's not very comforting.

I don't find it acceptable that programs can have huge (up to ~30% in
real programs) amounts of memory leaked over time due to THP page
faults. This is a very real problem impacting projects like Redis,
MariaDB and Firefox because they all use jemalloc.

https://shk.io/2015/03/22/transparent-huge-pages/
https://www.percona.com/blog/2014/07/23/why-tokudb-hates-transparent-hugepages/
http://dev.nuodb.com/techblog/linux-transparent-huge-pages-jemalloc-and-nuodb
https://bugzilla.mozilla.org/show_bug.cgi?id=770612

Bionic (Android's libc) switched over to jemalloc too.

The only reason you don't hear about this with glibc is because it
doesn't have aggressive, fine-grained purging and a low fragmentation
design in the first place.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-26  3:24                                   ` Daniel Micay
@ 2015-03-26  3:36                                       ` Daniel Micay
  -1 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-26  3:36 UTC (permalink / raw)
  To: David Rientjes
  Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton,
	Shaohua Li, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw

[-- Attachment #1: Type: text/plain, Size: 501 bytes --]

jemalloc doesn't really use free lists or sbrk for user allocations much
at all: thread caches are arrays of pointers (easier to flush and no
need to deref stale memory), red-black trees manage chunks and runs
within chunks, and runs use bitmaps. It can use sbrk as an alternate
source of chunks, but it defaults to using mmap and there's no real
advantage to switching it.

THP currently seems to be designed around the assumption that all
userspace allocators are variants of dlmalloc...


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-26  3:36                                       ` Daniel Micay
  0 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-26  3:36 UTC (permalink / raw)
  To: David Rientjes
  Cc: Vlastimil Babka, Aliaksey Kandratsenka, Andrew Morton,
	Shaohua Li, linux-mm, linux-api, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools

[-- Attachment #1: Type: text/plain, Size: 501 bytes --]

jemalloc doesn't really use free lists or sbrk for user allocations much
at all: thread caches are arrays of pointers (easier to flush and no
need to deref stale memory), red-black trees manage chunks and runs
within chunks, and runs use bitmaps. It can use sbrk as an alternate
source of chunks, but it defaults to using mmap and there's no real
advantage to switching it.

THP currently seems to be designed around the assumption that all
userspace allocators are variants of dlmalloc...


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-26  1:21                       ` Daniel Micay
@ 2015-03-26  7:02                           ` Minchan Kim
  -1 siblings, 0 replies; 44+ messages in thread
From: Minchan Kim @ 2015-03-26  7:02 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Aliaksey Kandratsenka, Shaohua Li, Andrew Morton,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw

On Wed, Mar 25, 2015 at 09:21:10PM -0400, Daniel Micay wrote:
> > I didn't follow this thread. However, as you mentioned MADV_FREE will
> > make many page fault, I jump into here.
> > One of the benefit with MADV_FREE in current implementation is to
> > avoid page fault as well as no zeroing.
> > Why did you see many page fault?
> 
> I think I just misunderstood why it was still so much slower than not
> using purging at all.
> 
> >> I get ~20k requests/s with jemalloc on the ebizzy benchmark with this
> >> dual core ivy bridge laptop. It jumps to ~60k requests/s with MADV_FREE
> >> IIRC, but disabling purging via MALLOC_CONF=lg_dirty_mult:-1 leads to
> >> 3.5 *million* requests/s. It has a similar impact with TCMalloc.
> > 
> > When I tested MADV_FREE with ebizzy, I saw similar result two or three
> > times fater than MADV_DONTNEED. But It's no free cost. It incurs MADV_FREE
> > cost itself*(ie, enumerating all of page table in the range and clear
> > dirty bit and tlb flush). Of course, it has mmap_sem with read-side lock.
> > If you see great improve when you disable purging, I guess mainly it's
> > caused by no lock of mmap_sem so some threads can allocate while other
> > threads can do page fault. The reason I think so is I saw similar result
> > when I implemented vrange syscall which hold mmap_sem read-side lock
> > during very short time(ie, marking the volatile into vma, ie O(1) while
> > MADV_FREE holds a lock during enumerating all of pages in the range, ie O(N))
> 
> It stops doing mmap after getting warmed up since it never unmaps so I
> don't think mmap_sem is a contention issue. It could just be caused by
> the cost of the system call itself and TLB flush. I found perf to be
> fairly useless in identifying where the time was being spent.
> 
> It might be much more important to purge very large ranges in one go
> with MADV_FREE. It's a different direction than the current compromises
> forced by MADV_DONTNEED.
> 

I tested ebizzy + recent jemalloc in my KVM guest.

Apparently, no purging was best(ie, 4925 records/s) while purging with
MADV_DONTNEED was worst(ie, 1814 records/s).
However, in my machine, purging with MADV_FREE was not bad as yourr.

        4338 records/s vs 4925 records/s.

Still, no purging was win but if we consider the num of madvise syscall
between no purging and MADV_FREE purging, it would be better than now.

        0 vs 43724

One thing I am wondering is why the madvise syscall count is increased
when we turns on MADV_FREE compared to MADV_DONTNEED. It might be
aggressive dirty puring rule in jemalloc internal?

Anyway, my point is gap between MADV_FREE and no puring in my machine
is not much like you said.

********
#> lscpu

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                12
On-line CPU(s) list:   0-11
Thread(s) per core:    2
Core(s) per socket:    6
Socket(s):             1
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 45
Stepping:              7
CPU MHz:               1200.000
BogoMIPS:              6399.71
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              12288K
NUMA node0 CPU(s):     0-11

*****

ebizzy 0.2 
(C) 2006-7 Intel Corporation
(C) 2007 Valerie Henson <val-/gER7w9Thpc@public.gmane.org>
always_mmap 0
never_mmap 0
chunks 10
prevent coalescing using permissions 0
prevent coalescing using holes 0
random_size 0
chunk_size 5242880
seconds 10
threads 24
verbose 1
linear 0
touch_pages 0
page size 4096
Allocated memory
Wrote memory
Threads starting
Threads finished

******

jemalloc git head
commit 65db63cf3f0c5dd5126a1b3786756486eaf931ba
Author: Jason Evans <je-b10kYP2dOMg@public.gmane.org>
Date:   Wed Mar 25 18:56:55 2015 -0700

    Fix in-place shrinking huge reallocation purging bugs.


******
1) LD_PRELOAD="/jemalloc/lib/libjemalloc.so.dontneed" strace -c -f ./ebizzy -s $((5<<20))

1814 records/s
real 10.00 s
user 28.18 s
sys  90.08 s
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 90.78   99.368420        5469     18171           madvise
  9.14   10.001131    10001131         1           nanosleep
  0.05    0.050037         807        62        10 futex
  0.03    0.031721         291       109           mmap
  0.00    0.004455         178        25           set_robust_list
  0.00    0.000129           5        24           clone
  0.00    0.000000           0         4           read
  0.00    0.000000           0         1           write
  0.00    0.000000           0         6           open
  0.00    0.000000           0         6           close
  0.00    0.000000           0         6           fstat
  0.00    0.000000           0        32           mprotect
  0.00    0.000000           0        35           munmap
  0.00    0.000000           0         2           brk
  0.00    0.000000           0         3           rt_sigaction
  0.00    0.000000           0         3           rt_sigprocmask
  0.00    0.000000           0         4         3 access
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1         1 readlink
  0.00    0.000000           0         1           getrlimit
  0.00    0.000000           0         2           getrusage
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           set_tid_address
------ ----------- ----------- --------- --------- ----------------
100.00  109.455893                 18501        14 total

2) LD_PRELOAD="/jemalloc/lib/libjemalloc.so.dontneed" MALLOC_CONF=lg_dirty_mult:-1 strace -c -f ./ebizzy -s $((5<<20))

4925 records/s
real 10.00 s
user 119.83 s
sys   0.16 s
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 82.73    0.821804       15804        52         6 futex
 15.70    0.156000      156000         1           nanosleep
  1.53    0.015186         115       132           mmap
  0.04    0.000349           4        87           munmap
  0.00    0.000000           0         4           read
  0.00    0.000000           0         1           write
  0.00    0.000000           0         6           open
  0.00    0.000000           0         6           close
  0.00    0.000000           0         6           fstat
  0.00    0.000000           0        32           mprotect
  0.00    0.000000           0         2           brk
  0.00    0.000000           0         3           rt_sigaction
  0.00    0.000000           0         3           rt_sigprocmask
  0.00    0.000000           0         4         3 access
  0.00    0.000000           0        24           madvise
  0.00    0.000000           0        24           clone
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1         1 readlink
  0.00    0.000000           0         1           getrlimit
  0.00    0.000000           0         2           getrusage
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           set_tid_address
  0.00    0.000000           0        25           set_robust_list
------ ----------- ----------- --------- --------- ----------------
100.00    0.993339                   419        10 total

3) LD_PRELOAD="/jemalloc/lib/libjemalloc.so.free" strace -c -f ./ebizzy -s $((5<<20))

4338 records/s
real 10.00 s
user 91.40 s
sys  12.58 s
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 78.39   36.433483         839     43408           madvise
 21.53   10.004889    10004889         1           nanosleep
  0.04    0.020472         394        52        15 futex
  0.03    0.015464         145       107           mmap
  0.00    0.000041           2        24           clone
  0.00    0.000000           0         4           read
  0.00    0.000000           0         1           write
  0.00    0.000000           0         6           open
  0.00    0.000000           0         6           close
  0.00    0.000000           0         6           fstat
  0.00    0.000000           0        32           mprotect
  0.00    0.000000           0        33           munmap
  0.00    0.000000           0         2           brk 
  0.00    0.000000           0         3           rt_sigaction
  0.00    0.000000           0         3           rt_sigprocmask
  0.00    0.000000           0         4         3 access
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1         1 readlink
  0.00    0.000000           0         1           getrlimit
  0.00    0.000000           0         2           getrusage
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           set_tid_address
  0.00    0.000000           0        25           set_robust_list
------ ----------- ----------- --------- --------- ----------------
100.00   46.474349                 43724        19 total

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-26  7:02                           ` Minchan Kim
  0 siblings, 0 replies; 44+ messages in thread
From: Minchan Kim @ 2015-03-26  7:02 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Aliaksey Kandratsenka, Shaohua Li, Andrew Morton, linux-mm,
	linux-api, Rik van Riel, Hugh Dickins, Mel Gorman,
	Johannes Weiner, Michal Hocko, Andy Lutomirski, google-perftools

On Wed, Mar 25, 2015 at 09:21:10PM -0400, Daniel Micay wrote:
> > I didn't follow this thread. However, as you mentioned MADV_FREE will
> > make many page fault, I jump into here.
> > One of the benefit with MADV_FREE in current implementation is to
> > avoid page fault as well as no zeroing.
> > Why did you see many page fault?
> 
> I think I just misunderstood why it was still so much slower than not
> using purging at all.
> 
> >> I get ~20k requests/s with jemalloc on the ebizzy benchmark with this
> >> dual core ivy bridge laptop. It jumps to ~60k requests/s with MADV_FREE
> >> IIRC, but disabling purging via MALLOC_CONF=lg_dirty_mult:-1 leads to
> >> 3.5 *million* requests/s. It has a similar impact with TCMalloc.
> > 
> > When I tested MADV_FREE with ebizzy, I saw similar result two or three
> > times fater than MADV_DONTNEED. But It's no free cost. It incurs MADV_FREE
> > cost itself*(ie, enumerating all of page table in the range and clear
> > dirty bit and tlb flush). Of course, it has mmap_sem with read-side lock.
> > If you see great improve when you disable purging, I guess mainly it's
> > caused by no lock of mmap_sem so some threads can allocate while other
> > threads can do page fault. The reason I think so is I saw similar result
> > when I implemented vrange syscall which hold mmap_sem read-side lock
> > during very short time(ie, marking the volatile into vma, ie O(1) while
> > MADV_FREE holds a lock during enumerating all of pages in the range, ie O(N))
> 
> It stops doing mmap after getting warmed up since it never unmaps so I
> don't think mmap_sem is a contention issue. It could just be caused by
> the cost of the system call itself and TLB flush. I found perf to be
> fairly useless in identifying where the time was being spent.
> 
> It might be much more important to purge very large ranges in one go
> with MADV_FREE. It's a different direction than the current compromises
> forced by MADV_DONTNEED.
> 

I tested ebizzy + recent jemalloc in my KVM guest.

Apparently, no purging was best(ie, 4925 records/s) while purging with
MADV_DONTNEED was worst(ie, 1814 records/s).
However, in my machine, purging with MADV_FREE was not bad as yourr.

        4338 records/s vs 4925 records/s.

Still, no purging was win but if we consider the num of madvise syscall
between no purging and MADV_FREE purging, it would be better than now.

        0 vs 43724

One thing I am wondering is why the madvise syscall count is increased
when we turns on MADV_FREE compared to MADV_DONTNEED. It might be
aggressive dirty puring rule in jemalloc internal?

Anyway, my point is gap between MADV_FREE and no puring in my machine
is not much like you said.

********
#> lscpu

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                12
On-line CPU(s) list:   0-11
Thread(s) per core:    2
Core(s) per socket:    6
Socket(s):             1
NUMA node(s):          1
Vendor ID:             GenuineIntel
CPU family:            6
Model:                 45
Stepping:              7
CPU MHz:               1200.000
BogoMIPS:              6399.71
Virtualization:        VT-x
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              12288K
NUMA node0 CPU(s):     0-11

*****

ebizzy 0.2 
(C) 2006-7 Intel Corporation
(C) 2007 Valerie Henson <val@nmt.edu>
always_mmap 0
never_mmap 0
chunks 10
prevent coalescing using permissions 0
prevent coalescing using holes 0
random_size 0
chunk_size 5242880
seconds 10
threads 24
verbose 1
linear 0
touch_pages 0
page size 4096
Allocated memory
Wrote memory
Threads starting
Threads finished

******

jemalloc git head
commit 65db63cf3f0c5dd5126a1b3786756486eaf931ba
Author: Jason Evans <je@fb.com>
Date:   Wed Mar 25 18:56:55 2015 -0700

    Fix in-place shrinking huge reallocation purging bugs.


******
1) LD_PRELOAD="/jemalloc/lib/libjemalloc.so.dontneed" strace -c -f ./ebizzy -s $((5<<20))

1814 records/s
real 10.00 s
user 28.18 s
sys  90.08 s
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 90.78   99.368420        5469     18171           madvise
  9.14   10.001131    10001131         1           nanosleep
  0.05    0.050037         807        62        10 futex
  0.03    0.031721         291       109           mmap
  0.00    0.004455         178        25           set_robust_list
  0.00    0.000129           5        24           clone
  0.00    0.000000           0         4           read
  0.00    0.000000           0         1           write
  0.00    0.000000           0         6           open
  0.00    0.000000           0         6           close
  0.00    0.000000           0         6           fstat
  0.00    0.000000           0        32           mprotect
  0.00    0.000000           0        35           munmap
  0.00    0.000000           0         2           brk
  0.00    0.000000           0         3           rt_sigaction
  0.00    0.000000           0         3           rt_sigprocmask
  0.00    0.000000           0         4         3 access
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1         1 readlink
  0.00    0.000000           0         1           getrlimit
  0.00    0.000000           0         2           getrusage
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           set_tid_address
------ ----------- ----------- --------- --------- ----------------
100.00  109.455893                 18501        14 total

2) LD_PRELOAD="/jemalloc/lib/libjemalloc.so.dontneed" MALLOC_CONF=lg_dirty_mult:-1 strace -c -f ./ebizzy -s $((5<<20))

4925 records/s
real 10.00 s
user 119.83 s
sys   0.16 s
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 82.73    0.821804       15804        52         6 futex
 15.70    0.156000      156000         1           nanosleep
  1.53    0.015186         115       132           mmap
  0.04    0.000349           4        87           munmap
  0.00    0.000000           0         4           read
  0.00    0.000000           0         1           write
  0.00    0.000000           0         6           open
  0.00    0.000000           0         6           close
  0.00    0.000000           0         6           fstat
  0.00    0.000000           0        32           mprotect
  0.00    0.000000           0         2           brk
  0.00    0.000000           0         3           rt_sigaction
  0.00    0.000000           0         3           rt_sigprocmask
  0.00    0.000000           0         4         3 access
  0.00    0.000000           0        24           madvise
  0.00    0.000000           0        24           clone
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1         1 readlink
  0.00    0.000000           0         1           getrlimit
  0.00    0.000000           0         2           getrusage
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           set_tid_address
  0.00    0.000000           0        25           set_robust_list
------ ----------- ----------- --------- --------- ----------------
100.00    0.993339                   419        10 total

3) LD_PRELOAD="/jemalloc/lib/libjemalloc.so.free" strace -c -f ./ebizzy -s $((5<<20))

4338 records/s
real 10.00 s
user 91.40 s
sys  12.58 s
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
 78.39   36.433483         839     43408           madvise
 21.53   10.004889    10004889         1           nanosleep
  0.04    0.020472         394        52        15 futex
  0.03    0.015464         145       107           mmap
  0.00    0.000041           2        24           clone
  0.00    0.000000           0         4           read
  0.00    0.000000           0         1           write
  0.00    0.000000           0         6           open
  0.00    0.000000           0         6           close
  0.00    0.000000           0         6           fstat
  0.00    0.000000           0        32           mprotect
  0.00    0.000000           0        33           munmap
  0.00    0.000000           0         2           brk 
  0.00    0.000000           0         3           rt_sigaction
  0.00    0.000000           0         3           rt_sigprocmask
  0.00    0.000000           0         4         3 access
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1         1 readlink
  0.00    0.000000           0         1           getrlimit
  0.00    0.000000           0         2           getrusage
  0.00    0.000000           0         1           arch_prctl
  0.00    0.000000           0         1           set_tid_address
  0.00    0.000000           0        25           set_robust_list
------ ----------- ----------- --------- --------- ----------------
100.00   46.474349                 43724        19 total

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-26  3:24                                   ` Daniel Micay
@ 2015-03-26 17:25                                       ` Vlastimil Babka
  -1 siblings, 0 replies; 44+ messages in thread
From: Vlastimil Babka @ 2015-03-26 17:25 UTC (permalink / raw)
  To: Daniel Micay, David Rientjes
  Cc: Aliaksey Kandratsenka, Andrew Morton, Shaohua Li,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Rik van Riel, Hugh Dickins,
	Mel Gorman, Johannes Weiner, Michal Hocko, Andy Lutomirski,
	google-perftools-/JYPxA39Uh5TLH3MbocFFw

On 03/26/2015 04:24 AM, Daniel Micay wrote:
> It's all well and good to say that you shouldn't do that, but it's the
> basis of the design in jemalloc and other zone-based arena allocators.
>
> There's a chosen chunk size and chunks are naturally aligned. An
> allocation is either a span of chunks (chunk-aligned) or has metadata
> stored in the chunk header. This also means chunks can be assigned to
> arenas for a high level of concurrency. Thread caching is then only
> necessary for batching operations to amortize the cost of locking rather
> than to reduce contention. Per-CPU arenas can be implemented quite well
> by using sched_getcpu() to move threads around whenever it detects that
> another thread allocated from the arena.
>
> With >= 2M chunks, madvise purging works very well at the chunk level
> but there's also fine-grained purging within chunks and it completely
> breaks down from THP page faults.

Are you sure it's due to page faults and not khugepaged + high value 
(such as the default 511) of max_ptes_none? As reported here?

https://bugzilla.kernel.org/show_bug.cgi?id=93111

Once you have faulted in a THP, and then purged part of it and split it, 
I don't think page faults in the purged part can lead to a new THP 
collapse, only khugepaged can do that AFAIK.
And if you mmap smaller than 2M areas (i.e. your 256K chunks), that 
should prevent THP page faults on the first fault within the chunk as well.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
@ 2015-03-26 17:25                                       ` Vlastimil Babka
  0 siblings, 0 replies; 44+ messages in thread
From: Vlastimil Babka @ 2015-03-26 17:25 UTC (permalink / raw)
  To: Daniel Micay, David Rientjes
  Cc: Aliaksey Kandratsenka, Andrew Morton, Shaohua Li, linux-mm,
	linux-api, Rik van Riel, Hugh Dickins, Mel Gorman,
	Johannes Weiner, Michal Hocko, Andy Lutomirski, google-perftools

On 03/26/2015 04:24 AM, Daniel Micay wrote:
> It's all well and good to say that you shouldn't do that, but it's the
> basis of the design in jemalloc and other zone-based arena allocators.
>
> There's a chosen chunk size and chunks are naturally aligned. An
> allocation is either a span of chunks (chunk-aligned) or has metadata
> stored in the chunk header. This also means chunks can be assigned to
> arenas for a high level of concurrency. Thread caching is then only
> necessary for batching operations to amortize the cost of locking rather
> than to reduce contention. Per-CPU arenas can be implemented quite well
> by using sched_getcpu() to move threads around whenever it detects that
> another thread allocated from the arena.
>
> With >= 2M chunks, madvise purging works very well at the chunk level
> but there's also fine-grained purging within chunks and it completely
> breaks down from THP page faults.

Are you sure it's due to page faults and not khugepaged + high value 
(such as the default 511) of max_ptes_none? As reported here?

https://bugzilla.kernel.org/show_bug.cgi?id=93111

Once you have faulted in a THP, and then purged part of it and split it, 
I don't think page faults in the purged part can lead to a new THP 
collapse, only khugepaged can do that AFAIK.
And if you mmap smaller than 2M areas (i.e. your 256K chunks), that 
should prevent THP page faults on the first fault within the chunk as well.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
  2015-03-26 17:25                                       ` Vlastimil Babka
  (?)
@ 2015-03-26 20:45                                       ` Daniel Micay
  -1 siblings, 0 replies; 44+ messages in thread
From: Daniel Micay @ 2015-03-26 20:45 UTC (permalink / raw)
  To: Vlastimil Babka, David Rientjes
  Cc: Aliaksey Kandratsenka, Andrew Morton, Shaohua Li, linux-mm,
	linux-api, Rik van Riel, Hugh Dickins, Mel Gorman,
	Johannes Weiner, Michal Hocko, Andy Lutomirski, google-perftools

[-- Attachment #1: Type: text/plain, Size: 1025 bytes --]

> Are you sure it's due to page faults and not khugepaged + high value
> (such as the default 511) of max_ptes_none? As reported here?
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=93111
> 
> Once you have faulted in a THP, and then purged part of it and split it,
> I don't think page faults in the purged part can lead to a new THP
> collapse, only khugepaged can do that AFAIK.
> And if you mmap smaller than 2M areas (i.e. your 256K chunks), that
> should prevent THP page faults on the first fault within the chunk as well.

Hm, that's probably it. The page faults would still be an issue when
reserving ranges on 64-bit for parallel chunk allocation and to make
sure the lowest address chunks are the oldest from the start, which is
likely down the road.

A nice property of 2M chunks is that mremap doesn't need to split huge
pages and neither does purging at the chunk level. I'd expect that to be
a *good thing* rather than something that needs to be avoided due to an
aggressive heuristic.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2015-03-26 20:45 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-17 21:09 [PATCH] mremap: add MREMAP_NOHOLE flag --resend Shaohua Li
2015-03-17 21:09 ` Shaohua Li
     [not found] ` <deaa4139de6e6422a0cec1e3282553aed3495e94.1426626497.git.shli-b10kYP2dOMg@public.gmane.org>
2015-03-18 22:31   ` Andrew Morton
2015-03-18 22:31     ` Andrew Morton
     [not found]     ` <20150318153100.5658b741277f3717b52e42d9-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
2015-03-19  5:08       ` Shaohua Li
2015-03-19  5:08         ` Shaohua Li
     [not found]         ` <20150319050826.GA1591708-XA4dbxeItU7BTsLV8vAZyg2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2015-03-19  5:22           ` Andrew Morton
2015-03-19  5:22             ` Andrew Morton
     [not found]             ` <20150318222246.bc608dd0.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
2015-03-19 16:38               ` Shaohua Li
2015-03-19 16:38                 ` Shaohua Li
2015-03-19  5:34       ` Daniel Micay
2015-03-19  5:34         ` Daniel Micay
2015-03-22  6:06         ` Aliaksey Kandratsenka
     [not found]           ` <CADpJO7zBLhjecbiQeTubnTReiicVLr0-K43KbB4uCL5w_dyqJg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-03-22  7:22             ` Daniel Micay
2015-03-22  7:22               ` Daniel Micay
     [not found]               ` <550E6D9D.1060507-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2015-03-24  4:36                 ` Aliaksey Kandratsenka
2015-03-24  4:36                   ` Aliaksey Kandratsenka
2015-03-24 14:54                   ` Daniel Micay
2015-03-25 16:22                 ` Vlastimil Babka
2015-03-25 16:22                   ` Vlastimil Babka
2015-03-25 20:49                   ` Daniel Micay
2015-03-25 20:54                     ` Daniel Micay
     [not found]                     ` <55131F70.7020503-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2015-03-26  0:19                       ` David Rientjes
2015-03-26  0:19                         ` David Rientjes
2015-03-26  0:24                         ` Daniel Micay
     [not found]                           ` <551351CA.3090803-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2015-03-26  2:31                             ` David Rientjes
2015-03-26  2:31                               ` David Rientjes
     [not found]                               ` <alpine.DEB.2.10.1503251914260.16714-X6Q0R45D7oAcqpCFd4KODRPsWskHk0ljAL8bYrjMMd8@public.gmane.org>
2015-03-26  3:24                                 ` Daniel Micay
2015-03-26  3:24                                   ` Daniel Micay
     [not found]                                   ` <55137C06.9020608-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2015-03-26  3:36                                     ` Daniel Micay
2015-03-26  3:36                                       ` Daniel Micay
2015-03-26 17:25                                     ` Vlastimil Babka
2015-03-26 17:25                                       ` Vlastimil Babka
2015-03-26 20:45                                       ` Daniel Micay
2015-03-23  5:17           ` Shaohua Li
2015-03-24  5:25             ` Aliaksey Kandratsenka
     [not found]               ` <CADpJO7zk8J3q7Bw9NibV9CzLarO+YkfeshyFTTq=XeS5qziBiA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-03-24 14:39                 ` Daniel Micay
2015-03-24 14:39                   ` Daniel Micay
2015-03-25  5:02                   ` Shaohua Li
2015-03-26  0:50                   ` Minchan Kim
2015-03-26  1:21                     ` Daniel Micay
2015-03-26  1:21                       ` Daniel Micay
     [not found]                       ` <55135F06.4000906-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2015-03-26  7:02                         ` Minchan Kim
2015-03-26  7:02                           ` Minchan Kim

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.