All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm: cache largest vma
@ 2013-11-01 20:17 ` Davidlohr Bueso
  0 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-01 20:17 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Michel Lespinasse, Ingo Molnar, Mel Gorman,
	Rik van Riel, Guan Xuetao, aswin, linux-kernel, linux-mm

While caching the last used vma already does a nice job avoiding
having to iterate the rbtree in find_vma, we can improve. After
studying the hit rate on a load of workloads and environments,
it was seen that it was around 45-50% - constant for a standard
desktop system (gnome3 + evolution + firefox + a few xterms),
and multiple java related workloads (including Hadoop/terasort),
and aim7, which indicates it's better than the 35% value documented
in the code.

By also caching the largest vma, that is, the one that contains
most addresses, there is a steady 10-15% hit rate gain, putting
it above the 60% region. This improvement comes at a very low
overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
the current logic.

This patch introduces a second mmap_cache pointer, which is just
as racy as the first, but as we already know, doesn't matter in
this context. For documentation purposes, I have also added the
ACCESS_ONCE() around mm->mmap_cache updates, keeping it consistent
with the reads.

Cc: Hugh Dickins <hughd@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
---
Please note that nommu and unicore32 arch are *untested*.

I also have a patch on top of this one that caches the most 
used vma, which adds another 8-10% hit rate gain, However,
since it does add a counter to the vma structure and we have
to do more logic in find_vma to keep track, I was hesitant about
the overhead. If folks are interested I can send that out as well.


 Documentation/vm/locking                 |  4 +-
 arch/unicore32/include/asm/mmu_context.h |  2 +-
 include/linux/mm.h                       | 13 ++++++
 include/linux/mm_types.h                 | 15 ++++++-
 kernel/debug/debug_core.c                | 17 +++++++-
 kernel/fork.c                            |  2 +-
 mm/mmap.c                                | 68 ++++++++++++++++++++------------
 7 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/Documentation/vm/locking b/Documentation/vm/locking
index f61228b..b4e8154 100644
--- a/Documentation/vm/locking
+++ b/Documentation/vm/locking
@@ -42,8 +42,8 @@ The rules are:
    for mm B.
 
 The caveats are:
-1. find_vma() makes use of, and updates, the mmap_cache pointer hint.
-The update of mmap_cache is racy (page stealer can race with other code
+1. find_vma() makes use of, and updates, the mmap_cache pointers hint.
+The updates of mmap_cache is racy (page stealer can race with other code
 that invokes find_vma with mmap_sem held), but that is okay, since it 
 is a hint. This can be fixed, if desired, by having find_vma grab the
 page_table_lock.
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index fb5e4c6..38cc7fc 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -73,7 +73,7 @@ do { \
 		else \
 			mm->mmap = NULL; \
 		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
-		mm->mmap_cache = NULL; \
+		vma_clear_caches(mm);			\
 		mm->map_count--; \
 		remove_vma(high_vma); \
 	} \
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b6e55e..2c0f8ed 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1534,8 +1534,21 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
 	/* Ignore errors */
 	(void) __mm_populate(addr, len, 1);
 }
+
+static inline void vma_clear_caches(struct mm_struct *mm)
+{
+	int i;
+
+	for (i = 0; i < NR_VMA_CACHES; i++)
+		mm->mmap_cache[i] = NULL;
+}
 #else
 static inline void mm_populate(unsigned long addr, unsigned long len) {}
+
+static inline void vma_clear_caches(struct mm_struct *mm)
+{
+	mm->mmap_cache = NULL;
+}
 #endif
 
 /* These take the mm semaphore themselves */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d9851ee..7f92835 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -322,12 +322,23 @@ struct mm_rss_stat {
 	atomic_long_t count[NR_MM_COUNTERS];
 };
 
+
+#ifdef CONFIG_MMU
+enum {
+	VMA_LAST_USED, /* last find_vma result */
+	VMA_LARGEST,   /* vma that contains most address */
+	NR_VMA_CACHES
+};
+#endif
+
 struct kioctx_table;
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
-	struct vm_area_struct * mmap_cache;	/* last find_vma result */
-#ifdef CONFIG_MMU
+#ifndef CONFIG_MMU
+	struct vm_area_struct *mmap_cache;      /* last find_vma result */
+#else
+	struct vm_area_struct *mmap_cache[NR_VMA_CACHES];
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0506d44..d9d72e4 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -221,13 +221,26 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
  */
 static void kgdb_flush_swbreak_addr(unsigned long addr)
 {
+	struct mm_struct *mm = current->mm;
 	if (!CACHE_FLUSH_IS_SAFE)
 		return;
 
-	if (current->mm && current->mm->mmap_cache) {
-		flush_cache_range(current->mm->mmap_cache,
+#ifdef CONFIG_MMU
+	if (mm) {
+		int i;
+
+		for (i = 0; i < NR_VMA_CACHES; i++)
+			if (mm->mmap_cache[i])
+				flush_cache_range(mm->mmap_cache[i],
+						  addr,
+						  addr + BREAK_INSTR_SIZE);
+	}
+#else
+	if (mm && mm->mmap_cache) {
+		flush_cache_range(mm->mmap_cache,
 				  addr, addr + BREAK_INSTR_SIZE);
 	}
+#endif
 	/* Force flush instruction cache if it was outside the mm */
 	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73..7b92666 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -363,8 +363,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
-	mm->mmap_cache = NULL;
 	mm->map_count = 0;
+	vma_clear_caches(mm);
 	cpumask_clear(mm_cpumask(mm));
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d54851..29c3fc0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -676,14 +676,17 @@ static inline void
 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev)
 {
+	int i;
 	struct vm_area_struct *next;
 
 	vma_rb_erase(vma, &mm->mm_rb);
 	prev->vm_next = next = vma->vm_next;
 	if (next)
 		next->vm_prev = prev;
-	if (mm->mmap_cache == vma)
-		mm->mmap_cache = prev;
+
+	for (i = 0; i < NR_VMA_CACHES; i++)
+		if (mm->mmap_cache[i] == vma)
+			mm->mmap_cache[i] = prev;
 }
 
 /*
@@ -1972,34 +1975,47 @@ EXPORT_SYMBOL(get_unmapped_area);
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
+	unsigned long currlen = 0;
+	struct rb_node *rb_node;
 	struct vm_area_struct *vma = NULL;
 
-	/* Check the cache first. */
-	/* (Cache hit rate is typically around 35%.) */
-	vma = ACCESS_ONCE(mm->mmap_cache);
-	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
-		struct rb_node *rb_node;
+	/* Check the cache first */
+	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]);
+	if (vma && vma->vm_end > addr && vma->vm_start <= addr)
+		goto ret;
 
-		rb_node = mm->mm_rb.rb_node;
-		vma = NULL;
+	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]);
+	if (vma) {
+		if (vma->vm_end > addr && vma->vm_start <= addr)
+			goto ret;
+		currlen = vma->vm_end - vma->vm_start;
+	}
 
-		while (rb_node) {
-			struct vm_area_struct *vma_tmp;
-
-			vma_tmp = rb_entry(rb_node,
-					   struct vm_area_struct, vm_rb);
-
-			if (vma_tmp->vm_end > addr) {
-				vma = vma_tmp;
-				if (vma_tmp->vm_start <= addr)
-					break;
-				rb_node = rb_node->rb_left;
-			} else
-				rb_node = rb_node->rb_right;
-		}
-		if (vma)
-			mm->mmap_cache = vma;
+	/* Bad cache! iterate rbtree */
+	rb_node = mm->mm_rb.rb_node;
+	vma = NULL;
+
+	while (rb_node) {
+		struct vm_area_struct *vma_tmp;
+
+		vma_tmp = rb_entry(rb_node,
+				   struct vm_area_struct, vm_rb);
+
+		if (vma_tmp->vm_end > addr) {
+			vma = vma_tmp;
+			if (vma_tmp->vm_start <= addr)
+				break;
+			rb_node = rb_node->rb_left;
+		} else
+			rb_node = rb_node->rb_right;
+	}
+
+	if (vma) {
+		ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]) = vma;
+		if (vma->vm_end - vma->vm_start > currlen)
+			ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]) = vma;
 	}
+ret:
 	return vma;
 }
 
@@ -2371,7 +2387,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	} else
 		mm->highest_vm_end = prev ? prev->vm_end : 0;
 	tail_vma->vm_next = NULL;
-	mm->mmap_cache = NULL;		/* Kill the cache. */
+	vma_clear_caches(mm);
 }
 
 /*
-- 
1.8.1.4




^ permalink raw reply related	[flat|nested] 76+ messages in thread

* [PATCH] mm: cache largest vma
@ 2013-11-01 20:17 ` Davidlohr Bueso
  0 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-01 20:17 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Michel Lespinasse, Ingo Molnar, Mel Gorman,
	Rik van Riel, Guan Xuetao, aswin, linux-kernel, linux-mm

While caching the last used vma already does a nice job avoiding
having to iterate the rbtree in find_vma, we can improve. After
studying the hit rate on a load of workloads and environments,
it was seen that it was around 45-50% - constant for a standard
desktop system (gnome3 + evolution + firefox + a few xterms),
and multiple java related workloads (including Hadoop/terasort),
and aim7, which indicates it's better than the 35% value documented
in the code.

By also caching the largest vma, that is, the one that contains
most addresses, there is a steady 10-15% hit rate gain, putting
it above the 60% region. This improvement comes at a very low
overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
the current logic.

This patch introduces a second mmap_cache pointer, which is just
as racy as the first, but as we already know, doesn't matter in
this context. For documentation purposes, I have also added the
ACCESS_ONCE() around mm->mmap_cache updates, keeping it consistent
with the reads.

Cc: Hugh Dickins <hughd@google.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
---
Please note that nommu and unicore32 arch are *untested*.

I also have a patch on top of this one that caches the most 
used vma, which adds another 8-10% hit rate gain, However,
since it does add a counter to the vma structure and we have
to do more logic in find_vma to keep track, I was hesitant about
the overhead. If folks are interested I can send that out as well.


 Documentation/vm/locking                 |  4 +-
 arch/unicore32/include/asm/mmu_context.h |  2 +-
 include/linux/mm.h                       | 13 ++++++
 include/linux/mm_types.h                 | 15 ++++++-
 kernel/debug/debug_core.c                | 17 +++++++-
 kernel/fork.c                            |  2 +-
 mm/mmap.c                                | 68 ++++++++++++++++++++------------
 7 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/Documentation/vm/locking b/Documentation/vm/locking
index f61228b..b4e8154 100644
--- a/Documentation/vm/locking
+++ b/Documentation/vm/locking
@@ -42,8 +42,8 @@ The rules are:
    for mm B.
 
 The caveats are:
-1. find_vma() makes use of, and updates, the mmap_cache pointer hint.
-The update of mmap_cache is racy (page stealer can race with other code
+1. find_vma() makes use of, and updates, the mmap_cache pointers hint.
+The updates of mmap_cache is racy (page stealer can race with other code
 that invokes find_vma with mmap_sem held), but that is okay, since it 
 is a hint. This can be fixed, if desired, by having find_vma grab the
 page_table_lock.
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index fb5e4c6..38cc7fc 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -73,7 +73,7 @@ do { \
 		else \
 			mm->mmap = NULL; \
 		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
-		mm->mmap_cache = NULL; \
+		vma_clear_caches(mm);			\
 		mm->map_count--; \
 		remove_vma(high_vma); \
 	} \
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b6e55e..2c0f8ed 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1534,8 +1534,21 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
 	/* Ignore errors */
 	(void) __mm_populate(addr, len, 1);
 }
+
+static inline void vma_clear_caches(struct mm_struct *mm)
+{
+	int i;
+
+	for (i = 0; i < NR_VMA_CACHES; i++)
+		mm->mmap_cache[i] = NULL;
+}
 #else
 static inline void mm_populate(unsigned long addr, unsigned long len) {}
+
+static inline void vma_clear_caches(struct mm_struct *mm)
+{
+	mm->mmap_cache = NULL;
+}
 #endif
 
 /* These take the mm semaphore themselves */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d9851ee..7f92835 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -322,12 +322,23 @@ struct mm_rss_stat {
 	atomic_long_t count[NR_MM_COUNTERS];
 };
 
+
+#ifdef CONFIG_MMU
+enum {
+	VMA_LAST_USED, /* last find_vma result */
+	VMA_LARGEST,   /* vma that contains most address */
+	NR_VMA_CACHES
+};
+#endif
+
 struct kioctx_table;
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
-	struct vm_area_struct * mmap_cache;	/* last find_vma result */
-#ifdef CONFIG_MMU
+#ifndef CONFIG_MMU
+	struct vm_area_struct *mmap_cache;      /* last find_vma result */
+#else
+	struct vm_area_struct *mmap_cache[NR_VMA_CACHES];
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
 				unsigned long pgoff, unsigned long flags);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0506d44..d9d72e4 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -221,13 +221,26 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
  */
 static void kgdb_flush_swbreak_addr(unsigned long addr)
 {
+	struct mm_struct *mm = current->mm;
 	if (!CACHE_FLUSH_IS_SAFE)
 		return;
 
-	if (current->mm && current->mm->mmap_cache) {
-		flush_cache_range(current->mm->mmap_cache,
+#ifdef CONFIG_MMU
+	if (mm) {
+		int i;
+
+		for (i = 0; i < NR_VMA_CACHES; i++)
+			if (mm->mmap_cache[i])
+				flush_cache_range(mm->mmap_cache[i],
+						  addr,
+						  addr + BREAK_INSTR_SIZE);
+	}
+#else
+	if (mm && mm->mmap_cache) {
+		flush_cache_range(mm->mmap_cache,
 				  addr, addr + BREAK_INSTR_SIZE);
 	}
+#endif
 	/* Force flush instruction cache if it was outside the mm */
 	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73..7b92666 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -363,8 +363,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
-	mm->mmap_cache = NULL;
 	mm->map_count = 0;
+	vma_clear_caches(mm);
 	cpumask_clear(mm_cpumask(mm));
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d54851..29c3fc0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -676,14 +676,17 @@ static inline void
 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev)
 {
+	int i;
 	struct vm_area_struct *next;
 
 	vma_rb_erase(vma, &mm->mm_rb);
 	prev->vm_next = next = vma->vm_next;
 	if (next)
 		next->vm_prev = prev;
-	if (mm->mmap_cache == vma)
-		mm->mmap_cache = prev;
+
+	for (i = 0; i < NR_VMA_CACHES; i++)
+		if (mm->mmap_cache[i] == vma)
+			mm->mmap_cache[i] = prev;
 }
 
 /*
@@ -1972,34 +1975,47 @@ EXPORT_SYMBOL(get_unmapped_area);
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
+	unsigned long currlen = 0;
+	struct rb_node *rb_node;
 	struct vm_area_struct *vma = NULL;
 
-	/* Check the cache first. */
-	/* (Cache hit rate is typically around 35%.) */
-	vma = ACCESS_ONCE(mm->mmap_cache);
-	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
-		struct rb_node *rb_node;
+	/* Check the cache first */
+	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]);
+	if (vma && vma->vm_end > addr && vma->vm_start <= addr)
+		goto ret;
 
-		rb_node = mm->mm_rb.rb_node;
-		vma = NULL;
+	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]);
+	if (vma) {
+		if (vma->vm_end > addr && vma->vm_start <= addr)
+			goto ret;
+		currlen = vma->vm_end - vma->vm_start;
+	}
 
-		while (rb_node) {
-			struct vm_area_struct *vma_tmp;
-
-			vma_tmp = rb_entry(rb_node,
-					   struct vm_area_struct, vm_rb);
-
-			if (vma_tmp->vm_end > addr) {
-				vma = vma_tmp;
-				if (vma_tmp->vm_start <= addr)
-					break;
-				rb_node = rb_node->rb_left;
-			} else
-				rb_node = rb_node->rb_right;
-		}
-		if (vma)
-			mm->mmap_cache = vma;
+	/* Bad cache! iterate rbtree */
+	rb_node = mm->mm_rb.rb_node;
+	vma = NULL;
+
+	while (rb_node) {
+		struct vm_area_struct *vma_tmp;
+
+		vma_tmp = rb_entry(rb_node,
+				   struct vm_area_struct, vm_rb);
+
+		if (vma_tmp->vm_end > addr) {
+			vma = vma_tmp;
+			if (vma_tmp->vm_start <= addr)
+				break;
+			rb_node = rb_node->rb_left;
+		} else
+			rb_node = rb_node->rb_right;
+	}
+
+	if (vma) {
+		ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]) = vma;
+		if (vma->vm_end - vma->vm_start > currlen)
+			ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]) = vma;
 	}
+ret:
 	return vma;
 }
 
@@ -2371,7 +2387,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	} else
 		mm->highest_vm_end = prev ? prev->vm_end : 0;
 	tail_vma->vm_next = NULL;
-	mm->mmap_cache = NULL;		/* Kill the cache. */
+	vma_clear_caches(mm);
 }
 
 /*
-- 
1.8.1.4



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-01 20:17 ` Davidlohr Bueso
@ 2013-11-01 20:38   ` KOSAKI Motohiro
  -1 siblings, 0 replies; 76+ messages in thread
From: KOSAKI Motohiro @ 2013-11-01 20:38 UTC (permalink / raw)
  To: Davidlohr Bueso, Andrew Morton
  Cc: kosaki.motohiro, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, linux-kernel,
	linux-mm

(11/1/13 4:17 PM), Davidlohr Bueso wrote:
> While caching the last used vma already does a nice job avoiding
> having to iterate the rbtree in find_vma, we can improve. After
> studying the hit rate on a load of workloads and environments,
> it was seen that it was around 45-50% - constant for a standard
> desktop system (gnome3 + evolution + firefox + a few xterms),
> and multiple java related workloads (including Hadoop/terasort),
> and aim7, which indicates it's better than the 35% value documented
> in the code.
>
> By also caching the largest vma, that is, the one that contains
> most addresses, there is a steady 10-15% hit rate gain, putting
> it above the 60% region. This improvement comes at a very low
> overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
> the current logic.

I'm slightly surprised this cache makes 15% hit. Which application
get a benefit? You listed a lot of applications, but I'm not sure
which is highly depending on largest vma.



^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-01 20:38   ` KOSAKI Motohiro
  0 siblings, 0 replies; 76+ messages in thread
From: KOSAKI Motohiro @ 2013-11-01 20:38 UTC (permalink / raw)
  To: Davidlohr Bueso, Andrew Morton
  Cc: kosaki.motohiro, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, linux-kernel,
	linux-mm

(11/1/13 4:17 PM), Davidlohr Bueso wrote:
> While caching the last used vma already does a nice job avoiding
> having to iterate the rbtree in find_vma, we can improve. After
> studying the hit rate on a load of workloads and environments,
> it was seen that it was around 45-50% - constant for a standard
> desktop system (gnome3 + evolution + firefox + a few xterms),
> and multiple java related workloads (including Hadoop/terasort),
> and aim7, which indicates it's better than the 35% value documented
> in the code.
>
> By also caching the largest vma, that is, the one that contains
> most addresses, there is a steady 10-15% hit rate gain, putting
> it above the 60% region. This improvement comes at a very low
> overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
> the current logic.

I'm slightly surprised this cache makes 15% hit. Which application
get a benefit? You listed a lot of applications, but I'm not sure
which is highly depending on largest vma.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-01 20:38   ` KOSAKI Motohiro
@ 2013-11-01 21:11     ` Davidlohr Bueso
  -1 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-01 21:11 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, linux-kernel,
	linux-mm

On Fri, 2013-11-01 at 16:38 -0400, KOSAKI Motohiro wrote:
> (11/1/13 4:17 PM), Davidlohr Bueso wrote:
> > While caching the last used vma already does a nice job avoiding
> > having to iterate the rbtree in find_vma, we can improve. After
> > studying the hit rate on a load of workloads and environments,
> > it was seen that it was around 45-50% - constant for a standard
> > desktop system (gnome3 + evolution + firefox + a few xterms),
> > and multiple java related workloads (including Hadoop/terasort),
> > and aim7, which indicates it's better than the 35% value documented
> > in the code.
> >
> > By also caching the largest vma, that is, the one that contains
> > most addresses, there is a steady 10-15% hit rate gain, putting
> > it above the 60% region. This improvement comes at a very low
> > overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
> > the current logic.
> 
> I'm slightly surprised this cache makes 15% hit. Which application
> get a benefit? You listed a lot of applications, but I'm not sure
> which is highly depending on largest vma.

Well I chose the largest vma because it gives us a greater chance of
being already cached when we do the lookup for the faulted address.

The 15% improvement was with Hadoop. According to my notes it was at
~48% with the baseline kernel and increased to ~63% with this patch.

In any case I didn't measure the rates on a per-task granularity, but at
a general system level. When a system is first booted I can see that the
mmap_cache access rate becomes the determinant factor and when adding a
workload it doesn't change much. One exception to this was a kernel
build, where we go from ~50% to ~89% hit rate on a vanilla kernel.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-01 21:11     ` Davidlohr Bueso
  0 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-01 21:11 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, linux-kernel,
	linux-mm

On Fri, 2013-11-01 at 16:38 -0400, KOSAKI Motohiro wrote:
> (11/1/13 4:17 PM), Davidlohr Bueso wrote:
> > While caching the last used vma already does a nice job avoiding
> > having to iterate the rbtree in find_vma, we can improve. After
> > studying the hit rate on a load of workloads and environments,
> > it was seen that it was around 45-50% - constant for a standard
> > desktop system (gnome3 + evolution + firefox + a few xterms),
> > and multiple java related workloads (including Hadoop/terasort),
> > and aim7, which indicates it's better than the 35% value documented
> > in the code.
> >
> > By also caching the largest vma, that is, the one that contains
> > most addresses, there is a steady 10-15% hit rate gain, putting
> > it above the 60% region. This improvement comes at a very low
> > overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
> > the current logic.
> 
> I'm slightly surprised this cache makes 15% hit. Which application
> get a benefit? You listed a lot of applications, but I'm not sure
> which is highly depending on largest vma.

Well I chose the largest vma because it gives us a greater chance of
being already cached when we do the lookup for the faulted address.

The 15% improvement was with Hadoop. According to my notes it was at
~48% with the baseline kernel and increased to ~63% with this patch.

In any case I didn't measure the rates on a per-task granularity, but at
a general system level. When a system is first booted I can see that the
mmap_cache access rate becomes the determinant factor and when adding a
workload it doesn't change much. One exception to this was a kernel
build, where we go from ~50% to ~89% hit rate on a vanilla kernel.

Thanks,
Davidlohr

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-01 20:17 ` Davidlohr Bueso
@ 2013-11-01 21:23   ` Rik van Riel
  -1 siblings, 0 replies; 76+ messages in thread
From: Rik van Riel @ 2013-11-01 21:23 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Guan Xuetao, aswin, linux-kernel, linux-mm

On 11/01/2013 04:17 PM, Davidlohr Bueso wrote:
> While caching the last used vma already does a nice job avoiding
> having to iterate the rbtree in find_vma, we can improve. After
> studying the hit rate on a load of workloads and environments,
> it was seen that it was around 45-50% - constant for a standard
> desktop system (gnome3 + evolution + firefox + a few xterms),
> and multiple java related workloads (including Hadoop/terasort),
> and aim7, which indicates it's better than the 35% value documented
> in the code.
> 
> By also caching the largest vma, that is, the one that contains
> most addresses, there is a steady 10-15% hit rate gain, putting
> it above the 60% region. This improvement comes at a very low

I suspect this will especially help when also using automatic
numa balancing, which causes periodic page faults.

Acked-by: Rik van Riel <riel@redhat.com>

-- 
All rights reversed

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-01 21:23   ` Rik van Riel
  0 siblings, 0 replies; 76+ messages in thread
From: Rik van Riel @ 2013-11-01 21:23 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Guan Xuetao, aswin, linux-kernel, linux-mm

On 11/01/2013 04:17 PM, Davidlohr Bueso wrote:
> While caching the last used vma already does a nice job avoiding
> having to iterate the rbtree in find_vma, we can improve. After
> studying the hit rate on a load of workloads and environments,
> it was seen that it was around 45-50% - constant for a standard
> desktop system (gnome3 + evolution + firefox + a few xterms),
> and multiple java related workloads (including Hadoop/terasort),
> and aim7, which indicates it's better than the 35% value documented
> in the code.
> 
> By also caching the largest vma, that is, the one that contains
> most addresses, there is a steady 10-15% hit rate gain, putting
> it above the 60% region. This improvement comes at a very low

I suspect this will especially help when also using automatic
numa balancing, which causes periodic page faults.

Acked-by: Rik van Riel <riel@redhat.com>

-- 
All rights reversed

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-01 21:11     ` Davidlohr Bueso
@ 2013-11-03  9:46       ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-03  9:46 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: KOSAKI Motohiro, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, linux-kernel,
	linux-mm, Linus Torvalds


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> On Fri, 2013-11-01 at 16:38 -0400, KOSAKI Motohiro wrote:
> > (11/1/13 4:17 PM), Davidlohr Bueso wrote:
> >
> > > While caching the last used vma already does a nice job avoiding 
> > > having to iterate the rbtree in find_vma, we can improve. After 
> > > studying the hit rate on a load of workloads and environments, it 
> > > was seen that it was around 45-50% - constant for a standard desktop 
> > > system (gnome3 + evolution + firefox + a few xterms), and multiple 
> > > java related workloads (including Hadoop/terasort), and aim7, which 
> > > indicates it's better than the 35% value documented in the code.
> > >
> > > By also caching the largest vma, that is, the one that contains most 
> > > addresses, there is a steady 10-15% hit rate gain, putting it above 
> > > the 60% region. This improvement comes at a very low overhead for a 
> > > miss. Furthermore, systems with !CONFIG_MMU keep the current logic.
> > 
> > I'm slightly surprised this cache makes 15% hit. Which application get 
> > a benefit? You listed a lot of applications, but I'm not sure which is 
> > highly depending on largest vma.
> 
> Well I chose the largest vma because it gives us a greater chance of 
> being already cached when we do the lookup for the faulted address.
> 
> The 15% improvement was with Hadoop. According to my notes it was at 
> ~48% with the baseline kernel and increased to ~63% with this patch.
> 
> In any case I didn't measure the rates on a per-task granularity, but at 
> a general system level. When a system is first booted I can see that the 
> mmap_cache access rate becomes the determinant factor and when adding a 
> workload it doesn't change much. One exception to this was a kernel 
> build, where we go from ~50% to ~89% hit rate on a vanilla kernel.

~90% during a kernel build is pretty impressive.

Still the ad-hoc nature of the caching worries me a bit - but I don't have 
any better ideas myself.

[I've Cc:-ed Linus, in case he has any better ideas.]

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-03  9:46       ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-03  9:46 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: KOSAKI Motohiro, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, linux-kernel,
	linux-mm, Linus Torvalds


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> On Fri, 2013-11-01 at 16:38 -0400, KOSAKI Motohiro wrote:
> > (11/1/13 4:17 PM), Davidlohr Bueso wrote:
> >
> > > While caching the last used vma already does a nice job avoiding 
> > > having to iterate the rbtree in find_vma, we can improve. After 
> > > studying the hit rate on a load of workloads and environments, it 
> > > was seen that it was around 45-50% - constant for a standard desktop 
> > > system (gnome3 + evolution + firefox + a few xterms), and multiple 
> > > java related workloads (including Hadoop/terasort), and aim7, which 
> > > indicates it's better than the 35% value documented in the code.
> > >
> > > By also caching the largest vma, that is, the one that contains most 
> > > addresses, there is a steady 10-15% hit rate gain, putting it above 
> > > the 60% region. This improvement comes at a very low overhead for a 
> > > miss. Furthermore, systems with !CONFIG_MMU keep the current logic.
> > 
> > I'm slightly surprised this cache makes 15% hit. Which application get 
> > a benefit? You listed a lot of applications, but I'm not sure which is 
> > highly depending on largest vma.
> 
> Well I chose the largest vma because it gives us a greater chance of 
> being already cached when we do the lookup for the faulted address.
> 
> The 15% improvement was with Hadoop. According to my notes it was at 
> ~48% with the baseline kernel and increased to ~63% with this patch.
> 
> In any case I didn't measure the rates on a per-task granularity, but at 
> a general system level. When a system is first booted I can see that the 
> mmap_cache access rate becomes the determinant factor and when adding a 
> workload it doesn't change much. One exception to this was a kernel 
> build, where we go from ~50% to ~89% hit rate on a vanilla kernel.

~90% during a kernel build is pretty impressive.

Still the ad-hoc nature of the caching worries me a bit - but I don't have 
any better ideas myself.

[I've Cc:-ed Linus, in case he has any better ideas.]

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-01 20:17 ` Davidlohr Bueso
@ 2013-11-03 10:12   ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-03 10:12 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Mel Gorman,
	Rik van Riel, Guan Xuetao, aswin, linux-kernel, linux-mm,
	Linus Torvalds


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> While caching the last used vma already does a nice job avoiding
> having to iterate the rbtree in find_vma, we can improve. After
> studying the hit rate on a load of workloads and environments,
> it was seen that it was around 45-50% - constant for a standard
> desktop system (gnome3 + evolution + firefox + a few xterms),
> and multiple java related workloads (including Hadoop/terasort),
> and aim7, which indicates it's better than the 35% value documented
> in the code.
> 
> By also caching the largest vma, that is, the one that contains
> most addresses, there is a steady 10-15% hit rate gain, putting
> it above the 60% region. This improvement comes at a very low
> overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
> the current logic.
> 
> This patch introduces a second mmap_cache pointer, which is just
> as racy as the first, but as we already know, doesn't matter in
> this context. For documentation purposes, I have also added the
> ACCESS_ONCE() around mm->mmap_cache updates, keeping it consistent
> with the reads.
> 
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Michel Lespinasse <walken@google.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
> Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
> ---
> Please note that nommu and unicore32 arch are *untested*.
> 
> I also have a patch on top of this one that caches the most 
> used vma, which adds another 8-10% hit rate gain, However,
> since it does add a counter to the vma structure and we have
> to do more logic in find_vma to keep track, I was hesitant about
> the overhead. If folks are interested I can send that out as well.

Would be interesting to see.

Btw., roughly how many cycles/instructions do we save by increasing the 
hit rate, in the typical case (for example during a kernel build)?

That would be important to measure, so that we can get a ballpark figure 
for the cost/benefit equation.

>  Documentation/vm/locking                 |  4 +-
>  arch/unicore32/include/asm/mmu_context.h |  2 +-
>  include/linux/mm.h                       | 13 ++++++
>  include/linux/mm_types.h                 | 15 ++++++-
>  kernel/debug/debug_core.c                | 17 +++++++-
>  kernel/fork.c                            |  2 +-
>  mm/mmap.c                                | 68 ++++++++++++++++++++------------
>  7 files changed, 87 insertions(+), 34 deletions(-)
> 
> diff --git a/Documentation/vm/locking b/Documentation/vm/locking
> index f61228b..b4e8154 100644
> --- a/Documentation/vm/locking
> +++ b/Documentation/vm/locking
> @@ -42,8 +42,8 @@ The rules are:
>     for mm B.
>  
>  The caveats are:
> -1. find_vma() makes use of, and updates, the mmap_cache pointer hint.
> -The update of mmap_cache is racy (page stealer can race with other code
> +1. find_vma() makes use of, and updates, the mmap_cache pointers hint.
> +The updates of mmap_cache is racy (page stealer can race with other code
>  that invokes find_vma with mmap_sem held), but that is okay, since it 
>  is a hint. This can be fixed, if desired, by having find_vma grab the
>  page_table_lock.
> diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
> index fb5e4c6..38cc7fc 100644
> --- a/arch/unicore32/include/asm/mmu_context.h
> +++ b/arch/unicore32/include/asm/mmu_context.h
> @@ -73,7 +73,7 @@ do { \
>  		else \
>  			mm->mmap = NULL; \
>  		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
> -		mm->mmap_cache = NULL; \
> +		vma_clear_caches(mm);			\
>  		mm->map_count--; \
>  		remove_vma(high_vma); \
>  	} \
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 8b6e55e..2c0f8ed 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1534,8 +1534,21 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
>  	/* Ignore errors */
>  	(void) __mm_populate(addr, len, 1);
>  }
> +
> +static inline void vma_clear_caches(struct mm_struct *mm)
> +{
> +	int i;
> +
> +	for (i = 0; i < NR_VMA_CACHES; i++)
> +		mm->mmap_cache[i] = NULL;

Just curious: does GCC manage to open-code this as two stores of NULL?

> +}
>  #else
>  static inline void mm_populate(unsigned long addr, unsigned long len) {}
> +
> +static inline void vma_clear_caches(struct mm_struct *mm)
1> +{
> +	mm->mmap_cache = NULL;
> +}
>  #endif
>  
>  /* These take the mm semaphore themselves */
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index d9851ee..7f92835 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -322,12 +322,23 @@ struct mm_rss_stat {
>  	atomic_long_t count[NR_MM_COUNTERS];
>  };
>  
> +
> +#ifdef CONFIG_MMU
> +enum {
> +	VMA_LAST_USED, /* last find_vma result */
> +	VMA_LARGEST,   /* vma that contains most address */
> +	NR_VMA_CACHES
> +};
> +#endif
> +
>  struct kioctx_table;
>  struct mm_struct {
>  	struct vm_area_struct * mmap;		/* list of VMAs */
>  	struct rb_root mm_rb;
> -	struct vm_area_struct * mmap_cache;	/* last find_vma result */
> -#ifdef CONFIG_MMU
> +#ifndef CONFIG_MMU
> +	struct vm_area_struct *mmap_cache;      /* last find_vma result */
> +#else
> +	struct vm_area_struct *mmap_cache[NR_VMA_CACHES];

I think the CONFIG_MMU assymetry in the data structure is rather ugly.

Why not make it a single-entry enum in the !CONFIG_MMU case? To the 
compiler a single-entry array should be the same as a pointer field.

That would eliminate most of the related #ifdefs AFAICS.

>  	unsigned long (*get_unmapped_area) (struct file *filp,
>  				unsigned long addr, unsigned long len,
>  				unsigned long pgoff, unsigned long flags);
> diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
> index 0506d44..d9d72e4 100644
> --- a/kernel/debug/debug_core.c
> +++ b/kernel/debug/debug_core.c
> @@ -221,13 +221,26 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
>   */
>  static void kgdb_flush_swbreak_addr(unsigned long addr)
>  {
> +	struct mm_struct *mm = current->mm;
>  	if (!CACHE_FLUSH_IS_SAFE)
>  		return;
>  
> -	if (current->mm && current->mm->mmap_cache) {
> -		flush_cache_range(current->mm->mmap_cache,
> +#ifdef CONFIG_MMU
> +	if (mm) {
> +		int i;
> +
> +		for (i = 0; i < NR_VMA_CACHES; i++)
> +			if (mm->mmap_cache[i])
> +				flush_cache_range(mm->mmap_cache[i],
> +						  addr,
> +						  addr + BREAK_INSTR_SIZE);

(Nit: please use curly braces for for such multi-line statements.)

> +	}
> +#else
> +	if (mm && mm->mmap_cache) {
> +		flush_cache_range(mm->mmap_cache,
>  				  addr, addr + BREAK_INSTR_SIZE);
>  	}
> +#endif

Btw., this #ifdef would be unified with my suggested data structure 
variant as well.

>  	/* Force flush instruction cache if it was outside the mm */
>  	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
>  }
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 086fe73..7b92666 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -363,8 +363,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
>  
>  	mm->locked_vm = 0;
>  	mm->mmap = NULL;
> -	mm->mmap_cache = NULL;
>  	mm->map_count = 0;
> +	vma_clear_caches(mm);
>  	cpumask_clear(mm_cpumask(mm));
>  	mm->mm_rb = RB_ROOT;
>  	rb_link = &mm->mm_rb.rb_node;
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 9d54851..29c3fc0 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -676,14 +676,17 @@ static inline void
>  __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
>  		struct vm_area_struct *prev)
>  {
> +	int i;
>  	struct vm_area_struct *next;
>  
>  	vma_rb_erase(vma, &mm->mm_rb);
>  	prev->vm_next = next = vma->vm_next;
>  	if (next)
>  		next->vm_prev = prev;
> -	if (mm->mmap_cache == vma)
> -		mm->mmap_cache = prev;
> +
> +	for (i = 0; i < NR_VMA_CACHES; i++)
> +		if (mm->mmap_cache[i] == vma)
> +			mm->mmap_cache[i] = prev;

(Nit: missing curly braces.)

Also, I don't think setting the cache value back to 'prev' is valid in the 
VMA_LARGEST case. The likelihood that it's the second largest VMA is 
remote.

The right action here would be to set it to NULL.

For VMA_LAST_USED setting it to 'prev' seems justified.

>  }
>  
>  /*
> @@ -1972,34 +1975,47 @@ EXPORT_SYMBOL(get_unmapped_area);
>  /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
>  struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
>  {
> +	unsigned long currlen = 0;

(Nit: I don't think 'currlen' really explains the role of the variable. 
'max_len' would be better?)

> +	struct rb_node *rb_node;
>  	struct vm_area_struct *vma = NULL;
>  
> -	/* Check the cache first. */
> -	/* (Cache hit rate is typically around 35%.) */
> -	vma = ACCESS_ONCE(mm->mmap_cache);
> -	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
> -		struct rb_node *rb_node;
> +	/* Check the cache first */
> +	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]);
> +	if (vma && vma->vm_end > addr && vma->vm_start <= addr)
> +		goto ret;
>  
> -		rb_node = mm->mm_rb.rb_node;
> -		vma = NULL;
> +	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]);
> +	if (vma) {
> +		if (vma->vm_end > addr && vma->vm_start <= addr)
> +			goto ret;
> +		currlen = vma->vm_end - vma->vm_start;
> +	}
>  
> -		while (rb_node) {
> -			struct vm_area_struct *vma_tmp;
> -
> -			vma_tmp = rb_entry(rb_node,
> -					   struct vm_area_struct, vm_rb);
> -
> -			if (vma_tmp->vm_end > addr) {
> -				vma = vma_tmp;
> -				if (vma_tmp->vm_start <= addr)
> -					break;
> -				rb_node = rb_node->rb_left;
> -			} else
> -				rb_node = rb_node->rb_right;
> -		}
> -		if (vma)
> -			mm->mmap_cache = vma;
> +	/* Bad cache! iterate rbtree */

(Nit: the cache is not 'bad', we just didn't hit it.)

> +	rb_node = mm->mm_rb.rb_node;
> +	vma = NULL;
> +
> +	while (rb_node) {
> +		struct vm_area_struct *vma_tmp;
> +
> +		vma_tmp = rb_entry(rb_node,
> +				   struct vm_area_struct, vm_rb);

(Nit: in such cases a single, slightly-longer-than-80col line is IMHO a 
better solution than such an artificial line-break.)

> +
> +		if (vma_tmp->vm_end > addr) {
> +			vma = vma_tmp;
> +			if (vma_tmp->vm_start <= addr)
> +				break;
> +			rb_node = rb_node->rb_left;
> +		} else
> +			rb_node = rb_node->rb_right;

(Nit: unbalanced curly braces.)

> +	}
> +
> +	if (vma) {
> +		ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]) = vma;
> +		if (vma->vm_end - vma->vm_start > currlen)
> +			ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]) = vma;

Would it make sense to not update VMA_LAST_USED if VMA_LARGEST is set?

This would have the advantage of increasing the cache size to two, for the 
common case where there's two vmas used most of the time.

To maximize the hit rate in the general case what we basically want to 
have is an LRU cache, weighted by vma size.

Maybe by expressing it all in that fashion and looking at the hit rate at 
1, 2, 3 and 4 entries would give us equivalent (or better!) behavior than 
your open-coded variant, with a better idea about how to size it 
precisely.

Note that that approach would get rid of the VMA_LAST_USED/VMA_LARGEST 
distinction in a natural fashion.

Obviously, if the LRU logic gets too complex then it probably won't bring 
us any benefits compared to a primitive front-entry cache, so all this is 
a delicate balance ... hence my previous question about 
cycles/instructions saved by hitting the cache.

>  	}
> +ret:
>  	return vma;
>  }
>  
> @@ -2371,7 +2387,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
>  	} else
>  		mm->highest_vm_end = prev ? prev->vm_end : 0;
>  	tail_vma->vm_next = NULL;
> -	mm->mmap_cache = NULL;		/* Kill the cache. */
> +	vma_clear_caches(mm);

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-03 10:12   ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-03 10:12 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Mel Gorman,
	Rik van Riel, Guan Xuetao, aswin, linux-kernel, linux-mm,
	Linus Torvalds


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> While caching the last used vma already does a nice job avoiding
> having to iterate the rbtree in find_vma, we can improve. After
> studying the hit rate on a load of workloads and environments,
> it was seen that it was around 45-50% - constant for a standard
> desktop system (gnome3 + evolution + firefox + a few xterms),
> and multiple java related workloads (including Hadoop/terasort),
> and aim7, which indicates it's better than the 35% value documented
> in the code.
> 
> By also caching the largest vma, that is, the one that contains
> most addresses, there is a steady 10-15% hit rate gain, putting
> it above the 60% region. This improvement comes at a very low
> overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
> the current logic.
> 
> This patch introduces a second mmap_cache pointer, which is just
> as racy as the first, but as we already know, doesn't matter in
> this context. For documentation purposes, I have also added the
> ACCESS_ONCE() around mm->mmap_cache updates, keeping it consistent
> with the reads.
> 
> Cc: Hugh Dickins <hughd@google.com>
> Cc: Michel Lespinasse <walken@google.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Mel Gorman <mgorman@suse.de>
> Cc: Rik van Riel <riel@redhat.com>
> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
> Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
> ---
> Please note that nommu and unicore32 arch are *untested*.
> 
> I also have a patch on top of this one that caches the most 
> used vma, which adds another 8-10% hit rate gain, However,
> since it does add a counter to the vma structure and we have
> to do more logic in find_vma to keep track, I was hesitant about
> the overhead. If folks are interested I can send that out as well.

Would be interesting to see.

Btw., roughly how many cycles/instructions do we save by increasing the 
hit rate, in the typical case (for example during a kernel build)?

That would be important to measure, so that we can get a ballpark figure 
for the cost/benefit equation.

>  Documentation/vm/locking                 |  4 +-
>  arch/unicore32/include/asm/mmu_context.h |  2 +-
>  include/linux/mm.h                       | 13 ++++++
>  include/linux/mm_types.h                 | 15 ++++++-
>  kernel/debug/debug_core.c                | 17 +++++++-
>  kernel/fork.c                            |  2 +-
>  mm/mmap.c                                | 68 ++++++++++++++++++++------------
>  7 files changed, 87 insertions(+), 34 deletions(-)
> 
> diff --git a/Documentation/vm/locking b/Documentation/vm/locking
> index f61228b..b4e8154 100644
> --- a/Documentation/vm/locking
> +++ b/Documentation/vm/locking
> @@ -42,8 +42,8 @@ The rules are:
>     for mm B.
>  
>  The caveats are:
> -1. find_vma() makes use of, and updates, the mmap_cache pointer hint.
> -The update of mmap_cache is racy (page stealer can race with other code
> +1. find_vma() makes use of, and updates, the mmap_cache pointers hint.
> +The updates of mmap_cache is racy (page stealer can race with other code
>  that invokes find_vma with mmap_sem held), but that is okay, since it 
>  is a hint. This can be fixed, if desired, by having find_vma grab the
>  page_table_lock.
> diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
> index fb5e4c6..38cc7fc 100644
> --- a/arch/unicore32/include/asm/mmu_context.h
> +++ b/arch/unicore32/include/asm/mmu_context.h
> @@ -73,7 +73,7 @@ do { \
>  		else \
>  			mm->mmap = NULL; \
>  		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
> -		mm->mmap_cache = NULL; \
> +		vma_clear_caches(mm);			\
>  		mm->map_count--; \
>  		remove_vma(high_vma); \
>  	} \
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 8b6e55e..2c0f8ed 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1534,8 +1534,21 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
>  	/* Ignore errors */
>  	(void) __mm_populate(addr, len, 1);
>  }
> +
> +static inline void vma_clear_caches(struct mm_struct *mm)
> +{
> +	int i;
> +
> +	for (i = 0; i < NR_VMA_CACHES; i++)
> +		mm->mmap_cache[i] = NULL;

Just curious: does GCC manage to open-code this as two stores of NULL?

> +}
>  #else
>  static inline void mm_populate(unsigned long addr, unsigned long len) {}
> +
> +static inline void vma_clear_caches(struct mm_struct *mm)
1> +{
> +	mm->mmap_cache = NULL;
> +}
>  #endif
>  
>  /* These take the mm semaphore themselves */
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index d9851ee..7f92835 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -322,12 +322,23 @@ struct mm_rss_stat {
>  	atomic_long_t count[NR_MM_COUNTERS];
>  };
>  
> +
> +#ifdef CONFIG_MMU
> +enum {
> +	VMA_LAST_USED, /* last find_vma result */
> +	VMA_LARGEST,   /* vma that contains most address */
> +	NR_VMA_CACHES
> +};
> +#endif
> +
>  struct kioctx_table;
>  struct mm_struct {
>  	struct vm_area_struct * mmap;		/* list of VMAs */
>  	struct rb_root mm_rb;
> -	struct vm_area_struct * mmap_cache;	/* last find_vma result */
> -#ifdef CONFIG_MMU
> +#ifndef CONFIG_MMU
> +	struct vm_area_struct *mmap_cache;      /* last find_vma result */
> +#else
> +	struct vm_area_struct *mmap_cache[NR_VMA_CACHES];

I think the CONFIG_MMU assymetry in the data structure is rather ugly.

Why not make it a single-entry enum in the !CONFIG_MMU case? To the 
compiler a single-entry array should be the same as a pointer field.

That would eliminate most of the related #ifdefs AFAICS.

>  	unsigned long (*get_unmapped_area) (struct file *filp,
>  				unsigned long addr, unsigned long len,
>  				unsigned long pgoff, unsigned long flags);
> diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
> index 0506d44..d9d72e4 100644
> --- a/kernel/debug/debug_core.c
> +++ b/kernel/debug/debug_core.c
> @@ -221,13 +221,26 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
>   */
>  static void kgdb_flush_swbreak_addr(unsigned long addr)
>  {
> +	struct mm_struct *mm = current->mm;
>  	if (!CACHE_FLUSH_IS_SAFE)
>  		return;
>  
> -	if (current->mm && current->mm->mmap_cache) {
> -		flush_cache_range(current->mm->mmap_cache,
> +#ifdef CONFIG_MMU
> +	if (mm) {
> +		int i;
> +
> +		for (i = 0; i < NR_VMA_CACHES; i++)
> +			if (mm->mmap_cache[i])
> +				flush_cache_range(mm->mmap_cache[i],
> +						  addr,
> +						  addr + BREAK_INSTR_SIZE);

(Nit: please use curly braces for for such multi-line statements.)

> +	}
> +#else
> +	if (mm && mm->mmap_cache) {
> +		flush_cache_range(mm->mmap_cache,
>  				  addr, addr + BREAK_INSTR_SIZE);
>  	}
> +#endif

Btw., this #ifdef would be unified with my suggested data structure 
variant as well.

>  	/* Force flush instruction cache if it was outside the mm */
>  	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
>  }
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 086fe73..7b92666 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -363,8 +363,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
>  
>  	mm->locked_vm = 0;
>  	mm->mmap = NULL;
> -	mm->mmap_cache = NULL;
>  	mm->map_count = 0;
> +	vma_clear_caches(mm);
>  	cpumask_clear(mm_cpumask(mm));
>  	mm->mm_rb = RB_ROOT;
>  	rb_link = &mm->mm_rb.rb_node;
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 9d54851..29c3fc0 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -676,14 +676,17 @@ static inline void
>  __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
>  		struct vm_area_struct *prev)
>  {
> +	int i;
>  	struct vm_area_struct *next;
>  
>  	vma_rb_erase(vma, &mm->mm_rb);
>  	prev->vm_next = next = vma->vm_next;
>  	if (next)
>  		next->vm_prev = prev;
> -	if (mm->mmap_cache == vma)
> -		mm->mmap_cache = prev;
> +
> +	for (i = 0; i < NR_VMA_CACHES; i++)
> +		if (mm->mmap_cache[i] == vma)
> +			mm->mmap_cache[i] = prev;

(Nit: missing curly braces.)

Also, I don't think setting the cache value back to 'prev' is valid in the 
VMA_LARGEST case. The likelihood that it's the second largest VMA is 
remote.

The right action here would be to set it to NULL.

For VMA_LAST_USED setting it to 'prev' seems justified.

>  }
>  
>  /*
> @@ -1972,34 +1975,47 @@ EXPORT_SYMBOL(get_unmapped_area);
>  /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
>  struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
>  {
> +	unsigned long currlen = 0;

(Nit: I don't think 'currlen' really explains the role of the variable. 
'max_len' would be better?)

> +	struct rb_node *rb_node;
>  	struct vm_area_struct *vma = NULL;
>  
> -	/* Check the cache first. */
> -	/* (Cache hit rate is typically around 35%.) */
> -	vma = ACCESS_ONCE(mm->mmap_cache);
> -	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
> -		struct rb_node *rb_node;
> +	/* Check the cache first */
> +	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]);
> +	if (vma && vma->vm_end > addr && vma->vm_start <= addr)
> +		goto ret;
>  
> -		rb_node = mm->mm_rb.rb_node;
> -		vma = NULL;
> +	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]);
> +	if (vma) {
> +		if (vma->vm_end > addr && vma->vm_start <= addr)
> +			goto ret;
> +		currlen = vma->vm_end - vma->vm_start;
> +	}
>  
> -		while (rb_node) {
> -			struct vm_area_struct *vma_tmp;
> -
> -			vma_tmp = rb_entry(rb_node,
> -					   struct vm_area_struct, vm_rb);
> -
> -			if (vma_tmp->vm_end > addr) {
> -				vma = vma_tmp;
> -				if (vma_tmp->vm_start <= addr)
> -					break;
> -				rb_node = rb_node->rb_left;
> -			} else
> -				rb_node = rb_node->rb_right;
> -		}
> -		if (vma)
> -			mm->mmap_cache = vma;
> +	/* Bad cache! iterate rbtree */

(Nit: the cache is not 'bad', we just didn't hit it.)

> +	rb_node = mm->mm_rb.rb_node;
> +	vma = NULL;
> +
> +	while (rb_node) {
> +		struct vm_area_struct *vma_tmp;
> +
> +		vma_tmp = rb_entry(rb_node,
> +				   struct vm_area_struct, vm_rb);

(Nit: in such cases a single, slightly-longer-than-80col line is IMHO a 
better solution than such an artificial line-break.)

> +
> +		if (vma_tmp->vm_end > addr) {
> +			vma = vma_tmp;
> +			if (vma_tmp->vm_start <= addr)
> +				break;
> +			rb_node = rb_node->rb_left;
> +		} else
> +			rb_node = rb_node->rb_right;

(Nit: unbalanced curly braces.)

> +	}
> +
> +	if (vma) {
> +		ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]) = vma;
> +		if (vma->vm_end - vma->vm_start > currlen)
> +			ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]) = vma;

Would it make sense to not update VMA_LAST_USED if VMA_LARGEST is set?

This would have the advantage of increasing the cache size to two, for the 
common case where there's two vmas used most of the time.

To maximize the hit rate in the general case what we basically want to 
have is an LRU cache, weighted by vma size.

Maybe by expressing it all in that fashion and looking at the hit rate at 
1, 2, 3 and 4 entries would give us equivalent (or better!) behavior than 
your open-coded variant, with a better idea about how to size it 
precisely.

Note that that approach would get rid of the VMA_LAST_USED/VMA_LARGEST 
distinction in a natural fashion.

Obviously, if the LRU logic gets too complex then it probably won't bring 
us any benefits compared to a primitive front-entry cache, so all this is 
a delicate balance ... hence my previous question about 
cycles/instructions saved by hitting the cache.

>  	}
> +ret:
>  	return vma;
>  }
>  
> @@ -2371,7 +2387,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
>  	} else
>  		mm->highest_vm_end = prev ? prev->vm_end : 0;
>  	tail_vma->vm_next = NULL;
> -	mm->mmap_cache = NULL;		/* Kill the cache. */
> +	vma_clear_caches(mm);

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-01 20:17 ` Davidlohr Bueso
@ 2013-11-03 18:51   ` Linus Torvalds
  -1 siblings, 0 replies; 76+ messages in thread
From: Linus Torvalds @ 2013-11-03 18:51 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

Ugh. This patch makes me angry. It looks way too ad-hoc.

I can well imagine that our current one-entry cache is crap and could
be improved, but this looks too random. Different code for the
CONFIG_MMU case? Same name, but for non-MMU it's a single entry, for
MMU it's an array? And the whole "largest" just looks odd. Plus why do
you set LAST_USED if you also set LARGEST?

Did you try just a two- or four-entry pseudo-LRU instead, with a
per-thread index for "last hit"? Or even possibly a small fixed-size
hash table (say "idx = (add >> 10) & 3" or something)?

And what happens for threaded models? Maybe we'd be much better off
making the cache be per-thread, and the flushing of the cache would be
a sequence number that has to match (so "vma_clear_cache()" ends up
just incrementing a 64-bit sequence number in the mm)?

Basically, my complaints boil down to "too random" and "too
specialized", and I can see (and you already comment on) this patch
being grown with even *more* ad-hoc random new cases (LAST, LARGEST,
MOST_USED - what's next?). And while I don't know if we should worry
about the threaded case, I do get the feeling that this ad-hoc
approach is guaranteed to never work for that, which makes me feel
that it's not just ad-hoc, it's also fundamentally limited.

I can see us merging this patch, but I would really like to hear that
we do so because other cleaner approaches don't work well. In
particular, pseudo-LRU tends to be successful (and cheap) for caches.

                 Linus

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-03 18:51   ` Linus Torvalds
  0 siblings, 0 replies; 76+ messages in thread
From: Linus Torvalds @ 2013-11-03 18:51 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

Ugh. This patch makes me angry. It looks way too ad-hoc.

I can well imagine that our current one-entry cache is crap and could
be improved, but this looks too random. Different code for the
CONFIG_MMU case? Same name, but for non-MMU it's a single entry, for
MMU it's an array? And the whole "largest" just looks odd. Plus why do
you set LAST_USED if you also set LARGEST?

Did you try just a two- or four-entry pseudo-LRU instead, with a
per-thread index for "last hit"? Or even possibly a small fixed-size
hash table (say "idx = (add >> 10) & 3" or something)?

And what happens for threaded models? Maybe we'd be much better off
making the cache be per-thread, and the flushing of the cache would be
a sequence number that has to match (so "vma_clear_cache()" ends up
just incrementing a 64-bit sequence number in the mm)?

Basically, my complaints boil down to "too random" and "too
specialized", and I can see (and you already comment on) this patch
being grown with even *more* ad-hoc random new cases (LAST, LARGEST,
MOST_USED - what's next?). And while I don't know if we should worry
about the threaded case, I do get the feeling that this ad-hoc
approach is guaranteed to never work for that, which makes me feel
that it's not just ad-hoc, it's also fundamentally limited.

I can see us merging this patch, but I would really like to hear that
we do so because other cleaner approaches don't work well. In
particular, pseudo-LRU tends to be successful (and cheap) for caches.

                 Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-01 21:11     ` Davidlohr Bueso
@ 2013-11-03 23:57       ` KOSAKI Motohiro
  -1 siblings, 0 replies; 76+ messages in thread
From: KOSAKI Motohiro @ 2013-11-03 23:57 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, LKML, linux-mm

>> I'm slightly surprised this cache makes 15% hit. Which application
>> get a benefit? You listed a lot of applications, but I'm not sure
>> which is highly depending on largest vma.
>
> Well I chose the largest vma because it gives us a greater chance of
> being already cached when we do the lookup for the faulted address.
>
> The 15% improvement was with Hadoop. According to my notes it was at
> ~48% with the baseline kernel and increased to ~63% with this patch.
>
> In any case I didn't measure the rates on a per-task granularity, but at
> a general system level. When a system is first booted I can see that the
> mmap_cache access rate becomes the determinant factor and when adding a
> workload it doesn't change much. One exception to this was a kernel
> build, where we go from ~50% to ~89% hit rate on a vanilla kernel.

I looked at this patch a bit. The worth of this is to improve the
cache hit ratio
of heap.

1) For single thread applications, heap is frequently largest mapping
in the process.
2) For java VM, "java -Xms1000m -Xmx1000m HelloWorld" makes following
/proc/<pid>/smaps entry. That said, JVM allocate single heap even if
applications are multi threaded.

c1800000-100000000 rw-p 00000000 00:00 0
Size:            1024000 kB
Rss:                 244 kB
Pss:                 244 kB
Shared_Clean:          0 kB
Shared_Dirty:          0 kB
Private_Clean:         0 kB
Private_Dirty:       244 kB
Referenced:          244 kB
Anonymous:           244 kB
AnonHugePages:         0 kB
Swap:                  0 kB
KernelPageSize:        4 kB
MMUPageSize:           4 kB

That's good.

However, we know there is a situation that this patch doesn't work.
glibc makes per thread heap (arena) by default. So, it is not to be
expected works well on glibc multi threaded programs. That's a
slightly big limitation.

Anyway, I haven't observed real performance difference because most
big penalty of find_vma come from taking mmap_sem, not rb-tree search.

Another and additional input are welcome. But I myself haven't convinced
this patch works everywhere.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-03 23:57       ` KOSAKI Motohiro
  0 siblings, 0 replies; 76+ messages in thread
From: KOSAKI Motohiro @ 2013-11-03 23:57 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, LKML, linux-mm

>> I'm slightly surprised this cache makes 15% hit. Which application
>> get a benefit? You listed a lot of applications, but I'm not sure
>> which is highly depending on largest vma.
>
> Well I chose the largest vma because it gives us a greater chance of
> being already cached when we do the lookup for the faulted address.
>
> The 15% improvement was with Hadoop. According to my notes it was at
> ~48% with the baseline kernel and increased to ~63% with this patch.
>
> In any case I didn't measure the rates on a per-task granularity, but at
> a general system level. When a system is first booted I can see that the
> mmap_cache access rate becomes the determinant factor and when adding a
> workload it doesn't change much. One exception to this was a kernel
> build, where we go from ~50% to ~89% hit rate on a vanilla kernel.

I looked at this patch a bit. The worth of this is to improve the
cache hit ratio
of heap.

1) For single thread applications, heap is frequently largest mapping
in the process.
2) For java VM, "java -Xms1000m -Xmx1000m HelloWorld" makes following
/proc/<pid>/smaps entry. That said, JVM allocate single heap even if
applications are multi threaded.

c1800000-100000000 rw-p 00000000 00:00 0
Size:            1024000 kB
Rss:                 244 kB
Pss:                 244 kB
Shared_Clean:          0 kB
Shared_Dirty:          0 kB
Private_Clean:         0 kB
Private_Dirty:       244 kB
Referenced:          244 kB
Anonymous:           244 kB
AnonHugePages:         0 kB
Swap:                  0 kB
KernelPageSize:        4 kB
MMUPageSize:           4 kB

That's good.

However, we know there is a situation that this patch doesn't work.
glibc makes per thread heap (arena) by default. So, it is not to be
expected works well on glibc multi threaded programs. That's a
slightly big limitation.

Anyway, I haven't observed real performance difference because most
big penalty of find_vma come from taking mmap_sem, not rb-tree search.

Another and additional input are welcome. But I myself haven't convinced
this patch works everywhere.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-03 18:51   ` Linus Torvalds
@ 2013-11-04  4:04     ` Davidlohr Bueso
  -1 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-04  4:04 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

On Sun, 2013-11-03 at 10:51 -0800, Linus Torvalds wrote:
> Ugh. This patch makes me angry. It looks way too ad-hoc.
> 
> I can well imagine that our current one-entry cache is crap and could
> be improved, but this looks too random. 

Indeed, my approach is random *because* I wanted to keep things as
simple and low overhead as possible. Caching the largest VMA is probably
as least invasive and as low as overhead as you can get in find_vma().

> Different code for the
> CONFIG_MMU case? Same name, but for non-MMU it's a single entry, for
> MMU it's an array? And the whole "largest" just looks odd. Plus why do
> you set LAST_USED if you also set LARGEST?
> 
> Did you try just a two- or four-entry pseudo-LRU instead, with a
> per-thread index for "last hit"? Or even possibly a small fixed-size
> hash table (say "idx = (add >> 10) & 3" or something)?
> 
> And what happens for threaded models? Maybe we'd be much better off
> making the cache be per-thread, and the flushing of the cache would be
> a sequence number that has to match (so "vma_clear_cache()" ends up
> just incrementing a 64-bit sequence number in the mm)?

I will look into doing the vma cache per thread instead of mm (I hadn't
really looked at the problem like this) as well as Ingo's suggestion on
the weighted LRU approach. However, having seen that we can cheaply and
easily reach around ~70% hit rate in a lot of workloads, makes me wonder
how good is good enough?

> Basically, my complaints boil down to "too random" and "too
> specialized", and I can see (and you already comment on) this patch
> being grown with even *more* ad-hoc random new cases (LAST, LARGEST,
> MOST_USED - what's next?). And while I don't know if we should worry
> about the threaded case, I do get the feeling that this ad-hoc
> approach is guaranteed to never work for that, which makes me feel
> that it's not just ad-hoc, it's also fundamentally limited.
> 
> I can see us merging this patch, but I would really like to hear that
> we do so because other cleaner approaches don't work well. In
> particular, pseudo-LRU tends to be successful (and cheap) for caches.

OK, will report back with comparisons, hopefully I'll have a better
picture by then.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-04  4:04     ` Davidlohr Bueso
  0 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-04  4:04 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

On Sun, 2013-11-03 at 10:51 -0800, Linus Torvalds wrote:
> Ugh. This patch makes me angry. It looks way too ad-hoc.
> 
> I can well imagine that our current one-entry cache is crap and could
> be improved, but this looks too random. 

Indeed, my approach is random *because* I wanted to keep things as
simple and low overhead as possible. Caching the largest VMA is probably
as least invasive and as low as overhead as you can get in find_vma().

> Different code for the
> CONFIG_MMU case? Same name, but for non-MMU it's a single entry, for
> MMU it's an array? And the whole "largest" just looks odd. Plus why do
> you set LAST_USED if you also set LARGEST?
> 
> Did you try just a two- or four-entry pseudo-LRU instead, with a
> per-thread index for "last hit"? Or even possibly a small fixed-size
> hash table (say "idx = (add >> 10) & 3" or something)?
> 
> And what happens for threaded models? Maybe we'd be much better off
> making the cache be per-thread, and the flushing of the cache would be
> a sequence number that has to match (so "vma_clear_cache()" ends up
> just incrementing a 64-bit sequence number in the mm)?

I will look into doing the vma cache per thread instead of mm (I hadn't
really looked at the problem like this) as well as Ingo's suggestion on
the weighted LRU approach. However, having seen that we can cheaply and
easily reach around ~70% hit rate in a lot of workloads, makes me wonder
how good is good enough?

> Basically, my complaints boil down to "too random" and "too
> specialized", and I can see (and you already comment on) this patch
> being grown with even *more* ad-hoc random new cases (LAST, LARGEST,
> MOST_USED - what's next?). And while I don't know if we should worry
> about the threaded case, I do get the feeling that this ad-hoc
> approach is guaranteed to never work for that, which makes me feel
> that it's not just ad-hoc, it's also fundamentally limited.
> 
> I can see us merging this patch, but I would really like to hear that
> we do so because other cleaner approaches don't work well. In
> particular, pseudo-LRU tends to be successful (and cheap) for caches.

OK, will report back with comparisons, hopefully I'll have a better
picture by then.

Thanks,
Davidlohr

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-03 10:12   ` Ingo Molnar
@ 2013-11-04  4:20     ` Davidlohr Bueso
  -1 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-04  4:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Mel Gorman,
	Rik van Riel, Guan Xuetao, aswin, linux-kernel, linux-mm,
	Linus Torvalds

On Sun, 2013-11-03 at 11:12 +0100, Ingo Molnar wrote:
> * Davidlohr Bueso <davidlohr@hp.com> wrote:
> 
> > While caching the last used vma already does a nice job avoiding
> > having to iterate the rbtree in find_vma, we can improve. After
> > studying the hit rate on a load of workloads and environments,
> > it was seen that it was around 45-50% - constant for a standard
> > desktop system (gnome3 + evolution + firefox + a few xterms),
> > and multiple java related workloads (including Hadoop/terasort),
> > and aim7, which indicates it's better than the 35% value documented
> > in the code.
> > 
> > By also caching the largest vma, that is, the one that contains
> > most addresses, there is a steady 10-15% hit rate gain, putting
> > it above the 60% region. This improvement comes at a very low
> > overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
> > the current logic.
> > 
> > This patch introduces a second mmap_cache pointer, which is just
> > as racy as the first, but as we already know, doesn't matter in
> > this context. For documentation purposes, I have also added the
> > ACCESS_ONCE() around mm->mmap_cache updates, keeping it consistent
> > with the reads.
> > 
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: Michel Lespinasse <walken@google.com>
> > Cc: Ingo Molnar <mingo@kernel.org>
> > Cc: Mel Gorman <mgorman@suse.de>
> > Cc: Rik van Riel <riel@redhat.com>
> > Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
> > Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
> > ---
> > Please note that nommu and unicore32 arch are *untested*.
> > 
> > I also have a patch on top of this one that caches the most 
> > used vma, which adds another 8-10% hit rate gain, However,
> > since it does add a counter to the vma structure and we have
> > to do more logic in find_vma to keep track, I was hesitant about
> > the overhead. If folks are interested I can send that out as well.
> 
> Would be interesting to see.
> 
> Btw., roughly how many cycles/instructions do we save by increasing the 
> hit rate, in the typical case (for example during a kernel build)?

Good point. The IPC from perf stat doesn't show any difference with or
without the patch -- note that this is probably the least interesting
one as we already get a really nice hit rate with the single mmap_cache.
I have yet to try it on the other workloads.

> 
> That would be important to measure, so that we can get a ballpark figure 
> for the cost/benefit equation.
> 
> >  Documentation/vm/locking                 |  4 +-
> >  arch/unicore32/include/asm/mmu_context.h |  2 +-
> >  include/linux/mm.h                       | 13 ++++++
> >  include/linux/mm_types.h                 | 15 ++++++-
> >  kernel/debug/debug_core.c                | 17 +++++++-
> >  kernel/fork.c                            |  2 +-
> >  mm/mmap.c                                | 68 ++++++++++++++++++++------------
> >  7 files changed, 87 insertions(+), 34 deletions(-)
> > 
> > diff --git a/Documentation/vm/locking b/Documentation/vm/locking
> > index f61228b..b4e8154 100644
> > --- a/Documentation/vm/locking
> > +++ b/Documentation/vm/locking
> > @@ -42,8 +42,8 @@ The rules are:
> >     for mm B.
> >  
> >  The caveats are:
> > -1. find_vma() makes use of, and updates, the mmap_cache pointer hint.
> > -The update of mmap_cache is racy (page stealer can race with other code
> > +1. find_vma() makes use of, and updates, the mmap_cache pointers hint.
> > +The updates of mmap_cache is racy (page stealer can race with other code
> >  that invokes find_vma with mmap_sem held), but that is okay, since it 
> >  is a hint. This can be fixed, if desired, by having find_vma grab the
> >  page_table_lock.
> > diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
> > index fb5e4c6..38cc7fc 100644
> > --- a/arch/unicore32/include/asm/mmu_context.h
> > +++ b/arch/unicore32/include/asm/mmu_context.h
> > @@ -73,7 +73,7 @@ do { \
> >  		else \
> >  			mm->mmap = NULL; \
> >  		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
> > -		mm->mmap_cache = NULL; \
> > +		vma_clear_caches(mm);			\
> >  		mm->map_count--; \
> >  		remove_vma(high_vma); \
> >  	} \
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 8b6e55e..2c0f8ed 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1534,8 +1534,21 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
> >  	/* Ignore errors */
> >  	(void) __mm_populate(addr, len, 1);
> >  }
> > +
> > +static inline void vma_clear_caches(struct mm_struct *mm)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < NR_VMA_CACHES; i++)
> > +		mm->mmap_cache[i] = NULL;
> 
> Just curious: does GCC manage to open-code this as two stores of NULL?
> 
> > +}
> >  #else
> >  static inline void mm_populate(unsigned long addr, unsigned long len) {}
> > +
> > +static inline void vma_clear_caches(struct mm_struct *mm)
> 1> +{
> > +	mm->mmap_cache = NULL;
> > +}
> >  #endif
> >  
> >  /* These take the mm semaphore themselves */
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index d9851ee..7f92835 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -322,12 +322,23 @@ struct mm_rss_stat {
> >  	atomic_long_t count[NR_MM_COUNTERS];
> >  };
> >  
> > +
> > +#ifdef CONFIG_MMU
> > +enum {
> > +	VMA_LAST_USED, /* last find_vma result */
> > +	VMA_LARGEST,   /* vma that contains most address */
> > +	NR_VMA_CACHES
> > +};
> > +#endif
> > +
> >  struct kioctx_table;
> >  struct mm_struct {
> >  	struct vm_area_struct * mmap;		/* list of VMAs */
> >  	struct rb_root mm_rb;
> > -	struct vm_area_struct * mmap_cache;	/* last find_vma result */
> > -#ifdef CONFIG_MMU
> > +#ifndef CONFIG_MMU
> > +	struct vm_area_struct *mmap_cache;      /* last find_vma result */
> > +#else
> > +	struct vm_area_struct *mmap_cache[NR_VMA_CACHES];
> 
> I think the CONFIG_MMU assymetry in the data structure is rather ugly.
> 
> Why not make it a single-entry enum in the !CONFIG_MMU case? To the 
> compiler a single-entry array should be the same as a pointer field.
> 
> That would eliminate most of the related #ifdefs AFAICS.

Yes that's a lot better.

> 
> >  	unsigned long (*get_unmapped_area) (struct file *filp,
> >  				unsigned long addr, unsigned long len,
> >  				unsigned long pgoff, unsigned long flags);
> > diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
> > index 0506d44..d9d72e4 100644
> > --- a/kernel/debug/debug_core.c
> > +++ b/kernel/debug/debug_core.c
> > @@ -221,13 +221,26 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
> >   */
> >  static void kgdb_flush_swbreak_addr(unsigned long addr)
> >  {
> > +	struct mm_struct *mm = current->mm;
> >  	if (!CACHE_FLUSH_IS_SAFE)
> >  		return;
> >  
> > -	if (current->mm && current->mm->mmap_cache) {
> > -		flush_cache_range(current->mm->mmap_cache,
> > +#ifdef CONFIG_MMU
> > +	if (mm) {
> > +		int i;
> > +
> > +		for (i = 0; i < NR_VMA_CACHES; i++)
> > +			if (mm->mmap_cache[i])
> > +				flush_cache_range(mm->mmap_cache[i],
> > +						  addr,
> > +						  addr + BREAK_INSTR_SIZE);
> 
> (Nit: please use curly braces for for such multi-line statements.)
> 
> > +	}
> > +#else
> > +	if (mm && mm->mmap_cache) {
> > +		flush_cache_range(mm->mmap_cache,
> >  				  addr, addr + BREAK_INSTR_SIZE);
> >  	}
> > +#endif
> 
> Btw., this #ifdef would be unified with my suggested data structure 
> variant as well.
> 
> >  	/* Force flush instruction cache if it was outside the mm */
> >  	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
> >  }
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 086fe73..7b92666 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -363,8 +363,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
> >  
> >  	mm->locked_vm = 0;
> >  	mm->mmap = NULL;
> > -	mm->mmap_cache = NULL;
> >  	mm->map_count = 0;
> > +	vma_clear_caches(mm);
> >  	cpumask_clear(mm_cpumask(mm));
> >  	mm->mm_rb = RB_ROOT;
> >  	rb_link = &mm->mm_rb.rb_node;
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 9d54851..29c3fc0 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -676,14 +676,17 @@ static inline void
> >  __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
> >  		struct vm_area_struct *prev)
> >  {
> > +	int i;
> >  	struct vm_area_struct *next;
> >  
> >  	vma_rb_erase(vma, &mm->mm_rb);
> >  	prev->vm_next = next = vma->vm_next;
> >  	if (next)
> >  		next->vm_prev = prev;
> > -	if (mm->mmap_cache == vma)
> > -		mm->mmap_cache = prev;
> > +
> > +	for (i = 0; i < NR_VMA_CACHES; i++)
> > +		if (mm->mmap_cache[i] == vma)
> > +			mm->mmap_cache[i] = prev;
> 
> (Nit: missing curly braces.)
> 
> Also, I don't think setting the cache value back to 'prev' is valid in the 
> VMA_LARGEST case. The likelihood that it's the second largest VMA is 
> remote.
> 
> The right action here would be to set it to NULL.
> 
> For VMA_LAST_USED setting it to 'prev' seems justified.
> 
> >  }
> >  
> >  /*
> > @@ -1972,34 +1975,47 @@ EXPORT_SYMBOL(get_unmapped_area);
> >  /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> >  struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
> >  {
> > +	unsigned long currlen = 0;
> 
> (Nit: I don't think 'currlen' really explains the role of the variable. 
> 'max_len' would be better?)
> 
> > +	struct rb_node *rb_node;
> >  	struct vm_area_struct *vma = NULL;
> >  
> > -	/* Check the cache first. */
> > -	/* (Cache hit rate is typically around 35%.) */
> > -	vma = ACCESS_ONCE(mm->mmap_cache);
> > -	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
> > -		struct rb_node *rb_node;
> > +	/* Check the cache first */
> > +	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]);
> > +	if (vma && vma->vm_end > addr && vma->vm_start <= addr)
> > +		goto ret;
> >  
> > -		rb_node = mm->mm_rb.rb_node;
> > -		vma = NULL;
> > +	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]);
> > +	if (vma) {
> > +		if (vma->vm_end > addr && vma->vm_start <= addr)
> > +			goto ret;
> > +		currlen = vma->vm_end - vma->vm_start;
> > +	}
> >  
> > -		while (rb_node) {
> > -			struct vm_area_struct *vma_tmp;
> > -
> > -			vma_tmp = rb_entry(rb_node,
> > -					   struct vm_area_struct, vm_rb);
> > -
> > -			if (vma_tmp->vm_end > addr) {
> > -				vma = vma_tmp;
> > -				if (vma_tmp->vm_start <= addr)
> > -					break;
> > -				rb_node = rb_node->rb_left;
> > -			} else
> > -				rb_node = rb_node->rb_right;
> > -		}
> > -		if (vma)
> > -			mm->mmap_cache = vma;
> > +	/* Bad cache! iterate rbtree */
> 
> (Nit: the cache is not 'bad', we just didn't hit it.)
> 
> > +	rb_node = mm->mm_rb.rb_node;
> > +	vma = NULL;
> > +
> > +	while (rb_node) {
> > +		struct vm_area_struct *vma_tmp;
> > +
> > +		vma_tmp = rb_entry(rb_node,
> > +				   struct vm_area_struct, vm_rb);
> 
> (Nit: in such cases a single, slightly-longer-than-80col line is IMHO a 
> better solution than such an artificial line-break.)
> 
> > +
> > +		if (vma_tmp->vm_end > addr) {
> > +			vma = vma_tmp;
> > +			if (vma_tmp->vm_start <= addr)
> > +				break;
> > +			rb_node = rb_node->rb_left;
> > +		} else
> > +			rb_node = rb_node->rb_right;
> 
> (Nit: unbalanced curly braces.)
> 
> > +	}
> > +
> > +	if (vma) {
> > +		ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]) = vma;
> > +		if (vma->vm_end - vma->vm_start > currlen)
> > +			ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]) = vma;
> 
> Would it make sense to not update VMA_LAST_USED if VMA_LARGEST is set?
> 
> This would have the advantage of increasing the cache size to two, for the 
> common case where there's two vmas used most of the time.
> 
> To maximize the hit rate in the general case what we basically want to 
> have is an LRU cache, weighted by vma size.
> 
> Maybe by expressing it all in that fashion and looking at the hit rate at 
> 1, 2, 3 and 4 entries would give us equivalent (or better!) behavior than 
> your open-coded variant, with a better idea about how to size it 
> precisely.
> 
> Note that that approach would get rid of the VMA_LAST_USED/VMA_LARGEST 
> distinction in a natural fashion.
> 
> Obviously, if the LRU logic gets too complex then it probably won't bring 
> us any benefits compared to a primitive front-entry cache, so all this is 
> a delicate balance ... hence my previous question about 
> cycles/instructions saved by hitting the cache.

Will try this, thanks for the suggestions.

Btw, do you suggest using a high level tool such as perf for getting
this data or sprinkling get_cycles() in find_vma() -- I'd think that the
first isn't fine grained enough, while the later will probably variate a
lot from run to run but the ratio should be rather constant.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-04  4:20     ` Davidlohr Bueso
  0 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-04  4:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Mel Gorman,
	Rik van Riel, Guan Xuetao, aswin, linux-kernel, linux-mm,
	Linus Torvalds

On Sun, 2013-11-03 at 11:12 +0100, Ingo Molnar wrote:
> * Davidlohr Bueso <davidlohr@hp.com> wrote:
> 
> > While caching the last used vma already does a nice job avoiding
> > having to iterate the rbtree in find_vma, we can improve. After
> > studying the hit rate on a load of workloads and environments,
> > it was seen that it was around 45-50% - constant for a standard
> > desktop system (gnome3 + evolution + firefox + a few xterms),
> > and multiple java related workloads (including Hadoop/terasort),
> > and aim7, which indicates it's better than the 35% value documented
> > in the code.
> > 
> > By also caching the largest vma, that is, the one that contains
> > most addresses, there is a steady 10-15% hit rate gain, putting
> > it above the 60% region. This improvement comes at a very low
> > overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
> > the current logic.
> > 
> > This patch introduces a second mmap_cache pointer, which is just
> > as racy as the first, but as we already know, doesn't matter in
> > this context. For documentation purposes, I have also added the
> > ACCESS_ONCE() around mm->mmap_cache updates, keeping it consistent
> > with the reads.
> > 
> > Cc: Hugh Dickins <hughd@google.com>
> > Cc: Michel Lespinasse <walken@google.com>
> > Cc: Ingo Molnar <mingo@kernel.org>
> > Cc: Mel Gorman <mgorman@suse.de>
> > Cc: Rik van Riel <riel@redhat.com>
> > Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
> > Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
> > ---
> > Please note that nommu and unicore32 arch are *untested*.
> > 
> > I also have a patch on top of this one that caches the most 
> > used vma, which adds another 8-10% hit rate gain, However,
> > since it does add a counter to the vma structure and we have
> > to do more logic in find_vma to keep track, I was hesitant about
> > the overhead. If folks are interested I can send that out as well.
> 
> Would be interesting to see.
> 
> Btw., roughly how many cycles/instructions do we save by increasing the 
> hit rate, in the typical case (for example during a kernel build)?

Good point. The IPC from perf stat doesn't show any difference with or
without the patch -- note that this is probably the least interesting
one as we already get a really nice hit rate with the single mmap_cache.
I have yet to try it on the other workloads.

> 
> That would be important to measure, so that we can get a ballpark figure 
> for the cost/benefit equation.
> 
> >  Documentation/vm/locking                 |  4 +-
> >  arch/unicore32/include/asm/mmu_context.h |  2 +-
> >  include/linux/mm.h                       | 13 ++++++
> >  include/linux/mm_types.h                 | 15 ++++++-
> >  kernel/debug/debug_core.c                | 17 +++++++-
> >  kernel/fork.c                            |  2 +-
> >  mm/mmap.c                                | 68 ++++++++++++++++++++------------
> >  7 files changed, 87 insertions(+), 34 deletions(-)
> > 
> > diff --git a/Documentation/vm/locking b/Documentation/vm/locking
> > index f61228b..b4e8154 100644
> > --- a/Documentation/vm/locking
> > +++ b/Documentation/vm/locking
> > @@ -42,8 +42,8 @@ The rules are:
> >     for mm B.
> >  
> >  The caveats are:
> > -1. find_vma() makes use of, and updates, the mmap_cache pointer hint.
> > -The update of mmap_cache is racy (page stealer can race with other code
> > +1. find_vma() makes use of, and updates, the mmap_cache pointers hint.
> > +The updates of mmap_cache is racy (page stealer can race with other code
> >  that invokes find_vma with mmap_sem held), but that is okay, since it 
> >  is a hint. This can be fixed, if desired, by having find_vma grab the
> >  page_table_lock.
> > diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
> > index fb5e4c6..38cc7fc 100644
> > --- a/arch/unicore32/include/asm/mmu_context.h
> > +++ b/arch/unicore32/include/asm/mmu_context.h
> > @@ -73,7 +73,7 @@ do { \
> >  		else \
> >  			mm->mmap = NULL; \
> >  		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
> > -		mm->mmap_cache = NULL; \
> > +		vma_clear_caches(mm);			\
> >  		mm->map_count--; \
> >  		remove_vma(high_vma); \
> >  	} \
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 8b6e55e..2c0f8ed 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1534,8 +1534,21 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
> >  	/* Ignore errors */
> >  	(void) __mm_populate(addr, len, 1);
> >  }
> > +
> > +static inline void vma_clear_caches(struct mm_struct *mm)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < NR_VMA_CACHES; i++)
> > +		mm->mmap_cache[i] = NULL;
> 
> Just curious: does GCC manage to open-code this as two stores of NULL?
> 
> > +}
> >  #else
> >  static inline void mm_populate(unsigned long addr, unsigned long len) {}
> > +
> > +static inline void vma_clear_caches(struct mm_struct *mm)
> 1> +{
> > +	mm->mmap_cache = NULL;
> > +}
> >  #endif
> >  
> >  /* These take the mm semaphore themselves */
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index d9851ee..7f92835 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -322,12 +322,23 @@ struct mm_rss_stat {
> >  	atomic_long_t count[NR_MM_COUNTERS];
> >  };
> >  
> > +
> > +#ifdef CONFIG_MMU
> > +enum {
> > +	VMA_LAST_USED, /* last find_vma result */
> > +	VMA_LARGEST,   /* vma that contains most address */
> > +	NR_VMA_CACHES
> > +};
> > +#endif
> > +
> >  struct kioctx_table;
> >  struct mm_struct {
> >  	struct vm_area_struct * mmap;		/* list of VMAs */
> >  	struct rb_root mm_rb;
> > -	struct vm_area_struct * mmap_cache;	/* last find_vma result */
> > -#ifdef CONFIG_MMU
> > +#ifndef CONFIG_MMU
> > +	struct vm_area_struct *mmap_cache;      /* last find_vma result */
> > +#else
> > +	struct vm_area_struct *mmap_cache[NR_VMA_CACHES];
> 
> I think the CONFIG_MMU assymetry in the data structure is rather ugly.
> 
> Why not make it a single-entry enum in the !CONFIG_MMU case? To the 
> compiler a single-entry array should be the same as a pointer field.
> 
> That would eliminate most of the related #ifdefs AFAICS.

Yes that's a lot better.

> 
> >  	unsigned long (*get_unmapped_area) (struct file *filp,
> >  				unsigned long addr, unsigned long len,
> >  				unsigned long pgoff, unsigned long flags);
> > diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
> > index 0506d44..d9d72e4 100644
> > --- a/kernel/debug/debug_core.c
> > +++ b/kernel/debug/debug_core.c
> > @@ -221,13 +221,26 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
> >   */
> >  static void kgdb_flush_swbreak_addr(unsigned long addr)
> >  {
> > +	struct mm_struct *mm = current->mm;
> >  	if (!CACHE_FLUSH_IS_SAFE)
> >  		return;
> >  
> > -	if (current->mm && current->mm->mmap_cache) {
> > -		flush_cache_range(current->mm->mmap_cache,
> > +#ifdef CONFIG_MMU
> > +	if (mm) {
> > +		int i;
> > +
> > +		for (i = 0; i < NR_VMA_CACHES; i++)
> > +			if (mm->mmap_cache[i])
> > +				flush_cache_range(mm->mmap_cache[i],
> > +						  addr,
> > +						  addr + BREAK_INSTR_SIZE);
> 
> (Nit: please use curly braces for for such multi-line statements.)
> 
> > +	}
> > +#else
> > +	if (mm && mm->mmap_cache) {
> > +		flush_cache_range(mm->mmap_cache,
> >  				  addr, addr + BREAK_INSTR_SIZE);
> >  	}
> > +#endif
> 
> Btw., this #ifdef would be unified with my suggested data structure 
> variant as well.
> 
> >  	/* Force flush instruction cache if it was outside the mm */
> >  	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
> >  }
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 086fe73..7b92666 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -363,8 +363,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
> >  
> >  	mm->locked_vm = 0;
> >  	mm->mmap = NULL;
> > -	mm->mmap_cache = NULL;
> >  	mm->map_count = 0;
> > +	vma_clear_caches(mm);
> >  	cpumask_clear(mm_cpumask(mm));
> >  	mm->mm_rb = RB_ROOT;
> >  	rb_link = &mm->mm_rb.rb_node;
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 9d54851..29c3fc0 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -676,14 +676,17 @@ static inline void
> >  __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
> >  		struct vm_area_struct *prev)
> >  {
> > +	int i;
> >  	struct vm_area_struct *next;
> >  
> >  	vma_rb_erase(vma, &mm->mm_rb);
> >  	prev->vm_next = next = vma->vm_next;
> >  	if (next)
> >  		next->vm_prev = prev;
> > -	if (mm->mmap_cache == vma)
> > -		mm->mmap_cache = prev;
> > +
> > +	for (i = 0; i < NR_VMA_CACHES; i++)
> > +		if (mm->mmap_cache[i] == vma)
> > +			mm->mmap_cache[i] = prev;
> 
> (Nit: missing curly braces.)
> 
> Also, I don't think setting the cache value back to 'prev' is valid in the 
> VMA_LARGEST case. The likelihood that it's the second largest VMA is 
> remote.
> 
> The right action here would be to set it to NULL.
> 
> For VMA_LAST_USED setting it to 'prev' seems justified.
> 
> >  }
> >  
> >  /*
> > @@ -1972,34 +1975,47 @@ EXPORT_SYMBOL(get_unmapped_area);
> >  /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> >  struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
> >  {
> > +	unsigned long currlen = 0;
> 
> (Nit: I don't think 'currlen' really explains the role of the variable. 
> 'max_len' would be better?)
> 
> > +	struct rb_node *rb_node;
> >  	struct vm_area_struct *vma = NULL;
> >  
> > -	/* Check the cache first. */
> > -	/* (Cache hit rate is typically around 35%.) */
> > -	vma = ACCESS_ONCE(mm->mmap_cache);
> > -	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
> > -		struct rb_node *rb_node;
> > +	/* Check the cache first */
> > +	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]);
> > +	if (vma && vma->vm_end > addr && vma->vm_start <= addr)
> > +		goto ret;
> >  
> > -		rb_node = mm->mm_rb.rb_node;
> > -		vma = NULL;
> > +	vma = ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]);
> > +	if (vma) {
> > +		if (vma->vm_end > addr && vma->vm_start <= addr)
> > +			goto ret;
> > +		currlen = vma->vm_end - vma->vm_start;
> > +	}
> >  
> > -		while (rb_node) {
> > -			struct vm_area_struct *vma_tmp;
> > -
> > -			vma_tmp = rb_entry(rb_node,
> > -					   struct vm_area_struct, vm_rb);
> > -
> > -			if (vma_tmp->vm_end > addr) {
> > -				vma = vma_tmp;
> > -				if (vma_tmp->vm_start <= addr)
> > -					break;
> > -				rb_node = rb_node->rb_left;
> > -			} else
> > -				rb_node = rb_node->rb_right;
> > -		}
> > -		if (vma)
> > -			mm->mmap_cache = vma;
> > +	/* Bad cache! iterate rbtree */
> 
> (Nit: the cache is not 'bad', we just didn't hit it.)
> 
> > +	rb_node = mm->mm_rb.rb_node;
> > +	vma = NULL;
> > +
> > +	while (rb_node) {
> > +		struct vm_area_struct *vma_tmp;
> > +
> > +		vma_tmp = rb_entry(rb_node,
> > +				   struct vm_area_struct, vm_rb);
> 
> (Nit: in such cases a single, slightly-longer-than-80col line is IMHO a 
> better solution than such an artificial line-break.)
> 
> > +
> > +		if (vma_tmp->vm_end > addr) {
> > +			vma = vma_tmp;
> > +			if (vma_tmp->vm_start <= addr)
> > +				break;
> > +			rb_node = rb_node->rb_left;
> > +		} else
> > +			rb_node = rb_node->rb_right;
> 
> (Nit: unbalanced curly braces.)
> 
> > +	}
> > +
> > +	if (vma) {
> > +		ACCESS_ONCE(mm->mmap_cache[VMA_LAST_USED]) = vma;
> > +		if (vma->vm_end - vma->vm_start > currlen)
> > +			ACCESS_ONCE(mm->mmap_cache[VMA_LARGEST]) = vma;
> 
> Would it make sense to not update VMA_LAST_USED if VMA_LARGEST is set?
> 
> This would have the advantage of increasing the cache size to two, for the 
> common case where there's two vmas used most of the time.
> 
> To maximize the hit rate in the general case what we basically want to 
> have is an LRU cache, weighted by vma size.
> 
> Maybe by expressing it all in that fashion and looking at the hit rate at 
> 1, 2, 3 and 4 entries would give us equivalent (or better!) behavior than 
> your open-coded variant, with a better idea about how to size it 
> precisely.
> 
> Note that that approach would get rid of the VMA_LAST_USED/VMA_LARGEST 
> distinction in a natural fashion.
> 
> Obviously, if the LRU logic gets too complex then it probably won't bring 
> us any benefits compared to a primitive front-entry cache, so all this is 
> a delicate balance ... hence my previous question about 
> cycles/instructions saved by hitting the cache.

Will try this, thanks for the suggestions.

Btw, do you suggest using a high level tool such as perf for getting
this data or sprinkling get_cycles() in find_vma() -- I'd think that the
first isn't fine grained enough, while the later will probably variate a
lot from run to run but the ratio should be rather constant.

Thanks,
Davidlohr

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-03 23:57       ` KOSAKI Motohiro
@ 2013-11-04  4:22         ` Davidlohr Bueso
  -1 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-04  4:22 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, LKML, linux-mm

On Sun, 2013-11-03 at 18:57 -0500, KOSAKI Motohiro wrote:
> >> I'm slightly surprised this cache makes 15% hit. Which application
> >> get a benefit? You listed a lot of applications, but I'm not sure
> >> which is highly depending on largest vma.
> >
> > Well I chose the largest vma because it gives us a greater chance of
> > being already cached when we do the lookup for the faulted address.
> >
> > The 15% improvement was with Hadoop. According to my notes it was at
> > ~48% with the baseline kernel and increased to ~63% with this patch.
> >
> > In any case I didn't measure the rates on a per-task granularity, but at
> > a general system level. When a system is first booted I can see that the
> > mmap_cache access rate becomes the determinant factor and when adding a
> > workload it doesn't change much. One exception to this was a kernel
> > build, where we go from ~50% to ~89% hit rate on a vanilla kernel.
> 
> I looked at this patch a bit. The worth of this is to improve the
> cache hit ratio
> of heap.
> 
> 1) For single thread applications, heap is frequently largest mapping
> in the process.

Right.

> 2) For java VM, "java -Xms1000m -Xmx1000m HelloWorld" makes following
> /proc/<pid>/smaps entry. That said, JVM allocate single heap even if
> applications are multi threaded.

Oh, this is new to me and nicely explains why I see the most benefit in
java related workloads.

> 
> c1800000-100000000 rw-p 00000000 00:00 0
> Size:            1024000 kB
> Rss:                 244 kB
> Pss:                 244 kB
> Shared_Clean:          0 kB
> Shared_Dirty:          0 kB
> Private_Clean:         0 kB
> Private_Dirty:       244 kB
> Referenced:          244 kB
> Anonymous:           244 kB
> AnonHugePages:         0 kB
> Swap:                  0 kB
> KernelPageSize:        4 kB
> MMUPageSize:           4 kB
> 
> That's good.
> 
> However, we know there is a situation that this patch doesn't work.
> glibc makes per thread heap (arena) by default. So, it is not to be
> expected works well on glibc multi threaded programs. That's a
> slightly big limitation.

I think this is what Linus was referring to.

> 
> Anyway, I haven't observed real performance difference because most
> big penalty of find_vma come from taking mmap_sem, not rb-tree search.

Yes, undoubtedly, which is why I'm using units of hit/miss rather than
workload throughput.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-04  4:22         ` Davidlohr Bueso
  0 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-04  4:22 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Ingo Molnar,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, LKML, linux-mm

On Sun, 2013-11-03 at 18:57 -0500, KOSAKI Motohiro wrote:
> >> I'm slightly surprised this cache makes 15% hit. Which application
> >> get a benefit? You listed a lot of applications, but I'm not sure
> >> which is highly depending on largest vma.
> >
> > Well I chose the largest vma because it gives us a greater chance of
> > being already cached when we do the lookup for the faulted address.
> >
> > The 15% improvement was with Hadoop. According to my notes it was at
> > ~48% with the baseline kernel and increased to ~63% with this patch.
> >
> > In any case I didn't measure the rates on a per-task granularity, but at
> > a general system level. When a system is first booted I can see that the
> > mmap_cache access rate becomes the determinant factor and when adding a
> > workload it doesn't change much. One exception to this was a kernel
> > build, where we go from ~50% to ~89% hit rate on a vanilla kernel.
> 
> I looked at this patch a bit. The worth of this is to improve the
> cache hit ratio
> of heap.
> 
> 1) For single thread applications, heap is frequently largest mapping
> in the process.

Right.

> 2) For java VM, "java -Xms1000m -Xmx1000m HelloWorld" makes following
> /proc/<pid>/smaps entry. That said, JVM allocate single heap even if
> applications are multi threaded.

Oh, this is new to me and nicely explains why I see the most benefit in
java related workloads.

> 
> c1800000-100000000 rw-p 00000000 00:00 0
> Size:            1024000 kB
> Rss:                 244 kB
> Pss:                 244 kB
> Shared_Clean:          0 kB
> Shared_Dirty:          0 kB
> Private_Clean:         0 kB
> Private_Dirty:       244 kB
> Referenced:          244 kB
> Anonymous:           244 kB
> AnonHugePages:         0 kB
> Swap:                  0 kB
> KernelPageSize:        4 kB
> MMUPageSize:           4 kB
> 
> That's good.
> 
> However, we know there is a situation that this patch doesn't work.
> glibc makes per thread heap (arena) by default. So, it is not to be
> expected works well on glibc multi threaded programs. That's a
> slightly big limitation.

I think this is what Linus was referring to.

> 
> Anyway, I haven't observed real performance difference because most
> big penalty of find_vma come from taking mmap_sem, not rb-tree search.

Yes, undoubtedly, which is why I'm using units of hit/miss rather than
workload throughput.

Thanks,
Davidlohr

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* converting unicore32 to gate_vma as done for arm (was Re: [PATCH] mm: cache largest vma)
  2013-11-04  4:20     ` Davidlohr Bueso
@ 2013-11-04  4:48       ` Al Viro
  -1 siblings, 0 replies; 76+ messages in thread
From: Al Viro @ 2013-11-04  4:48 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Ingo Molnar, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, linux-kernel,
	linux-mm, Linus Torvalds

On Sun, Nov 03, 2013 at 08:20:10PM -0800, Davidlohr Bueso wrote:
> > > diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
> > > index fb5e4c6..38cc7fc 100644
> > > --- a/arch/unicore32/include/asm/mmu_context.h
> > > +++ b/arch/unicore32/include/asm/mmu_context.h
> > > @@ -73,7 +73,7 @@ do { \
> > >  		else \
> > >  			mm->mmap = NULL; \
> > >  		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
> > > -		mm->mmap_cache = NULL; \
> > > +		vma_clear_caches(mm);			\
> > >  		mm->map_count--; \
> > >  		remove_vma(high_vma); \
> > >  	} \

BTW, this one needs an analog of
commit f9d4861fc32b995b1616775614459b8f266c803c
Author: Will Deacon <will.deacon@arm.com>
Date:   Fri Jan 20 12:01:13 2012 +0100

    ARM: 7294/1: vectors: use gate_vma for vectors user mapping

This code is a copy of older arm logics rewritten in that commit; unicore32
never got its counterpart.  I have a [completely untested] variant sitting
in vfs.git#vm^; it's probably worth testing - if it works, we'll get rid
of one more place that needs to be aware of MM guts and unicore32 folks
will have fewer potential headache sources...

FWIW, after porting to the current tree it becomes the following; I'm not
sure whether we want VM_DONTEXPAND | VM_DONTDUMP set for this one, though...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/arch/unicore32/include/asm/elf.h b/arch/unicore32/include/asm/elf.h
index 829042d..eeba258 100644
--- a/arch/unicore32/include/asm/elf.h
+++ b/arch/unicore32/include/asm/elf.h
@@ -87,8 +87,4 @@ struct mm_struct;
 extern unsigned long arch_randomize_brk(struct mm_struct *mm);
 #define arch_randomize_brk arch_randomize_brk
 
-extern int vectors_user_mapping(void);
-#define arch_setup_additional_pages(bprm, uses_interp) vectors_user_mapping()
-#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
-
 #endif
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index fb5e4c6..600b1b8 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -18,6 +18,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/cpu-single.h>
+#include <asm-generic/mm_hooks.h>
 
 #define init_new_context(tsk, mm)	0
 
@@ -56,32 +57,4 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 #define deactivate_mm(tsk, mm)	do { } while (0)
 #define activate_mm(prev, next)	switch_mm(prev, next, NULL)
 
-/*
- * We are inserting a "fake" vma for the user-accessible vector page so
- * gdb and friends can get to it through ptrace and /proc/<pid>/mem.
- * But we also want to remove it before the generic code gets to see it
- * during process exit or the unmapping of it would  cause total havoc.
- * (the macro is used as remove_vma() is static to mm/mmap.c)
- */
-#define arch_exit_mmap(mm) \
-do { \
-	struct vm_area_struct *high_vma = find_vma(mm, 0xffff0000); \
-	if (high_vma) { \
-		BUG_ON(high_vma->vm_next);  /* it should be last */ \
-		if (high_vma->vm_prev) \
-			high_vma->vm_prev->vm_next = NULL; \
-		else \
-			mm->mmap = NULL; \
-		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
-		mm->mmap_cache = NULL; \
-		mm->map_count--; \
-		remove_vma(high_vma); \
-	} \
-} while (0)
-
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-				 struct mm_struct *mm)
-{
-}
-
 #endif
diff --git a/arch/unicore32/include/asm/page.h b/arch/unicore32/include/asm/page.h
index 594b322..e79da8b 100644
--- a/arch/unicore32/include/asm/page.h
+++ b/arch/unicore32/include/asm/page.h
@@ -28,6 +28,8 @@ extern void copy_page(void *to, const void *from);
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
+#define __HAVE_ARCH_GATE_AREA 1
+
 #undef STRICT_MM_TYPECHECKS
 
 #ifdef STRICT_MM_TYPECHECKS
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index 778ebba..51d129e 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -307,21 +307,39 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 
 /*
  * The vectors page is always readable from user space for the
- * atomic helpers and the signal restart code.  Let's declare a mapping
- * for it so it is visible through ptrace and /proc/<pid>/mem.
+ * atomic helpers and the signal restart code. Insert it into the
+ * gate_vma so that it is visible through ptrace and /proc/<pid>/mem.
  */
+static struct vm_area_struct gate_vma = {
+	.vm_start	= 0xffff0000,
+	.vm_end		= 0xffff0000 + PAGE_SIZE,
+	.vm_flags	= VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC |
+			  VM_DONTEXPAND | VM_DONTDUMP,
+};
+
+static int __init gate_vma_init(void)
+{
+	gate_vma.vm_page_prot	= PAGE_READONLY_EXEC;
+	return 0;
+}
+arch_initcall(gate_vma_init);
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+	return &gate_vma;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+	return (addr >= gate_vma.vm_start) && (addr < gate_vma.vm_end);
+}
 
-int vectors_user_mapping(void)
+int in_gate_area_no_mm(unsigned long addr)
 {
-	struct mm_struct *mm = current->mm;
-	return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
-				       VM_READ | VM_EXEC |
-				       VM_MAYREAD | VM_MAYEXEC |
-				       VM_DONTEXPAND | VM_DONTDUMP,
-				       NULL);
+	return in_gate_area(NULL, addr);
 }
 
 const char *arch_vma_name(struct vm_area_struct *vma)
 {
-	return (vma->vm_start == 0xffff0000) ? "[vectors]" : NULL;
+	return (vma == &gate_vma) ? "[vectors]" : NULL;
 }

^ permalink raw reply related	[flat|nested] 76+ messages in thread

* converting unicore32 to gate_vma as done for arm (was Re: [PATCH] mm: cache largest vma)
@ 2013-11-04  4:48       ` Al Viro
  0 siblings, 0 replies; 76+ messages in thread
From: Al Viro @ 2013-11-04  4:48 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Ingo Molnar, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, linux-kernel,
	linux-mm, Linus Torvalds

On Sun, Nov 03, 2013 at 08:20:10PM -0800, Davidlohr Bueso wrote:
> > > diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
> > > index fb5e4c6..38cc7fc 100644
> > > --- a/arch/unicore32/include/asm/mmu_context.h
> > > +++ b/arch/unicore32/include/asm/mmu_context.h
> > > @@ -73,7 +73,7 @@ do { \
> > >  		else \
> > >  			mm->mmap = NULL; \
> > >  		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
> > > -		mm->mmap_cache = NULL; \
> > > +		vma_clear_caches(mm);			\
> > >  		mm->map_count--; \
> > >  		remove_vma(high_vma); \
> > >  	} \

BTW, this one needs an analog of
commit f9d4861fc32b995b1616775614459b8f266c803c
Author: Will Deacon <will.deacon@arm.com>
Date:   Fri Jan 20 12:01:13 2012 +0100

    ARM: 7294/1: vectors: use gate_vma for vectors user mapping

This code is a copy of older arm logics rewritten in that commit; unicore32
never got its counterpart.  I have a [completely untested] variant sitting
in vfs.git#vm^; it's probably worth testing - if it works, we'll get rid
of one more place that needs to be aware of MM guts and unicore32 folks
will have fewer potential headache sources...

FWIW, after porting to the current tree it becomes the following; I'm not
sure whether we want VM_DONTEXPAND | VM_DONTDUMP set for this one, though...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/arch/unicore32/include/asm/elf.h b/arch/unicore32/include/asm/elf.h
index 829042d..eeba258 100644
--- a/arch/unicore32/include/asm/elf.h
+++ b/arch/unicore32/include/asm/elf.h
@@ -87,8 +87,4 @@ struct mm_struct;
 extern unsigned long arch_randomize_brk(struct mm_struct *mm);
 #define arch_randomize_brk arch_randomize_brk
 
-extern int vectors_user_mapping(void);
-#define arch_setup_additional_pages(bprm, uses_interp) vectors_user_mapping()
-#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
-
 #endif
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index fb5e4c6..600b1b8 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -18,6 +18,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/cpu-single.h>
+#include <asm-generic/mm_hooks.h>
 
 #define init_new_context(tsk, mm)	0
 
@@ -56,32 +57,4 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 #define deactivate_mm(tsk, mm)	do { } while (0)
 #define activate_mm(prev, next)	switch_mm(prev, next, NULL)
 
-/*
- * We are inserting a "fake" vma for the user-accessible vector page so
- * gdb and friends can get to it through ptrace and /proc/<pid>/mem.
- * But we also want to remove it before the generic code gets to see it
- * during process exit or the unmapping of it would  cause total havoc.
- * (the macro is used as remove_vma() is static to mm/mmap.c)
- */
-#define arch_exit_mmap(mm) \
-do { \
-	struct vm_area_struct *high_vma = find_vma(mm, 0xffff0000); \
-	if (high_vma) { \
-		BUG_ON(high_vma->vm_next);  /* it should be last */ \
-		if (high_vma->vm_prev) \
-			high_vma->vm_prev->vm_next = NULL; \
-		else \
-			mm->mmap = NULL; \
-		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
-		mm->mmap_cache = NULL; \
-		mm->map_count--; \
-		remove_vma(high_vma); \
-	} \
-} while (0)
-
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-				 struct mm_struct *mm)
-{
-}
-
 #endif
diff --git a/arch/unicore32/include/asm/page.h b/arch/unicore32/include/asm/page.h
index 594b322..e79da8b 100644
--- a/arch/unicore32/include/asm/page.h
+++ b/arch/unicore32/include/asm/page.h
@@ -28,6 +28,8 @@ extern void copy_page(void *to, const void *from);
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
+#define __HAVE_ARCH_GATE_AREA 1
+
 #undef STRICT_MM_TYPECHECKS
 
 #ifdef STRICT_MM_TYPECHECKS
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index 778ebba..51d129e 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -307,21 +307,39 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 
 /*
  * The vectors page is always readable from user space for the
- * atomic helpers and the signal restart code.  Let's declare a mapping
- * for it so it is visible through ptrace and /proc/<pid>/mem.
+ * atomic helpers and the signal restart code. Insert it into the
+ * gate_vma so that it is visible through ptrace and /proc/<pid>/mem.
  */
+static struct vm_area_struct gate_vma = {
+	.vm_start	= 0xffff0000,
+	.vm_end		= 0xffff0000 + PAGE_SIZE,
+	.vm_flags	= VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC |
+			  VM_DONTEXPAND | VM_DONTDUMP,
+};
+
+static int __init gate_vma_init(void)
+{
+	gate_vma.vm_page_prot	= PAGE_READONLY_EXEC;
+	return 0;
+}
+arch_initcall(gate_vma_init);
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+	return &gate_vma;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+	return (addr >= gate_vma.vm_start) && (addr < gate_vma.vm_end);
+}
 
-int vectors_user_mapping(void)
+int in_gate_area_no_mm(unsigned long addr)
 {
-	struct mm_struct *mm = current->mm;
-	return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
-				       VM_READ | VM_EXEC |
-				       VM_MAYREAD | VM_MAYEXEC |
-				       VM_DONTEXPAND | VM_DONTDUMP,
-				       NULL);
+	return in_gate_area(NULL, addr);
 }
 
 const char *arch_vma_name(struct vm_area_struct *vma)
 {
-	return (vma->vm_start == 0xffff0000) ? "[vectors]" : NULL;
+	return (vma == &gate_vma) ? "[vectors]" : NULL;
 }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-04  4:20     ` Davidlohr Bueso
@ 2013-11-04  7:00       ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-04  7:00 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Mel Gorman,
	Rik van Riel, Guan Xuetao, aswin, linux-kernel, linux-mm,
	Linus Torvalds


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> On Sun, 2013-11-03 at 11:12 +0100, Ingo Molnar wrote:
> > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > 
> > > While caching the last used vma already does a nice job avoiding
> > > having to iterate the rbtree in find_vma, we can improve. After
> > > studying the hit rate on a load of workloads and environments,
> > > it was seen that it was around 45-50% - constant for a standard
> > > desktop system (gnome3 + evolution + firefox + a few xterms),
> > > and multiple java related workloads (including Hadoop/terasort),
> > > and aim7, which indicates it's better than the 35% value documented
> > > in the code.
> > > 
> > > By also caching the largest vma, that is, the one that contains
> > > most addresses, there is a steady 10-15% hit rate gain, putting
> > > it above the 60% region. This improvement comes at a very low
> > > overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
> > > the current logic.
> > > 
> > > This patch introduces a second mmap_cache pointer, which is just
> > > as racy as the first, but as we already know, doesn't matter in
> > > this context. For documentation purposes, I have also added the
> > > ACCESS_ONCE() around mm->mmap_cache updates, keeping it consistent
> > > with the reads.
> > > 
> > > Cc: Hugh Dickins <hughd@google.com>
> > > Cc: Michel Lespinasse <walken@google.com>
> > > Cc: Ingo Molnar <mingo@kernel.org>
> > > Cc: Mel Gorman <mgorman@suse.de>
> > > Cc: Rik van Riel <riel@redhat.com>
> > > Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
> > > Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
> > > ---
> > > Please note that nommu and unicore32 arch are *untested*.
> > > 
> > > I also have a patch on top of this one that caches the most 
> > > used vma, which adds another 8-10% hit rate gain, However,
> > > since it does add a counter to the vma structure and we have
> > > to do more logic in find_vma to keep track, I was hesitant about
> > > the overhead. If folks are interested I can send that out as well.
> > 
> > Would be interesting to see.
> > 
> > Btw., roughly how many cycles/instructions do we save by increasing 
> > the hit rate, in the typical case (for example during a kernel build)?
> 
> Good point. The IPC from perf stat doesn't show any difference with or 
> without the patch -- note that this is probably the least interesting 
> one as we already get a really nice hit rate with the single mmap_cache. 
> I have yet to try it on the other workloads.

I'd be surprised if this was measureable via perf stat, unless you do the 
measurement in a really, really careful way - and even then it's easy to 
make a hard to detect mistake larger in magnitude than the measured effect 
...

An easier and more reliable measurement would be to stick 2-3 get_cycles() 
calls into the affected code and save the pure timestamps into 
task.se.statistics, and extract the timestamps via /proc/sched_debug by 
adding matching seq_printf()s to kernel/sched/debug.c. (You can clear the 
statistics by echoing 0 to /proc/<PID>/sched_debug, see 
proc_sched_set_task().)

That measurement is still subject to skid and other artifacts but 
hopefully the effect is larger than cycles fuzz - and we are interested in 
a ballpark figure in any case.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-04  7:00       ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-04  7:00 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Mel Gorman,
	Rik van Riel, Guan Xuetao, aswin, linux-kernel, linux-mm,
	Linus Torvalds


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> On Sun, 2013-11-03 at 11:12 +0100, Ingo Molnar wrote:
> > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > 
> > > While caching the last used vma already does a nice job avoiding
> > > having to iterate the rbtree in find_vma, we can improve. After
> > > studying the hit rate on a load of workloads and environments,
> > > it was seen that it was around 45-50% - constant for a standard
> > > desktop system (gnome3 + evolution + firefox + a few xterms),
> > > and multiple java related workloads (including Hadoop/terasort),
> > > and aim7, which indicates it's better than the 35% value documented
> > > in the code.
> > > 
> > > By also caching the largest vma, that is, the one that contains
> > > most addresses, there is a steady 10-15% hit rate gain, putting
> > > it above the 60% region. This improvement comes at a very low
> > > overhead for a miss. Furthermore, systems with !CONFIG_MMU keep
> > > the current logic.
> > > 
> > > This patch introduces a second mmap_cache pointer, which is just
> > > as racy as the first, but as we already know, doesn't matter in
> > > this context. For documentation purposes, I have also added the
> > > ACCESS_ONCE() around mm->mmap_cache updates, keeping it consistent
> > > with the reads.
> > > 
> > > Cc: Hugh Dickins <hughd@google.com>
> > > Cc: Michel Lespinasse <walken@google.com>
> > > Cc: Ingo Molnar <mingo@kernel.org>
> > > Cc: Mel Gorman <mgorman@suse.de>
> > > Cc: Rik van Riel <riel@redhat.com>
> > > Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
> > > Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
> > > ---
> > > Please note that nommu and unicore32 arch are *untested*.
> > > 
> > > I also have a patch on top of this one that caches the most 
> > > used vma, which adds another 8-10% hit rate gain, However,
> > > since it does add a counter to the vma structure and we have
> > > to do more logic in find_vma to keep track, I was hesitant about
> > > the overhead. If folks are interested I can send that out as well.
> > 
> > Would be interesting to see.
> > 
> > Btw., roughly how many cycles/instructions do we save by increasing 
> > the hit rate, in the typical case (for example during a kernel build)?
> 
> Good point. The IPC from perf stat doesn't show any difference with or 
> without the patch -- note that this is probably the least interesting 
> one as we already get a really nice hit rate with the single mmap_cache. 
> I have yet to try it on the other workloads.

I'd be surprised if this was measureable via perf stat, unless you do the 
measurement in a really, really careful way - and even then it's easy to 
make a hard to detect mistake larger in magnitude than the measured effect 
...

An easier and more reliable measurement would be to stick 2-3 get_cycles() 
calls into the affected code and save the pure timestamps into 
task.se.statistics, and extract the timestamps via /proc/sched_debug by 
adding matching seq_printf()s to kernel/sched/debug.c. (You can clear the 
statistics by echoing 0 to /proc/<PID>/sched_debug, see 
proc_sched_set_task().)

That measurement is still subject to skid and other artifacts but 
hopefully the effect is larger than cycles fuzz - and we are interested in 
a ballpark figure in any case.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-03 18:51   ` Linus Torvalds
@ 2013-11-04  7:03     ` Christoph Hellwig
  -1 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2013-11-04  7:03 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Davidlohr Bueso, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Ingo Molnar, Mel Gorman, Rik van Riel, Guan Xuetao,
	Chandramouleeswaran, Aswin, Linux Kernel Mailing List, linux-mm,
	Dave Chinner

On Sun, Nov 03, 2013 at 10:51:27AM -0800, Linus Torvalds wrote:
> Ugh. This patch makes me angry. It looks way too ad-hoc.
> 
> I can well imagine that our current one-entry cache is crap and could
> be improved, but this looks too random. Different code for the
> CONFIG_MMU case? Same name, but for non-MMU it's a single entry, for
> MMU it's an array? And the whole "largest" just looks odd. Plus why do
> you set LAST_USED if you also set LARGEST?
> 
> Did you try just a two- or four-entry pseudo-LRU instead, with a
> per-thread index for "last hit"? Or even possibly a small fixed-size
> hash table (say "idx = (add >> 10) & 3" or something)?

Btw, Dave Chiner has recently implemented a simple look aside cache for
the buffer cache, which also uses a rbtree.  Might beworth into making
that into a generic library and use it here:

	http://thread.gmane.org/gmane.comp.file-systems.xfs.general/56220

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-04  7:03     ` Christoph Hellwig
  0 siblings, 0 replies; 76+ messages in thread
From: Christoph Hellwig @ 2013-11-04  7:03 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Davidlohr Bueso, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Ingo Molnar, Mel Gorman, Rik van Riel, Guan Xuetao,
	Chandramouleeswaran, Aswin, Linux Kernel Mailing List, linux-mm,
	Dave Chinner

On Sun, Nov 03, 2013 at 10:51:27AM -0800, Linus Torvalds wrote:
> Ugh. This patch makes me angry. It looks way too ad-hoc.
> 
> I can well imagine that our current one-entry cache is crap and could
> be improved, but this looks too random. Different code for the
> CONFIG_MMU case? Same name, but for non-MMU it's a single entry, for
> MMU it's an array? And the whole "largest" just looks odd. Plus why do
> you set LAST_USED if you also set LARGEST?
> 
> Did you try just a two- or four-entry pseudo-LRU instead, with a
> per-thread index for "last hit"? Or even possibly a small fixed-size
> hash table (say "idx = (add >> 10) & 3" or something)?

Btw, Dave Chiner has recently implemented a simple look aside cache for
the buffer cache, which also uses a rbtree.  Might beworth into making
that into a generic library and use it here:

	http://thread.gmane.org/gmane.comp.file-systems.xfs.general/56220

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-04  4:20     ` Davidlohr Bueso
@ 2013-11-04  7:05       ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-04  7:05 UTC (permalink / raw)
  To: Davidlohr Bueso, Frédéric Weisbecker
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Mel Gorman,
	Rik van Riel, Guan Xuetao, aswin, linux-kernel, linux-mm,
	Linus Torvalds


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> Btw, do you suggest using a high level tool such as perf for getting 
> this data or sprinkling get_cycles() in find_vma() -- I'd think that the 
> first isn't fine grained enough, while the later will probably variate a 
> lot from run to run but the ratio should be rather constant.

LOL - I guess I should have read your mail before replying to it ;-)

Yes, I think get_cycles() works better in this case - not due to 
granularity (perf stat will report cycle granular just fine), but due to 
the size of the critical path you'll be measuring. You really want to 
extract the delta, because it's probably so much smaller than the overhead 
of the workload itself.

[ We still don't have good 'measure overhead from instruction X to 
  instruction Y' delta measurement infrastructure in perf yet, although
  Frederic is working on such a trigger/delta facility AFAIK. ]

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-04  7:05       ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-04  7:05 UTC (permalink / raw)
  To: Davidlohr Bueso, Frédéric Weisbecker
  Cc: Andrew Morton, Hugh Dickins, Michel Lespinasse, Mel Gorman,
	Rik van Riel, Guan Xuetao, aswin, linux-kernel, linux-mm,
	Linus Torvalds


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> Btw, do you suggest using a high level tool such as perf for getting 
> this data or sprinkling get_cycles() in find_vma() -- I'd think that the 
> first isn't fine grained enough, while the later will probably variate a 
> lot from run to run but the ratio should be rather constant.

LOL - I guess I should have read your mail before replying to it ;-)

Yes, I think get_cycles() works better in this case - not due to 
granularity (perf stat will report cycle granular just fine), but due to 
the size of the critical path you'll be measuring. You really want to 
extract the delta, because it's probably so much smaller than the overhead 
of the workload itself.

[ We still don't have good 'measure overhead from instruction X to 
  instruction Y' delta measurement infrastructure in perf yet, although
  Frederic is working on such a trigger/delta facility AFAIK. ]

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-04  4:04     ` Davidlohr Bueso
@ 2013-11-04  7:36       ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-04  7:36 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> I will look into doing the vma cache per thread instead of mm (I hadn't 
> really looked at the problem like this) as well as Ingo's suggestion on 
> the weighted LRU approach. However, having seen that we can cheaply and 
> easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> how good is good enough?

So I think it all really depends on the hit/miss cost difference. It makes 
little sense to add a more complex scheme if it washes out most of the 
benefits!

Also note the historic context: the _original_ mmap_cache, that I 
implemented 16 years ago, was a front-line cache to a linear list walk 
over all vmas (!).

This is the relevant 2.1.37pre1 code in include/linux/mm.h:

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
{
        struct vm_area_struct *vma = NULL;

        if (mm) {
                /* Check the cache first. */
                vma = mm->mmap_cache;
                if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
                        vma = mm->mmap;
                        while(vma && vma->vm_end <= addr)
                                vma = vma->vm_next;
                        mm->mmap_cache = vma;
                }
        }
        return vma;
}

See that vma->vm_next iteration? It was awful - but back then most of us 
had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
no worries - the mm was really simple back then.

Today we have the vma rbtree, which is self-balancing and a lot faster 
than your typical linear list walk search ;-)

So I'd _really_ suggest to first examine the assumptions behind the cache, 
it being named 'cache' and it having a hit rate does in itself not 
guarantee that it gives us any worthwile cost savings when put in front of 
an rbtree ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-04  7:36       ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-04  7:36 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> I will look into doing the vma cache per thread instead of mm (I hadn't 
> really looked at the problem like this) as well as Ingo's suggestion on 
> the weighted LRU approach. However, having seen that we can cheaply and 
> easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> how good is good enough?

So I think it all really depends on the hit/miss cost difference. It makes 
little sense to add a more complex scheme if it washes out most of the 
benefits!

Also note the historic context: the _original_ mmap_cache, that I 
implemented 16 years ago, was a front-line cache to a linear list walk 
over all vmas (!).

This is the relevant 2.1.37pre1 code in include/linux/mm.h:

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
{
        struct vm_area_struct *vma = NULL;

        if (mm) {
                /* Check the cache first. */
                vma = mm->mmap_cache;
                if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
                        vma = mm->mmap;
                        while(vma && vma->vm_end <= addr)
                                vma = vma->vm_next;
                        mm->mmap_cache = vma;
                }
        }
        return vma;
}

See that vma->vm_next iteration? It was awful - but back then most of us 
had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
no worries - the mm was really simple back then.

Today we have the vma rbtree, which is self-balancing and a lot faster 
than your typical linear list walk search ;-)

So I'd _really_ suggest to first examine the assumptions behind the cache, 
it being named 'cache' and it having a hit rate does in itself not 
guarantee that it gives us any worthwile cost savings when put in front of 
an rbtree ...

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-04  7:05       ` Ingo Molnar
@ 2013-11-04 14:20         ` Frederic Weisbecker
  -1 siblings, 0 replies; 76+ messages in thread
From: Frederic Weisbecker @ 2013-11-04 14:20 UTC (permalink / raw)
  To: Ingo Molnar, Jiri Olsa
  Cc: Davidlohr Bueso, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, linux-kernel,
	linux-mm, Linus Torvalds

On Mon, Nov 04, 2013 at 08:05:00AM +0100, Ingo Molnar wrote:
> 
> * Davidlohr Bueso <davidlohr@hp.com> wrote:
> 
> > Btw, do you suggest using a high level tool such as perf for getting 
> > this data or sprinkling get_cycles() in find_vma() -- I'd think that the 
> > first isn't fine grained enough, while the later will probably variate a 
> > lot from run to run but the ratio should be rather constant.
> 
> LOL - I guess I should have read your mail before replying to it ;-)
> 
> Yes, I think get_cycles() works better in this case - not due to 
> granularity (perf stat will report cycle granular just fine), but due to 
> the size of the critical path you'll be measuring. You really want to 
> extract the delta, because it's probably so much smaller than the overhead 
> of the workload itself.
> 
> [ We still don't have good 'measure overhead from instruction X to 
>   instruction Y' delta measurement infrastructure in perf yet, although
>   Frederic is working on such a trigger/delta facility AFAIK. ]

Yep, in fact Jiri took it over and he's still working on it. But yeah, once
that get merged, we should be able to measure instructions or cycles inside
any user or kernel function through kprobes/uprobes or function graph tracer.

> 
> Thanks,
> 
> 	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-04 14:20         ` Frederic Weisbecker
  0 siblings, 0 replies; 76+ messages in thread
From: Frederic Weisbecker @ 2013-11-04 14:20 UTC (permalink / raw)
  To: Ingo Molnar, Jiri Olsa
  Cc: Davidlohr Bueso, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, aswin, linux-kernel,
	linux-mm, Linus Torvalds

On Mon, Nov 04, 2013 at 08:05:00AM +0100, Ingo Molnar wrote:
> 
> * Davidlohr Bueso <davidlohr@hp.com> wrote:
> 
> > Btw, do you suggest using a high level tool such as perf for getting 
> > this data or sprinkling get_cycles() in find_vma() -- I'd think that the 
> > first isn't fine grained enough, while the later will probably variate a 
> > lot from run to run but the ratio should be rather constant.
> 
> LOL - I guess I should have read your mail before replying to it ;-)
> 
> Yes, I think get_cycles() works better in this case - not due to 
> granularity (perf stat will report cycle granular just fine), but due to 
> the size of the critical path you'll be measuring. You really want to 
> extract the delta, because it's probably so much smaller than the overhead 
> of the workload itself.
> 
> [ We still don't have good 'measure overhead from instruction X to 
>   instruction Y' delta measurement infrastructure in perf yet, although
>   Frederic is working on such a trigger/delta facility AFAIK. ]

Yep, in fact Jiri took it over and he's still working on it. But yeah, once
that get merged, we should be able to measure instructions or cycles inside
any user or kernel function through kprobes/uprobes or function graph tracer.

> 
> Thanks,
> 
> 	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-04  7:36       ` Ingo Molnar
@ 2013-11-04 14:56         ` Michel Lespinasse
  -1 siblings, 0 replies; 76+ messages in thread
From: Michel Lespinasse @ 2013-11-04 14:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Davidlohr Bueso, Linus Torvalds, Andrew Morton, Hugh Dickins,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

On Sun, Nov 3, 2013 at 11:36 PM, Ingo Molnar <mingo@kernel.org> wrote:
> So I think it all really depends on the hit/miss cost difference. It makes
> little sense to add a more complex scheme if it washes out most of the
> benefits!
>
> Also note the historic context: the _original_ mmap_cache, that I
> implemented 16 years ago, was a front-line cache to a linear list walk
> over all vmas (!).
>
> Today we have the vma rbtree, which is self-balancing and a lot faster
> than your typical linear list walk search ;-)
>
> So I'd _really_ suggest to first examine the assumptions behind the cache,
> it being named 'cache' and it having a hit rate does in itself not
> guarantee that it gives us any worthwile cost savings when put in front of
> an rbtree ...

Agree. We have made the general case a lot faster, and caches in front
of it may not pull their weight anymore - the fact that we are
wondering how to even measure that, to me, means that we probably
shouldn't even bother. That's what I did when I implemented the
augmented rbtree to search for allocatable spaces between vmas: I
removed the cache for the last used gap, and nobody has complained
about it since. Absent some contrary data, I would actually prefer we
remove the mmap_cache as well.

And if a multiple-entry cache is necessary, I would also prefer it to
be LRU type rather than something ad-hoc (if there is a benefit to
caching the largest VMA, then LRU would capture that as well...)

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-04 14:56         ` Michel Lespinasse
  0 siblings, 0 replies; 76+ messages in thread
From: Michel Lespinasse @ 2013-11-04 14:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Davidlohr Bueso, Linus Torvalds, Andrew Morton, Hugh Dickins,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

On Sun, Nov 3, 2013 at 11:36 PM, Ingo Molnar <mingo@kernel.org> wrote:
> So I think it all really depends on the hit/miss cost difference. It makes
> little sense to add a more complex scheme if it washes out most of the
> benefits!
>
> Also note the historic context: the _original_ mmap_cache, that I
> implemented 16 years ago, was a front-line cache to a linear list walk
> over all vmas (!).
>
> Today we have the vma rbtree, which is self-balancing and a lot faster
> than your typical linear list walk search ;-)
>
> So I'd _really_ suggest to first examine the assumptions behind the cache,
> it being named 'cache' and it having a hit rate does in itself not
> guarantee that it gives us any worthwile cost savings when put in front of
> an rbtree ...

Agree. We have made the general case a lot faster, and caches in front
of it may not pull their weight anymore - the fact that we are
wondering how to even measure that, to me, means that we probably
shouldn't even bother. That's what I did when I implemented the
augmented rbtree to search for allocatable spaces between vmas: I
removed the cache for the last used gap, and nobody has complained
about it since. Absent some contrary data, I would actually prefer we
remove the mmap_cache as well.

And if a multiple-entry cache is necessary, I would also prefer it to
be LRU type rather than something ad-hoc (if there is a benefit to
caching the largest VMA, then LRU would capture that as well...)

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-04 14:20         ` Frederic Weisbecker
@ 2013-11-04 17:52           ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-04 17:52 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Jiri Olsa, Davidlohr Bueso, Andrew Morton, Hugh Dickins,
	Michel Lespinasse, Mel Gorman, Rik van Riel, Guan Xuetao, aswin,
	linux-kernel, linux-mm, Linus Torvalds, David Ahern


* Frederic Weisbecker <fweisbec@gmail.com> wrote:

> On Mon, Nov 04, 2013 at 08:05:00AM +0100, Ingo Molnar wrote:
> > 
> > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > 
> > > Btw, do you suggest using a high level tool such as perf for getting 
> > > this data or sprinkling get_cycles() in find_vma() -- I'd think that the 
> > > first isn't fine grained enough, while the later will probably variate a 
> > > lot from run to run but the ratio should be rather constant.
> > 
> > LOL - I guess I should have read your mail before replying to it ;-)
> > 
> > Yes, I think get_cycles() works better in this case - not due to 
> > granularity (perf stat will report cycle granular just fine), but due 
> > to the size of the critical path you'll be measuring. You really want 
> > to extract the delta, because it's probably so much smaller than the 
> > overhead of the workload itself.
> > 
> > [ We still don't have good 'measure overhead from instruction X to 
> >   instruction Y' delta measurement infrastructure in perf yet, 
> >   although Frederic is working on such a trigger/delta facility AFAIK. 
> >   ]
> 
> Yep, in fact Jiri took it over and he's still working on it. But yeah, 
> once that get merged, we should be able to measure instructions or 
> cycles inside any user or kernel function through kprobes/uprobes or 
> function graph tracer.

So, what would be nice is to actually make use of it: one very nice 
usecase I'd love to see is to have the capability within the 'perf top' 
TUI annotated assembly output to mark specific instructions as 'start' and 
'end' markers, and measure the overhead between them.

I.e. allow perf top / perf report to manage probes into interesting 
functions - or create a similar TUI for 'perf probe' to allow easy live 
marking/probing of various kernel functionality.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-04 17:52           ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-04 17:52 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Jiri Olsa, Davidlohr Bueso, Andrew Morton, Hugh Dickins,
	Michel Lespinasse, Mel Gorman, Rik van Riel, Guan Xuetao, aswin,
	linux-kernel, linux-mm, Linus Torvalds, David Ahern


* Frederic Weisbecker <fweisbec@gmail.com> wrote:

> On Mon, Nov 04, 2013 at 08:05:00AM +0100, Ingo Molnar wrote:
> > 
> > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > 
> > > Btw, do you suggest using a high level tool such as perf for getting 
> > > this data or sprinkling get_cycles() in find_vma() -- I'd think that the 
> > > first isn't fine grained enough, while the later will probably variate a 
> > > lot from run to run but the ratio should be rather constant.
> > 
> > LOL - I guess I should have read your mail before replying to it ;-)
> > 
> > Yes, I think get_cycles() works better in this case - not due to 
> > granularity (perf stat will report cycle granular just fine), but due 
> > to the size of the critical path you'll be measuring. You really want 
> > to extract the delta, because it's probably so much smaller than the 
> > overhead of the workload itself.
> > 
> > [ We still don't have good 'measure overhead from instruction X to 
> >   instruction Y' delta measurement infrastructure in perf yet, 
> >   although Frederic is working on such a trigger/delta facility AFAIK. 
> >   ]
> 
> Yep, in fact Jiri took it over and he's still working on it. But yeah, 
> once that get merged, we should be able to measure instructions or 
> cycles inside any user or kernel function through kprobes/uprobes or 
> function graph tracer.

So, what would be nice is to actually make use of it: one very nice 
usecase I'd love to see is to have the capability within the 'perf top' 
TUI annotated assembly output to mark specific instructions as 'start' and 
'end' markers, and measure the overhead between them.

I.e. allow perf top / perf report to manage probes into interesting 
functions - or create a similar TUI for 'perf probe' to allow easy live 
marking/probing of various kernel functionality.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-04 17:52           ` Ingo Molnar
@ 2013-11-04 18:10             ` Frederic Weisbecker
  -1 siblings, 0 replies; 76+ messages in thread
From: Frederic Weisbecker @ 2013-11-04 18:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jiri Olsa, Davidlohr Bueso, Andrew Morton, Hugh Dickins,
	Michel Lespinasse, Mel Gorman, Rik van Riel, Guan Xuetao, aswin,
	linux-kernel, linux-mm, Linus Torvalds, David Ahern

On Mon, Nov 04, 2013 at 06:52:45PM +0100, Ingo Molnar wrote:
> 
> * Frederic Weisbecker <fweisbec@gmail.com> wrote:
> 
> > On Mon, Nov 04, 2013 at 08:05:00AM +0100, Ingo Molnar wrote:
> > > 
> > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > 
> > > > Btw, do you suggest using a high level tool such as perf for getting 
> > > > this data or sprinkling get_cycles() in find_vma() -- I'd think that the 
> > > > first isn't fine grained enough, while the later will probably variate a 
> > > > lot from run to run but the ratio should be rather constant.
> > > 
> > > LOL - I guess I should have read your mail before replying to it ;-)
> > > 
> > > Yes, I think get_cycles() works better in this case - not due to 
> > > granularity (perf stat will report cycle granular just fine), but due 
> > > to the size of the critical path you'll be measuring. You really want 
> > > to extract the delta, because it's probably so much smaller than the 
> > > overhead of the workload itself.
> > > 
> > > [ We still don't have good 'measure overhead from instruction X to 
> > >   instruction Y' delta measurement infrastructure in perf yet, 
> > >   although Frederic is working on such a trigger/delta facility AFAIK. 
> > >   ]
> > 
> > Yep, in fact Jiri took it over and he's still working on it. But yeah, 
> > once that get merged, we should be able to measure instructions or 
> > cycles inside any user or kernel function through kprobes/uprobes or 
> > function graph tracer.
> 
> So, what would be nice is to actually make use of it: one very nice 
> usecase I'd love to see is to have the capability within the 'perf top' 
> TUI annotated assembly output to mark specific instructions as 'start' and 
> 'end' markers, and measure the overhead between them.

Yeah that would be a nice interface. Speaking about that, it would be nice to get your input
on the proposed interface for toggle events.

It's still in an RFC state, although it's getting quite elaborated, and I believe we haven't
yet found a real direction to take for the tooling interface IIRC. For example the perf record
cmdline used to state toggle events based contexts was one of the parts we were not that confident about.
And we really don't want to take a wrong direction for that as it's going to be complicated
to handle in any case.

See this thread:
https://lwn.net/Articles/568602/

thanks.

> 
> I.e. allow perf top / perf report to manage probes into interesting 
> functions - or create a similar TUI for 'perf probe' to allow easy live 
> marking/probing of various kernel functionality.
> 
> Thanks,
> 
> 	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-04 18:10             ` Frederic Weisbecker
  0 siblings, 0 replies; 76+ messages in thread
From: Frederic Weisbecker @ 2013-11-04 18:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jiri Olsa, Davidlohr Bueso, Andrew Morton, Hugh Dickins,
	Michel Lespinasse, Mel Gorman, Rik van Riel, Guan Xuetao, aswin,
	linux-kernel, linux-mm, Linus Torvalds, David Ahern

On Mon, Nov 04, 2013 at 06:52:45PM +0100, Ingo Molnar wrote:
> 
> * Frederic Weisbecker <fweisbec@gmail.com> wrote:
> 
> > On Mon, Nov 04, 2013 at 08:05:00AM +0100, Ingo Molnar wrote:
> > > 
> > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > 
> > > > Btw, do you suggest using a high level tool such as perf for getting 
> > > > this data or sprinkling get_cycles() in find_vma() -- I'd think that the 
> > > > first isn't fine grained enough, while the later will probably variate a 
> > > > lot from run to run but the ratio should be rather constant.
> > > 
> > > LOL - I guess I should have read your mail before replying to it ;-)
> > > 
> > > Yes, I think get_cycles() works better in this case - not due to 
> > > granularity (perf stat will report cycle granular just fine), but due 
> > > to the size of the critical path you'll be measuring. You really want 
> > > to extract the delta, because it's probably so much smaller than the 
> > > overhead of the workload itself.
> > > 
> > > [ We still don't have good 'measure overhead from instruction X to 
> > >   instruction Y' delta measurement infrastructure in perf yet, 
> > >   although Frederic is working on such a trigger/delta facility AFAIK. 
> > >   ]
> > 
> > Yep, in fact Jiri took it over and he's still working on it. But yeah, 
> > once that get merged, we should be able to measure instructions or 
> > cycles inside any user or kernel function through kprobes/uprobes or 
> > function graph tracer.
> 
> So, what would be nice is to actually make use of it: one very nice 
> usecase I'd love to see is to have the capability within the 'perf top' 
> TUI annotated assembly output to mark specific instructions as 'start' and 
> 'end' markers, and measure the overhead between them.

Yeah that would be a nice interface. Speaking about that, it would be nice to get your input
on the proposed interface for toggle events.

It's still in an RFC state, although it's getting quite elaborated, and I believe we haven't
yet found a real direction to take for the tooling interface IIRC. For example the perf record
cmdline used to state toggle events based contexts was one of the parts we were not that confident about.
And we really don't want to take a wrong direction for that as it's going to be complicated
to handle in any case.

See this thread:
https://lwn.net/Articles/568602/

thanks.

> 
> I.e. allow perf top / perf report to manage probes into interesting 
> functions - or create a similar TUI for 'perf probe' to allow easy live 
> marking/probing of various kernel functionality.
> 
> Thanks,
> 
> 	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: converting unicore32 to gate_vma as done for arm (was Re:  [PATCH] mm: cache largest vma)
  2013-11-04  4:48       ` Al Viro
@ 2013-11-05  2:49         ` 管雪涛
  -1 siblings, 0 replies; 76+ messages in thread
From: 管雪涛 @ 2013-11-05  2:49 UTC (permalink / raw)
  To: Al Viro
  Cc: Davidlohr Bueso, Ingo Molnar, Andrew Morton, Hugh Dickins,
	Michel Lespinasse, Mel Gorman, Rik van Riel, Guan Xuetao, aswin,
	linux-kernel, linux-mm, Linus Torvalds

The patch is ok for unicore32. Thanks Al.

While testing this patch, a bug is found in arch/unicore32/include/asm/pgtable.h:

@@ -96,7 +96,7 @@ extern pgprot_t pgprot_kernel;
                                                                | PTE_EXEC)
 #define PAGE_READONLY          __pgprot(pgprot_val(pgprot_user | PTE_READ)
 #define PAGE_READONLY_EXEC     __pgprot(pgprot_val(pgprot_user | PTE_READ \
-                                                               | PTE_EXEC)
+                                                               | PTE_EXEC))

In fact, all similar macros are wrong. I'll post an bug-fix patch for this obvious error.

Xuetao

----- Al Viro <viro@ZenIV.linux.org.uk> 写道:
> On Sun, Nov 03, 2013 at 08:20:10PM -0800, Davidlohr Bueso wrote:
> > > > diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
> > > > index fb5e4c6..38cc7fc 100644
> > > > --- a/arch/unicore32/include/asm/mmu_context.h
> > > > +++ b/arch/unicore32/include/asm/mmu_context.h
> > > > @@ -73,7 +73,7 @@ do { \
> > > >  		else \
> > > >  			mm->mmap = NULL; \
> > > >  		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
> > > > -		mm->mmap_cache = NULL; \
> > > > +		vma_clear_caches(mm);			\
> > > >  		mm->map_count--; \
> > > >  		remove_vma(high_vma); \
> > > >  	} \
> 
> BTW, this one needs an analog of
> commit f9d4861fc32b995b1616775614459b8f266c803c
> Author: Will Deacon <will.deacon@arm.com>
> Date:   Fri Jan 20 12:01:13 2012 +0100
> 
>     ARM: 7294/1: vectors: use gate_vma for vectors user mapping
> 
> This code is a copy of older arm logics rewritten in that commit; unicore32
> never got its counterpart.  I have a [completely untested] variant sitting
> in vfs.git#vm^; it's probably worth testing - if it works, we'll get rid
> of one more place that needs to be aware of MM guts and unicore32 folks
> will have fewer potential headache sources...
> 
> FWIW, after porting to the current tree it becomes the following; I'm not
> sure whether we want VM_DONTEXPAND | VM_DONTDUMP set for this one, though...
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---
> diff --git a/arch/unicore32/include/asm/elf.h b/arch/unicore32/include/asm/elf.h
> index 829042d..eeba258 100644
> --- a/arch/unicore32/include/asm/elf.h
> +++ b/arch/unicore32/include/asm/elf.h
> @@ -87,8 +87,4 @@ struct mm_struct;
>  extern unsigned long arch_randomize_brk(struct mm_struct *mm);
>  #define arch_randomize_brk arch_randomize_brk
>  
> -extern int vectors_user_mapping(void);
> -#define arch_setup_additional_pages(bprm, uses_interp) vectors_user_mapping()
> -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
> -
>  #endif
> diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
> index fb5e4c6..600b1b8 100644
> --- a/arch/unicore32/include/asm/mmu_context.h
> +++ b/arch/unicore32/include/asm/mmu_context.h
> @@ -18,6 +18,7 @@
>  
>  #include <asm/cacheflush.h>
>  #include <asm/cpu-single.h>
> +#include <asm-generic/mm_hooks.h>
>  
>  #define init_new_context(tsk, mm)	0
>  
> @@ -56,32 +57,4 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
>  #define deactivate_mm(tsk, mm)	do { } while (0)
>  #define activate_mm(prev, next)	switch_mm(prev, next, NULL)
>  
> -/*
> - * We are inserting a "fake" vma for the user-accessible vector page so
> - * gdb and friends can get to it through ptrace and /proc/<pid>/mem.
> - * But we also want to remove it before the generic code gets to see it
> - * during process exit or the unmapping of it would  cause total havoc.
> - * (the macro is used as remove_vma() is static to mm/mmap.c)
> - */
> -#define arch_exit_mmap(mm) \
> -do { \
> -	struct vm_area_struct *high_vma = find_vma(mm, 0xffff0000); \
> -	if (high_vma) { \
> -		BUG_ON(high_vma->vm_next);  /* it should be last */ \
> -		if (high_vma->vm_prev) \
> -			high_vma->vm_prev->vm_next = NULL; \
> -		else \
> -			mm->mmap = NULL; \
> -		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
> -		mm->mmap_cache = NULL; \
> -		mm->map_count--; \
> -		remove_vma(high_vma); \
> -	} \
> -} while (0)
> -
> -static inline void arch_dup_mmap(struct mm_struct *oldmm,
> -				 struct mm_struct *mm)
> -{
> -}
> -
>  #endif
> diff --git a/arch/unicore32/include/asm/page.h b/arch/unicore32/include/asm/page.h
> index 594b322..e79da8b 100644
> --- a/arch/unicore32/include/asm/page.h
> +++ b/arch/unicore32/include/asm/page.h
> @@ -28,6 +28,8 @@ extern void copy_page(void *to, const void *from);
>  #define clear_user_page(page, vaddr, pg)	clear_page(page)
>  #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
>  
> +#define __HAVE_ARCH_GATE_AREA 1
> +
>  #undef STRICT_MM_TYPECHECKS
>  
>  #ifdef STRICT_MM_TYPECHECKS
> diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
> index 778ebba..51d129e 100644
> --- a/arch/unicore32/kernel/process.c
> +++ b/arch/unicore32/kernel/process.c
> @@ -307,21 +307,39 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
>  
>  /*
>   * The vectors page is always readable from user space for the
> - * atomic helpers and the signal restart code.  Let's declare a mapping
> - * for it so it is visible through ptrace and /proc/<pid>/mem.
> + * atomic helpers and the signal restart code. Insert it into the
> + * gate_vma so that it is visible through ptrace and /proc/<pid>/mem.
>   */
> +static struct vm_area_struct gate_vma = {
> +	.vm_start	= 0xffff0000,
> +	.vm_end		= 0xffff0000 + PAGE_SIZE,
> +	.vm_flags	= VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC |
> +			  VM_DONTEXPAND | VM_DONTDUMP,
> +};
> +
> +static int __init gate_vma_init(void)
> +{
> +	gate_vma.vm_page_prot	= PAGE_READONLY_EXEC;
> +	return 0;
> +}
> +arch_initcall(gate_vma_init);
> +
> +struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
> +{
> +	return &gate_vma;
> +}
> +
> +int in_gate_area(struct mm_struct *mm, unsigned long addr)
> +{
> +	return (addr >= gate_vma.vm_start) && (addr < gate_vma.vm_end);
> +}
>  
> -int vectors_user_mapping(void)
> +int in_gate_area_no_mm(unsigned long addr)
>  {
> -	struct mm_struct *mm = current->mm;
> -	return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
> -				       VM_READ | VM_EXEC |
> -				       VM_MAYREAD | VM_MAYEXEC |
> -				       VM_DONTEXPAND | VM_DONTDUMP,
> -				       NULL);
> +	return in_gate_area(NULL, addr);
>  }
>  
>  const char *arch_vma_name(struct vm_area_struct *vma)
>  {
> -	return (vma->vm_start == 0xffff0000) ? "[vectors]" : NULL;
> +	return (vma == &gate_vma) ? "[vectors]" : NULL;
>  }


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: converting unicore32 to gate_vma as done for arm (was Re:  [PATCH] mm: cache largest vma)
@ 2013-11-05  2:49         ` 管雪涛
  0 siblings, 0 replies; 76+ messages in thread
From: 管雪涛 @ 2013-11-05  2:49 UTC (permalink / raw)
  To: Al Viro
  Cc: Davidlohr Bueso, Ingo Molnar, Andrew Morton, Hugh Dickins,
	Michel Lespinasse, Mel Gorman, Rik van Riel, Guan Xuetao, aswin,
	linux-kernel, linux-mm, Linus Torvalds

The patch is ok for unicore32. Thanks Al.

While testing this patch, a bug is found in arch/unicore32/include/asm/pgtable.h:

@@ -96,7 +96,7 @@ extern pgprot_t pgprot_kernel;
                                                                | PTE_EXEC)
 #define PAGE_READONLY          __pgprot(pgprot_val(pgprot_user | PTE_READ)
 #define PAGE_READONLY_EXEC     __pgprot(pgprot_val(pgprot_user | PTE_READ \
-                                                               | PTE_EXEC)
+                                                               | PTE_EXEC))

In fact, all similar macros are wrong. I'll post an bug-fix patch for this obvious error.

Xuetao

----- Al Viro <viro@ZenIV.linux.org.uk> 写道:
> On Sun, Nov 03, 2013 at 08:20:10PM -0800, Davidlohr Bueso wrote:
> > > > diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
> > > > index fb5e4c6..38cc7fc 100644
> > > > --- a/arch/unicore32/include/asm/mmu_context.h
> > > > +++ b/arch/unicore32/include/asm/mmu_context.h
> > > > @@ -73,7 +73,7 @@ do { \
> > > >  		else \
> > > >  			mm->mmap = NULL; \
> > > >  		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
> > > > -		mm->mmap_cache = NULL; \
> > > > +		vma_clear_caches(mm);			\
> > > >  		mm->map_count--; \
> > > >  		remove_vma(high_vma); \
> > > >  	} \
> 
> BTW, this one needs an analog of
> commit f9d4861fc32b995b1616775614459b8f266c803c
> Author: Will Deacon <will.deacon@arm.com>
> Date:   Fri Jan 20 12:01:13 2012 +0100
> 
>     ARM: 7294/1: vectors: use gate_vma for vectors user mapping
> 
> This code is a copy of older arm logics rewritten in that commit; unicore32
> never got its counterpart.  I have a [completely untested] variant sitting
> in vfs.git#vm^; it's probably worth testing - if it works, we'll get rid
> of one more place that needs to be aware of MM guts and unicore32 folks
> will have fewer potential headache sources...
> 
> FWIW, after porting to the current tree it becomes the following; I'm not
> sure whether we want VM_DONTEXPAND | VM_DONTDUMP set for this one, though...
> 
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---
> diff --git a/arch/unicore32/include/asm/elf.h b/arch/unicore32/include/asm/elf.h
> index 829042d..eeba258 100644
> --- a/arch/unicore32/include/asm/elf.h
> +++ b/arch/unicore32/include/asm/elf.h
> @@ -87,8 +87,4 @@ struct mm_struct;
>  extern unsigned long arch_randomize_brk(struct mm_struct *mm);
>  #define arch_randomize_brk arch_randomize_brk
>  
> -extern int vectors_user_mapping(void);
> -#define arch_setup_additional_pages(bprm, uses_interp) vectors_user_mapping()
> -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
> -
>  #endif
> diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
> index fb5e4c6..600b1b8 100644
> --- a/arch/unicore32/include/asm/mmu_context.h
> +++ b/arch/unicore32/include/asm/mmu_context.h
> @@ -18,6 +18,7 @@
>  
>  #include <asm/cacheflush.h>
>  #include <asm/cpu-single.h>
> +#include <asm-generic/mm_hooks.h>
>  
>  #define init_new_context(tsk, mm)	0
>  
> @@ -56,32 +57,4 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
>  #define deactivate_mm(tsk, mm)	do { } while (0)
>  #define activate_mm(prev, next)	switch_mm(prev, next, NULL)
>  
> -/*
> - * We are inserting a "fake" vma for the user-accessible vector page so
> - * gdb and friends can get to it through ptrace and /proc/<pid>/mem.
> - * But we also want to remove it before the generic code gets to see it
> - * during process exit or the unmapping of it would  cause total havoc.
> - * (the macro is used as remove_vma() is static to mm/mmap.c)
> - */
> -#define arch_exit_mmap(mm) \
> -do { \
> -	struct vm_area_struct *high_vma = find_vma(mm, 0xffff0000); \
> -	if (high_vma) { \
> -		BUG_ON(high_vma->vm_next);  /* it should be last */ \
> -		if (high_vma->vm_prev) \
> -			high_vma->vm_prev->vm_next = NULL; \
> -		else \
> -			mm->mmap = NULL; \
> -		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
> -		mm->mmap_cache = NULL; \
> -		mm->map_count--; \
> -		remove_vma(high_vma); \
> -	} \
> -} while (0)
> -
> -static inline void arch_dup_mmap(struct mm_struct *oldmm,
> -				 struct mm_struct *mm)
> -{
> -}
> -
>  #endif
> diff --git a/arch/unicore32/include/asm/page.h b/arch/unicore32/include/asm/page.h
> index 594b322..e79da8b 100644
> --- a/arch/unicore32/include/asm/page.h
> +++ b/arch/unicore32/include/asm/page.h
> @@ -28,6 +28,8 @@ extern void copy_page(void *to, const void *from);
>  #define clear_user_page(page, vaddr, pg)	clear_page(page)
>  #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
>  
> +#define __HAVE_ARCH_GATE_AREA 1
> +
>  #undef STRICT_MM_TYPECHECKS
>  
>  #ifdef STRICT_MM_TYPECHECKS
> diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
> index 778ebba..51d129e 100644
> --- a/arch/unicore32/kernel/process.c
> +++ b/arch/unicore32/kernel/process.c
> @@ -307,21 +307,39 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
>  
>  /*
>   * The vectors page is always readable from user space for the
> - * atomic helpers and the signal restart code.  Let's declare a mapping
> - * for it so it is visible through ptrace and /proc/<pid>/mem.
> + * atomic helpers and the signal restart code. Insert it into the
> + * gate_vma so that it is visible through ptrace and /proc/<pid>/mem.
>   */
> +static struct vm_area_struct gate_vma = {
> +	.vm_start	= 0xffff0000,
> +	.vm_end		= 0xffff0000 + PAGE_SIZE,
> +	.vm_flags	= VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC |
> +			  VM_DONTEXPAND | VM_DONTDUMP,
> +};
> +
> +static int __init gate_vma_init(void)
> +{
> +	gate_vma.vm_page_prot	= PAGE_READONLY_EXEC;
> +	return 0;
> +}
> +arch_initcall(gate_vma_init);
> +
> +struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
> +{
> +	return &gate_vma;
> +}
> +
> +int in_gate_area(struct mm_struct *mm, unsigned long addr)
> +{
> +	return (addr >= gate_vma.vm_start) && (addr < gate_vma.vm_end);
> +}
>  
> -int vectors_user_mapping(void)
> +int in_gate_area_no_mm(unsigned long addr)
>  {
> -	struct mm_struct *mm = current->mm;
> -	return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
> -				       VM_READ | VM_EXEC |
> -				       VM_MAYREAD | VM_MAYEXEC |
> -				       VM_DONTEXPAND | VM_DONTDUMP,
> -				       NULL);
> +	return in_gate_area(NULL, addr);
>  }
>  
>  const char *arch_vma_name(struct vm_area_struct *vma)
>  {
> -	return (vma->vm_start == 0xffff0000) ? "[vectors]" : NULL;
> +	return (vma == &gate_vma) ? "[vectors]" : NULL;
>  }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-04 18:10             ` Frederic Weisbecker
@ 2013-11-05  8:24               ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-05  8:24 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Jiri Olsa, Davidlohr Bueso, Andrew Morton, Hugh Dickins,
	Michel Lespinasse, Mel Gorman, Rik van Riel, Guan Xuetao, aswin,
	linux-kernel, linux-mm, Linus Torvalds, David Ahern


* Frederic Weisbecker <fweisbec@gmail.com> wrote:

> On Mon, Nov 04, 2013 at 06:52:45PM +0100, Ingo Molnar wrote:
> > 
> > * Frederic Weisbecker <fweisbec@gmail.com> wrote:
> > 
> > > On Mon, Nov 04, 2013 at 08:05:00AM +0100, Ingo Molnar wrote:
> > > > 
> > > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > > 
> > > > > Btw, do you suggest using a high level tool such as perf for getting 
> > > > > this data or sprinkling get_cycles() in find_vma() -- I'd think that the 
> > > > > first isn't fine grained enough, while the later will probably variate a 
> > > > > lot from run to run but the ratio should be rather constant.
> > > > 
> > > > LOL - I guess I should have read your mail before replying to it ;-)
> > > > 
> > > > Yes, I think get_cycles() works better in this case - not due to 
> > > > granularity (perf stat will report cycle granular just fine), but due 
> > > > to the size of the critical path you'll be measuring. You really want 
> > > > to extract the delta, because it's probably so much smaller than the 
> > > > overhead of the workload itself.
> > > > 
> > > > [ We still don't have good 'measure overhead from instruction X to 
> > > >   instruction Y' delta measurement infrastructure in perf yet, 
> > > >   although Frederic is working on such a trigger/delta facility AFAIK. 
> > > >   ]
> > > 
> > > Yep, in fact Jiri took it over and he's still working on it. But yeah, 
> > > once that get merged, we should be able to measure instructions or 
> > > cycles inside any user or kernel function through kprobes/uprobes or 
> > > function graph tracer.
> > 
> > So, what would be nice is to actually make use of it: one very nice 
> > usecase I'd love to see is to have the capability within the 'perf top' 
> > TUI annotated assembly output to mark specific instructions as 'start' and 
> > 'end' markers, and measure the overhead between them.
> 
> Yeah that would be a nice interface. Speaking about that, it would be nice to get your input
> on the proposed interface for toggle events.
> 
> It's still in an RFC state, although it's getting quite elaborated, and I believe we haven't
> yet found a real direction to take for the tooling interface IIRC. For example the perf record
> cmdline used to state toggle events based contexts was one of the parts we were not that confident about.
> And we really don't want to take a wrong direction for that as it's going to be complicated
> to handle in any case.
> 
> See this thread:
> https://lwn.net/Articles/568602/

At the risk of hijacking this discussion, here's my take on triggers:

I think the primary interface should be to allow the disabling/enabling of 
a specific event from other events.

>From user-space it would be fd driven: add a perf attribute to allow a 
specific event to set the state of another event if it triggers. The 
'other event' would be an fd, similar to how group events are specified.

An 'off' trigger sets the state to 0 (disabled).
An 'on' trigger sets the state to 1 (enabled).

Using such a facility the measurement of deltas would need 3 events:

 - fd1: a cycles event that is created disabled

 - fd2: a kprobes event at the 'start' RIP, set to counting only,
        connected to fd1, setting state to '1'

 - fd3: a kprobes event at the 'stop' RIP, set to counting only,
        connected to fd1, setting state to '0'.

This way every time the (fd2) start-RIP kprobes event executes, the 
trigger code sees that it's supposed to enable the (fd1) cycles event. 
Every time the (fd3) stop-RIP kprobes event executes, the trigger code 
sees that it's set to disable the (fd1) cycles event.

Instead of 'cycles event', it could count instructions, or pagefaults, or 
cachemisses.

( If the (fd1) cycles event is a sampling event then this would allow nice 
  things like the profiling of individual functions within the context of 
  a specific system call, driven by triggers. )

In theory we could allow self-referential triggers as well: the first 
execution of the trigger would disable itself. If the trigger state is not 
on/off but a counter then this would allow 'take 100 samples then shut 
off' type of functionality as well.

But success primarily depends on how useful the tooling UI turns out to 
be: create a nice Slang or GTK UI for kprobes and triggers, and/or turn it 
into a really intuitive command line UI, and people will use it.

I think annotated assembly/source output is a really nice match for 
triggers and kprobes, so I'd suggest the Slang TUI route ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-05  8:24               ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-05  8:24 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Jiri Olsa, Davidlohr Bueso, Andrew Morton, Hugh Dickins,
	Michel Lespinasse, Mel Gorman, Rik van Riel, Guan Xuetao, aswin,
	linux-kernel, linux-mm, Linus Torvalds, David Ahern


* Frederic Weisbecker <fweisbec@gmail.com> wrote:

> On Mon, Nov 04, 2013 at 06:52:45PM +0100, Ingo Molnar wrote:
> > 
> > * Frederic Weisbecker <fweisbec@gmail.com> wrote:
> > 
> > > On Mon, Nov 04, 2013 at 08:05:00AM +0100, Ingo Molnar wrote:
> > > > 
> > > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > > 
> > > > > Btw, do you suggest using a high level tool such as perf for getting 
> > > > > this data or sprinkling get_cycles() in find_vma() -- I'd think that the 
> > > > > first isn't fine grained enough, while the later will probably variate a 
> > > > > lot from run to run but the ratio should be rather constant.
> > > > 
> > > > LOL - I guess I should have read your mail before replying to it ;-)
> > > > 
> > > > Yes, I think get_cycles() works better in this case - not due to 
> > > > granularity (perf stat will report cycle granular just fine), but due 
> > > > to the size of the critical path you'll be measuring. You really want 
> > > > to extract the delta, because it's probably so much smaller than the 
> > > > overhead of the workload itself.
> > > > 
> > > > [ We still don't have good 'measure overhead from instruction X to 
> > > >   instruction Y' delta measurement infrastructure in perf yet, 
> > > >   although Frederic is working on such a trigger/delta facility AFAIK. 
> > > >   ]
> > > 
> > > Yep, in fact Jiri took it over and he's still working on it. But yeah, 
> > > once that get merged, we should be able to measure instructions or 
> > > cycles inside any user or kernel function through kprobes/uprobes or 
> > > function graph tracer.
> > 
> > So, what would be nice is to actually make use of it: one very nice 
> > usecase I'd love to see is to have the capability within the 'perf top' 
> > TUI annotated assembly output to mark specific instructions as 'start' and 
> > 'end' markers, and measure the overhead between them.
> 
> Yeah that would be a nice interface. Speaking about that, it would be nice to get your input
> on the proposed interface for toggle events.
> 
> It's still in an RFC state, although it's getting quite elaborated, and I believe we haven't
> yet found a real direction to take for the tooling interface IIRC. For example the perf record
> cmdline used to state toggle events based contexts was one of the parts we were not that confident about.
> And we really don't want to take a wrong direction for that as it's going to be complicated
> to handle in any case.
> 
> See this thread:
> https://lwn.net/Articles/568602/

At the risk of hijacking this discussion, here's my take on triggers:

I think the primary interface should be to allow the disabling/enabling of 
a specific event from other events.

>From user-space it would be fd driven: add a perf attribute to allow a 
specific event to set the state of another event if it triggers. The 
'other event' would be an fd, similar to how group events are specified.

An 'off' trigger sets the state to 0 (disabled).
An 'on' trigger sets the state to 1 (enabled).

Using such a facility the measurement of deltas would need 3 events:

 - fd1: a cycles event that is created disabled

 - fd2: a kprobes event at the 'start' RIP, set to counting only,
        connected to fd1, setting state to '1'

 - fd3: a kprobes event at the 'stop' RIP, set to counting only,
        connected to fd1, setting state to '0'.

This way every time the (fd2) start-RIP kprobes event executes, the 
trigger code sees that it's supposed to enable the (fd1) cycles event. 
Every time the (fd3) stop-RIP kprobes event executes, the trigger code 
sees that it's set to disable the (fd1) cycles event.

Instead of 'cycles event', it could count instructions, or pagefaults, or 
cachemisses.

( If the (fd1) cycles event is a sampling event then this would allow nice 
  things like the profiling of individual functions within the context of 
  a specific system call, driven by triggers. )

In theory we could allow self-referential triggers as well: the first 
execution of the trigger would disable itself. If the trigger state is not 
on/off but a counter then this would allow 'take 100 samples then shut 
off' type of functionality as well.

But success primarily depends on how useful the tooling UI turns out to 
be: create a nice Slang or GTK UI for kprobes and triggers, and/or turn it 
into a really intuitive command line UI, and people will use it.

I think annotated assembly/source output is a really nice match for 
triggers and kprobes, so I'd suggest the Slang TUI route ...

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-05  8:24               ` Ingo Molnar
@ 2013-11-05 14:27                 ` Jiri Olsa
  -1 siblings, 0 replies; 76+ messages in thread
From: Jiri Olsa @ 2013-11-05 14:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Frederic Weisbecker, Davidlohr Bueso, Andrew Morton,
	Hugh Dickins, Michel Lespinasse, Mel Gorman, Rik van Riel,
	Guan Xuetao, aswin, linux-kernel, linux-mm, Linus Torvalds,
	David Ahern, Arnaldo Carvalho de Melo

On Tue, Nov 05, 2013 at 09:24:51AM +0100, Ingo Molnar wrote:

SNIP

> > 
> > Yeah that would be a nice interface. Speaking about that, it would be nice to get your input
> > on the proposed interface for toggle events.
> > 
> > It's still in an RFC state, although it's getting quite elaborated, and I believe we haven't
> > yet found a real direction to take for the tooling interface IIRC. For example the perf record
> > cmdline used to state toggle events based contexts was one of the parts we were not that confident about.
> > And we really don't want to take a wrong direction for that as it's going to be complicated
> > to handle in any case.
> > 
> > See this thread:
> > https://lwn.net/Articles/568602/
> 
> At the risk of hijacking this discussion, here's my take on triggers:
> 
> I think the primary interface should be to allow the disabling/enabling of 
> a specific event from other events.
> 
> From user-space it would be fd driven: add a perf attribute to allow a 
> specific event to set the state of another event if it triggers. The 
> 'other event' would be an fd, similar to how group events are specified.
> 
> An 'off' trigger sets the state to 0 (disabled).
> An 'on' trigger sets the state to 1 (enabled).
> 
> Using such a facility the measurement of deltas would need 3 events:
> 
>  - fd1: a cycles event that is created disabled
> 
>  - fd2: a kprobes event at the 'start' RIP, set to counting only,
>         connected to fd1, setting state to '1'
> 
>  - fd3: a kprobes event at the 'stop' RIP, set to counting only,
>         connected to fd1, setting state to '0'.
> 
> This way every time the (fd2) start-RIP kprobes event executes, the 
> trigger code sees that it's supposed to enable the (fd1) cycles event. 
> Every time the (fd3) stop-RIP kprobes event executes, the trigger code 
> sees that it's set to disable the (fd1) cycles event.

that's more or less how the current code works,
you can check this wiki for details:
https://perf.wiki.kernel.org/index.php/Jolsa_Features_Togle_Event

> 
> Instead of 'cycles event', it could count instructions, or pagefaults, or 
> cachemisses.

we made it general for any kind of event

> 
> ( If the (fd1) cycles event is a sampling event then this would allow nice 
>   things like the profiling of individual functions within the context of 
>   a specific system call, driven by triggers. )
> 
> In theory we could allow self-referential triggers as well: the first 
> execution of the trigger would disable itself. If the trigger state is not 
> on/off but a counter then this would allow 'take 100 samples then shut 
> off' type of functionality as well.

ok, there's something similar in ftrace and we already
discussed this for perf.. I'll check

> 
> But success primarily depends on how useful the tooling UI turns out to 
> be: create a nice Slang or GTK UI for kprobes and triggers, and/or turn it 
> into a really intuitive command line UI, and people will use it.
> 
> I think annotated assembly/source output is a really nice match for 
> triggers and kprobes, so I'd suggest the Slang TUI route ...

yep, current toggling command line UI is not much user friendly

but perhaps we should leave it there (because it seems it wont
get much better anyway) and focus more on Slang UI as the
target one..

CCing Arnaldo ;-)

thanks,
jirka

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-05 14:27                 ` Jiri Olsa
  0 siblings, 0 replies; 76+ messages in thread
From: Jiri Olsa @ 2013-11-05 14:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Frederic Weisbecker, Davidlohr Bueso, Andrew Morton,
	Hugh Dickins, Michel Lespinasse, Mel Gorman, Rik van Riel,
	Guan Xuetao, aswin, linux-kernel, linux-mm, Linus Torvalds,
	David Ahern, Arnaldo Carvalho de Melo

On Tue, Nov 05, 2013 at 09:24:51AM +0100, Ingo Molnar wrote:

SNIP

> > 
> > Yeah that would be a nice interface. Speaking about that, it would be nice to get your input
> > on the proposed interface for toggle events.
> > 
> > It's still in an RFC state, although it's getting quite elaborated, and I believe we haven't
> > yet found a real direction to take for the tooling interface IIRC. For example the perf record
> > cmdline used to state toggle events based contexts was one of the parts we were not that confident about.
> > And we really don't want to take a wrong direction for that as it's going to be complicated
> > to handle in any case.
> > 
> > See this thread:
> > https://lwn.net/Articles/568602/
> 
> At the risk of hijacking this discussion, here's my take on triggers:
> 
> I think the primary interface should be to allow the disabling/enabling of 
> a specific event from other events.
> 
> From user-space it would be fd driven: add a perf attribute to allow a 
> specific event to set the state of another event if it triggers. The 
> 'other event' would be an fd, similar to how group events are specified.
> 
> An 'off' trigger sets the state to 0 (disabled).
> An 'on' trigger sets the state to 1 (enabled).
> 
> Using such a facility the measurement of deltas would need 3 events:
> 
>  - fd1: a cycles event that is created disabled
> 
>  - fd2: a kprobes event at the 'start' RIP, set to counting only,
>         connected to fd1, setting state to '1'
> 
>  - fd3: a kprobes event at the 'stop' RIP, set to counting only,
>         connected to fd1, setting state to '0'.
> 
> This way every time the (fd2) start-RIP kprobes event executes, the 
> trigger code sees that it's supposed to enable the (fd1) cycles event. 
> Every time the (fd3) stop-RIP kprobes event executes, the trigger code 
> sees that it's set to disable the (fd1) cycles event.

that's more or less how the current code works,
you can check this wiki for details:
https://perf.wiki.kernel.org/index.php/Jolsa_Features_Togle_Event

> 
> Instead of 'cycles event', it could count instructions, or pagefaults, or 
> cachemisses.

we made it general for any kind of event

> 
> ( If the (fd1) cycles event is a sampling event then this would allow nice 
>   things like the profiling of individual functions within the context of 
>   a specific system call, driven by triggers. )
> 
> In theory we could allow self-referential triggers as well: the first 
> execution of the trigger would disable itself. If the trigger state is not 
> on/off but a counter then this would allow 'take 100 samples then shut 
> off' type of functionality as well.

ok, there's something similar in ftrace and we already
discussed this for perf.. I'll check

> 
> But success primarily depends on how useful the tooling UI turns out to 
> be: create a nice Slang or GTK UI for kprobes and triggers, and/or turn it 
> into a really intuitive command line UI, and people will use it.
> 
> I think annotated assembly/source output is a really nice match for 
> triggers and kprobes, so I'd suggest the Slang TUI route ...

yep, current toggling command line UI is not much user friendly

but perhaps we should leave it there (because it seems it wont
get much better anyway) and focus more on Slang UI as the
target one..

CCing Arnaldo ;-)

thanks,
jirka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-05 14:27                 ` Jiri Olsa
@ 2013-11-06  6:01                   ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-06  6:01 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Frederic Weisbecker, Davidlohr Bueso, Andrew Morton,
	Hugh Dickins, Michel Lespinasse, Mel Gorman, Rik van Riel,
	Guan Xuetao, aswin, linux-kernel, linux-mm, Linus Torvalds,
	David Ahern, Arnaldo Carvalho de Melo


* Jiri Olsa <jolsa@redhat.com> wrote:

> > But success primarily depends on how useful the tooling UI turns out 
> > to be: create a nice Slang or GTK UI for kprobes and triggers, and/or 
> > turn it into a really intuitive command line UI, and people will use 
> > it.
> > 
> > I think annotated assembly/source output is a really nice match for 
> > triggers and kprobes, so I'd suggest the Slang TUI route ...
> 
> yep, current toggling command line UI is not much user friendly
> 
> but perhaps we should leave it there (because it seems it wont get much 
> better anyway) and focus more on Slang UI as the target one..
> 
> CCing Arnaldo ;-)

Btw., I think we should do the TUI interface _before_ we can merge the 
kernel changes. Frankly, 'not very user friendly' means that it's not used 
(and tested) much - which begs the question: why merge the feature at all?

Making a new kernel feature usable to as many people as possible must be a 
primary concern, not an afterthought.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-06  6:01                   ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-06  6:01 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Frederic Weisbecker, Davidlohr Bueso, Andrew Morton,
	Hugh Dickins, Michel Lespinasse, Mel Gorman, Rik van Riel,
	Guan Xuetao, aswin, linux-kernel, linux-mm, Linus Torvalds,
	David Ahern, Arnaldo Carvalho de Melo


* Jiri Olsa <jolsa@redhat.com> wrote:

> > But success primarily depends on how useful the tooling UI turns out 
> > to be: create a nice Slang or GTK UI for kprobes and triggers, and/or 
> > turn it into a really intuitive command line UI, and people will use 
> > it.
> > 
> > I think annotated assembly/source output is a really nice match for 
> > triggers and kprobes, so I'd suggest the Slang TUI route ...
> 
> yep, current toggling command line UI is not much user friendly
> 
> but perhaps we should leave it there (because it seems it wont get much 
> better anyway) and focus more on Slang UI as the target one..
> 
> CCing Arnaldo ;-)

Btw., I think we should do the TUI interface _before_ we can merge the 
kernel changes. Frankly, 'not very user friendly' means that it's not used 
(and tested) much - which begs the question: why merge the feature at all?

Making a new kernel feature usable to as many people as possible must be a 
primary concern, not an afterthought.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-06  6:01                   ` Ingo Molnar
@ 2013-11-06 14:03                     ` Konstantin Khlebnikov
  -1 siblings, 0 replies; 76+ messages in thread
From: Konstantin Khlebnikov @ 2013-11-06 14:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jiri Olsa, Frederic Weisbecker, Davidlohr Bueso, Andrew Morton,
	Hugh Dickins, Michel Lespinasse, Mel Gorman, Rik van Riel,
	Guan Xuetao, aswin, Linux Kernel Mailing List, linux-mm,
	Linus Torvalds, David Ahern, Arnaldo Carvalho de Melo

Some time ago I've thought about caching vma on PTE's struct page.
This will work for all huge vmas not only for largest ones.

Of course this requires some reordering in do_page_fault because
currently it lookups vma before pte for obvious reason.


On Wed, Nov 6, 2013 at 10:01 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Jiri Olsa <jolsa@redhat.com> wrote:
>
>> > But success primarily depends on how useful the tooling UI turns out
>> > to be: create a nice Slang or GTK UI for kprobes and triggers, and/or
>> > turn it into a really intuitive command line UI, and people will use
>> > it.
>> >
>> > I think annotated assembly/source output is a really nice match for
>> > triggers and kprobes, so I'd suggest the Slang TUI route ...
>>
>> yep, current toggling command line UI is not much user friendly
>>
>> but perhaps we should leave it there (because it seems it wont get much
>> better anyway) and focus more on Slang UI as the target one..
>>
>> CCing Arnaldo ;-)
>
> Btw., I think we should do the TUI interface _before_ we can merge the
> kernel changes. Frankly, 'not very user friendly' means that it's not used
> (and tested) much - which begs the question: why merge the feature at all?
>
> Making a new kernel feature usable to as many people as possible must be a
> primary concern, not an afterthought.
>
> Thanks,
>
>         Ingo
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-06 14:03                     ` Konstantin Khlebnikov
  0 siblings, 0 replies; 76+ messages in thread
From: Konstantin Khlebnikov @ 2013-11-06 14:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jiri Olsa, Frederic Weisbecker, Davidlohr Bueso, Andrew Morton,
	Hugh Dickins, Michel Lespinasse, Mel Gorman, Rik van Riel,
	Guan Xuetao, aswin, Linux Kernel Mailing List, linux-mm,
	Linus Torvalds, David Ahern, Arnaldo Carvalho de Melo

Some time ago I've thought about caching vma on PTE's struct page.
This will work for all huge vmas not only for largest ones.

Of course this requires some reordering in do_page_fault because
currently it lookups vma before pte for obvious reason.


On Wed, Nov 6, 2013 at 10:01 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Jiri Olsa <jolsa@redhat.com> wrote:
>
>> > But success primarily depends on how useful the tooling UI turns out
>> > to be: create a nice Slang or GTK UI for kprobes and triggers, and/or
>> > turn it into a really intuitive command line UI, and people will use
>> > it.
>> >
>> > I think annotated assembly/source output is a really nice match for
>> > triggers and kprobes, so I'd suggest the Slang TUI route ...
>>
>> yep, current toggling command line UI is not much user friendly
>>
>> but perhaps we should leave it there (because it seems it wont get much
>> better anyway) and focus more on Slang UI as the target one..
>>
>> CCing Arnaldo ;-)
>
> Btw., I think we should do the TUI interface _before_ we can merge the
> kernel changes. Frankly, 'not very user friendly' means that it's not used
> (and tested) much - which begs the question: why merge the feature at all?
>
> Making a new kernel feature usable to as many people as possible must be a
> primary concern, not an afterthought.
>
> Thanks,
>
>         Ingo
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-04  7:36       ` Ingo Molnar
@ 2013-11-11  4:12         ` Davidlohr Bueso
  -1 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-11  4:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

Hi Ingo,

On Mon, 2013-11-04 at 08:36 +0100, Ingo Molnar wrote:
> * Davidlohr Bueso <davidlohr@hp.com> wrote:
> 
> > I will look into doing the vma cache per thread instead of mm (I hadn't 
> > really looked at the problem like this) as well as Ingo's suggestion on 
> > the weighted LRU approach. However, having seen that we can cheaply and 
> > easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> > how good is good enough?
> 
> So I think it all really depends on the hit/miss cost difference. It makes 
> little sense to add a more complex scheme if it washes out most of the 
> benefits!
> 
> Also note the historic context: the _original_ mmap_cache, that I 
> implemented 16 years ago, was a front-line cache to a linear list walk 
> over all vmas (!).
> 
> This is the relevant 2.1.37pre1 code in include/linux/mm.h:
> 
> /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
> {
>         struct vm_area_struct *vma = NULL;
> 
>         if (mm) {
>                 /* Check the cache first. */
>                 vma = mm->mmap_cache;
>                 if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
>                         vma = mm->mmap;
>                         while(vma && vma->vm_end <= addr)
>                                 vma = vma->vm_next;
>                         mm->mmap_cache = vma;
>                 }
>         }
>         return vma;
> }
> 
> See that vma->vm_next iteration? It was awful - but back then most of us 
> had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
> no worries - the mm was really simple back then.
> 
> Today we have the vma rbtree, which is self-balancing and a lot faster 
> than your typical linear list walk search ;-)
> 
> So I'd _really_ suggest to first examine the assumptions behind the cache, 
> it being named 'cache' and it having a hit rate does in itself not 
> guarantee that it gives us any worthwile cost savings when put in front of 
> an rbtree ...

So having mmap_cache around, in whatever form, is an important
optimization for find_vma() - even to this day. It can save us at least
50% cycles that correspond to this function. I ran a variety of
mmap_cache alternatives over two workloads that are heavy on page faults
(as opposed to Java based ones I had tried previously, which really
don't trigger enough for it to be worthwhile).  So we now have a
comparison of 5 different caching schemes -- note that the 4 element
hash table is quite similar to two elements, with a hash function of
(addr % hash_size).

1) Kernel build
+------------------------+----------+------------------+---------+
|    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
+------------------------+----------+------------------+---------+
| no mmap_cache          | -        | 15.85            | 0.10066 |
| current mmap_cache     | 72.32%   | 11.03            | 0.01155 |
| mmap_cache+largest VMA | 84.55%   |  9.91            | 0.01414 |
| 4 element hash table   | 78.38%   | 10.52            | 0.01155 |
| per-thread mmap_cache  | 78.84%   | 10.69            | 0.01325 |
+------------------------+----------+------------------+---------+

In this particular workload the proposed patch benefits the most and
current alternatives, while they do help some, aren't really worth
bothering with as the current implementation already does a nice enough
job.

2) Oracle Data mining (4K pages)
+------------------------+----------+------------------+---------+
|    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
+------------------------+----------+------------------+---------+
| no mmap_cache          | -        | 63.35            | 0.20207 |
| current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
| mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
| 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
| per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
+------------------------+----------+------------------+---------+

This workload sure makes the point of how much we can benefit of caching
the vma, otherwise find_vma() can cost more than 220% extra cycles. We
clearly win here by having a per-thread cache instead of per address
space. I also tried the same workload with 2Mb hugepages and the results
are much more closer to the kernel build, but with the per-thread vma
still winning over the rest of the alternatives.

All in all I think that we should probably have a per-thread vma cache.
Please let me know if there is some other workload you'd like me to try
out. If folks agree then I can cleanup the patch and send it out.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-11  4:12         ` Davidlohr Bueso
  0 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-11  4:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

Hi Ingo,

On Mon, 2013-11-04 at 08:36 +0100, Ingo Molnar wrote:
> * Davidlohr Bueso <davidlohr@hp.com> wrote:
> 
> > I will look into doing the vma cache per thread instead of mm (I hadn't 
> > really looked at the problem like this) as well as Ingo's suggestion on 
> > the weighted LRU approach. However, having seen that we can cheaply and 
> > easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> > how good is good enough?
> 
> So I think it all really depends on the hit/miss cost difference. It makes 
> little sense to add a more complex scheme if it washes out most of the 
> benefits!
> 
> Also note the historic context: the _original_ mmap_cache, that I 
> implemented 16 years ago, was a front-line cache to a linear list walk 
> over all vmas (!).
> 
> This is the relevant 2.1.37pre1 code in include/linux/mm.h:
> 
> /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
> {
>         struct vm_area_struct *vma = NULL;
> 
>         if (mm) {
>                 /* Check the cache first. */
>                 vma = mm->mmap_cache;
>                 if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
>                         vma = mm->mmap;
>                         while(vma && vma->vm_end <= addr)
>                                 vma = vma->vm_next;
>                         mm->mmap_cache = vma;
>                 }
>         }
>         return vma;
> }
> 
> See that vma->vm_next iteration? It was awful - but back then most of us 
> had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
> no worries - the mm was really simple back then.
> 
> Today we have the vma rbtree, which is self-balancing and a lot faster 
> than your typical linear list walk search ;-)
> 
> So I'd _really_ suggest to first examine the assumptions behind the cache, 
> it being named 'cache' and it having a hit rate does in itself not 
> guarantee that it gives us any worthwile cost savings when put in front of 
> an rbtree ...

So having mmap_cache around, in whatever form, is an important
optimization for find_vma() - even to this day. It can save us at least
50% cycles that correspond to this function. I ran a variety of
mmap_cache alternatives over two workloads that are heavy on page faults
(as opposed to Java based ones I had tried previously, which really
don't trigger enough for it to be worthwhile).  So we now have a
comparison of 5 different caching schemes -- note that the 4 element
hash table is quite similar to two elements, with a hash function of
(addr % hash_size).

1) Kernel build
+------------------------+----------+------------------+---------+
|    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
+------------------------+----------+------------------+---------+
| no mmap_cache          | -        | 15.85            | 0.10066 |
| current mmap_cache     | 72.32%   | 11.03            | 0.01155 |
| mmap_cache+largest VMA | 84.55%   |  9.91            | 0.01414 |
| 4 element hash table   | 78.38%   | 10.52            | 0.01155 |
| per-thread mmap_cache  | 78.84%   | 10.69            | 0.01325 |
+------------------------+----------+------------------+---------+

In this particular workload the proposed patch benefits the most and
current alternatives, while they do help some, aren't really worth
bothering with as the current implementation already does a nice enough
job.

2) Oracle Data mining (4K pages)
+------------------------+----------+------------------+---------+
|    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
+------------------------+----------+------------------+---------+
| no mmap_cache          | -        | 63.35            | 0.20207 |
| current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
| mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
| 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
| per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
+------------------------+----------+------------------+---------+

This workload sure makes the point of how much we can benefit of caching
the vma, otherwise find_vma() can cost more than 220% extra cycles. We
clearly win here by having a per-thread cache instead of per address
space. I also tried the same workload with 2Mb hugepages and the results
are much more closer to the kernel build, but with the per-thread vma
still winning over the rest of the alternatives.

All in all I think that we should probably have a per-thread vma cache.
Please let me know if there is some other workload you'd like me to try
out. If folks agree then I can cleanup the patch and send it out.

Thanks,
Davidlohr

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: converting unicore32 to gate_vma as done for arm (was Re:?? [PATCH] mm: cache largest vma)
  2013-11-05  2:49         ` 管雪涛
@ 2013-11-11  7:25           ` Al Viro
  -1 siblings, 0 replies; 76+ messages in thread
From: Al Viro @ 2013-11-11  7:25 UTC (permalink / raw)
  To: ?????????
  Cc: Davidlohr Bueso, Ingo Molnar, Andrew Morton, Hugh Dickins,
	Michel Lespinasse, Mel Gorman, Rik van Riel, Guan Xuetao, aswin,
	linux-kernel, linux-mm, Linus Torvalds

On Tue, Nov 05, 2013 at 10:49:15AM +0800, ????????? wrote:
> The patch is ok for unicore32. Thanks Al.
> 
> While testing this patch, a bug is found in arch/unicore32/include/asm/pgtable.h:
> 
> @@ -96,7 +96,7 @@ extern pgprot_t pgprot_kernel;
>                                                                 | PTE_EXEC)
>  #define PAGE_READONLY          __pgprot(pgprot_val(pgprot_user | PTE_READ)
>  #define PAGE_READONLY_EXEC     __pgprot(pgprot_val(pgprot_user | PTE_READ \
> -                                                               | PTE_EXEC)
> +                                                               | PTE_EXEC))
> 
> In fact, all similar macros are wrong. I'll post an bug-fix patch for this obvious error.

BTW, another missing thing is an analog of commit 9b61a4 (ARM: prevent
VM_GROWSDOWN mmaps extending below FIRST_USER_ADDRESS); I'm not sure why
does unicore32 have FIRST_USER_ADDRESS set to PAGE_SIZE (some no-MMU
arm variants really need that, what with the vectors page living at
address 0 on those), but since you have it set that way, you'd probably
better not allow a mapping to grow down there...

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: converting unicore32 to gate_vma as done for arm (was Re:?? [PATCH] mm: cache largest vma)
@ 2013-11-11  7:25           ` Al Viro
  0 siblings, 0 replies; 76+ messages in thread
From: Al Viro @ 2013-11-11  7:25 UTC (permalink / raw)
  To: ?????????
  Cc: Davidlohr Bueso, Ingo Molnar, Andrew Morton, Hugh Dickins,
	Michel Lespinasse, Mel Gorman, Rik van Riel, Guan Xuetao, aswin,
	linux-kernel, linux-mm, Linus Torvalds

On Tue, Nov 05, 2013 at 10:49:15AM +0800, ????????? wrote:
> The patch is ok for unicore32. Thanks Al.
> 
> While testing this patch, a bug is found in arch/unicore32/include/asm/pgtable.h:
> 
> @@ -96,7 +96,7 @@ extern pgprot_t pgprot_kernel;
>                                                                 | PTE_EXEC)
>  #define PAGE_READONLY          __pgprot(pgprot_val(pgprot_user | PTE_READ)
>  #define PAGE_READONLY_EXEC     __pgprot(pgprot_val(pgprot_user | PTE_READ \
> -                                                               | PTE_EXEC)
> +                                                               | PTE_EXEC))
> 
> In fact, all similar macros are wrong. I'll post an bug-fix patch for this obvious error.

BTW, another missing thing is an analog of commit 9b61a4 (ARM: prevent
VM_GROWSDOWN mmaps extending below FIRST_USER_ADDRESS); I'm not sure why
does unicore32 have FIRST_USER_ADDRESS set to PAGE_SIZE (some no-MMU
arm variants really need that, what with the vectors page living at
address 0 on those), but since you have it set that way, you'd probably
better not allow a mapping to grow down there...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-11  4:12         ` Davidlohr Bueso
@ 2013-11-11  7:43           ` Michel Lespinasse
  -1 siblings, 0 replies; 76+ messages in thread
From: Michel Lespinasse @ 2013-11-11  7:43 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Ingo Molnar, Linus Torvalds, Andrew Morton, Hugh Dickins,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

On Sun, Nov 10, 2013 at 8:12 PM, Davidlohr Bueso <davidlohr@hp.com> wrote:
> 2) Oracle Data mining (4K pages)
> +------------------------+----------+------------------+---------+
> |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> +------------------------+----------+------------------+---------+
> | no mmap_cache          | -        | 63.35            | 0.20207 |
> | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> +------------------------+----------+------------------+---------+
>
> This workload sure makes the point of how much we can benefit of caching
> the vma, otherwise find_vma() can cost more than 220% extra cycles. We
> clearly win here by having a per-thread cache instead of per address
> space. I also tried the same workload with 2Mb hugepages and the results
> are much more closer to the kernel build, but with the per-thread vma
> still winning over the rest of the alternatives.
>
> All in all I think that we should probably have a per-thread vma cache.
> Please let me know if there is some other workload you'd like me to try
> out. If folks agree then I can cleanup the patch and send it out.

Per thread cache sounds interesting - with per-mm caches there is a
real risk that some modern threaded apps pay the cost of cache updates
without seeing much of the benefit. However, how do you cheaply handle
invalidations for the per thread cache ?

If you have a nice simple scheme for invalidations, I could see per
thread LRU cache working well.

That said, the difficulty with this kind of measurements
(instrumenting code to fish out the cost of a particular function) is
that it would be easy to lose somewhere else - for example for keeping
the cache up to date - and miss that on the instrumented measurement.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-11  7:43           ` Michel Lespinasse
  0 siblings, 0 replies; 76+ messages in thread
From: Michel Lespinasse @ 2013-11-11  7:43 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Ingo Molnar, Linus Torvalds, Andrew Morton, Hugh Dickins,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

On Sun, Nov 10, 2013 at 8:12 PM, Davidlohr Bueso <davidlohr@hp.com> wrote:
> 2) Oracle Data mining (4K pages)
> +------------------------+----------+------------------+---------+
> |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> +------------------------+----------+------------------+---------+
> | no mmap_cache          | -        | 63.35            | 0.20207 |
> | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> +------------------------+----------+------------------+---------+
>
> This workload sure makes the point of how much we can benefit of caching
> the vma, otherwise find_vma() can cost more than 220% extra cycles. We
> clearly win here by having a per-thread cache instead of per address
> space. I also tried the same workload with 2Mb hugepages and the results
> are much more closer to the kernel build, but with the per-thread vma
> still winning over the rest of the alternatives.
>
> All in all I think that we should probably have a per-thread vma cache.
> Please let me know if there is some other workload you'd like me to try
> out. If folks agree then I can cleanup the patch and send it out.

Per thread cache sounds interesting - with per-mm caches there is a
real risk that some modern threaded apps pay the cost of cache updates
without seeing much of the benefit. However, how do you cheaply handle
invalidations for the per thread cache ?

If you have a nice simple scheme for invalidations, I could see per
thread LRU cache working well.

That said, the difficulty with this kind of measurements
(instrumenting code to fish out the cost of a particular function) is
that it would be easy to lose somewhere else - for example for keeping
the cache up to date - and miss that on the instrumented measurement.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-11  4:12         ` Davidlohr Bueso
@ 2013-11-11 12:01           ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-11 12:01 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> Hi Ingo,
> 
> On Mon, 2013-11-04 at 08:36 +0100, Ingo Molnar wrote:
> > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > 
> > > I will look into doing the vma cache per thread instead of mm (I hadn't 
> > > really looked at the problem like this) as well as Ingo's suggestion on 
> > > the weighted LRU approach. However, having seen that we can cheaply and 
> > > easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> > > how good is good enough?
> > 
> > So I think it all really depends on the hit/miss cost difference. It makes 
> > little sense to add a more complex scheme if it washes out most of the 
> > benefits!
> > 
> > Also note the historic context: the _original_ mmap_cache, that I 
> > implemented 16 years ago, was a front-line cache to a linear list walk 
> > over all vmas (!).
> > 
> > This is the relevant 2.1.37pre1 code in include/linux/mm.h:
> > 
> > /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> > static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
> > {
> >         struct vm_area_struct *vma = NULL;
> > 
> >         if (mm) {
> >                 /* Check the cache first. */
> >                 vma = mm->mmap_cache;
> >                 if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
> >                         vma = mm->mmap;
> >                         while(vma && vma->vm_end <= addr)
> >                                 vma = vma->vm_next;
> >                         mm->mmap_cache = vma;
> >                 }
> >         }
> >         return vma;
> > }
> > 
> > See that vma->vm_next iteration? It was awful - but back then most of us 
> > had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
> > no worries - the mm was really simple back then.
> > 
> > Today we have the vma rbtree, which is self-balancing and a lot faster 
> > than your typical linear list walk search ;-)
> > 
> > So I'd _really_ suggest to first examine the assumptions behind the cache, 
> > it being named 'cache' and it having a hit rate does in itself not 
> > guarantee that it gives us any worthwile cost savings when put in front of 
> > an rbtree ...
> 
> So having mmap_cache around, in whatever form, is an important
> optimization for find_vma() - even to this day. It can save us at least
> 50% cycles that correspond to this function. [...]

I'm glad it still helps! :-)

> [...] I ran a variety of mmap_cache alternatives over two workloads that 
> are heavy on page faults (as opposed to Java based ones I had tried 
> previously, which really don't trigger enough for it to be worthwhile).  
> So we now have a comparison of 5 different caching schemes -- note that 
> the 4 element hash table is quite similar to two elements, with a hash 
> function of (addr % hash_size).
> 
> 1) Kernel build
> +------------------------+----------+------------------+---------+
> |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> +------------------------+----------+------------------+---------+
> | no mmap_cache          | -        | 15.85            | 0.10066 |
> | current mmap_cache     | 72.32%   | 11.03            | 0.01155 |
> | mmap_cache+largest VMA | 84.55%   |  9.91            | 0.01414 |
> | 4 element hash table   | 78.38%   | 10.52            | 0.01155 |
> | per-thread mmap_cache  | 78.84%   | 10.69            | 0.01325 |
> +------------------------+----------+------------------+---------+
> 
> In this particular workload the proposed patch benefits the most and 
> current alternatives, while they do help some, aren't really worth 
> bothering with as the current implementation already does a nice enough 
> job.

Interesting.

> 2) Oracle Data mining (4K pages)
> +------------------------+----------+------------------+---------+
> |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> +------------------------+----------+------------------+---------+
> | no mmap_cache          | -        | 63.35            | 0.20207 |
> | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> +------------------------+----------+------------------+---------+
> 
> This workload sure makes the point of how much we can benefit of caching 
> the vma, otherwise find_vma() can cost more than 220% extra cycles. We 
> clearly win here by having a per-thread cache instead of per address 
> space. I also tried the same workload with 2Mb hugepages and the results 
> are much more closer to the kernel build, but with the per-thread vma 
> still winning over the rest of the alternatives.

That's also very interesting, and it's exactly the kind of data we need to 
judge such matters. Kernel builds and DB loads are two very different, yet 
important workloads, so if we improve both cases then the probability that 
we improve all other workloads as well increases substantially.

Do you have any data on the number of find_vma() calls performed in these 
two cases, so that we can know the per function call average cost?

It's that per call number that tells us what kind of cache abstraction we 
want to use (if any). In the first approximation every extra step of 
abstraction will add a constant cycle cost, so the per function call 
average cost directly matters to the cost/benefit ratio.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-11 12:01           ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-11 12:01 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> Hi Ingo,
> 
> On Mon, 2013-11-04 at 08:36 +0100, Ingo Molnar wrote:
> > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > 
> > > I will look into doing the vma cache per thread instead of mm (I hadn't 
> > > really looked at the problem like this) as well as Ingo's suggestion on 
> > > the weighted LRU approach. However, having seen that we can cheaply and 
> > > easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> > > how good is good enough?
> > 
> > So I think it all really depends on the hit/miss cost difference. It makes 
> > little sense to add a more complex scheme if it washes out most of the 
> > benefits!
> > 
> > Also note the historic context: the _original_ mmap_cache, that I 
> > implemented 16 years ago, was a front-line cache to a linear list walk 
> > over all vmas (!).
> > 
> > This is the relevant 2.1.37pre1 code in include/linux/mm.h:
> > 
> > /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> > static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
> > {
> >         struct vm_area_struct *vma = NULL;
> > 
> >         if (mm) {
> >                 /* Check the cache first. */
> >                 vma = mm->mmap_cache;
> >                 if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
> >                         vma = mm->mmap;
> >                         while(vma && vma->vm_end <= addr)
> >                                 vma = vma->vm_next;
> >                         mm->mmap_cache = vma;
> >                 }
> >         }
> >         return vma;
> > }
> > 
> > See that vma->vm_next iteration? It was awful - but back then most of us 
> > had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
> > no worries - the mm was really simple back then.
> > 
> > Today we have the vma rbtree, which is self-balancing and a lot faster 
> > than your typical linear list walk search ;-)
> > 
> > So I'd _really_ suggest to first examine the assumptions behind the cache, 
> > it being named 'cache' and it having a hit rate does in itself not 
> > guarantee that it gives us any worthwile cost savings when put in front of 
> > an rbtree ...
> 
> So having mmap_cache around, in whatever form, is an important
> optimization for find_vma() - even to this day. It can save us at least
> 50% cycles that correspond to this function. [...]

I'm glad it still helps! :-)

> [...] I ran a variety of mmap_cache alternatives over two workloads that 
> are heavy on page faults (as opposed to Java based ones I had tried 
> previously, which really don't trigger enough for it to be worthwhile).  
> So we now have a comparison of 5 different caching schemes -- note that 
> the 4 element hash table is quite similar to two elements, with a hash 
> function of (addr % hash_size).
> 
> 1) Kernel build
> +------------------------+----------+------------------+---------+
> |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> +------------------------+----------+------------------+---------+
> | no mmap_cache          | -        | 15.85            | 0.10066 |
> | current mmap_cache     | 72.32%   | 11.03            | 0.01155 |
> | mmap_cache+largest VMA | 84.55%   |  9.91            | 0.01414 |
> | 4 element hash table   | 78.38%   | 10.52            | 0.01155 |
> | per-thread mmap_cache  | 78.84%   | 10.69            | 0.01325 |
> +------------------------+----------+------------------+---------+
> 
> In this particular workload the proposed patch benefits the most and 
> current alternatives, while they do help some, aren't really worth 
> bothering with as the current implementation already does a nice enough 
> job.

Interesting.

> 2) Oracle Data mining (4K pages)
> +------------------------+----------+------------------+---------+
> |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> +------------------------+----------+------------------+---------+
> | no mmap_cache          | -        | 63.35            | 0.20207 |
> | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> +------------------------+----------+------------------+---------+
> 
> This workload sure makes the point of how much we can benefit of caching 
> the vma, otherwise find_vma() can cost more than 220% extra cycles. We 
> clearly win here by having a per-thread cache instead of per address 
> space. I also tried the same workload with 2Mb hugepages and the results 
> are much more closer to the kernel build, but with the per-thread vma 
> still winning over the rest of the alternatives.

That's also very interesting, and it's exactly the kind of data we need to 
judge such matters. Kernel builds and DB loads are two very different, yet 
important workloads, so if we improve both cases then the probability that 
we improve all other workloads as well increases substantially.

Do you have any data on the number of find_vma() calls performed in these 
two cases, so that we can know the per function call average cost?

It's that per call number that tells us what kind of cache abstraction we 
want to use (if any). In the first approximation every extra step of 
abstraction will add a constant cycle cost, so the per function call 
average cost directly matters to the cost/benefit ratio.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-11  7:43           ` Michel Lespinasse
@ 2013-11-11 12:04             ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-11 12:04 UTC (permalink / raw)
  To: Michel Lespinasse, Peter Zijlstra
  Cc: Davidlohr Bueso, Linus Torvalds, Andrew Morton, Hugh Dickins,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm


* Michel Lespinasse <walken@google.com> wrote:

> On Sun, Nov 10, 2013 at 8:12 PM, Davidlohr Bueso <davidlohr@hp.com> wrote:
> > 2) Oracle Data mining (4K pages)
> > +------------------------+----------+------------------+---------+
> > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > +------------------------+----------+------------------+---------+
> > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > +------------------------+----------+------------------+---------+
> >
> > This workload sure makes the point of how much we can benefit of 
> > caching the vma, otherwise find_vma() can cost more than 220% extra 
> > cycles. We clearly win here by having a per-thread cache instead of 
> > per address space. I also tried the same workload with 2Mb hugepages 
> > and the results are much more closer to the kernel build, but with the 
> > per-thread vma still winning over the rest of the alternatives.
> >
> > All in all I think that we should probably have a per-thread vma 
> > cache. Please let me know if there is some other workload you'd like 
> > me to try out. If folks agree then I can cleanup the patch and send it 
> > out.
> 
> Per thread cache sounds interesting - with per-mm caches there is a real 
> risk that some modern threaded apps pay the cost of cache updates 
> without seeing much of the benefit. However, how do you cheaply handle 
> invalidations for the per thread cache ?

The cheapest way to handle that would be to have a generation counter for 
the mm and to couple cache validity to a specific value of that. 
'Invalidation' is then the free side effect of bumping the generation 
counter when a vma is removed/moved.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-11 12:04             ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-11 12:04 UTC (permalink / raw)
  To: Michel Lespinasse, Peter Zijlstra
  Cc: Davidlohr Bueso, Linus Torvalds, Andrew Morton, Hugh Dickins,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm


* Michel Lespinasse <walken@google.com> wrote:

> On Sun, Nov 10, 2013 at 8:12 PM, Davidlohr Bueso <davidlohr@hp.com> wrote:
> > 2) Oracle Data mining (4K pages)
> > +------------------------+----------+------------------+---------+
> > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > +------------------------+----------+------------------+---------+
> > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > +------------------------+----------+------------------+---------+
> >
> > This workload sure makes the point of how much we can benefit of 
> > caching the vma, otherwise find_vma() can cost more than 220% extra 
> > cycles. We clearly win here by having a per-thread cache instead of 
> > per address space. I also tried the same workload with 2Mb hugepages 
> > and the results are much more closer to the kernel build, but with the 
> > per-thread vma still winning over the rest of the alternatives.
> >
> > All in all I think that we should probably have a per-thread vma 
> > cache. Please let me know if there is some other workload you'd like 
> > me to try out. If folks agree then I can cleanup the patch and send it 
> > out.
> 
> Per thread cache sounds interesting - with per-mm caches there is a real 
> risk that some modern threaded apps pay the cost of cache updates 
> without seeing much of the benefit. However, how do you cheaply handle 
> invalidations for the per thread cache ?

The cheapest way to handle that would be to have a generation counter for 
the mm and to couple cache validity to a specific value of that. 
'Invalidation' is then the free side effect of bumping the generation 
counter when a vma is removed/moved.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-11 12:01           ` Ingo Molnar
@ 2013-11-11 18:24             ` Davidlohr Bueso
  -1 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-11 18:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

On Mon, 2013-11-11 at 13:01 +0100, Ingo Molnar wrote:
> * Davidlohr Bueso <davidlohr@hp.com> wrote:
> 
> > Hi Ingo,
> > 
> > On Mon, 2013-11-04 at 08:36 +0100, Ingo Molnar wrote:
> > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > 
> > > > I will look into doing the vma cache per thread instead of mm (I hadn't 
> > > > really looked at the problem like this) as well as Ingo's suggestion on 
> > > > the weighted LRU approach. However, having seen that we can cheaply and 
> > > > easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> > > > how good is good enough?
> > > 
> > > So I think it all really depends on the hit/miss cost difference. It makes 
> > > little sense to add a more complex scheme if it washes out most of the 
> > > benefits!
> > > 
> > > Also note the historic context: the _original_ mmap_cache, that I 
> > > implemented 16 years ago, was a front-line cache to a linear list walk 
> > > over all vmas (!).
> > > 
> > > This is the relevant 2.1.37pre1 code in include/linux/mm.h:
> > > 
> > > /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> > > static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
> > > {
> > >         struct vm_area_struct *vma = NULL;
> > > 
> > >         if (mm) {
> > >                 /* Check the cache first. */
> > >                 vma = mm->mmap_cache;
> > >                 if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
> > >                         vma = mm->mmap;
> > >                         while(vma && vma->vm_end <= addr)
> > >                                 vma = vma->vm_next;
> > >                         mm->mmap_cache = vma;
> > >                 }
> > >         }
> > >         return vma;
> > > }
> > > 
> > > See that vma->vm_next iteration? It was awful - but back then most of us 
> > > had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
> > > no worries - the mm was really simple back then.
> > > 
> > > Today we have the vma rbtree, which is self-balancing and a lot faster 
> > > than your typical linear list walk search ;-)
> > > 
> > > So I'd _really_ suggest to first examine the assumptions behind the cache, 
> > > it being named 'cache' and it having a hit rate does in itself not 
> > > guarantee that it gives us any worthwile cost savings when put in front of 
> > > an rbtree ...
> > 
> > So having mmap_cache around, in whatever form, is an important
> > optimization for find_vma() - even to this day. It can save us at least
> > 50% cycles that correspond to this function. [...]
> 
> I'm glad it still helps! :-)
> 
> > [...] I ran a variety of mmap_cache alternatives over two workloads that 
> > are heavy on page faults (as opposed to Java based ones I had tried 
> > previously, which really don't trigger enough for it to be worthwhile).  
> > So we now have a comparison of 5 different caching schemes -- note that 
> > the 4 element hash table is quite similar to two elements, with a hash 
> > function of (addr % hash_size).
> > 
> > 1) Kernel build
> > +------------------------+----------+------------------+---------+
> > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > +------------------------+----------+------------------+---------+
> > | no mmap_cache          | -        | 15.85            | 0.10066 |
> > | current mmap_cache     | 72.32%   | 11.03            | 0.01155 |
> > | mmap_cache+largest VMA | 84.55%   |  9.91            | 0.01414 |
> > | 4 element hash table   | 78.38%   | 10.52            | 0.01155 |
> > | per-thread mmap_cache  | 78.84%   | 10.69            | 0.01325 |
> > +------------------------+----------+------------------+---------+
> > 
> > In this particular workload the proposed patch benefits the most and 
> > current alternatives, while they do help some, aren't really worth 
> > bothering with as the current implementation already does a nice enough 
> > job.
> 
> Interesting.
> 
> > 2) Oracle Data mining (4K pages)
> > +------------------------+----------+------------------+---------+
> > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > +------------------------+----------+------------------+---------+
> > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > +------------------------+----------+------------------+---------+
> > 
> > This workload sure makes the point of how much we can benefit of caching 
> > the vma, otherwise find_vma() can cost more than 220% extra cycles. We 
> > clearly win here by having a per-thread cache instead of per address 
> > space. I also tried the same workload with 2Mb hugepages and the results 
> > are much more closer to the kernel build, but with the per-thread vma 
> > still winning over the rest of the alternatives.
> 
> That's also very interesting, and it's exactly the kind of data we need to 
> judge such matters. Kernel builds and DB loads are two very different, yet 
> important workloads, so if we improve both cases then the probability that 
> we improve all other workloads as well increases substantially.
> 
> Do you have any data on the number of find_vma() calls performed in these 
> two cases, so that we can know the per function call average cost?
> 

For the kernel build we get around 140 million calls to find_vma(), and
for Oracle around 27 million. So the function ends up costing
significantly more for the DB workload.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-11 18:24             ` Davidlohr Bueso
  0 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-11 18:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

On Mon, 2013-11-11 at 13:01 +0100, Ingo Molnar wrote:
> * Davidlohr Bueso <davidlohr@hp.com> wrote:
> 
> > Hi Ingo,
> > 
> > On Mon, 2013-11-04 at 08:36 +0100, Ingo Molnar wrote:
> > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > 
> > > > I will look into doing the vma cache per thread instead of mm (I hadn't 
> > > > really looked at the problem like this) as well as Ingo's suggestion on 
> > > > the weighted LRU approach. However, having seen that we can cheaply and 
> > > > easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> > > > how good is good enough?
> > > 
> > > So I think it all really depends on the hit/miss cost difference. It makes 
> > > little sense to add a more complex scheme if it washes out most of the 
> > > benefits!
> > > 
> > > Also note the historic context: the _original_ mmap_cache, that I 
> > > implemented 16 years ago, was a front-line cache to a linear list walk 
> > > over all vmas (!).
> > > 
> > > This is the relevant 2.1.37pre1 code in include/linux/mm.h:
> > > 
> > > /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> > > static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
> > > {
> > >         struct vm_area_struct *vma = NULL;
> > > 
> > >         if (mm) {
> > >                 /* Check the cache first. */
> > >                 vma = mm->mmap_cache;
> > >                 if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
> > >                         vma = mm->mmap;
> > >                         while(vma && vma->vm_end <= addr)
> > >                                 vma = vma->vm_next;
> > >                         mm->mmap_cache = vma;
> > >                 }
> > >         }
> > >         return vma;
> > > }
> > > 
> > > See that vma->vm_next iteration? It was awful - but back then most of us 
> > > had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
> > > no worries - the mm was really simple back then.
> > > 
> > > Today we have the vma rbtree, which is self-balancing and a lot faster 
> > > than your typical linear list walk search ;-)
> > > 
> > > So I'd _really_ suggest to first examine the assumptions behind the cache, 
> > > it being named 'cache' and it having a hit rate does in itself not 
> > > guarantee that it gives us any worthwile cost savings when put in front of 
> > > an rbtree ...
> > 
> > So having mmap_cache around, in whatever form, is an important
> > optimization for find_vma() - even to this day. It can save us at least
> > 50% cycles that correspond to this function. [...]
> 
> I'm glad it still helps! :-)
> 
> > [...] I ran a variety of mmap_cache alternatives over two workloads that 
> > are heavy on page faults (as opposed to Java based ones I had tried 
> > previously, which really don't trigger enough for it to be worthwhile).  
> > So we now have a comparison of 5 different caching schemes -- note that 
> > the 4 element hash table is quite similar to two elements, with a hash 
> > function of (addr % hash_size).
> > 
> > 1) Kernel build
> > +------------------------+----------+------------------+---------+
> > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > +------------------------+----------+------------------+---------+
> > | no mmap_cache          | -        | 15.85            | 0.10066 |
> > | current mmap_cache     | 72.32%   | 11.03            | 0.01155 |
> > | mmap_cache+largest VMA | 84.55%   |  9.91            | 0.01414 |
> > | 4 element hash table   | 78.38%   | 10.52            | 0.01155 |
> > | per-thread mmap_cache  | 78.84%   | 10.69            | 0.01325 |
> > +------------------------+----------+------------------+---------+
> > 
> > In this particular workload the proposed patch benefits the most and 
> > current alternatives, while they do help some, aren't really worth 
> > bothering with as the current implementation already does a nice enough 
> > job.
> 
> Interesting.
> 
> > 2) Oracle Data mining (4K pages)
> > +------------------------+----------+------------------+---------+
> > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > +------------------------+----------+------------------+---------+
> > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > +------------------------+----------+------------------+---------+
> > 
> > This workload sure makes the point of how much we can benefit of caching 
> > the vma, otherwise find_vma() can cost more than 220% extra cycles. We 
> > clearly win here by having a per-thread cache instead of per address 
> > space. I also tried the same workload with 2Mb hugepages and the results 
> > are much more closer to the kernel build, but with the per-thread vma 
> > still winning over the rest of the alternatives.
> 
> That's also very interesting, and it's exactly the kind of data we need to 
> judge such matters. Kernel builds and DB loads are two very different, yet 
> important workloads, so if we improve both cases then the probability that 
> we improve all other workloads as well increases substantially.
> 
> Do you have any data on the number of find_vma() calls performed in these 
> two cases, so that we can know the per function call average cost?
> 

For the kernel build we get around 140 million calls to find_vma(), and
for Oracle around 27 million. So the function ends up costing
significantly more for the DB workload.

Thanks,
Davidlohr

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-11 18:24             ` Davidlohr Bueso
@ 2013-11-11 20:47               ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-11 20:47 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> On Mon, 2013-11-11 at 13:01 +0100, Ingo Molnar wrote:
> > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > 
> > > Hi Ingo,
> > > 
> > > On Mon, 2013-11-04 at 08:36 +0100, Ingo Molnar wrote:
> > > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > > 
> > > > > I will look into doing the vma cache per thread instead of mm (I hadn't 
> > > > > really looked at the problem like this) as well as Ingo's suggestion on 
> > > > > the weighted LRU approach. However, having seen that we can cheaply and 
> > > > > easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> > > > > how good is good enough?
> > > > 
> > > > So I think it all really depends on the hit/miss cost difference. It makes 
> > > > little sense to add a more complex scheme if it washes out most of the 
> > > > benefits!
> > > > 
> > > > Also note the historic context: the _original_ mmap_cache, that I 
> > > > implemented 16 years ago, was a front-line cache to a linear list walk 
> > > > over all vmas (!).
> > > > 
> > > > This is the relevant 2.1.37pre1 code in include/linux/mm.h:
> > > > 
> > > > /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> > > > static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
> > > > {
> > > >         struct vm_area_struct *vma = NULL;
> > > > 
> > > >         if (mm) {
> > > >                 /* Check the cache first. */
> > > >                 vma = mm->mmap_cache;
> > > >                 if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
> > > >                         vma = mm->mmap;
> > > >                         while(vma && vma->vm_end <= addr)
> > > >                                 vma = vma->vm_next;
> > > >                         mm->mmap_cache = vma;
> > > >                 }
> > > >         }
> > > >         return vma;
> > > > }
> > > > 
> > > > See that vma->vm_next iteration? It was awful - but back then most of us 
> > > > had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
> > > > no worries - the mm was really simple back then.
> > > > 
> > > > Today we have the vma rbtree, which is self-balancing and a lot faster 
> > > > than your typical linear list walk search ;-)
> > > > 
> > > > So I'd _really_ suggest to first examine the assumptions behind the cache, 
> > > > it being named 'cache' and it having a hit rate does in itself not 
> > > > guarantee that it gives us any worthwile cost savings when put in front of 
> > > > an rbtree ...
> > > 
> > > So having mmap_cache around, in whatever form, is an important
> > > optimization for find_vma() - even to this day. It can save us at least
> > > 50% cycles that correspond to this function. [...]
> > 
> > I'm glad it still helps! :-)
> > 
> > > [...] I ran a variety of mmap_cache alternatives over two workloads that 
> > > are heavy on page faults (as opposed to Java based ones I had tried 
> > > previously, which really don't trigger enough for it to be worthwhile).  
> > > So we now have a comparison of 5 different caching schemes -- note that 
> > > the 4 element hash table is quite similar to two elements, with a hash 
> > > function of (addr % hash_size).
> > > 
> > > 1) Kernel build
> > > +------------------------+----------+------------------+---------+
> > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > +------------------------+----------+------------------+---------+
> > > | no mmap_cache          | -        | 15.85            | 0.10066 |
> > > | current mmap_cache     | 72.32%   | 11.03            | 0.01155 |
> > > | mmap_cache+largest VMA | 84.55%   |  9.91            | 0.01414 |
> > > | 4 element hash table   | 78.38%   | 10.52            | 0.01155 |
> > > | per-thread mmap_cache  | 78.84%   | 10.69            | 0.01325 |
> > > +------------------------+----------+------------------+---------+
> > > 
> > > In this particular workload the proposed patch benefits the most and 
> > > current alternatives, while they do help some, aren't really worth 
> > > bothering with as the current implementation already does a nice enough 
> > > job.
> > 
> > Interesting.
> > 
> > > 2) Oracle Data mining (4K pages)
> > > +------------------------+----------+------------------+---------+
> > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > +------------------------+----------+------------------+---------+
> > > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > > +------------------------+----------+------------------+---------+
> > > 
> > > This workload sure makes the point of how much we can benefit of caching 
> > > the vma, otherwise find_vma() can cost more than 220% extra cycles. We 
> > > clearly win here by having a per-thread cache instead of per address 
> > > space. I also tried the same workload with 2Mb hugepages and the results 
> > > are much more closer to the kernel build, but with the per-thread vma 
> > > still winning over the rest of the alternatives.
> > 
> > That's also very interesting, and it's exactly the kind of data we need to 
> > judge such matters. Kernel builds and DB loads are two very different, yet 
> > important workloads, so if we improve both cases then the probability that 
> > we improve all other workloads as well increases substantially.
> > 
> > Do you have any data on the number of find_vma() calls performed in these 
> > two cases, so that we can know the per function call average cost?
> > 
> 
> For the kernel build we get around 140 million calls to find_vma(), and 
> for Oracle around 27 million. So the function ends up costing 
> significantly more for the DB workload.

Hm, mind tabulating that into per function call (cycles) and such, for an 
easier overview?

I do think the Oracle case might be pinpointing a separate 
bug/problem/property: unless it's using an obscene number of vmas its 
rbtree should have a manageable depth, what is the average (accessed) 
depth of the rbtree, and is it properly balanced?

Or is access to varied in the Oracle case that it's missing the cache all 
the time, because the rbtree causes many cachemisses as the separate nodes 
are accessed during an rb-walk?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-11 20:47               ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-11 20:47 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> On Mon, 2013-11-11 at 13:01 +0100, Ingo Molnar wrote:
> > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > 
> > > Hi Ingo,
> > > 
> > > On Mon, 2013-11-04 at 08:36 +0100, Ingo Molnar wrote:
> > > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > > 
> > > > > I will look into doing the vma cache per thread instead of mm (I hadn't 
> > > > > really looked at the problem like this) as well as Ingo's suggestion on 
> > > > > the weighted LRU approach. However, having seen that we can cheaply and 
> > > > > easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> > > > > how good is good enough?
> > > > 
> > > > So I think it all really depends on the hit/miss cost difference. It makes 
> > > > little sense to add a more complex scheme if it washes out most of the 
> > > > benefits!
> > > > 
> > > > Also note the historic context: the _original_ mmap_cache, that I 
> > > > implemented 16 years ago, was a front-line cache to a linear list walk 
> > > > over all vmas (!).
> > > > 
> > > > This is the relevant 2.1.37pre1 code in include/linux/mm.h:
> > > > 
> > > > /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> > > > static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
> > > > {
> > > >         struct vm_area_struct *vma = NULL;
> > > > 
> > > >         if (mm) {
> > > >                 /* Check the cache first. */
> > > >                 vma = mm->mmap_cache;
> > > >                 if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
> > > >                         vma = mm->mmap;
> > > >                         while(vma && vma->vm_end <= addr)
> > > >                                 vma = vma->vm_next;
> > > >                         mm->mmap_cache = vma;
> > > >                 }
> > > >         }
> > > >         return vma;
> > > > }
> > > > 
> > > > See that vma->vm_next iteration? It was awful - but back then most of us 
> > > > had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
> > > > no worries - the mm was really simple back then.
> > > > 
> > > > Today we have the vma rbtree, which is self-balancing and a lot faster 
> > > > than your typical linear list walk search ;-)
> > > > 
> > > > So I'd _really_ suggest to first examine the assumptions behind the cache, 
> > > > it being named 'cache' and it having a hit rate does in itself not 
> > > > guarantee that it gives us any worthwile cost savings when put in front of 
> > > > an rbtree ...
> > > 
> > > So having mmap_cache around, in whatever form, is an important
> > > optimization for find_vma() - even to this day. It can save us at least
> > > 50% cycles that correspond to this function. [...]
> > 
> > I'm glad it still helps! :-)
> > 
> > > [...] I ran a variety of mmap_cache alternatives over two workloads that 
> > > are heavy on page faults (as opposed to Java based ones I had tried 
> > > previously, which really don't trigger enough for it to be worthwhile).  
> > > So we now have a comparison of 5 different caching schemes -- note that 
> > > the 4 element hash table is quite similar to two elements, with a hash 
> > > function of (addr % hash_size).
> > > 
> > > 1) Kernel build
> > > +------------------------+----------+------------------+---------+
> > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > +------------------------+----------+------------------+---------+
> > > | no mmap_cache          | -        | 15.85            | 0.10066 |
> > > | current mmap_cache     | 72.32%   | 11.03            | 0.01155 |
> > > | mmap_cache+largest VMA | 84.55%   |  9.91            | 0.01414 |
> > > | 4 element hash table   | 78.38%   | 10.52            | 0.01155 |
> > > | per-thread mmap_cache  | 78.84%   | 10.69            | 0.01325 |
> > > +------------------------+----------+------------------+---------+
> > > 
> > > In this particular workload the proposed patch benefits the most and 
> > > current alternatives, while they do help some, aren't really worth 
> > > bothering with as the current implementation already does a nice enough 
> > > job.
> > 
> > Interesting.
> > 
> > > 2) Oracle Data mining (4K pages)
> > > +------------------------+----------+------------------+---------+
> > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > +------------------------+----------+------------------+---------+
> > > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > > +------------------------+----------+------------------+---------+
> > > 
> > > This workload sure makes the point of how much we can benefit of caching 
> > > the vma, otherwise find_vma() can cost more than 220% extra cycles. We 
> > > clearly win here by having a per-thread cache instead of per address 
> > > space. I also tried the same workload with 2Mb hugepages and the results 
> > > are much more closer to the kernel build, but with the per-thread vma 
> > > still winning over the rest of the alternatives.
> > 
> > That's also very interesting, and it's exactly the kind of data we need to 
> > judge such matters. Kernel builds and DB loads are two very different, yet 
> > important workloads, so if we improve both cases then the probability that 
> > we improve all other workloads as well increases substantially.
> > 
> > Do you have any data on the number of find_vma() calls performed in these 
> > two cases, so that we can know the per function call average cost?
> > 
> 
> For the kernel build we get around 140 million calls to find_vma(), and 
> for Oracle around 27 million. So the function ends up costing 
> significantly more for the DB workload.

Hm, mind tabulating that into per function call (cycles) and such, for an 
easier overview?

I do think the Oracle case might be pinpointing a separate 
bug/problem/property: unless it's using an obscene number of vmas its 
rbtree should have a manageable depth, what is the average (accessed) 
depth of the rbtree, and is it properly balanced?

Or is access to varied in the Oracle case that it's missing the cache all 
the time, because the rbtree causes many cachemisses as the separate nodes 
are accessed during an rb-walk?

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-11 12:04             ` Ingo Molnar
@ 2013-11-11 20:47               ` Davidlohr Bueso
  -1 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-11 20:47 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Michel Lespinasse, Peter Zijlstra, Linus Torvalds, Andrew Morton,
	Hugh Dickins, Mel Gorman, Rik van Riel, Guan Xuetao,
	Chandramouleeswaran, Aswin, Linux Kernel Mailing List, linux-mm

On Mon, 2013-11-11 at 13:04 +0100, Ingo Molnar wrote:
> * Michel Lespinasse <walken@google.com> wrote:
> 
> > On Sun, Nov 10, 2013 at 8:12 PM, Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > 2) Oracle Data mining (4K pages)
> > > +------------------------+----------+------------------+---------+
> > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > +------------------------+----------+------------------+---------+
> > > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > > +------------------------+----------+------------------+---------+
> > >
> > > This workload sure makes the point of how much we can benefit of 
> > > caching the vma, otherwise find_vma() can cost more than 220% extra 
> > > cycles. We clearly win here by having a per-thread cache instead of 
> > > per address space. I also tried the same workload with 2Mb hugepages 
> > > and the results are much more closer to the kernel build, but with the 
> > > per-thread vma still winning over the rest of the alternatives.
> > >
> > > All in all I think that we should probably have a per-thread vma 
> > > cache. Please let me know if there is some other workload you'd like 
> > > me to try out. If folks agree then I can cleanup the patch and send it 
> > > out.
> > 
> > Per thread cache sounds interesting - with per-mm caches there is a real 
> > risk that some modern threaded apps pay the cost of cache updates 
> > without seeing much of the benefit. However, how do you cheaply handle 
> > invalidations for the per thread cache ?
> 
> The cheapest way to handle that would be to have a generation counter for 
> the mm and to couple cache validity to a specific value of that. 
> 'Invalidation' is then the free side effect of bumping the generation 
> counter when a vma is removed/moved.

I was basing the invalidations on the freeing of vm_area_cachep, so I
mark current->mmap_cache = NULL whenever we call
kmem_cache_free(vm_area_cachep, ...). But I can see this being a problem
if more than one task's mmap_cache points to the same vma, as we end up
invalidating only one. I'd really like to use a similar logic and base
everything around the existence of the vma instead of adding a counting
infrastructure. Sure we'd end up doing more reads when we do the lookup
in find_vma() but the cost of maintaining it comes free. I just ran into
a similar idea from 2 years ago:
http://lkml.indiana.edu/hypermail/linux/kernel/1112.1/01352.html

While there are several things that aren't needed, it does do the
is_kmem_cache() to verify that the vma is still a valid slab.

Thanks,
Davidlohr



^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-11 20:47               ` Davidlohr Bueso
  0 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-11 20:47 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Michel Lespinasse, Peter Zijlstra, Linus Torvalds, Andrew Morton,
	Hugh Dickins, Mel Gorman, Rik van Riel, Guan Xuetao,
	Chandramouleeswaran, Aswin, Linux Kernel Mailing List, linux-mm

On Mon, 2013-11-11 at 13:04 +0100, Ingo Molnar wrote:
> * Michel Lespinasse <walken@google.com> wrote:
> 
> > On Sun, Nov 10, 2013 at 8:12 PM, Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > 2) Oracle Data mining (4K pages)
> > > +------------------------+----------+------------------+---------+
> > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > +------------------------+----------+------------------+---------+
> > > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > > +------------------------+----------+------------------+---------+
> > >
> > > This workload sure makes the point of how much we can benefit of 
> > > caching the vma, otherwise find_vma() can cost more than 220% extra 
> > > cycles. We clearly win here by having a per-thread cache instead of 
> > > per address space. I also tried the same workload with 2Mb hugepages 
> > > and the results are much more closer to the kernel build, but with the 
> > > per-thread vma still winning over the rest of the alternatives.
> > >
> > > All in all I think that we should probably have a per-thread vma 
> > > cache. Please let me know if there is some other workload you'd like 
> > > me to try out. If folks agree then I can cleanup the patch and send it 
> > > out.
> > 
> > Per thread cache sounds interesting - with per-mm caches there is a real 
> > risk that some modern threaded apps pay the cost of cache updates 
> > without seeing much of the benefit. However, how do you cheaply handle 
> > invalidations for the per thread cache ?
> 
> The cheapest way to handle that would be to have a generation counter for 
> the mm and to couple cache validity to a specific value of that. 
> 'Invalidation' is then the free side effect of bumping the generation 
> counter when a vma is removed/moved.

I was basing the invalidations on the freeing of vm_area_cachep, so I
mark current->mmap_cache = NULL whenever we call
kmem_cache_free(vm_area_cachep, ...). But I can see this being a problem
if more than one task's mmap_cache points to the same vma, as we end up
invalidating only one. I'd really like to use a similar logic and base
everything around the existence of the vma instead of adding a counting
infrastructure. Sure we'd end up doing more reads when we do the lookup
in find_vma() but the cost of maintaining it comes free. I just ran into
a similar idea from 2 years ago:
http://lkml.indiana.edu/hypermail/linux/kernel/1112.1/01352.html

While there are several things that aren't needed, it does do the
is_kmem_cache() to verify that the vma is still a valid slab.

Thanks,
Davidlohr


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-11 20:47               ` Ingo Molnar
@ 2013-11-11 20:59                 ` Davidlohr Bueso
  -1 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-11 20:59 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

On Mon, 2013-11-11 at 21:47 +0100, Ingo Molnar wrote:
> * Davidlohr Bueso <davidlohr@hp.com> wrote:
> 
> > On Mon, 2013-11-11 at 13:01 +0100, Ingo Molnar wrote:
> > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > 
> > > > Hi Ingo,
> > > > 
> > > > On Mon, 2013-11-04 at 08:36 +0100, Ingo Molnar wrote:
> > > > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > > > 
> > > > > > I will look into doing the vma cache per thread instead of mm (I hadn't 
> > > > > > really looked at the problem like this) as well as Ingo's suggestion on 
> > > > > > the weighted LRU approach. However, having seen that we can cheaply and 
> > > > > > easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> > > > > > how good is good enough?
> > > > > 
> > > > > So I think it all really depends on the hit/miss cost difference. It makes 
> > > > > little sense to add a more complex scheme if it washes out most of the 
> > > > > benefits!
> > > > > 
> > > > > Also note the historic context: the _original_ mmap_cache, that I 
> > > > > implemented 16 years ago, was a front-line cache to a linear list walk 
> > > > > over all vmas (!).
> > > > > 
> > > > > This is the relevant 2.1.37pre1 code in include/linux/mm.h:
> > > > > 
> > > > > /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> > > > > static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
> > > > > {
> > > > >         struct vm_area_struct *vma = NULL;
> > > > > 
> > > > >         if (mm) {
> > > > >                 /* Check the cache first. */
> > > > >                 vma = mm->mmap_cache;
> > > > >                 if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
> > > > >                         vma = mm->mmap;
> > > > >                         while(vma && vma->vm_end <= addr)
> > > > >                                 vma = vma->vm_next;
> > > > >                         mm->mmap_cache = vma;
> > > > >                 }
> > > > >         }
> > > > >         return vma;
> > > > > }
> > > > > 
> > > > > See that vma->vm_next iteration? It was awful - but back then most of us 
> > > > > had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
> > > > > no worries - the mm was really simple back then.
> > > > > 
> > > > > Today we have the vma rbtree, which is self-balancing and a lot faster 
> > > > > than your typical linear list walk search ;-)
> > > > > 
> > > > > So I'd _really_ suggest to first examine the assumptions behind the cache, 
> > > > > it being named 'cache' and it having a hit rate does in itself not 
> > > > > guarantee that it gives us any worthwile cost savings when put in front of 
> > > > > an rbtree ...
> > > > 
> > > > So having mmap_cache around, in whatever form, is an important
> > > > optimization for find_vma() - even to this day. It can save us at least
> > > > 50% cycles that correspond to this function. [...]
> > > 
> > > I'm glad it still helps! :-)
> > > 
> > > > [...] I ran a variety of mmap_cache alternatives over two workloads that 
> > > > are heavy on page faults (as opposed to Java based ones I had tried 
> > > > previously, which really don't trigger enough for it to be worthwhile).  
> > > > So we now have a comparison of 5 different caching schemes -- note that 
> > > > the 4 element hash table is quite similar to two elements, with a hash 
> > > > function of (addr % hash_size).
> > > > 
> > > > 1) Kernel build
> > > > +------------------------+----------+------------------+---------+
> > > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > > +------------------------+----------+------------------+---------+
> > > > | no mmap_cache          | -        | 15.85            | 0.10066 |
> > > > | current mmap_cache     | 72.32%   | 11.03            | 0.01155 |
> > > > | mmap_cache+largest VMA | 84.55%   |  9.91            | 0.01414 |
> > > > | 4 element hash table   | 78.38%   | 10.52            | 0.01155 |
> > > > | per-thread mmap_cache  | 78.84%   | 10.69            | 0.01325 |
> > > > +------------------------+----------+------------------+---------+
> > > > 
> > > > In this particular workload the proposed patch benefits the most and 
> > > > current alternatives, while they do help some, aren't really worth 
> > > > bothering with as the current implementation already does a nice enough 
> > > > job.
> > > 
> > > Interesting.
> > > 
> > > > 2) Oracle Data mining (4K pages)
> > > > +------------------------+----------+------------------+---------+
> > > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > > +------------------------+----------+------------------+---------+
> > > > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > > > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > > > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > > > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > > > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > > > +------------------------+----------+------------------+---------+
> > > > 
> > > > This workload sure makes the point of how much we can benefit of caching 
> > > > the vma, otherwise find_vma() can cost more than 220% extra cycles. We 
> > > > clearly win here by having a per-thread cache instead of per address 
> > > > space. I also tried the same workload with 2Mb hugepages and the results 
> > > > are much more closer to the kernel build, but with the per-thread vma 
> > > > still winning over the rest of the alternatives.
> > > 
> > > That's also very interesting, and it's exactly the kind of data we need to 
> > > judge such matters. Kernel builds and DB loads are two very different, yet 
> > > important workloads, so if we improve both cases then the probability that 
> > > we improve all other workloads as well increases substantially.
> > > 
> > > Do you have any data on the number of find_vma() calls performed in these 
> > > two cases, so that we can know the per function call average cost?
> > > 
> > 
> > For the kernel build we get around 140 million calls to find_vma(), and 
> > for Oracle around 27 million. So the function ends up costing 
> > significantly more for the DB workload.
> 
> Hm, mind tabulating that into per function call (cycles) and such, for an 
> easier overview?
> 
> I do think the Oracle case might be pinpointing a separate 
> bug/problem/property: unless it's using an obscene number of vmas its 
> rbtree should have a manageable depth, what is the average (accessed) 
> depth of the rbtree, and is it properly balanced?

That is something I didn't measure. However, by judging the huge
increase of cycles when we remove the mmap_cache, it must be an enormous
tree and/or the way the tree is sorted by address really isn't helping
the workload.

> 
> Or is access to varied in the Oracle case that it's missing the cache all 
> the time, because the rbtree causes many cachemisses as the separate nodes 
> are accessed during an rb-walk?

Similar to get_cycles(), is there anyway to quickly measure the amount
of executed instructions? Getting the IPC for the mmap_cache (this of
course is constant) and the treewalk could give us a nice overview of
the function's cost. I was thinking of stealing some perf-stat
functionality for this but didn't get around to it. Hopefully there's an
easier way...

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-11 20:59                 ` Davidlohr Bueso
  0 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-11 20:59 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm

On Mon, 2013-11-11 at 21:47 +0100, Ingo Molnar wrote:
> * Davidlohr Bueso <davidlohr@hp.com> wrote:
> 
> > On Mon, 2013-11-11 at 13:01 +0100, Ingo Molnar wrote:
> > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > 
> > > > Hi Ingo,
> > > > 
> > > > On Mon, 2013-11-04 at 08:36 +0100, Ingo Molnar wrote:
> > > > > * Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > > > 
> > > > > > I will look into doing the vma cache per thread instead of mm (I hadn't 
> > > > > > really looked at the problem like this) as well as Ingo's suggestion on 
> > > > > > the weighted LRU approach. However, having seen that we can cheaply and 
> > > > > > easily reach around ~70% hit rate in a lot of workloads, makes me wonder 
> > > > > > how good is good enough?
> > > > > 
> > > > > So I think it all really depends on the hit/miss cost difference. It makes 
> > > > > little sense to add a more complex scheme if it washes out most of the 
> > > > > benefits!
> > > > > 
> > > > > Also note the historic context: the _original_ mmap_cache, that I 
> > > > > implemented 16 years ago, was a front-line cache to a linear list walk 
> > > > > over all vmas (!).
> > > > > 
> > > > > This is the relevant 2.1.37pre1 code in include/linux/mm.h:
> > > > > 
> > > > > /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> > > > > static inline struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
> > > > > {
> > > > >         struct vm_area_struct *vma = NULL;
> > > > > 
> > > > >         if (mm) {
> > > > >                 /* Check the cache first. */
> > > > >                 vma = mm->mmap_cache;
> > > > >                 if(!vma || (vma->vm_end <= addr) || (vma->vm_start > addr)) {
> > > > >                         vma = mm->mmap;
> > > > >                         while(vma && vma->vm_end <= addr)
> > > > >                                 vma = vma->vm_next;
> > > > >                         mm->mmap_cache = vma;
> > > > >                 }
> > > > >         }
> > > > >         return vma;
> > > > > }
> > > > > 
> > > > > See that vma->vm_next iteration? It was awful - but back then most of us 
> > > > > had at most a couple of megs of RAM with just a few vmas. No RAM, no SMP, 
> > > > > no worries - the mm was really simple back then.
> > > > > 
> > > > > Today we have the vma rbtree, which is self-balancing and a lot faster 
> > > > > than your typical linear list walk search ;-)
> > > > > 
> > > > > So I'd _really_ suggest to first examine the assumptions behind the cache, 
> > > > > it being named 'cache' and it having a hit rate does in itself not 
> > > > > guarantee that it gives us any worthwile cost savings when put in front of 
> > > > > an rbtree ...
> > > > 
> > > > So having mmap_cache around, in whatever form, is an important
> > > > optimization for find_vma() - even to this day. It can save us at least
> > > > 50% cycles that correspond to this function. [...]
> > > 
> > > I'm glad it still helps! :-)
> > > 
> > > > [...] I ran a variety of mmap_cache alternatives over two workloads that 
> > > > are heavy on page faults (as opposed to Java based ones I had tried 
> > > > previously, which really don't trigger enough for it to be worthwhile).  
> > > > So we now have a comparison of 5 different caching schemes -- note that 
> > > > the 4 element hash table is quite similar to two elements, with a hash 
> > > > function of (addr % hash_size).
> > > > 
> > > > 1) Kernel build
> > > > +------------------------+----------+------------------+---------+
> > > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > > +------------------------+----------+------------------+---------+
> > > > | no mmap_cache          | -        | 15.85            | 0.10066 |
> > > > | current mmap_cache     | 72.32%   | 11.03            | 0.01155 |
> > > > | mmap_cache+largest VMA | 84.55%   |  9.91            | 0.01414 |
> > > > | 4 element hash table   | 78.38%   | 10.52            | 0.01155 |
> > > > | per-thread mmap_cache  | 78.84%   | 10.69            | 0.01325 |
> > > > +------------------------+----------+------------------+---------+
> > > > 
> > > > In this particular workload the proposed patch benefits the most and 
> > > > current alternatives, while they do help some, aren't really worth 
> > > > bothering with as the current implementation already does a nice enough 
> > > > job.
> > > 
> > > Interesting.
> > > 
> > > > 2) Oracle Data mining (4K pages)
> > > > +------------------------+----------+------------------+---------+
> > > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > > +------------------------+----------+------------------+---------+
> > > > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > > > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > > > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > > > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > > > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > > > +------------------------+----------+------------------+---------+
> > > > 
> > > > This workload sure makes the point of how much we can benefit of caching 
> > > > the vma, otherwise find_vma() can cost more than 220% extra cycles. We 
> > > > clearly win here by having a per-thread cache instead of per address 
> > > > space. I also tried the same workload with 2Mb hugepages and the results 
> > > > are much more closer to the kernel build, but with the per-thread vma 
> > > > still winning over the rest of the alternatives.
> > > 
> > > That's also very interesting, and it's exactly the kind of data we need to 
> > > judge such matters. Kernel builds and DB loads are two very different, yet 
> > > important workloads, so if we improve both cases then the probability that 
> > > we improve all other workloads as well increases substantially.
> > > 
> > > Do you have any data on the number of find_vma() calls performed in these 
> > > two cases, so that we can know the per function call average cost?
> > > 
> > 
> > For the kernel build we get around 140 million calls to find_vma(), and 
> > for Oracle around 27 million. So the function ends up costing 
> > significantly more for the DB workload.
> 
> Hm, mind tabulating that into per function call (cycles) and such, for an 
> easier overview?
> 
> I do think the Oracle case might be pinpointing a separate 
> bug/problem/property: unless it's using an obscene number of vmas its 
> rbtree should have a manageable depth, what is the average (accessed) 
> depth of the rbtree, and is it properly balanced?

That is something I didn't measure. However, by judging the huge
increase of cycles when we remove the mmap_cache, it must be an enormous
tree and/or the way the tree is sorted by address really isn't helping
the workload.

> 
> Or is access to varied in the Oracle case that it's missing the cache all 
> the time, because the rbtree causes many cachemisses as the separate nodes 
> are accessed during an rb-walk?

Similar to get_cycles(), is there anyway to quickly measure the amount
of executed instructions? Getting the IPC for the mmap_cache (this of
course is constant) and the treewalk could give us a nice overview of
the function's cost. I was thinking of stealing some perf-stat
functionality for this but didn't get around to it. Hopefully there's an
easier way...

Thanks,
Davidlohr

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-11 20:59                 ` Davidlohr Bueso
@ 2013-11-11 21:09                   ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-11 21:09 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm,
	Frédéric Weisbecker


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> > Or is access to varied in the Oracle case that it's missing the cache 
> > all the time, because the rbtree causes many cachemisses as the 
> > separate nodes are accessed during an rb-walk?
> 
> Similar to get_cycles(), is there anyway to quickly measure the amount 
> of executed instructions? Getting the IPC for the mmap_cache (this of 
> course is constant) and the treewalk could give us a nice overview of 
> the function's cost. I was thinking of stealing some perf-stat 
> functionality for this but didn't get around to it. Hopefully there's an 
> easier way...

There's no such easy method I'm afraid (Frederic's probe based trigger 
facility will give us that and more - but it's not ready yet) - but you 
could try profiling the workload for significant cache-misses:

  perf record -e cache-misses ...

I _think_ if it's really catastrophic cache-misses then the rbtree walk 
should light up on the perf radar like crazy.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-11 21:09                   ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-11 21:09 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Linus Torvalds, Andrew Morton, Hugh Dickins, Michel Lespinasse,
	Mel Gorman, Rik van Riel, Guan Xuetao, Chandramouleeswaran,
	Aswin, Linux Kernel Mailing List, linux-mm,
	Frédéric Weisbecker


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> > Or is access to varied in the Oracle case that it's missing the cache 
> > all the time, because the rbtree causes many cachemisses as the 
> > separate nodes are accessed during an rb-walk?
> 
> Similar to get_cycles(), is there anyway to quickly measure the amount 
> of executed instructions? Getting the IPC for the mmap_cache (this of 
> course is constant) and the treewalk could give us a nice overview of 
> the function's cost. I was thinking of stealing some perf-stat 
> functionality for this but didn't get around to it. Hopefully there's an 
> easier way...

There's no such easy method I'm afraid (Frederic's probe based trigger 
facility will give us that and more - but it's not ready yet) - but you 
could try profiling the workload for significant cache-misses:

  perf record -e cache-misses ...

I _think_ if it's really catastrophic cache-misses then the rbtree walk 
should light up on the perf radar like crazy.

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-11 20:47               ` Davidlohr Bueso
@ 2013-11-13 17:08                 ` Davidlohr Bueso
  -1 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-13 17:08 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Michel Lespinasse, Peter Zijlstra, Linus Torvalds, Andrew Morton,
	Hugh Dickins, Mel Gorman, Rik van Riel, Guan Xuetao,
	Chandramouleeswaran, Aswin, Linux Kernel Mailing List, linux-mm

On Mon, 2013-11-11 at 12:47 -0800, Davidlohr Bueso wrote:
> On Mon, 2013-11-11 at 13:04 +0100, Ingo Molnar wrote:
> > * Michel Lespinasse <walken@google.com> wrote:
> > 
> > > On Sun, Nov 10, 2013 at 8:12 PM, Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > > 2) Oracle Data mining (4K pages)
> > > > +------------------------+----------+------------------+---------+
> > > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > > +------------------------+----------+------------------+---------+
> > > > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > > > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > > > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > > > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > > > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > > > +------------------------+----------+------------------+---------+
> > > >
> > > > This workload sure makes the point of how much we can benefit of 
> > > > caching the vma, otherwise find_vma() can cost more than 220% extra 
> > > > cycles. We clearly win here by having a per-thread cache instead of 
> > > > per address space. I also tried the same workload with 2Mb hugepages 
> > > > and the results are much more closer to the kernel build, but with the 
> > > > per-thread vma still winning over the rest of the alternatives.
> > > >
> > > > All in all I think that we should probably have a per-thread vma 
> > > > cache. Please let me know if there is some other workload you'd like 
> > > > me to try out. If folks agree then I can cleanup the patch and send it 
> > > > out.
> > > 
> > > Per thread cache sounds interesting - with per-mm caches there is a real 
> > > risk that some modern threaded apps pay the cost of cache updates 
> > > without seeing much of the benefit. However, how do you cheaply handle 
> > > invalidations for the per thread cache ?
> > 
> > The cheapest way to handle that would be to have a generation counter for 
> > the mm and to couple cache validity to a specific value of that. 
> > 'Invalidation' is then the free side effect of bumping the generation 
> > counter when a vma is removed/moved.

Wouldn't this approach make us invalidate all vmas even when we just
want to do it for one? I mean we have no way of associating a single vma
with an mm->mmap_seqnum, or am I missing something?

> 
> I was basing the invalidations on the freeing of vm_area_cachep, so I
> mark current->mmap_cache = NULL whenever we call
> kmem_cache_free(vm_area_cachep, ...). But I can see this being a problem
> if more than one task's mmap_cache points to the same vma, as we end up
> invalidating only one. I'd really like to use a similar logic and base
> everything around the existence of the vma instead of adding a counting
> infrastructure. Sure we'd end up doing more reads when we do the lookup
> in find_vma() but the cost of maintaining it comes free. I just ran into
> a similar idea from 2 years ago:
> http://lkml.indiana.edu/hypermail/linux/kernel/1112.1/01352.html
> 
> While there are several things that aren't needed, it does do the
> is_kmem_cache() to verify that the vma is still a valid slab.

Doing invalidations this way is definitely not the way to go. While our
hit rate does match my previous attempt, the cost of checking the slab
ends up costing an extra 25% more of cycles than what we currently have.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-13 17:08                 ` Davidlohr Bueso
  0 siblings, 0 replies; 76+ messages in thread
From: Davidlohr Bueso @ 2013-11-13 17:08 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Michel Lespinasse, Peter Zijlstra, Linus Torvalds, Andrew Morton,
	Hugh Dickins, Mel Gorman, Rik van Riel, Guan Xuetao,
	Chandramouleeswaran, Aswin, Linux Kernel Mailing List, linux-mm

On Mon, 2013-11-11 at 12:47 -0800, Davidlohr Bueso wrote:
> On Mon, 2013-11-11 at 13:04 +0100, Ingo Molnar wrote:
> > * Michel Lespinasse <walken@google.com> wrote:
> > 
> > > On Sun, Nov 10, 2013 at 8:12 PM, Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > > 2) Oracle Data mining (4K pages)
> > > > +------------------------+----------+------------------+---------+
> > > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > > +------------------------+----------+------------------+---------+
> > > > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > > > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > > > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > > > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > > > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > > > +------------------------+----------+------------------+---------+
> > > >
> > > > This workload sure makes the point of how much we can benefit of 
> > > > caching the vma, otherwise find_vma() can cost more than 220% extra 
> > > > cycles. We clearly win here by having a per-thread cache instead of 
> > > > per address space. I also tried the same workload with 2Mb hugepages 
> > > > and the results are much more closer to the kernel build, but with the 
> > > > per-thread vma still winning over the rest of the alternatives.
> > > >
> > > > All in all I think that we should probably have a per-thread vma 
> > > > cache. Please let me know if there is some other workload you'd like 
> > > > me to try out. If folks agree then I can cleanup the patch and send it 
> > > > out.
> > > 
> > > Per thread cache sounds interesting - with per-mm caches there is a real 
> > > risk that some modern threaded apps pay the cost of cache updates 
> > > without seeing much of the benefit. However, how do you cheaply handle 
> > > invalidations for the per thread cache ?
> > 
> > The cheapest way to handle that would be to have a generation counter for 
> > the mm and to couple cache validity to a specific value of that. 
> > 'Invalidation' is then the free side effect of bumping the generation 
> > counter when a vma is removed/moved.

Wouldn't this approach make us invalidate all vmas even when we just
want to do it for one? I mean we have no way of associating a single vma
with an mm->mmap_seqnum, or am I missing something?

> 
> I was basing the invalidations on the freeing of vm_area_cachep, so I
> mark current->mmap_cache = NULL whenever we call
> kmem_cache_free(vm_area_cachep, ...). But I can see this being a problem
> if more than one task's mmap_cache points to the same vma, as we end up
> invalidating only one. I'd really like to use a similar logic and base
> everything around the existence of the vma instead of adding a counting
> infrastructure. Sure we'd end up doing more reads when we do the lookup
> in find_vma() but the cost of maintaining it comes free. I just ran into
> a similar idea from 2 years ago:
> http://lkml.indiana.edu/hypermail/linux/kernel/1112.1/01352.html
> 
> While there are several things that aren't needed, it does do the
> is_kmem_cache() to verify that the vma is still a valid slab.

Doing invalidations this way is definitely not the way to go. While our
hit rate does match my previous attempt, the cost of checking the slab
ends up costing an extra 25% more of cycles than what we currently have.

Thanks,
Davidlohr

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-13 17:08                 ` Davidlohr Bueso
@ 2013-11-13 17:59                   ` Ingo Molnar
  -1 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-13 17:59 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Michel Lespinasse, Peter Zijlstra, Linus Torvalds, Andrew Morton,
	Hugh Dickins, Mel Gorman, Rik van Riel, Guan Xuetao,
	Chandramouleeswaran, Aswin, Linux Kernel Mailing List, linux-mm


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> On Mon, 2013-11-11 at 12:47 -0800, Davidlohr Bueso wrote:
> > On Mon, 2013-11-11 at 13:04 +0100, Ingo Molnar wrote:
> > > * Michel Lespinasse <walken@google.com> wrote:
> > > 
> > > > On Sun, Nov 10, 2013 at 8:12 PM, Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > > > 2) Oracle Data mining (4K pages)
> > > > > +------------------------+----------+------------------+---------+
> > > > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > > > +------------------------+----------+------------------+---------+
> > > > > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > > > > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > > > > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > > > > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > > > > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > > > > +------------------------+----------+------------------+---------+
> > > > >
> > > > > This workload sure makes the point of how much we can benefit of 
> > > > > caching the vma, otherwise find_vma() can cost more than 220% extra 
> > > > > cycles. We clearly win here by having a per-thread cache instead of 
> > > > > per address space. I also tried the same workload with 2Mb hugepages 
> > > > > and the results are much more closer to the kernel build, but with the 
> > > > > per-thread vma still winning over the rest of the alternatives.
> > > > >
> > > > > All in all I think that we should probably have a per-thread vma 
> > > > > cache. Please let me know if there is some other workload you'd like 
> > > > > me to try out. If folks agree then I can cleanup the patch and send it 
> > > > > out.
> > > > 
> > > > Per thread cache sounds interesting - with per-mm caches there is a real 
> > > > risk that some modern threaded apps pay the cost of cache updates 
> > > > without seeing much of the benefit. However, how do you cheaply handle 
> > > > invalidations for the per thread cache ?
> > > 
> > > The cheapest way to handle that would be to have a generation counter for 
> > > the mm and to couple cache validity to a specific value of that. 
> > > 'Invalidation' is then the free side effect of bumping the generation 
> > > counter when a vma is removed/moved.
> 
> Wouldn't this approach make us invalidate all vmas even when we 
> just want to do it for one? [...]

Yes. If it's implemented as some sort of small, vma-size-weighted 
LRU, then all these 'different' caches go away and there's just 
this single LRU cache with a handful of entries cached.

This cache is then invalidated on munmap() et al. Which should be 
fine, mmap()/munmap() is a slowpath relative to find_vma().

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-13 17:59                   ` Ingo Molnar
  0 siblings, 0 replies; 76+ messages in thread
From: Ingo Molnar @ 2013-11-13 17:59 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Michel Lespinasse, Peter Zijlstra, Linus Torvalds, Andrew Morton,
	Hugh Dickins, Mel Gorman, Rik van Riel, Guan Xuetao,
	Chandramouleeswaran, Aswin, Linux Kernel Mailing List, linux-mm


* Davidlohr Bueso <davidlohr@hp.com> wrote:

> On Mon, 2013-11-11 at 12:47 -0800, Davidlohr Bueso wrote:
> > On Mon, 2013-11-11 at 13:04 +0100, Ingo Molnar wrote:
> > > * Michel Lespinasse <walken@google.com> wrote:
> > > 
> > > > On Sun, Nov 10, 2013 at 8:12 PM, Davidlohr Bueso <davidlohr@hp.com> wrote:
> > > > > 2) Oracle Data mining (4K pages)
> > > > > +------------------------+----------+------------------+---------+
> > > > > |    mmap_cache type     | hit-rate | cycles (billion) | stddev  |
> > > > > +------------------------+----------+------------------+---------+
> > > > > | no mmap_cache          | -        | 63.35            | 0.20207 |
> > > > > | current mmap_cache     | 65.66%   | 19.55            | 0.35019 |
> > > > > | mmap_cache+largest VMA | 71.53%   | 15.84            | 0.26764 |
> > > > > | 4 element hash table   | 70.75%   | 15.90            | 0.25586 |
> > > > > | per-thread mmap_cache  | 86.42%   | 11.57            | 0.29462 |
> > > > > +------------------------+----------+------------------+---------+
> > > > >
> > > > > This workload sure makes the point of how much we can benefit of 
> > > > > caching the vma, otherwise find_vma() can cost more than 220% extra 
> > > > > cycles. We clearly win here by having a per-thread cache instead of 
> > > > > per address space. I also tried the same workload with 2Mb hugepages 
> > > > > and the results are much more closer to the kernel build, but with the 
> > > > > per-thread vma still winning over the rest of the alternatives.
> > > > >
> > > > > All in all I think that we should probably have a per-thread vma 
> > > > > cache. Please let me know if there is some other workload you'd like 
> > > > > me to try out. If folks agree then I can cleanup the patch and send it 
> > > > > out.
> > > > 
> > > > Per thread cache sounds interesting - with per-mm caches there is a real 
> > > > risk that some modern threaded apps pay the cost of cache updates 
> > > > without seeing much of the benefit. However, how do you cheaply handle 
> > > > invalidations for the per thread cache ?
> > > 
> > > The cheapest way to handle that would be to have a generation counter for 
> > > the mm and to couple cache validity to a specific value of that. 
> > > 'Invalidation' is then the free side effect of bumping the generation 
> > > counter when a vma is removed/moved.
> 
> Wouldn't this approach make us invalidate all vmas even when we 
> just want to do it for one? [...]

Yes. If it's implemented as some sort of small, vma-size-weighted 
LRU, then all these 'different' caches go away and there's just 
this single LRU cache with a handful of entries cached.

This cache is then invalidated on munmap() et al. Which should be 
fine, mmap()/munmap() is a slowpath relative to find_vma().

Thanks,

	Ingo

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
  2013-11-11 20:47               ` Davidlohr Bueso
@ 2013-11-13 18:16                 ` Peter Zijlstra
  -1 siblings, 0 replies; 76+ messages in thread
From: Peter Zijlstra @ 2013-11-13 18:16 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Ingo Molnar, Michel Lespinasse, Linus Torvalds, Andrew Morton,
	Hugh Dickins, Mel Gorman, Rik van Riel, Guan Xuetao,
	Chandramouleeswaran, Aswin, Linux Kernel Mailing List, linux-mm

On Mon, Nov 11, 2013 at 12:47:28PM -0800, Davidlohr Bueso wrote:
> On Mon, 2013-11-11 at 13:04 +0100, Ingo Molnar wrote:
> in find_vma() but the cost of maintaining it comes free. I just ran into
> a similar idea from 2 years ago:
> http://lkml.indiana.edu/hypermail/linux/kernel/1112.1/01352.html

Here's one from 2007:

http://programming.kicks-ass.net/kernel-patches/futex-vma-cache/vma_cache.patch

and I'm very sure Nick Piggin had one even older :-)



^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH] mm: cache largest vma
@ 2013-11-13 18:16                 ` Peter Zijlstra
  0 siblings, 0 replies; 76+ messages in thread
From: Peter Zijlstra @ 2013-11-13 18:16 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Ingo Molnar, Michel Lespinasse, Linus Torvalds, Andrew Morton,
	Hugh Dickins, Mel Gorman, Rik van Riel, Guan Xuetao,
	Chandramouleeswaran, Aswin, Linux Kernel Mailing List, linux-mm

On Mon, Nov 11, 2013 at 12:47:28PM -0800, Davidlohr Bueso wrote:
> On Mon, 2013-11-11 at 13:04 +0100, Ingo Molnar wrote:
> in find_vma() but the cost of maintaining it comes free. I just ran into
> a similar idea from 2 years ago:
> http://lkml.indiana.edu/hypermail/linux/kernel/1112.1/01352.html

Here's one from 2007:

http://programming.kicks-ass.net/kernel-patches/futex-vma-cache/vma_cache.patch

and I'm very sure Nick Piggin had one even older :-)


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 76+ messages in thread

end of thread, other threads:[~2013-11-13 18:17 UTC | newest]

Thread overview: 76+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-11-01 20:17 [PATCH] mm: cache largest vma Davidlohr Bueso
2013-11-01 20:17 ` Davidlohr Bueso
2013-11-01 20:38 ` KOSAKI Motohiro
2013-11-01 20:38   ` KOSAKI Motohiro
2013-11-01 21:11   ` Davidlohr Bueso
2013-11-01 21:11     ` Davidlohr Bueso
2013-11-03  9:46     ` Ingo Molnar
2013-11-03  9:46       ` Ingo Molnar
2013-11-03 23:57     ` KOSAKI Motohiro
2013-11-03 23:57       ` KOSAKI Motohiro
2013-11-04  4:22       ` Davidlohr Bueso
2013-11-04  4:22         ` Davidlohr Bueso
2013-11-01 21:23 ` Rik van Riel
2013-11-01 21:23   ` Rik van Riel
2013-11-03 10:12 ` Ingo Molnar
2013-11-03 10:12   ` Ingo Molnar
2013-11-04  4:20   ` Davidlohr Bueso
2013-11-04  4:20     ` Davidlohr Bueso
2013-11-04  4:48     ` converting unicore32 to gate_vma as done for arm (was Re: [PATCH] mm: cache largest vma) Al Viro
2013-11-04  4:48       ` Al Viro
2013-11-05  2:49       ` 管雪涛
2013-11-05  2:49         ` 管雪涛
2013-11-11  7:25         ` converting unicore32 to gate_vma as done for arm (was " Al Viro
2013-11-11  7:25           ` Al Viro
2013-11-04  7:00     ` [PATCH] mm: cache largest vma Ingo Molnar
2013-11-04  7:00       ` Ingo Molnar
2013-11-04  7:05     ` Ingo Molnar
2013-11-04  7:05       ` Ingo Molnar
2013-11-04 14:20       ` Frederic Weisbecker
2013-11-04 14:20         ` Frederic Weisbecker
2013-11-04 17:52         ` Ingo Molnar
2013-11-04 17:52           ` Ingo Molnar
2013-11-04 18:10           ` Frederic Weisbecker
2013-11-04 18:10             ` Frederic Weisbecker
2013-11-05  8:24             ` Ingo Molnar
2013-11-05  8:24               ` Ingo Molnar
2013-11-05 14:27               ` Jiri Olsa
2013-11-05 14:27                 ` Jiri Olsa
2013-11-06  6:01                 ` Ingo Molnar
2013-11-06  6:01                   ` Ingo Molnar
2013-11-06 14:03                   ` Konstantin Khlebnikov
2013-11-06 14:03                     ` Konstantin Khlebnikov
2013-11-03 18:51 ` Linus Torvalds
2013-11-03 18:51   ` Linus Torvalds
2013-11-04  4:04   ` Davidlohr Bueso
2013-11-04  4:04     ` Davidlohr Bueso
2013-11-04  7:36     ` Ingo Molnar
2013-11-04  7:36       ` Ingo Molnar
2013-11-04 14:56       ` Michel Lespinasse
2013-11-04 14:56         ` Michel Lespinasse
2013-11-11  4:12       ` Davidlohr Bueso
2013-11-11  4:12         ` Davidlohr Bueso
2013-11-11  7:43         ` Michel Lespinasse
2013-11-11  7:43           ` Michel Lespinasse
2013-11-11 12:04           ` Ingo Molnar
2013-11-11 12:04             ` Ingo Molnar
2013-11-11 20:47             ` Davidlohr Bueso
2013-11-11 20:47               ` Davidlohr Bueso
2013-11-13 17:08               ` Davidlohr Bueso
2013-11-13 17:08                 ` Davidlohr Bueso
2013-11-13 17:59                 ` Ingo Molnar
2013-11-13 17:59                   ` Ingo Molnar
2013-11-13 18:16               ` Peter Zijlstra
2013-11-13 18:16                 ` Peter Zijlstra
2013-11-11 12:01         ` Ingo Molnar
2013-11-11 12:01           ` Ingo Molnar
2013-11-11 18:24           ` Davidlohr Bueso
2013-11-11 18:24             ` Davidlohr Bueso
2013-11-11 20:47             ` Ingo Molnar
2013-11-11 20:47               ` Ingo Molnar
2013-11-11 20:59               ` Davidlohr Bueso
2013-11-11 20:59                 ` Davidlohr Bueso
2013-11-11 21:09                 ` Ingo Molnar
2013-11-11 21:09                   ` Ingo Molnar
2013-11-04  7:03   ` Christoph Hellwig
2013-11-04  7:03     ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.