All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-29  6:31 ` Aaron Lu
  0 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-29  6:31 UTC (permalink / raw)
  To: Linux Memory Management List
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel


The global zero page is used to satisfy an anonymous read fault. If
THP(Transparent HugePage) is enabled then the global huge zero page is used.
The global huge zero page uses an atomic counter for reference counting
and is allocated/freed dynamically according to its counter value.

CPU time spent on that counter will greatly increase if there are
a lot of processes doing anonymous read faults. This patch proposes a
way to reduce the access to the global counter so that the CPU load
can be reduced accordingly.

To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
With this flag, the process only need to touch the global counter in
two cases:
1 The first time it uses the global huge zero page;
2 The time when mm_user of its mm_struct reaches zero.

Note that right now, the huge zero page is eligible to be freed as soon
as its last use goes away.  With this patch, the page will not be
eligible to be freed until the exit of the last process from which it
was ever used.

And with the use of mm_user, the kthread is not eligible to use huge
zero page either. Since no kthread is using huge zero page today, there
is no difference after applying this patch. But if that is not desired,
I can change it to when mm_count reaches zero.

Case used for test on Haswell EP:
usemem -n 72 --readonly -j 0x200000 100G
Which spawns 72 processes and each will mmap 100G anonymous space and
then do read only access to that space sequentially with a step of 2MB.

perf report for base commit:
    54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
perf report for this commit:
     0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
 fs/dax.c                |  2 +-
 include/linux/huge_mm.h |  6 +++---
 include/linux/sched.h   |  1 +
 kernel/fork.c           |  1 +
 mm/huge_memory.c        | 36 +++++++++++++++++++++++++-----------
 mm/swap.c               |  4 +---
 mm/swap_state.c         |  4 +---
 7 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 993dc6fe0416..226c0d5eedac 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1034,7 +1034,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!write && !buffer_mapped(&bh)) {
 		spinlock_t *ptl;
 		pmd_t entry;
-		struct page *zero_page = get_huge_zero_page();
+		struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
 
 		if (unlikely(!zero_page)) {
 			dax_pmd_dbg(&bh, address, "no zero page");
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6f14de45b5ce..10bcf58cc594 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -152,8 +152,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 	return is_huge_zero_page(pmd_page(pmd));
 }
 
-struct page *get_huge_zero_page(void);
-void put_huge_zero_page(void);
+struct page *mm_get_huge_zero_page(struct mm_struct *mm);
+void mm_put_huge_zero_page(struct mm_struct *mm);
 
 #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
 
@@ -213,7 +213,7 @@ static inline bool is_huge_zero_page(struct page *page)
 	return false;
 }
 
-static inline void put_huge_zero_page(void)
+static inline void mm_put_huge_zero_page(struct mm_struct *mm)
 {
 	BUILD_BUG();
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d7e1e783cf01..02246a70b63c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,6 +523,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
 #define MMF_OOM_REAPED		21	/* mm has been already reaped */
 #define MMF_OOM_NOT_REAPABLE	22	/* mm couldn't be reaped */
+#define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 52e725d4a866..372e02616b47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -711,6 +711,7 @@ static inline void __mmput(struct mm_struct *mm)
 	ksm_exit(mm);
 	khugepaged_exit(mm); /* must run before exit_mmap */
 	exit_mmap(mm);
+	mm_put_huge_zero_page(mm);
 	set_mm_exe_file(mm, NULL);
 	if (!list_empty(&mm->mmlist)) {
 		spin_lock(&mmlist_lock);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2db2112aa31e..d88bb1ec6fad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker;
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 
-struct page *get_huge_zero_page(void)
+static struct page *get_huge_zero_page(void)
 {
 	struct page *zero_page;
 retry:
@@ -86,7 +86,7 @@ retry:
 	return READ_ONCE(huge_zero_page);
 }
 
-void put_huge_zero_page(void)
+static void put_huge_zero_page(void)
 {
 	/*
 	 * Counter should never go to zero here. Only shrinker can put
@@ -95,6 +95,26 @@ void put_huge_zero_page(void)
 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 }
 
+struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+{
+	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		return READ_ONCE(huge_zero_page);
+
+	if (!get_huge_zero_page())
+		return NULL;
+
+	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		put_huge_zero_page();
+
+	return READ_ONCE(huge_zero_page);
+}
+
+void mm_put_huge_zero_page(struct mm_struct *mm)
+{
+	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		put_huge_zero_page();
+}
+
 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
 					struct shrink_control *sc)
 {
@@ -601,7 +621,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
 		pgtable = pte_alloc_one(vma->vm_mm, haddr);
 		if (unlikely(!pgtable))
 			return VM_FAULT_OOM;
-		zero_page = get_huge_zero_page();
+		zero_page = mm_get_huge_zero_page(vma->vm_mm);
 		if (unlikely(!zero_page)) {
 			pte_free(vma->vm_mm, pgtable);
 			count_vm_event(THP_FAULT_FALLBACK);
@@ -623,10 +643,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
 			}
 		} else
 			spin_unlock(fe->ptl);
-		if (!set) {
+		if (!set)
 			pte_free(vma->vm_mm, pgtable);
-			put_huge_zero_page();
-		}
 		return ret;
 	}
 	gfp = alloc_hugepage_direct_gfpmask(vma);
@@ -780,7 +798,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * since we already have a zero page to copy. It just takes a
 		 * reference.
 		 */
-		zero_page = get_huge_zero_page();
+		zero_page = mm_get_huge_zero_page(dst_mm);
 		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
 				zero_page);
 		ret = 0;
@@ -1038,7 +1056,6 @@ alloc:
 		update_mmu_cache_pmd(vma, fe->address, fe->pmd);
 		if (!page) {
 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-			put_huge_zero_page();
 		} else {
 			VM_BUG_ON_PAGE(!PageHead(page), page);
 			page_remove_rmap(page, true);
@@ -1502,7 +1519,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	}
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
-	put_huge_zero_page();
 }
 
 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1525,8 +1541,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	if (!vma_is_anonymous(vma)) {
 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-		if (is_huge_zero_pmd(_pmd))
-			put_huge_zero_page();
 		if (vma_is_dax(vma))
 			return;
 		page = pmd_page(_pmd);
diff --git a/mm/swap.c b/mm/swap.c
index 75c63bb2a1da..4dcf852e1e6d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold)
 			locked_pgdat = NULL;
 		}
 
-		if (is_huge_zero_page(page)) {
-			put_huge_zero_page();
+		if (is_huge_zero_page(page))
 			continue;
-		}
 
 		page = compound_head(page);
 		if (!put_page_testzero(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c8310a37be3a..5ffd3ee26592 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -252,9 +252,7 @@ static inline void free_swap_cache(struct page *page)
 void free_page_and_swap_cache(struct page *page)
 {
 	free_swap_cache(page);
-	if (is_huge_zero_page(page))
-		put_huge_zero_page();
-	else
+	if (!is_huge_zero_page(page))
 		put_page(page);
 }
 
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-29  6:31 ` Aaron Lu
  0 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-29  6:31 UTC (permalink / raw)
  To: Linux Memory Management List
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel


The global zero page is used to satisfy an anonymous read fault. If
THP(Transparent HugePage) is enabled then the global huge zero page is used.
The global huge zero page uses an atomic counter for reference counting
and is allocated/freed dynamically according to its counter value.

CPU time spent on that counter will greatly increase if there are
a lot of processes doing anonymous read faults. This patch proposes a
way to reduce the access to the global counter so that the CPU load
can be reduced accordingly.

To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
With this flag, the process only need to touch the global counter in
two cases:
1 The first time it uses the global huge zero page;
2 The time when mm_user of its mm_struct reaches zero.

Note that right now, the huge zero page is eligible to be freed as soon
as its last use goes away.  With this patch, the page will not be
eligible to be freed until the exit of the last process from which it
was ever used.

And with the use of mm_user, the kthread is not eligible to use huge
zero page either. Since no kthread is using huge zero page today, there
is no difference after applying this patch. But if that is not desired,
I can change it to when mm_count reaches zero.

Case used for test on Haswell EP:
usemem -n 72 --readonly -j 0x200000 100G
Which spawns 72 processes and each will mmap 100G anonymous space and
then do read only access to that space sequentially with a step of 2MB.

perf report for base commit:
    54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
perf report for this commit:
     0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
 fs/dax.c                |  2 +-
 include/linux/huge_mm.h |  6 +++---
 include/linux/sched.h   |  1 +
 kernel/fork.c           |  1 +
 mm/huge_memory.c        | 36 +++++++++++++++++++++++++-----------
 mm/swap.c               |  4 +---
 mm/swap_state.c         |  4 +---
 7 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 993dc6fe0416..226c0d5eedac 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1034,7 +1034,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!write && !buffer_mapped(&bh)) {
 		spinlock_t *ptl;
 		pmd_t entry;
-		struct page *zero_page = get_huge_zero_page();
+		struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
 
 		if (unlikely(!zero_page)) {
 			dax_pmd_dbg(&bh, address, "no zero page");
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6f14de45b5ce..10bcf58cc594 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -152,8 +152,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 	return is_huge_zero_page(pmd_page(pmd));
 }
 
-struct page *get_huge_zero_page(void);
-void put_huge_zero_page(void);
+struct page *mm_get_huge_zero_page(struct mm_struct *mm);
+void mm_put_huge_zero_page(struct mm_struct *mm);
 
 #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
 
@@ -213,7 +213,7 @@ static inline bool is_huge_zero_page(struct page *page)
 	return false;
 }
 
-static inline void put_huge_zero_page(void)
+static inline void mm_put_huge_zero_page(struct mm_struct *mm)
 {
 	BUILD_BUG();
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d7e1e783cf01..02246a70b63c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,6 +523,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
 #define MMF_OOM_REAPED		21	/* mm has been already reaped */
 #define MMF_OOM_NOT_REAPABLE	22	/* mm couldn't be reaped */
+#define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 52e725d4a866..372e02616b47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -711,6 +711,7 @@ static inline void __mmput(struct mm_struct *mm)
 	ksm_exit(mm);
 	khugepaged_exit(mm); /* must run before exit_mmap */
 	exit_mmap(mm);
+	mm_put_huge_zero_page(mm);
 	set_mm_exe_file(mm, NULL);
 	if (!list_empty(&mm->mmlist)) {
 		spin_lock(&mmlist_lock);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2db2112aa31e..d88bb1ec6fad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker;
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 
-struct page *get_huge_zero_page(void)
+static struct page *get_huge_zero_page(void)
 {
 	struct page *zero_page;
 retry:
@@ -86,7 +86,7 @@ retry:
 	return READ_ONCE(huge_zero_page);
 }
 
-void put_huge_zero_page(void)
+static void put_huge_zero_page(void)
 {
 	/*
 	 * Counter should never go to zero here. Only shrinker can put
@@ -95,6 +95,26 @@ void put_huge_zero_page(void)
 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 }
 
+struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+{
+	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		return READ_ONCE(huge_zero_page);
+
+	if (!get_huge_zero_page())
+		return NULL;
+
+	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		put_huge_zero_page();
+
+	return READ_ONCE(huge_zero_page);
+}
+
+void mm_put_huge_zero_page(struct mm_struct *mm)
+{
+	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		put_huge_zero_page();
+}
+
 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
 					struct shrink_control *sc)
 {
@@ -601,7 +621,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
 		pgtable = pte_alloc_one(vma->vm_mm, haddr);
 		if (unlikely(!pgtable))
 			return VM_FAULT_OOM;
-		zero_page = get_huge_zero_page();
+		zero_page = mm_get_huge_zero_page(vma->vm_mm);
 		if (unlikely(!zero_page)) {
 			pte_free(vma->vm_mm, pgtable);
 			count_vm_event(THP_FAULT_FALLBACK);
@@ -623,10 +643,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
 			}
 		} else
 			spin_unlock(fe->ptl);
-		if (!set) {
+		if (!set)
 			pte_free(vma->vm_mm, pgtable);
-			put_huge_zero_page();
-		}
 		return ret;
 	}
 	gfp = alloc_hugepage_direct_gfpmask(vma);
@@ -780,7 +798,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * since we already have a zero page to copy. It just takes a
 		 * reference.
 		 */
-		zero_page = get_huge_zero_page();
+		zero_page = mm_get_huge_zero_page(dst_mm);
 		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
 				zero_page);
 		ret = 0;
@@ -1038,7 +1056,6 @@ alloc:
 		update_mmu_cache_pmd(vma, fe->address, fe->pmd);
 		if (!page) {
 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-			put_huge_zero_page();
 		} else {
 			VM_BUG_ON_PAGE(!PageHead(page), page);
 			page_remove_rmap(page, true);
@@ -1502,7 +1519,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	}
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
-	put_huge_zero_page();
 }
 
 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1525,8 +1541,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	if (!vma_is_anonymous(vma)) {
 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-		if (is_huge_zero_pmd(_pmd))
-			put_huge_zero_page();
 		if (vma_is_dax(vma))
 			return;
 		page = pmd_page(_pmd);
diff --git a/mm/swap.c b/mm/swap.c
index 75c63bb2a1da..4dcf852e1e6d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold)
 			locked_pgdat = NULL;
 		}
 
-		if (is_huge_zero_page(page)) {
-			put_huge_zero_page();
+		if (is_huge_zero_page(page))
 			continue;
-		}
 
 		page = compound_head(page);
 		if (!put_page_testzero(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c8310a37be3a..5ffd3ee26592 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -252,9 +252,7 @@ static inline void free_swap_cache(struct page *page)
 void free_page_and_swap_cache(struct page *page)
 {
 	free_swap_cache(page);
-	if (is_huge_zero_page(page))
-		put_huge_zero_page();
-	else
+	if (!is_huge_zero_page(page))
 		put_page(page);
 }
 
-- 
2.5.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-29  6:31 ` Aaron Lu
@ 2016-08-29  8:49   ` Anshuman Khandual
  -1 siblings, 0 replies; 34+ messages in thread
From: Anshuman Khandual @ 2016-08-29  8:49 UTC (permalink / raw)
  To: Aaron Lu, Linux Memory Management List
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel

On 08/29/2016 12:01 PM, Aaron Lu wrote:
> The global zero page is used to satisfy an anonymous read fault. If
> THP(Transparent HugePage) is enabled then the global huge zero page is used.
> The global huge zero page uses an atomic counter for reference counting
> and is allocated/freed dynamically according to its counter value.
> 
> CPU time spent on that counter will greatly increase if there are
> a lot of processes doing anonymous read faults. This patch proposes a
> way to reduce the access to the global counter so that the CPU load
> can be reduced accordingly.
> 
> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
> With this flag, the process only need to touch the global counter in
> two cases:
> 1 The first time it uses the global huge zero page;
> 2 The time when mm_user of its mm_struct reaches zero.
> 
> Note that right now, the huge zero page is eligible to be freed as soon
> as its last use goes away.  With this patch, the page will not be
> eligible to be freed until the exit of the last process from which it
> was ever used.
> 
> And with the use of mm_user, the kthread is not eligible to use huge
> zero page either. Since no kthread is using huge zero page today, there
> is no difference after applying this patch. But if that is not desired,
> I can change it to when mm_count reaches zero.
> 
> Case used for test on Haswell EP:
> usemem -n 72 --readonly -j 0x200000 100G

Is this benchmark publicly available ? Does not seem to be this one
https://github.com/gnubert/usemem.git, Does it ?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-29  8:49   ` Anshuman Khandual
  0 siblings, 0 replies; 34+ messages in thread
From: Anshuman Khandual @ 2016-08-29  8:49 UTC (permalink / raw)
  To: Aaron Lu, Linux Memory Management List
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel

On 08/29/2016 12:01 PM, Aaron Lu wrote:
> The global zero page is used to satisfy an anonymous read fault. If
> THP(Transparent HugePage) is enabled then the global huge zero page is used.
> The global huge zero page uses an atomic counter for reference counting
> and is allocated/freed dynamically according to its counter value.
> 
> CPU time spent on that counter will greatly increase if there are
> a lot of processes doing anonymous read faults. This patch proposes a
> way to reduce the access to the global counter so that the CPU load
> can be reduced accordingly.
> 
> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
> With this flag, the process only need to touch the global counter in
> two cases:
> 1 The first time it uses the global huge zero page;
> 2 The time when mm_user of its mm_struct reaches zero.
> 
> Note that right now, the huge zero page is eligible to be freed as soon
> as its last use goes away.  With this patch, the page will not be
> eligible to be freed until the exit of the last process from which it
> was ever used.
> 
> And with the use of mm_user, the kthread is not eligible to use huge
> zero page either. Since no kthread is using huge zero page today, there
> is no difference after applying this patch. But if that is not desired,
> I can change it to when mm_count reaches zero.
> 
> Case used for test on Haswell EP:
> usemem -n 72 --readonly -j 0x200000 100G

Is this benchmark publicly available ? Does not seem to be this one
https://github.com/gnubert/usemem.git, Does it ?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-29  8:49   ` Anshuman Khandual
@ 2016-08-29  8:53     ` Aaron Lu
  -1 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-29  8:53 UTC (permalink / raw)
  To: Anshuman Khandual, Linux Memory Management List
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel

On 08/29/2016 04:49 PM, Anshuman Khandual wrote:
> On 08/29/2016 12:01 PM, Aaron Lu wrote:
>> The global zero page is used to satisfy an anonymous read fault. If
>> THP(Transparent HugePage) is enabled then the global huge zero page is used.
>> The global huge zero page uses an atomic counter for reference counting
>> and is allocated/freed dynamically according to its counter value.
>>
>> CPU time spent on that counter will greatly increase if there are
>> a lot of processes doing anonymous read faults. This patch proposes a
>> way to reduce the access to the global counter so that the CPU load
>> can be reduced accordingly.
>>
>> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
>> With this flag, the process only need to touch the global counter in
>> two cases:
>> 1 The first time it uses the global huge zero page;
>> 2 The time when mm_user of its mm_struct reaches zero.
>>
>> Note that right now, the huge zero page is eligible to be freed as soon
>> as its last use goes away.  With this patch, the page will not be
>> eligible to be freed until the exit of the last process from which it
>> was ever used.
>>
>> And with the use of mm_user, the kthread is not eligible to use huge
>> zero page either. Since no kthread is using huge zero page today, there
>> is no difference after applying this patch. But if that is not desired,
>> I can change it to when mm_count reaches zero.
>>
>> Case used for test on Haswell EP:
>> usemem -n 72 --readonly -j 0x200000 100G
> 
> Is this benchmark publicly available ? Does not seem to be this one
> https://github.com/gnubert/usemem.git, Does it ?

Sorry, forgot to attach its link.
It's this one:
https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git

And the above mentioned usemem is:
https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git/tree/usemem.c

Regards,
Aaron

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-29  8:53     ` Aaron Lu
  0 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-29  8:53 UTC (permalink / raw)
  To: Anshuman Khandual, Linux Memory Management List
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel

On 08/29/2016 04:49 PM, Anshuman Khandual wrote:
> On 08/29/2016 12:01 PM, Aaron Lu wrote:
>> The global zero page is used to satisfy an anonymous read fault. If
>> THP(Transparent HugePage) is enabled then the global huge zero page is used.
>> The global huge zero page uses an atomic counter for reference counting
>> and is allocated/freed dynamically according to its counter value.
>>
>> CPU time spent on that counter will greatly increase if there are
>> a lot of processes doing anonymous read faults. This patch proposes a
>> way to reduce the access to the global counter so that the CPU load
>> can be reduced accordingly.
>>
>> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
>> With this flag, the process only need to touch the global counter in
>> two cases:
>> 1 The first time it uses the global huge zero page;
>> 2 The time when mm_user of its mm_struct reaches zero.
>>
>> Note that right now, the huge zero page is eligible to be freed as soon
>> as its last use goes away.  With this patch, the page will not be
>> eligible to be freed until the exit of the last process from which it
>> was ever used.
>>
>> And with the use of mm_user, the kthread is not eligible to use huge
>> zero page either. Since no kthread is using huge zero page today, there
>> is no difference after applying this patch. But if that is not desired,
>> I can change it to when mm_count reaches zero.
>>
>> Case used for test on Haswell EP:
>> usemem -n 72 --readonly -j 0x200000 100G
> 
> Is this benchmark publicly available ? Does not seem to be this one
> https://github.com/gnubert/usemem.git, Does it ?

Sorry, forgot to attach its link.
It's this one:
https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git

And the above mentioned usemem is:
https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git/tree/usemem.c

Regards,
Aaron

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-29  8:53     ` Aaron Lu
@ 2016-08-29 13:47       ` Anshuman Khandual
  -1 siblings, 0 replies; 34+ messages in thread
From: Anshuman Khandual @ 2016-08-29 13:47 UTC (permalink / raw)
  To: Aaron Lu, Anshuman Khandual, Linux Memory Management List
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel, Aneesh Kumar K.V

On 08/29/2016 02:23 PM, Aaron Lu wrote:
> On 08/29/2016 04:49 PM, Anshuman Khandual wrote:
>> > On 08/29/2016 12:01 PM, Aaron Lu wrote:
>>> >> The global zero page is used to satisfy an anonymous read fault. If
>>> >> THP(Transparent HugePage) is enabled then the global huge zero page is used.
>>> >> The global huge zero page uses an atomic counter for reference counting
>>> >> and is allocated/freed dynamically according to its counter value.
>>> >>
>>> >> CPU time spent on that counter will greatly increase if there are
>>> >> a lot of processes doing anonymous read faults. This patch proposes a
>>> >> way to reduce the access to the global counter so that the CPU load
>>> >> can be reduced accordingly.
>>> >>
>>> >> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
>>> >> With this flag, the process only need to touch the global counter in
>>> >> two cases:
>>> >> 1 The first time it uses the global huge zero page;
>>> >> 2 The time when mm_user of its mm_struct reaches zero.
>>> >>
>>> >> Note that right now, the huge zero page is eligible to be freed as soon
>>> >> as its last use goes away.  With this patch, the page will not be
>>> >> eligible to be freed until the exit of the last process from which it
>>> >> was ever used.
>>> >>
>>> >> And with the use of mm_user, the kthread is not eligible to use huge
>>> >> zero page either. Since no kthread is using huge zero page today, there
>>> >> is no difference after applying this patch. But if that is not desired,
>>> >> I can change it to when mm_count reaches zero.
>>> >>
>>> >> Case used for test on Haswell EP:
>>> >> usemem -n 72 --readonly -j 0x200000 100G
>> > 
>> > Is this benchmark publicly available ? Does not seem to be this one
>> > https://github.com/gnubert/usemem.git, Does it ?
> Sorry, forgot to attach its link.
> It's this one:
> https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git
> 
> And the above mentioned usemem is:
> https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git/tree/usemem.c

Hey Aaron,

Thanks for pointing out. I did ran similar test on a POWER8 box using 16M
steps (huge page size is 16MB on it) instead of 2MB. But the perf profile
looked different. The perf command line was like this on a 32 CPU system.

perf record ./usemem -n 256 --readonly -j 0x1000000 100G

But the relative weight of the above mentioned function came out to be
pretty less compared to what you have reported from your experiment
which is around 54.03%.

0.07%  usemem  [kernel.vmlinux]  [k] get_huge_zero_page

Seems way out of the mark. Can you please confirm your exact perf record
command line and how many CPUs you have on the system.

- Anshuman

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-29 13:47       ` Anshuman Khandual
  0 siblings, 0 replies; 34+ messages in thread
From: Anshuman Khandual @ 2016-08-29 13:47 UTC (permalink / raw)
  To: Aaron Lu, Anshuman Khandual, Linux Memory Management List
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel, Aneesh Kumar K.V

On 08/29/2016 02:23 PM, Aaron Lu wrote:
> On 08/29/2016 04:49 PM, Anshuman Khandual wrote:
>> > On 08/29/2016 12:01 PM, Aaron Lu wrote:
>>> >> The global zero page is used to satisfy an anonymous read fault. If
>>> >> THP(Transparent HugePage) is enabled then the global huge zero page is used.
>>> >> The global huge zero page uses an atomic counter for reference counting
>>> >> and is allocated/freed dynamically according to its counter value.
>>> >>
>>> >> CPU time spent on that counter will greatly increase if there are
>>> >> a lot of processes doing anonymous read faults. This patch proposes a
>>> >> way to reduce the access to the global counter so that the CPU load
>>> >> can be reduced accordingly.
>>> >>
>>> >> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
>>> >> With this flag, the process only need to touch the global counter in
>>> >> two cases:
>>> >> 1 The first time it uses the global huge zero page;
>>> >> 2 The time when mm_user of its mm_struct reaches zero.
>>> >>
>>> >> Note that right now, the huge zero page is eligible to be freed as soon
>>> >> as its last use goes away.  With this patch, the page will not be
>>> >> eligible to be freed until the exit of the last process from which it
>>> >> was ever used.
>>> >>
>>> >> And with the use of mm_user, the kthread is not eligible to use huge
>>> >> zero page either. Since no kthread is using huge zero page today, there
>>> >> is no difference after applying this patch. But if that is not desired,
>>> >> I can change it to when mm_count reaches zero.
>>> >>
>>> >> Case used for test on Haswell EP:
>>> >> usemem -n 72 --readonly -j 0x200000 100G
>> > 
>> > Is this benchmark publicly available ? Does not seem to be this one
>> > https://github.com/gnubert/usemem.git, Does it ?
> Sorry, forgot to attach its link.
> It's this one:
> https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git
> 
> And the above mentioned usemem is:
> https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git/tree/usemem.c

Hey Aaron,

Thanks for pointing out. I did ran similar test on a POWER8 box using 16M
steps (huge page size is 16MB on it) instead of 2MB. But the perf profile
looked different. The perf command line was like this on a 32 CPU system.

perf record ./usemem -n 256 --readonly -j 0x1000000 100G

But the relative weight of the above mentioned function came out to be
pretty less compared to what you have reported from your experiment
which is around 54.03%.

0.07%  usemem  [kernel.vmlinux]  [k] get_huge_zero_page

Seems way out of the mark. Can you please confirm your exact perf record
command line and how many CPUs you have on the system.

- Anshuman

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-29 13:47       ` Anshuman Khandual
@ 2016-08-29 14:10         ` Aaron Lu
  -1 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-29 14:10 UTC (permalink / raw)
  To: Anshuman Khandual
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel, Aneesh Kumar K.V

On Mon, Aug 29, 2016 at 07:17:58PM +0530, Anshuman Khandual wrote:
> On 08/29/2016 02:23 PM, Aaron Lu wrote:
> > On 08/29/2016 04:49 PM, Anshuman Khandual wrote:
> >> > On 08/29/2016 12:01 PM, Aaron Lu wrote:
> >>> >> The global zero page is used to satisfy an anonymous read fault. If
> >>> >> THP(Transparent HugePage) is enabled then the global huge zero page is used.
> >>> >> The global huge zero page uses an atomic counter for reference counting
> >>> >> and is allocated/freed dynamically according to its counter value.
> >>> >>
> >>> >> CPU time spent on that counter will greatly increase if there are
> >>> >> a lot of processes doing anonymous read faults. This patch proposes a
> >>> >> way to reduce the access to the global counter so that the CPU load
> >>> >> can be reduced accordingly.
> >>> >>
> >>> >> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
> >>> >> With this flag, the process only need to touch the global counter in
> >>> >> two cases:
> >>> >> 1 The first time it uses the global huge zero page;
> >>> >> 2 The time when mm_user of its mm_struct reaches zero.
> >>> >>
> >>> >> Note that right now, the huge zero page is eligible to be freed as soon
> >>> >> as its last use goes away.  With this patch, the page will not be
> >>> >> eligible to be freed until the exit of the last process from which it
> >>> >> was ever used.
> >>> >>
> >>> >> And with the use of mm_user, the kthread is not eligible to use huge
> >>> >> zero page either. Since no kthread is using huge zero page today, there
> >>> >> is no difference after applying this patch. But if that is not desired,
> >>> >> I can change it to when mm_count reaches zero.
> >>> >>
> >>> >> Case used for test on Haswell EP:
> >>> >> usemem -n 72 --readonly -j 0x200000 100G
> >> > 
> >> > Is this benchmark publicly available ? Does not seem to be this one
> >> > https://github.com/gnubert/usemem.git, Does it ?
> > Sorry, forgot to attach its link.
> > It's this one:
> > https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git
> > 
> > And the above mentioned usemem is:
> > https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git/tree/usemem.c
> 
> Hey Aaron,
> 
> Thanks for pointing out. I did ran similar test on a POWER8 box using 16M
> steps (huge page size is 16MB on it) instead of 2MB. But the perf profile
> looked different. The perf command line was like this on a 32 CPU system.
> 
> perf record ./usemem -n 256 --readonly -j 0x1000000 100G
> 
> But the relative weight of the above mentioned function came out to be
> pretty less compared to what you have reported from your experiment
> which is around 54.03%.
> 
> 0.07%  usemem  [kernel.vmlinux]  [k] get_huge_zero_page
> 
> Seems way out of the mark. Can you please confirm your exact perf record
> command line and how many CPUs you have on the system.

Haswell EP has 72 CPUs.

Since the huge page size is 16MB on your system, maybe you can try:
perf record ./usemem -n 32 --readonly -j 0x1000000 800G

Regards,
Aaron

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-29 14:10         ` Aaron Lu
  0 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-29 14:10 UTC (permalink / raw)
  To: Anshuman Khandual
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel, Aneesh Kumar K.V

On Mon, Aug 29, 2016 at 07:17:58PM +0530, Anshuman Khandual wrote:
> On 08/29/2016 02:23 PM, Aaron Lu wrote:
> > On 08/29/2016 04:49 PM, Anshuman Khandual wrote:
> >> > On 08/29/2016 12:01 PM, Aaron Lu wrote:
> >>> >> The global zero page is used to satisfy an anonymous read fault. If
> >>> >> THP(Transparent HugePage) is enabled then the global huge zero page is used.
> >>> >> The global huge zero page uses an atomic counter for reference counting
> >>> >> and is allocated/freed dynamically according to its counter value.
> >>> >>
> >>> >> CPU time spent on that counter will greatly increase if there are
> >>> >> a lot of processes doing anonymous read faults. This patch proposes a
> >>> >> way to reduce the access to the global counter so that the CPU load
> >>> >> can be reduced accordingly.
> >>> >>
> >>> >> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
> >>> >> With this flag, the process only need to touch the global counter in
> >>> >> two cases:
> >>> >> 1 The first time it uses the global huge zero page;
> >>> >> 2 The time when mm_user of its mm_struct reaches zero.
> >>> >>
> >>> >> Note that right now, the huge zero page is eligible to be freed as soon
> >>> >> as its last use goes away.  With this patch, the page will not be
> >>> >> eligible to be freed until the exit of the last process from which it
> >>> >> was ever used.
> >>> >>
> >>> >> And with the use of mm_user, the kthread is not eligible to use huge
> >>> >> zero page either. Since no kthread is using huge zero page today, there
> >>> >> is no difference after applying this patch. But if that is not desired,
> >>> >> I can change it to when mm_count reaches zero.
> >>> >>
> >>> >> Case used for test on Haswell EP:
> >>> >> usemem -n 72 --readonly -j 0x200000 100G
> >> > 
> >> > Is this benchmark publicly available ? Does not seem to be this one
> >> > https://github.com/gnubert/usemem.git, Does it ?
> > Sorry, forgot to attach its link.
> > It's this one:
> > https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git
> > 
> > And the above mentioned usemem is:
> > https://git.kernel.org/cgit/linux/kernel/git/wfg/vm-scalability.git/tree/usemem.c
> 
> Hey Aaron,
> 
> Thanks for pointing out. I did ran similar test on a POWER8 box using 16M
> steps (huge page size is 16MB on it) instead of 2MB. But the perf profile
> looked different. The perf command line was like this on a 32 CPU system.
> 
> perf record ./usemem -n 256 --readonly -j 0x1000000 100G
> 
> But the relative weight of the above mentioned function came out to be
> pretty less compared to what you have reported from your experiment
> which is around 54.03%.
> 
> 0.07%  usemem  [kernel.vmlinux]  [k] get_huge_zero_page
> 
> Seems way out of the mark. Can you please confirm your exact perf record
> command line and how many CPUs you have on the system.

Haswell EP has 72 CPUs.

Since the huge page size is 16MB on your system, maybe you can try:
perf record ./usemem -n 32 --readonly -j 0x1000000 800G

Regards,
Aaron

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-29  6:31 ` Aaron Lu
@ 2016-08-29 22:50   ` Andrew Morton
  -1 siblings, 0 replies; 34+ messages in thread
From: Andrew Morton @ 2016-08-29 22:50 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On Mon, 29 Aug 2016 14:31:20 +0800 Aaron Lu <aaron.lu@intel.com> wrote:

> 
> The global zero page is used to satisfy an anonymous read fault. If
> THP(Transparent HugePage) is enabled then the global huge zero page is used.
> The global huge zero page uses an atomic counter for reference counting
> and is allocated/freed dynamically according to its counter value.
> 
> CPU time spent on that counter will greatly increase if there are
> a lot of processes doing anonymous read faults. This patch proposes a
> way to reduce the access to the global counter so that the CPU load
> can be reduced accordingly.
> 
> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
> With this flag, the process only need to touch the global counter in
> two cases:
> 1 The first time it uses the global huge zero page;
> 2 The time when mm_user of its mm_struct reaches zero.
> 
> Note that right now, the huge zero page is eligible to be freed as soon
> as its last use goes away.  With this patch, the page will not be
> eligible to be freed until the exit of the last process from which it
> was ever used.
> 
> And with the use of mm_user, the kthread is not eligible to use huge
> zero page either. Since no kthread is using huge zero page today, there
> is no difference after applying this patch. But if that is not desired,
> I can change it to when mm_count reaches zero.

I suppose we could simply never free the zero huge page - if some
process has used it in the past, others will probably use it in the
future.  One wonders how useful this optimization is...

But the patch is simple enough.

> Case used for test on Haswell EP:
> usemem -n 72 --readonly -j 0x200000 100G
> Which spawns 72 processes and each will mmap 100G anonymous space and
> then do read only access to that space sequentially with a step of 2MB.
> 
> perf report for base commit:
>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
> perf report for this commit:
>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page

Does this mean that overall usemem runtime halved?

Do we have any numbers for something which is more real-wordly?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-29 22:50   ` Andrew Morton
  0 siblings, 0 replies; 34+ messages in thread
From: Andrew Morton @ 2016-08-29 22:50 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On Mon, 29 Aug 2016 14:31:20 +0800 Aaron Lu <aaron.lu@intel.com> wrote:

> 
> The global zero page is used to satisfy an anonymous read fault. If
> THP(Transparent HugePage) is enabled then the global huge zero page is used.
> The global huge zero page uses an atomic counter for reference counting
> and is allocated/freed dynamically according to its counter value.
> 
> CPU time spent on that counter will greatly increase if there are
> a lot of processes doing anonymous read faults. This patch proposes a
> way to reduce the access to the global counter so that the CPU load
> can be reduced accordingly.
> 
> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
> With this flag, the process only need to touch the global counter in
> two cases:
> 1 The first time it uses the global huge zero page;
> 2 The time when mm_user of its mm_struct reaches zero.
> 
> Note that right now, the huge zero page is eligible to be freed as soon
> as its last use goes away.  With this patch, the page will not be
> eligible to be freed until the exit of the last process from which it
> was ever used.
> 
> And with the use of mm_user, the kthread is not eligible to use huge
> zero page either. Since no kthread is using huge zero page today, there
> is no difference after applying this patch. But if that is not desired,
> I can change it to when mm_count reaches zero.

I suppose we could simply never free the zero huge page - if some
process has used it in the past, others will probably use it in the
future.  One wonders how useful this optimization is...

But the patch is simple enough.

> Case used for test on Haswell EP:
> usemem -n 72 --readonly -j 0x200000 100G
> Which spawns 72 processes and each will mmap 100G anonymous space and
> then do read only access to that space sequentially with a step of 2MB.
> 
> perf report for base commit:
>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
> perf report for this commit:
>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page

Does this mean that overall usemem runtime halved?

Do we have any numbers for something which is more real-wordly?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-29 22:50   ` Andrew Morton
@ 2016-08-30  3:09     ` Aaron Lu
  -1 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-30  3:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 06:50 AM, Andrew Morton wrote:
> On Mon, 29 Aug 2016 14:31:20 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> 
>>
>> The global zero page is used to satisfy an anonymous read fault. If
>> THP(Transparent HugePage) is enabled then the global huge zero page is used.
>> The global huge zero page uses an atomic counter for reference counting
>> and is allocated/freed dynamically according to its counter value.
>>
>> CPU time spent on that counter will greatly increase if there are
>> a lot of processes doing anonymous read faults. This patch proposes a
>> way to reduce the access to the global counter so that the CPU load
>> can be reduced accordingly.
>>
>> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
>> With this flag, the process only need to touch the global counter in
>> two cases:
>> 1 The first time it uses the global huge zero page;
>> 2 The time when mm_user of its mm_struct reaches zero.
>>
>> Note that right now, the huge zero page is eligible to be freed as soon
>> as its last use goes away.  With this patch, the page will not be
>> eligible to be freed until the exit of the last process from which it
>> was ever used.
>>
>> And with the use of mm_user, the kthread is not eligible to use huge
>> zero page either. Since no kthread is using huge zero page today, there
>> is no difference after applying this patch. But if that is not desired,
>> I can change it to when mm_count reaches zero.
> 
> I suppose we could simply never free the zero huge page - if some
> process has used it in the past, others will probably use it in the
> future.  One wonders how useful this optimization is...
>
> But the patch is simple enough.
> 
>> Case used for test on Haswell EP:
>> usemem -n 72 --readonly -j 0x200000 100G
>> Which spawns 72 processes and each will mmap 100G anonymous space and
>> then do read only access to that space sequentially with a step of 2MB.
>>
>> perf report for base commit:
>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
>> perf report for this commit:
>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
> 
> Does this mean that overall usemem runtime halved?

Sorry for the confusion, the above line is extracted from perf report.
It shows the percent of CPU cycles executed in a specific function.

The above two perf lines are used to show get_huge_zero_page doesn't
consume that much CPU cycles after applying the patch.

> 
> Do we have any numbers for something which is more real-wordly?

Unfortunately, no real world numbers.

We think the global atomic counter could be an issue for performance
so I'm trying to solve the problem.

Thanks,
Aaron

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-30  3:09     ` Aaron Lu
  0 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-30  3:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 06:50 AM, Andrew Morton wrote:
> On Mon, 29 Aug 2016 14:31:20 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> 
>>
>> The global zero page is used to satisfy an anonymous read fault. If
>> THP(Transparent HugePage) is enabled then the global huge zero page is used.
>> The global huge zero page uses an atomic counter for reference counting
>> and is allocated/freed dynamically according to its counter value.
>>
>> CPU time spent on that counter will greatly increase if there are
>> a lot of processes doing anonymous read faults. This patch proposes a
>> way to reduce the access to the global counter so that the CPU load
>> can be reduced accordingly.
>>
>> To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
>> With this flag, the process only need to touch the global counter in
>> two cases:
>> 1 The first time it uses the global huge zero page;
>> 2 The time when mm_user of its mm_struct reaches zero.
>>
>> Note that right now, the huge zero page is eligible to be freed as soon
>> as its last use goes away.  With this patch, the page will not be
>> eligible to be freed until the exit of the last process from which it
>> was ever used.
>>
>> And with the use of mm_user, the kthread is not eligible to use huge
>> zero page either. Since no kthread is using huge zero page today, there
>> is no difference after applying this patch. But if that is not desired,
>> I can change it to when mm_count reaches zero.
> 
> I suppose we could simply never free the zero huge page - if some
> process has used it in the past, others will probably use it in the
> future.  One wonders how useful this optimization is...
>
> But the patch is simple enough.
> 
>> Case used for test on Haswell EP:
>> usemem -n 72 --readonly -j 0x200000 100G
>> Which spawns 72 processes and each will mmap 100G anonymous space and
>> then do read only access to that space sequentially with a step of 2MB.
>>
>> perf report for base commit:
>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
>> perf report for this commit:
>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
> 
> Does this mean that overall usemem runtime halved?

Sorry for the confusion, the above line is extracted from perf report.
It shows the percent of CPU cycles executed in a specific function.

The above two perf lines are used to show get_huge_zero_page doesn't
consume that much CPU cycles after applying the patch.

> 
> Do we have any numbers for something which is more real-wordly?

Unfortunately, no real world numbers.

We think the global atomic counter could be an issue for performance
so I'm trying to solve the problem.

Thanks,
Aaron

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-30  3:09     ` Aaron Lu
@ 2016-08-30  3:39       ` Andrew Morton
  -1 siblings, 0 replies; 34+ messages in thread
From: Andrew Morton @ 2016-08-30  3:39 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:

> >> Case used for test on Haswell EP:
> >> usemem -n 72 --readonly -j 0x200000 100G
> >> Which spawns 72 processes and each will mmap 100G anonymous space and
> >> then do read only access to that space sequentially with a step of 2MB.
> >>
> >> perf report for base commit:
> >>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
> >> perf report for this commit:
> >>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
> > 
> > Does this mean that overall usemem runtime halved?
> 
> Sorry for the confusion, the above line is extracted from perf report.
> It shows the percent of CPU cycles executed in a specific function.
> 
> The above two perf lines are used to show get_huge_zero_page doesn't
> consume that much CPU cycles after applying the patch.
> 
> > 
> > Do we have any numbers for something which is more real-wordly?
> 
> Unfortunately, no real world numbers.
> 
> We think the global atomic counter could be an issue for performance
> so I'm trying to solve the problem.

So, umm, we don't actually know if the patch is useful to anyone?

Some more measurements would help things along, please.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-30  3:39       ` Andrew Morton
  0 siblings, 0 replies; 34+ messages in thread
From: Andrew Morton @ 2016-08-30  3:39 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:

> >> Case used for test on Haswell EP:
> >> usemem -n 72 --readonly -j 0x200000 100G
> >> Which spawns 72 processes and each will mmap 100G anonymous space and
> >> then do read only access to that space sequentially with a step of 2MB.
> >>
> >> perf report for base commit:
> >>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
> >> perf report for this commit:
> >>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
> > 
> > Does this mean that overall usemem runtime halved?
> 
> Sorry for the confusion, the above line is extracted from perf report.
> It shows the percent of CPU cycles executed in a specific function.
> 
> The above two perf lines are used to show get_huge_zero_page doesn't
> consume that much CPU cycles after applying the patch.
> 
> > 
> > Do we have any numbers for something which is more real-wordly?
> 
> Unfortunately, no real world numbers.
> 
> We think the global atomic counter could be an issue for performance
> so I'm trying to solve the problem.

So, umm, we don't actually know if the patch is useful to anyone?

Some more measurements would help things along, please.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-30  3:39       ` Andrew Morton
@ 2016-08-30  4:44         ` Anshuman Khandual
  -1 siblings, 0 replies; 34+ messages in thread
From: Anshuman Khandual @ 2016-08-30  4:44 UTC (permalink / raw)
  To: Andrew Morton, Aaron Lu
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 09:09 AM, Andrew Morton wrote:
> On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> 
>>>> Case used for test on Haswell EP:
>>>> usemem -n 72 --readonly -j 0x200000 100G
>>>> Which spawns 72 processes and each will mmap 100G anonymous space and
>>>> then do read only access to that space sequentially with a step of 2MB.
>>>>
>>>> perf report for base commit:
>>>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
>>>> perf report for this commit:
>>>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
>>>
>>> Does this mean that overall usemem runtime halved?
>>
>> Sorry for the confusion, the above line is extracted from perf report.
>> It shows the percent of CPU cycles executed in a specific function.
>>
>> The above two perf lines are used to show get_huge_zero_page doesn't
>> consume that much CPU cycles after applying the patch.
>>
>>>
>>> Do we have any numbers for something which is more real-wordly?
>>
>> Unfortunately, no real world numbers.
>>
>> We think the global atomic counter could be an issue for performance
>> so I'm trying to solve the problem.
> 
> So, umm, we don't actually know if the patch is useful to anyone?

On a POWER system it improves the CPU consumption of the above mentioned
function a little bit. Dont think its going to improve actual throughput
of the workload substantially.

0.07%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page

to

0.01%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-30  4:44         ` Anshuman Khandual
  0 siblings, 0 replies; 34+ messages in thread
From: Anshuman Khandual @ 2016-08-30  4:44 UTC (permalink / raw)
  To: Andrew Morton, Aaron Lu
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 09:09 AM, Andrew Morton wrote:
> On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> 
>>>> Case used for test on Haswell EP:
>>>> usemem -n 72 --readonly -j 0x200000 100G
>>>> Which spawns 72 processes and each will mmap 100G anonymous space and
>>>> then do read only access to that space sequentially with a step of 2MB.
>>>>
>>>> perf report for base commit:
>>>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
>>>> perf report for this commit:
>>>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
>>>
>>> Does this mean that overall usemem runtime halved?
>>
>> Sorry for the confusion, the above line is extracted from perf report.
>> It shows the percent of CPU cycles executed in a specific function.
>>
>> The above two perf lines are used to show get_huge_zero_page doesn't
>> consume that much CPU cycles after applying the patch.
>>
>>>
>>> Do we have any numbers for something which is more real-wordly?
>>
>> Unfortunately, no real world numbers.
>>
>> We think the global atomic counter could be an issue for performance
>> so I'm trying to solve the problem.
> 
> So, umm, we don't actually know if the patch is useful to anyone?

On a POWER system it improves the CPU consumption of the above mentioned
function a little bit. Dont think its going to improve actual throughput
of the workload substantially.

0.07%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page

to

0.01%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-30  4:44         ` Anshuman Khandual
@ 2016-08-30  4:56           ` Andrew Morton
  -1 siblings, 0 replies; 34+ messages in thread
From: Andrew Morton @ 2016-08-30  4:56 UTC (permalink / raw)
  To: Anshuman Khandual
  Cc: Aaron Lu, Linux Memory Management List,
	'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On Tue, 30 Aug 2016 10:14:25 +0530 Anshuman Khandual <khandual@linux.vnet.ibm.com> wrote:

> On 08/30/2016 09:09 AM, Andrew Morton wrote:
> > On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> > 
> >>>> Case used for test on Haswell EP:
> >>>> usemem -n 72 --readonly -j 0x200000 100G
> >>>> Which spawns 72 processes and each will mmap 100G anonymous space and
> >>>> then do read only access to that space sequentially with a step of 2MB.
> >>>>
> >>>> perf report for base commit:
> >>>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
> >>>> perf report for this commit:
> >>>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
> >>>
> >>> Does this mean that overall usemem runtime halved?
> >>
> >> Sorry for the confusion, the above line is extracted from perf report.
> >> It shows the percent of CPU cycles executed in a specific function.
> >>
> >> The above two perf lines are used to show get_huge_zero_page doesn't
> >> consume that much CPU cycles after applying the patch.
> >>
> >>>
> >>> Do we have any numbers for something which is more real-wordly?
> >>
> >> Unfortunately, no real world numbers.
> >>
> >> We think the global atomic counter could be an issue for performance
> >> so I'm trying to solve the problem.
> > 
> > So, umm, we don't actually know if the patch is useful to anyone?
> 
> On a POWER system it improves the CPU consumption of the above mentioned
> function a little bit. Dont think its going to improve actual throughput
> of the workload substantially.
> 
> 0.07%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page
> 
> to
> 
> 0.01%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page

I can't say I'm surprised really.  A huge page is, ahem, huge.  The
computational cost of actually writing stuff into that page will swamp
the cost of the locking to acquire it.

Is the patch really worth the additional complexity?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-30  4:56           ` Andrew Morton
  0 siblings, 0 replies; 34+ messages in thread
From: Andrew Morton @ 2016-08-30  4:56 UTC (permalink / raw)
  To: Anshuman Khandual
  Cc: Aaron Lu, Linux Memory Management List,
	'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On Tue, 30 Aug 2016 10:14:25 +0530 Anshuman Khandual <khandual@linux.vnet.ibm.com> wrote:

> On 08/30/2016 09:09 AM, Andrew Morton wrote:
> > On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> > 
> >>>> Case used for test on Haswell EP:
> >>>> usemem -n 72 --readonly -j 0x200000 100G
> >>>> Which spawns 72 processes and each will mmap 100G anonymous space and
> >>>> then do read only access to that space sequentially with a step of 2MB.
> >>>>
> >>>> perf report for base commit:
> >>>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
> >>>> perf report for this commit:
> >>>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
> >>>
> >>> Does this mean that overall usemem runtime halved?
> >>
> >> Sorry for the confusion, the above line is extracted from perf report.
> >> It shows the percent of CPU cycles executed in a specific function.
> >>
> >> The above two perf lines are used to show get_huge_zero_page doesn't
> >> consume that much CPU cycles after applying the patch.
> >>
> >>>
> >>> Do we have any numbers for something which is more real-wordly?
> >>
> >> Unfortunately, no real world numbers.
> >>
> >> We think the global atomic counter could be an issue for performance
> >> so I'm trying to solve the problem.
> > 
> > So, umm, we don't actually know if the patch is useful to anyone?
> 
> On a POWER system it improves the CPU consumption of the above mentioned
> function a little bit. Dont think its going to improve actual throughput
> of the workload substantially.
> 
> 0.07%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page
> 
> to
> 
> 0.01%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page

I can't say I'm surprised really.  A huge page is, ahem, huge.  The
computational cost of actually writing stuff into that page will swamp
the cost of the locking to acquire it.

Is the patch really worth the additional complexity?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-29 22:50   ` Andrew Morton
@ 2016-08-30  5:14     ` Anshuman Khandual
  -1 siblings, 0 replies; 34+ messages in thread
From: Anshuman Khandual @ 2016-08-30  5:14 UTC (permalink / raw)
  To: Andrew Morton, Aaron Lu
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 04:20 AM, Andrew Morton wrote:
> On Mon, 29 Aug 2016 14:31:20 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> 
>> > 
>> > The global zero page is used to satisfy an anonymous read fault. If
>> > THP(Transparent HugePage) is enabled then the global huge zero page is used.
>> > The global huge zero page uses an atomic counter for reference counting
>> > and is allocated/freed dynamically according to its counter value.
>> > 
>> > CPU time spent on that counter will greatly increase if there are
>> > a lot of processes doing anonymous read faults. This patch proposes a
>> > way to reduce the access to the global counter so that the CPU load
>> > can be reduced accordingly.
>> > 
>> > To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
>> > With this flag, the process only need to touch the global counter in
>> > two cases:
>> > 1 The first time it uses the global huge zero page;
>> > 2 The time when mm_user of its mm_struct reaches zero.
>> > 
>> > Note that right now, the huge zero page is eligible to be freed as soon
>> > as its last use goes away.  With this patch, the page will not be
>> > eligible to be freed until the exit of the last process from which it
>> > was ever used.
>> > 
>> > And with the use of mm_user, the kthread is not eligible to use huge
>> > zero page either. Since no kthread is using huge zero page today, there
>> > is no difference after applying this patch. But if that is not desired,
>> > I can change it to when mm_count reaches zero.

> I suppose we could simply never free the zero huge page - if some
> process has used it in the past, others will probably use it in the
> future.  One wonders how useful this optimization is...

Yeah, what prevents us from doing away with this lock altogether and
keep one zero filled huge page (after a process has used it once) for
ever to be mapped across all the read faults ? A 16MB / 2MB huge page
is too much of memory loss on a THP enabled system ? We can also save
on allocation time.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-30  5:14     ` Anshuman Khandual
  0 siblings, 0 replies; 34+ messages in thread
From: Anshuman Khandual @ 2016-08-30  5:14 UTC (permalink / raw)
  To: Andrew Morton, Aaron Lu
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 04:20 AM, Andrew Morton wrote:
> On Mon, 29 Aug 2016 14:31:20 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> 
>> > 
>> > The global zero page is used to satisfy an anonymous read fault. If
>> > THP(Transparent HugePage) is enabled then the global huge zero page is used.
>> > The global huge zero page uses an atomic counter for reference counting
>> > and is allocated/freed dynamically according to its counter value.
>> > 
>> > CPU time spent on that counter will greatly increase if there are
>> > a lot of processes doing anonymous read faults. This patch proposes a
>> > way to reduce the access to the global counter so that the CPU load
>> > can be reduced accordingly.
>> > 
>> > To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
>> > With this flag, the process only need to touch the global counter in
>> > two cases:
>> > 1 The first time it uses the global huge zero page;
>> > 2 The time when mm_user of its mm_struct reaches zero.
>> > 
>> > Note that right now, the huge zero page is eligible to be freed as soon
>> > as its last use goes away.  With this patch, the page will not be
>> > eligible to be freed until the exit of the last process from which it
>> > was ever used.
>> > 
>> > And with the use of mm_user, the kthread is not eligible to use huge
>> > zero page either. Since no kthread is using huge zero page today, there
>> > is no difference after applying this patch. But if that is not desired,
>> > I can change it to when mm_count reaches zero.

> I suppose we could simply never free the zero huge page - if some
> process has used it in the past, others will probably use it in the
> future.  One wonders how useful this optimization is...

Yeah, what prevents us from doing away with this lock altogether and
keep one zero filled huge page (after a process has used it once) for
ever to be mapped across all the read faults ? A 16MB / 2MB huge page
is too much of memory loss on a THP enabled system ? We can also save
on allocation time.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-30  5:14     ` Anshuman Khandual
@ 2016-08-30  5:19       ` Andrew Morton
  -1 siblings, 0 replies; 34+ messages in thread
From: Andrew Morton @ 2016-08-30  5:19 UTC (permalink / raw)
  To: Anshuman Khandual
  Cc: Aaron Lu, Linux Memory Management List,
	'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On Tue, 30 Aug 2016 10:44:21 +0530 Anshuman Khandual <khandual@linux.vnet.ibm.com> wrote:

> On 08/30/2016 04:20 AM, Andrew Morton wrote:
> > On Mon, 29 Aug 2016 14:31:20 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> > 
> >> > 
> >> > The global zero page is used to satisfy an anonymous read fault. If
> >> > THP(Transparent HugePage) is enabled then the global huge zero page is used.
> >> > The global huge zero page uses an atomic counter for reference counting
> >> > and is allocated/freed dynamically according to its counter value.
> >> > 
> >> > CPU time spent on that counter will greatly increase if there are
> >> > a lot of processes doing anonymous read faults. This patch proposes a
> >> > way to reduce the access to the global counter so that the CPU load
> >> > can be reduced accordingly.
> >> > 
> >> > To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
> >> > With this flag, the process only need to touch the global counter in
> >> > two cases:
> >> > 1 The first time it uses the global huge zero page;
> >> > 2 The time when mm_user of its mm_struct reaches zero.
> >> > 
> >> > Note that right now, the huge zero page is eligible to be freed as soon
> >> > as its last use goes away.  With this patch, the page will not be
> >> > eligible to be freed until the exit of the last process from which it
> >> > was ever used.
> >> > 
> >> > And with the use of mm_user, the kthread is not eligible to use huge
> >> > zero page either. Since no kthread is using huge zero page today, there
> >> > is no difference after applying this patch. But if that is not desired,
> >> > I can change it to when mm_count reaches zero.
> 
> > I suppose we could simply never free the zero huge page - if some
> > process has used it in the past, others will probably use it in the
> > future.  One wonders how useful this optimization is...
> 
> Yeah, what prevents us from doing away with this lock altogether and
> keep one zero filled huge page (after a process has used it once) for
> ever to be mapped across all the read faults ? A 16MB / 2MB huge page
> is too much of memory loss on a THP enabled system ? We can also save
> on allocation time.

Sounds OK to me.  But only if it makes a useful performance benefit to
something that someone cares about!

otoh, that patch is simple enough...

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-30  5:19       ` Andrew Morton
  0 siblings, 0 replies; 34+ messages in thread
From: Andrew Morton @ 2016-08-30  5:19 UTC (permalink / raw)
  To: Anshuman Khandual
  Cc: Aaron Lu, Linux Memory Management List,
	'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On Tue, 30 Aug 2016 10:44:21 +0530 Anshuman Khandual <khandual@linux.vnet.ibm.com> wrote:

> On 08/30/2016 04:20 AM, Andrew Morton wrote:
> > On Mon, 29 Aug 2016 14:31:20 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> > 
> >> > 
> >> > The global zero page is used to satisfy an anonymous read fault. If
> >> > THP(Transparent HugePage) is enabled then the global huge zero page is used.
> >> > The global huge zero page uses an atomic counter for reference counting
> >> > and is allocated/freed dynamically according to its counter value.
> >> > 
> >> > CPU time spent on that counter will greatly increase if there are
> >> > a lot of processes doing anonymous read faults. This patch proposes a
> >> > way to reduce the access to the global counter so that the CPU load
> >> > can be reduced accordingly.
> >> > 
> >> > To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
> >> > With this flag, the process only need to touch the global counter in
> >> > two cases:
> >> > 1 The first time it uses the global huge zero page;
> >> > 2 The time when mm_user of its mm_struct reaches zero.
> >> > 
> >> > Note that right now, the huge zero page is eligible to be freed as soon
> >> > as its last use goes away.  With this patch, the page will not be
> >> > eligible to be freed until the exit of the last process from which it
> >> > was ever used.
> >> > 
> >> > And with the use of mm_user, the kthread is not eligible to use huge
> >> > zero page either. Since no kthread is using huge zero page today, there
> >> > is no difference after applying this patch. But if that is not desired,
> >> > I can change it to when mm_count reaches zero.
> 
> > I suppose we could simply never free the zero huge page - if some
> > process has used it in the past, others will probably use it in the
> > future.  One wonders how useful this optimization is...
> 
> Yeah, what prevents us from doing away with this lock altogether and
> keep one zero filled huge page (after a process has used it once) for
> ever to be mapped across all the read faults ? A 16MB / 2MB huge page
> is too much of memory loss on a THP enabled system ? We can also save
> on allocation time.

Sounds OK to me.  But only if it makes a useful performance benefit to
something that someone cares about!

otoh, that patch is simple enough...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-30  3:39       ` Andrew Morton
@ 2016-08-30  5:51         ` Aaron Lu
  -1 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-30  5:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 11:39 AM, Andrew Morton wrote:
> On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> 
>>>> Case used for test on Haswell EP:
>>>> usemem -n 72 --readonly -j 0x200000 100G
>>>> Which spawns 72 processes and each will mmap 100G anonymous space and
>>>> then do read only access to that space sequentially with a step of 2MB.
>>>>
>>>> perf report for base commit:
>>>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
>>>> perf report for this commit:
>>>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
>>>
>>> Does this mean that overall usemem runtime halved?
>>
>> Sorry for the confusion, the above line is extracted from perf report.
>> It shows the percent of CPU cycles executed in a specific function.
>>
>> The above two perf lines are used to show get_huge_zero_page doesn't
>> consume that much CPU cycles after applying the patch.
>>
>>>
>>> Do we have any numbers for something which is more real-wordly?
>>
>> Unfortunately, no real world numbers.
>>
>> We think the global atomic counter could be an issue for performance
>> so I'm trying to solve the problem.
> 
> So, umm, we don't actually know if the patch is useful to anyone?

It should help when multiple processes are doing read only anonymous
page faults with THP enabled.

> 
> Some more measurements would help things along, please.
 
In addition to the perf cycles drop in the get_huge_zero_page function,
the throughput for the above workload also increased a lot.

usemem -n 72 --readonly -j 0x200000 100G

base commit
$ cat 7289420fc8e98999c8b7c1c2c888549ccc9aa96f/0/vm-scalability.json 
{
  "vm-scalability.throughput": [
    1784430792
  ],
}

this patch
$ cat a57acb91d1a29efc4cf34ffee09e1cebe93dcd24/0/vm-scalability.json 
{
  "vm-scalability.throughput": [
    4726928591
  ],
}

Throughput wise, it's a 164% gain.
Runtime wise, it's reduced from 707592 usecs to 303970 usecs, 50%+ drop.

Granted, real world use case may not encounter such an extreme case so
the gain would be much smaller.

Thanks,
Aaron

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-30  5:51         ` Aaron Lu
  0 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-30  5:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 11:39 AM, Andrew Morton wrote:
> On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
> 
>>>> Case used for test on Haswell EP:
>>>> usemem -n 72 --readonly -j 0x200000 100G
>>>> Which spawns 72 processes and each will mmap 100G anonymous space and
>>>> then do read only access to that space sequentially with a step of 2MB.
>>>>
>>>> perf report for base commit:
>>>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
>>>> perf report for this commit:
>>>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
>>>
>>> Does this mean that overall usemem runtime halved?
>>
>> Sorry for the confusion, the above line is extracted from perf report.
>> It shows the percent of CPU cycles executed in a specific function.
>>
>> The above two perf lines are used to show get_huge_zero_page doesn't
>> consume that much CPU cycles after applying the patch.
>>
>>>
>>> Do we have any numbers for something which is more real-wordly?
>>
>> Unfortunately, no real world numbers.
>>
>> We think the global atomic counter could be an issue for performance
>> so I'm trying to solve the problem.
> 
> So, umm, we don't actually know if the patch is useful to anyone?

It should help when multiple processes are doing read only anonymous
page faults with THP enabled.

> 
> Some more measurements would help things along, please.
 
In addition to the perf cycles drop in the get_huge_zero_page function,
the throughput for the above workload also increased a lot.

usemem -n 72 --readonly -j 0x200000 100G

base commit
$ cat 7289420fc8e98999c8b7c1c2c888549ccc9aa96f/0/vm-scalability.json 
{
  "vm-scalability.throughput": [
    1784430792
  ],
}

this patch
$ cat a57acb91d1a29efc4cf34ffee09e1cebe93dcd24/0/vm-scalability.json 
{
  "vm-scalability.throughput": [
    4726928591
  ],
}

Throughput wise, it's a 164% gain.
Runtime wise, it's reduced from 707592 usecs to 303970 usecs, 50%+ drop.

Granted, real world use case may not encounter such an extreme case so
the gain would be much smaller.

Thanks,
Aaron

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-30  4:44         ` Anshuman Khandual
@ 2016-08-30  5:54           ` Aaron Lu
  -1 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-30  5:54 UTC (permalink / raw)
  To: Anshuman Khandual, Andrew Morton
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 12:44 PM, Anshuman Khandual wrote:
> On 08/30/2016 09:09 AM, Andrew Morton wrote:
>> On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
>>
>>>>> Case used for test on Haswell EP:
>>>>> usemem -n 72 --readonly -j 0x200000 100G
>>>>> Which spawns 72 processes and each will mmap 100G anonymous space and
>>>>> then do read only access to that space sequentially with a step of 2MB.
>>>>>
>>>>> perf report for base commit:
>>>>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
>>>>> perf report for this commit:
>>>>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
>>>>
>>>> Does this mean that overall usemem runtime halved?
>>>
>>> Sorry for the confusion, the above line is extracted from perf report.
>>> It shows the percent of CPU cycles executed in a specific function.
>>>
>>> The above two perf lines are used to show get_huge_zero_page doesn't
>>> consume that much CPU cycles after applying the patch.
>>>
>>>>
>>>> Do we have any numbers for something which is more real-wordly?
>>>
>>> Unfortunately, no real world numbers.
>>>
>>> We think the global atomic counter could be an issue for performance
>>> so I'm trying to solve the problem.
>>
>> So, umm, we don't actually know if the patch is useful to anyone?
> 
> On a POWER system it improves the CPU consumption of the above mentioned
> function a little bit. Dont think its going to improve actual throughput
> of the workload substantially.
> 
> 0.07%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page

I guess this is the base commit? But there shouldn't be the new
mm_get_huge_zero_page symbol before this patch. A typo perhaps?

Regards,
Aaron

> to
> 
> 0.01%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-30  5:54           ` Aaron Lu
  0 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-30  5:54 UTC (permalink / raw)
  To: Anshuman Khandual, Andrew Morton
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 12:44 PM, Anshuman Khandual wrote:
> On 08/30/2016 09:09 AM, Andrew Morton wrote:
>> On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
>>
>>>>> Case used for test on Haswell EP:
>>>>> usemem -n 72 --readonly -j 0x200000 100G
>>>>> Which spawns 72 processes and each will mmap 100G anonymous space and
>>>>> then do read only access to that space sequentially with a step of 2MB.
>>>>>
>>>>> perf report for base commit:
>>>>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
>>>>> perf report for this commit:
>>>>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
>>>>
>>>> Does this mean that overall usemem runtime halved?
>>>
>>> Sorry for the confusion, the above line is extracted from perf report.
>>> It shows the percent of CPU cycles executed in a specific function.
>>>
>>> The above two perf lines are used to show get_huge_zero_page doesn't
>>> consume that much CPU cycles after applying the patch.
>>>
>>>>
>>>> Do we have any numbers for something which is more real-wordly?
>>>
>>> Unfortunately, no real world numbers.
>>>
>>> We think the global atomic counter could be an issue for performance
>>> so I'm trying to solve the problem.
>>
>> So, umm, we don't actually know if the patch is useful to anyone?
> 
> On a POWER system it improves the CPU consumption of the above mentioned
> function a little bit. Dont think its going to improve actual throughput
> of the workload substantially.
> 
> 0.07%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page

I guess this is the base commit? But there shouldn't be the new
mm_get_huge_zero_page symbol before this patch. A typo perhaps?

Regards,
Aaron

> to
> 
> 0.01%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-30  5:54           ` Aaron Lu
@ 2016-08-30  6:47             ` Anshuman Khandual
  -1 siblings, 0 replies; 34+ messages in thread
From: Anshuman Khandual @ 2016-08-30  6:47 UTC (permalink / raw)
  To: Aaron Lu, Anshuman Khandual, Andrew Morton
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 11:24 AM, Aaron Lu wrote:
> On 08/30/2016 12:44 PM, Anshuman Khandual wrote:
>> > On 08/30/2016 09:09 AM, Andrew Morton wrote:
>>> >> On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
>>> >>
>>>>>> >>>>> Case used for test on Haswell EP:
>>>>>> >>>>> usemem -n 72 --readonly -j 0x200000 100G
>>>>>> >>>>> Which spawns 72 processes and each will mmap 100G anonymous space and
>>>>>> >>>>> then do read only access to that space sequentially with a step of 2MB.
>>>>>> >>>>>
>>>>>> >>>>> perf report for base commit:
>>>>>> >>>>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
>>>>>> >>>>> perf report for this commit:
>>>>>> >>>>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
>>>>> >>>>
>>>>> >>>> Does this mean that overall usemem runtime halved?
>>>> >>>
>>>> >>> Sorry for the confusion, the above line is extracted from perf report.
>>>> >>> It shows the percent of CPU cycles executed in a specific function.
>>>> >>>
>>>> >>> The above two perf lines are used to show get_huge_zero_page doesn't
>>>> >>> consume that much CPU cycles after applying the patch.
>>>> >>>
>>>>> >>>>
>>>>> >>>> Do we have any numbers for something which is more real-wordly?
>>>> >>>
>>>> >>> Unfortunately, no real world numbers.
>>>> >>>
>>>> >>> We think the global atomic counter could be an issue for performance
>>>> >>> so I'm trying to solve the problem.
>>> >>
>>> >> So, umm, we don't actually know if the patch is useful to anyone?
>> > 
>> > On a POWER system it improves the CPU consumption of the above mentioned
>> > function a little bit. Dont think its going to improve actual throughput
>> > of the workload substantially.
>> > 
>> > 0.07%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page
> I guess this is the base commit? But there shouldn't be the new
> mm_get_huge_zero_page symbol before this patch. A typo perhaps?

Yeah, sorry about that.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-30  6:47             ` Anshuman Khandual
  0 siblings, 0 replies; 34+ messages in thread
From: Anshuman Khandual @ 2016-08-30  6:47 UTC (permalink / raw)
  To: Aaron Lu, Anshuman Khandual, Andrew Morton
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 11:24 AM, Aaron Lu wrote:
> On 08/30/2016 12:44 PM, Anshuman Khandual wrote:
>> > On 08/30/2016 09:09 AM, Andrew Morton wrote:
>>> >> On Tue, 30 Aug 2016 11:09:15 +0800 Aaron Lu <aaron.lu@intel.com> wrote:
>>> >>
>>>>>> >>>>> Case used for test on Haswell EP:
>>>>>> >>>>> usemem -n 72 --readonly -j 0x200000 100G
>>>>>> >>>>> Which spawns 72 processes and each will mmap 100G anonymous space and
>>>>>> >>>>> then do read only access to that space sequentially with a step of 2MB.
>>>>>> >>>>>
>>>>>> >>>>> perf report for base commit:
>>>>>> >>>>>     54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
>>>>>> >>>>> perf report for this commit:
>>>>>> >>>>>      0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page
>>>>> >>>>
>>>>> >>>> Does this mean that overall usemem runtime halved?
>>>> >>>
>>>> >>> Sorry for the confusion, the above line is extracted from perf report.
>>>> >>> It shows the percent of CPU cycles executed in a specific function.
>>>> >>>
>>>> >>> The above two perf lines are used to show get_huge_zero_page doesn't
>>>> >>> consume that much CPU cycles after applying the patch.
>>>> >>>
>>>>> >>>>
>>>>> >>>> Do we have any numbers for something which is more real-wordly?
>>>> >>>
>>>> >>> Unfortunately, no real world numbers.
>>>> >>>
>>>> >>> We think the global atomic counter could be an issue for performance
>>>> >>> so I'm trying to solve the problem.
>>> >>
>>> >> So, umm, we don't actually know if the patch is useful to anyone?
>> > 
>> > On a POWER system it improves the CPU consumption of the above mentioned
>> > function a little bit. Dont think its going to improve actual throughput
>> > of the workload substantially.
>> > 
>> > 0.07%  usemem  [kernel.vmlinux]  [k] mm_get_huge_zero_page
> I guess this is the base commit? But there shouldn't be the new
> mm_get_huge_zero_page symbol before this patch. A typo perhaps?

Yeah, sorry about that.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
  2016-08-29  6:31 ` Aaron Lu
@ 2016-08-30 15:59   ` Sergey Senozhatsky
  -1 siblings, 0 replies; 34+ messages in thread
From: Sergey Senozhatsky @ 2016-08-30 15:59 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel

On (08/29/16 14:31), Aaron Lu wrote:
> 
> The global zero page is used to satisfy an anonymous read fault. If
> THP(Transparent HugePage) is enabled then the global huge zero page is used.
> The global huge zero page uses an atomic counter for reference counting
> and is allocated/freed dynamically according to its counter value.
> 

Hello,

for !CONFIG_TRANSPARENT_HUGEPAGE configs mm_put_huge_zero_page() is BUILD_BUG(),
which gives the following build error (mmots v4.8-rc4-mmots-2016-08-29-16-56)


  CC      kernel/fork.o
In file included from ./include/asm-generic/bug.h:4:0,
                 from ./arch/x86/include/asm/bug.h:35,
                 from ./include/linux/bug.h:4,
                 from ./include/linux/mmdebug.h:4,
                 from ./include/linux/gfp.h:4,
                 from ./include/linux/slab.h:14,
                 from kernel/fork.c:14:
In function ‘mm_put_huge_zero_page’,
    inlined from ‘__mmput’ at kernel/fork.c:777:2,
    inlined from ‘mmput_async_fn’ at kernel/fork.c:806:2:
./include/linux/compiler.h:495:38: error: call to ‘__compiletime_assert_218’ declared with attribute error: BUILD_BUG failed
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
                                      ^
./include/linux/compiler.h:478:4: note: in definition of macro ‘__compiletime_assert’
    prefix ## suffix();    \
    ^~~~~~
./include/linux/compiler.h:495:2: note: in expansion of macro ‘_compiletime_assert’
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
  ^~~~~~~~~~~~~~~~~~~
./include/linux/bug.h:51:37: note: in expansion of macro ‘compiletime_assert’
 #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
                                     ^~~~~~~~~~~~~~~~~~
./include/linux/bug.h:85:21: note: in expansion of macro ‘BUILD_BUG_ON_MSG’
 #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
                     ^~~~~~~~~~~~~~~~
./include/linux/huge_mm.h:218:2: note: in expansion of macro ‘BUILD_BUG’
  BUILD_BUG();
  ^~~~~~~~~
In function ‘mm_put_huge_zero_page’,
    inlined from ‘__mmput’ at kernel/fork.c:777:2,
    inlined from ‘mmput’ at kernel/fork.c:798:3:
./include/linux/compiler.h:495:38: error: call to ‘__compiletime_assert_218’ declared with attribute error: BUILD_BUG failed
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
                                      ^
./include/linux/compiler.h:478:4: note: in definition of macro ‘__compiletime_assert’
    prefix ## suffix();    \
    ^~~~~~
./include/linux/compiler.h:495:2: note: in expansion of macro ‘_compiletime_assert’
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
  ^~~~~~~~~~~~~~~~~~~
./include/linux/bug.h:51:37: note: in expansion of macro ‘compiletime_assert’
 #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
                                     ^~~~~~~~~~~~~~~~~~
./include/linux/bug.h:85:21: note: in expansion of macro ‘BUILD_BUG_ON_MSG’
 #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
                     ^~~~~~~~~~~~~~~~
./include/linux/huge_mm.h:218:2: note: in expansion of macro ‘BUILD_BUG’
  BUILD_BUG();
  ^~~~~~~~~
make[1]: *** [scripts/Makefile.build:291: kernel/fork.o] Error 1
make: *** [Makefile:968: kernel] Error 2


	-ss

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-30 15:59   ` Sergey Senozhatsky
  0 siblings, 0 replies; 34+ messages in thread
From: Sergey Senozhatsky @ 2016-08-30 15:59 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Linux Memory Management List, 'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Andrew Morton,
	Vlastimil Babka, Jerome Marchand, Andrea Arcangeli, Mel Gorman,
	Ebru Akagunduz, linux-kernel

On (08/29/16 14:31), Aaron Lu wrote:
> 
> The global zero page is used to satisfy an anonymous read fault. If
> THP(Transparent HugePage) is enabled then the global huge zero page is used.
> The global huge zero page uses an atomic counter for reference counting
> and is allocated/freed dynamically according to its counter value.
> 

Hello,

for !CONFIG_TRANSPARENT_HUGEPAGE configs mm_put_huge_zero_page() is BUILD_BUG(),
which gives the following build error (mmots v4.8-rc4-mmots-2016-08-29-16-56)


  CC      kernel/fork.o
In file included from ./include/asm-generic/bug.h:4:0,
                 from ./arch/x86/include/asm/bug.h:35,
                 from ./include/linux/bug.h:4,
                 from ./include/linux/mmdebug.h:4,
                 from ./include/linux/gfp.h:4,
                 from ./include/linux/slab.h:14,
                 from kernel/fork.c:14:
In function a??mm_put_huge_zero_pagea??,
    inlined from a??__mmputa?? at kernel/fork.c:777:2,
    inlined from a??mmput_async_fna?? at kernel/fork.c:806:2:
./include/linux/compiler.h:495:38: error: call to a??__compiletime_assert_218a?? declared with attribute error: BUILD_BUG failed
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
                                      ^
./include/linux/compiler.h:478:4: note: in definition of macro a??__compiletime_asserta??
    prefix ## suffix();    \
    ^~~~~~
./include/linux/compiler.h:495:2: note: in expansion of macro a??_compiletime_asserta??
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
  ^~~~~~~~~~~~~~~~~~~
./include/linux/bug.h:51:37: note: in expansion of macro a??compiletime_asserta??
 #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
                                     ^~~~~~~~~~~~~~~~~~
./include/linux/bug.h:85:21: note: in expansion of macro a??BUILD_BUG_ON_MSGa??
 #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
                     ^~~~~~~~~~~~~~~~
./include/linux/huge_mm.h:218:2: note: in expansion of macro a??BUILD_BUGa??
  BUILD_BUG();
  ^~~~~~~~~
In function a??mm_put_huge_zero_pagea??,
    inlined from a??__mmputa?? at kernel/fork.c:777:2,
    inlined from a??mmputa?? at kernel/fork.c:798:3:
./include/linux/compiler.h:495:38: error: call to a??__compiletime_assert_218a?? declared with attribute error: BUILD_BUG failed
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
                                      ^
./include/linux/compiler.h:478:4: note: in definition of macro a??__compiletime_asserta??
    prefix ## suffix();    \
    ^~~~~~
./include/linux/compiler.h:495:2: note: in expansion of macro a??_compiletime_asserta??
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
  ^~~~~~~~~~~~~~~~~~~
./include/linux/bug.h:51:37: note: in expansion of macro a??compiletime_asserta??
 #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
                                     ^~~~~~~~~~~~~~~~~~
./include/linux/bug.h:85:21: note: in expansion of macro a??BUILD_BUG_ON_MSGa??
 #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
                     ^~~~~~~~~~~~~~~~
./include/linux/huge_mm.h:218:2: note: in expansion of macro a??BUILD_BUGa??
  BUILD_BUG();
  ^~~~~~~~~
make[1]: *** [scripts/Makefile.build:291: kernel/fork.o] Error 1
make: *** [Makefile:968: kernel] Error 2


	-ss

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2] thp: reduce usage of huge zero page's atomic counter
  2016-08-30 15:59   ` Sergey Senozhatsky
@ 2016-08-31  2:08     ` Aaron Lu
  -1 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-31  2:08 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Linux Memory Management List,
	'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 11:59 PM, Sergey Senozhatsky wrote:
> Hello,
> 
> for !CONFIG_TRANSPARENT_HUGEPAGE configs mm_put_huge_zero_page() is BUILD_BUG(),
> which gives the following build error (mmots v4.8-rc4-mmots-2016-08-29-16-56)

My bad, I mistakenly understand BUILD_BUG and now that
mm_put_huge_zero_page will not be eliminated during compile time, it's
not appropriate to use BUILD_BUG here.

Thanks for the note, I have changed the BUILD_BUG to "return;".

In the meantime, I have also added performance change and runtime change
data to the changelog.


From: Aaron Lu <aaron.lu@intel.com>
Date: Fri, 17 Jun 2016 17:13:08 +0800
Subject: [PATCH v2] thp: reduce usage of huge zero page's atomic counter

The global zero page is used to satisfy an anonymous read fault. If
THP(Transparent HugePage) is enabled then the global huge zero page is used.
The global huge zero page uses an atomic counter for reference counting
and is allocated/freed dynamically according to its counter value.

CPU time spent on that counter will greatly increase if there are
a lot of processes doing anonymous read faults. This patch proposes a
way to reduce the access to the global counter so that the CPU load
can be reduced accordingly.

To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
With this flag, the process only need to touch the global counter in
two cases:
1 The first time it uses the global huge zero page;
2 The time when mm_user of its mm_struct reaches zero.

Note that right now, the huge zero page is eligible to be freed as soon
as its last use goes away.  With this patch, the page will not be
eligible to be freed until the exit of the last process from which it
was ever used.

And with the use of mm_user, the kthread is not eligible to use huge
zero page either. Since no kthread is using huge zero page today, there
is no difference after applying this patch. But if that is not desired,
I can change it to when mm_count reaches zero.

Case used for test on Haswell EP:
usemem -n 72 --readonly -j 0x200000 100G
Which spawns 72 processes and each will mmap 100G anonymous space and
then do read only access to that space sequentially with a step of 2MB.

CPU cycles from perf report for base commit:
    54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
CPU cycles from perf report for this commit:
     0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page

Performance(throughput) of the workload for base commit: 1784430792
Performance(throughput) of the workload for this commit: 4726928591
164% increase.

Runtime of the workload for base commit: 707592 us
Runtime of the workload for this commit: 303970 us
50% drop.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
 fs/dax.c                |  2 +-
 include/linux/huge_mm.h |  8 ++++----
 include/linux/sched.h   |  1 +
 kernel/fork.c           |  1 +
 mm/huge_memory.c        | 36 +++++++++++++++++++++++++-----------
 mm/swap.c               |  4 +---
 mm/swap_state.c         |  4 +---
 7 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 993dc6fe0416..226c0d5eedac 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1034,7 +1034,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!write && !buffer_mapped(&bh)) {
 		spinlock_t *ptl;
 		pmd_t entry;
-		struct page *zero_page = get_huge_zero_page();
+		struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
 
 		if (unlikely(!zero_page)) {
 			dax_pmd_dbg(&bh, address, "no zero page");
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6f14de45b5ce..9e6ab7eeaf17 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -152,8 +152,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 	return is_huge_zero_page(pmd_page(pmd));
 }
 
-struct page *get_huge_zero_page(void);
-void put_huge_zero_page(void);
+struct page *mm_get_huge_zero_page(struct mm_struct *mm);
+void mm_put_huge_zero_page(struct mm_struct *mm);
 
 #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
 
@@ -213,9 +213,9 @@ static inline bool is_huge_zero_page(struct page *page)
 	return false;
 }
 
-static inline void put_huge_zero_page(void)
+static inline void mm_put_huge_zero_page(struct mm_struct *mm)
 {
-	BUILD_BUG();
+	return;
 }
 
 static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d7e1e783cf01..02246a70b63c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,6 +523,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
 #define MMF_OOM_REAPED		21	/* mm has been already reaped */
 #define MMF_OOM_NOT_REAPABLE	22	/* mm couldn't be reaped */
+#define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 52e725d4a866..372e02616b47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -711,6 +711,7 @@ static inline void __mmput(struct mm_struct *mm)
 	ksm_exit(mm);
 	khugepaged_exit(mm); /* must run before exit_mmap */
 	exit_mmap(mm);
+	mm_put_huge_zero_page(mm);
 	set_mm_exe_file(mm, NULL);
 	if (!list_empty(&mm->mmlist)) {
 		spin_lock(&mmlist_lock);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2db2112aa31e..d88bb1ec6fad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker;
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 
-struct page *get_huge_zero_page(void)
+static struct page *get_huge_zero_page(void)
 {
 	struct page *zero_page;
 retry:
@@ -86,7 +86,7 @@ retry:
 	return READ_ONCE(huge_zero_page);
 }
 
-void put_huge_zero_page(void)
+static void put_huge_zero_page(void)
 {
 	/*
 	 * Counter should never go to zero here. Only shrinker can put
@@ -95,6 +95,26 @@ void put_huge_zero_page(void)
 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 }
 
+struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+{
+	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		return READ_ONCE(huge_zero_page);
+
+	if (!get_huge_zero_page())
+		return NULL;
+
+	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		put_huge_zero_page();
+
+	return READ_ONCE(huge_zero_page);
+}
+
+void mm_put_huge_zero_page(struct mm_struct *mm)
+{
+	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		put_huge_zero_page();
+}
+
 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
 					struct shrink_control *sc)
 {
@@ -601,7 +621,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
 		pgtable = pte_alloc_one(vma->vm_mm, haddr);
 		if (unlikely(!pgtable))
 			return VM_FAULT_OOM;
-		zero_page = get_huge_zero_page();
+		zero_page = mm_get_huge_zero_page(vma->vm_mm);
 		if (unlikely(!zero_page)) {
 			pte_free(vma->vm_mm, pgtable);
 			count_vm_event(THP_FAULT_FALLBACK);
@@ -623,10 +643,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
 			}
 		} else
 			spin_unlock(fe->ptl);
-		if (!set) {
+		if (!set)
 			pte_free(vma->vm_mm, pgtable);
-			put_huge_zero_page();
-		}
 		return ret;
 	}
 	gfp = alloc_hugepage_direct_gfpmask(vma);
@@ -780,7 +798,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * since we already have a zero page to copy. It just takes a
 		 * reference.
 		 */
-		zero_page = get_huge_zero_page();
+		zero_page = mm_get_huge_zero_page(dst_mm);
 		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
 				zero_page);
 		ret = 0;
@@ -1038,7 +1056,6 @@ alloc:
 		update_mmu_cache_pmd(vma, fe->address, fe->pmd);
 		if (!page) {
 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-			put_huge_zero_page();
 		} else {
 			VM_BUG_ON_PAGE(!PageHead(page), page);
 			page_remove_rmap(page, true);
@@ -1502,7 +1519,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	}
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
-	put_huge_zero_page();
 }
 
 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1525,8 +1541,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	if (!vma_is_anonymous(vma)) {
 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-		if (is_huge_zero_pmd(_pmd))
-			put_huge_zero_page();
 		if (vma_is_dax(vma))
 			return;
 		page = pmd_page(_pmd);
diff --git a/mm/swap.c b/mm/swap.c
index 75c63bb2a1da..4dcf852e1e6d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold)
 			locked_pgdat = NULL;
 		}
 
-		if (is_huge_zero_page(page)) {
-			put_huge_zero_page();
+		if (is_huge_zero_page(page))
 			continue;
-		}
 
 		page = compound_head(page);
 		if (!put_page_testzero(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c8310a37be3a..5ffd3ee26592 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -252,9 +252,7 @@ static inline void free_swap_cache(struct page *page)
 void free_page_and_swap_cache(struct page *page)
 {
 	free_swap_cache(page);
-	if (is_huge_zero_page(page))
-		put_huge_zero_page();
-	else
+	if (!is_huge_zero_page(page))
 		put_page(page);
 }
 
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2] thp: reduce usage of huge zero page's atomic counter
@ 2016-08-31  2:08     ` Aaron Lu
  0 siblings, 0 replies; 34+ messages in thread
From: Aaron Lu @ 2016-08-31  2:08 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Linux Memory Management List,
	'Kirill A. Shutemov',
	Dave Hansen, Tim Chen, Huang Ying, Vlastimil Babka,
	Jerome Marchand, Andrea Arcangeli, Mel Gorman, Ebru Akagunduz,
	linux-kernel

On 08/30/2016 11:59 PM, Sergey Senozhatsky wrote:
> Hello,
> 
> for !CONFIG_TRANSPARENT_HUGEPAGE configs mm_put_huge_zero_page() is BUILD_BUG(),
> which gives the following build error (mmots v4.8-rc4-mmots-2016-08-29-16-56)

My bad, I mistakenly understand BUILD_BUG and now that
mm_put_huge_zero_page will not be eliminated during compile time, it's
not appropriate to use BUILD_BUG here.

Thanks for the note, I have changed the BUILD_BUG to "return;".

In the meantime, I have also added performance change and runtime change
data to the changelog.


From: Aaron Lu <aaron.lu@intel.com>
Date: Fri, 17 Jun 2016 17:13:08 +0800
Subject: [PATCH v2] thp: reduce usage of huge zero page's atomic counter

The global zero page is used to satisfy an anonymous read fault. If
THP(Transparent HugePage) is enabled then the global huge zero page is used.
The global huge zero page uses an atomic counter for reference counting
and is allocated/freed dynamically according to its counter value.

CPU time spent on that counter will greatly increase if there are
a lot of processes doing anonymous read faults. This patch proposes a
way to reduce the access to the global counter so that the CPU load
can be reduced accordingly.

To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE.
With this flag, the process only need to touch the global counter in
two cases:
1 The first time it uses the global huge zero page;
2 The time when mm_user of its mm_struct reaches zero.

Note that right now, the huge zero page is eligible to be freed as soon
as its last use goes away.  With this patch, the page will not be
eligible to be freed until the exit of the last process from which it
was ever used.

And with the use of mm_user, the kthread is not eligible to use huge
zero page either. Since no kthread is using huge zero page today, there
is no difference after applying this patch. But if that is not desired,
I can change it to when mm_count reaches zero.

Case used for test on Haswell EP:
usemem -n 72 --readonly -j 0x200000 100G
Which spawns 72 processes and each will mmap 100G anonymous space and
then do read only access to that space sequentially with a step of 2MB.

CPU cycles from perf report for base commit:
    54.03%  usemem   [kernel.kallsyms]   [k] get_huge_zero_page
CPU cycles from perf report for this commit:
     0.11%  usemem   [kernel.kallsyms]   [k] mm_get_huge_zero_page

Performance(throughput) of the workload for base commit: 1784430792
Performance(throughput) of the workload for this commit: 4726928591
164% increase.

Runtime of the workload for base commit: 707592 us
Runtime of the workload for this commit: 303970 us
50% drop.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
---
 fs/dax.c                |  2 +-
 include/linux/huge_mm.h |  8 ++++----
 include/linux/sched.h   |  1 +
 kernel/fork.c           |  1 +
 mm/huge_memory.c        | 36 +++++++++++++++++++++++++-----------
 mm/swap.c               |  4 +---
 mm/swap_state.c         |  4 +---
 7 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 993dc6fe0416..226c0d5eedac 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1034,7 +1034,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if (!write && !buffer_mapped(&bh)) {
 		spinlock_t *ptl;
 		pmd_t entry;
-		struct page *zero_page = get_huge_zero_page();
+		struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
 
 		if (unlikely(!zero_page)) {
 			dax_pmd_dbg(&bh, address, "no zero page");
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 6f14de45b5ce..9e6ab7eeaf17 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -152,8 +152,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
 	return is_huge_zero_page(pmd_page(pmd));
 }
 
-struct page *get_huge_zero_page(void);
-void put_huge_zero_page(void);
+struct page *mm_get_huge_zero_page(struct mm_struct *mm);
+void mm_put_huge_zero_page(struct mm_struct *mm);
 
 #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
 
@@ -213,9 +213,9 @@ static inline bool is_huge_zero_page(struct page *page)
 	return false;
 }
 
-static inline void put_huge_zero_page(void)
+static inline void mm_put_huge_zero_page(struct mm_struct *mm)
 {
-	BUILD_BUG();
+	return;
 }
 
 static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d7e1e783cf01..02246a70b63c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,6 +523,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
 #define MMF_OOM_REAPED		21	/* mm has been already reaped */
 #define MMF_OOM_NOT_REAPABLE	22	/* mm couldn't be reaped */
+#define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 52e725d4a866..372e02616b47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -711,6 +711,7 @@ static inline void __mmput(struct mm_struct *mm)
 	ksm_exit(mm);
 	khugepaged_exit(mm); /* must run before exit_mmap */
 	exit_mmap(mm);
+	mm_put_huge_zero_page(mm);
 	set_mm_exe_file(mm, NULL);
 	if (!list_empty(&mm->mmlist)) {
 		spin_lock(&mmlist_lock);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2db2112aa31e..d88bb1ec6fad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker;
 static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 
-struct page *get_huge_zero_page(void)
+static struct page *get_huge_zero_page(void)
 {
 	struct page *zero_page;
 retry:
@@ -86,7 +86,7 @@ retry:
 	return READ_ONCE(huge_zero_page);
 }
 
-void put_huge_zero_page(void)
+static void put_huge_zero_page(void)
 {
 	/*
 	 * Counter should never go to zero here. Only shrinker can put
@@ -95,6 +95,26 @@ void put_huge_zero_page(void)
 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 }
 
+struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+{
+	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		return READ_ONCE(huge_zero_page);
+
+	if (!get_huge_zero_page())
+		return NULL;
+
+	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		put_huge_zero_page();
+
+	return READ_ONCE(huge_zero_page);
+}
+
+void mm_put_huge_zero_page(struct mm_struct *mm)
+{
+	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+		put_huge_zero_page();
+}
+
 static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
 					struct shrink_control *sc)
 {
@@ -601,7 +621,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
 		pgtable = pte_alloc_one(vma->vm_mm, haddr);
 		if (unlikely(!pgtable))
 			return VM_FAULT_OOM;
-		zero_page = get_huge_zero_page();
+		zero_page = mm_get_huge_zero_page(vma->vm_mm);
 		if (unlikely(!zero_page)) {
 			pte_free(vma->vm_mm, pgtable);
 			count_vm_event(THP_FAULT_FALLBACK);
@@ -623,10 +643,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
 			}
 		} else
 			spin_unlock(fe->ptl);
-		if (!set) {
+		if (!set)
 			pte_free(vma->vm_mm, pgtable);
-			put_huge_zero_page();
-		}
 		return ret;
 	}
 	gfp = alloc_hugepage_direct_gfpmask(vma);
@@ -780,7 +798,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * since we already have a zero page to copy. It just takes a
 		 * reference.
 		 */
-		zero_page = get_huge_zero_page();
+		zero_page = mm_get_huge_zero_page(dst_mm);
 		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
 				zero_page);
 		ret = 0;
@@ -1038,7 +1056,6 @@ alloc:
 		update_mmu_cache_pmd(vma, fe->address, fe->pmd);
 		if (!page) {
 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-			put_huge_zero_page();
 		} else {
 			VM_BUG_ON_PAGE(!PageHead(page), page);
 			page_remove_rmap(page, true);
@@ -1502,7 +1519,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	}
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
-	put_huge_zero_page();
 }
 
 static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1525,8 +1541,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	if (!vma_is_anonymous(vma)) {
 		_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
-		if (is_huge_zero_pmd(_pmd))
-			put_huge_zero_page();
 		if (vma_is_dax(vma))
 			return;
 		page = pmd_page(_pmd);
diff --git a/mm/swap.c b/mm/swap.c
index 75c63bb2a1da..4dcf852e1e6d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold)
 			locked_pgdat = NULL;
 		}
 
-		if (is_huge_zero_page(page)) {
-			put_huge_zero_page();
+		if (is_huge_zero_page(page))
 			continue;
-		}
 
 		page = compound_head(page);
 		if (!put_page_testzero(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c8310a37be3a..5ffd3ee26592 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -252,9 +252,7 @@ static inline void free_swap_cache(struct page *page)
 void free_page_and_swap_cache(struct page *page)
 {
 	free_swap_cache(page);
-	if (is_huge_zero_page(page))
-		put_huge_zero_page();
-	else
+	if (!is_huge_zero_page(page))
 		put_page(page);
 }
 
-- 
2.5.5

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2016-08-31  2:09 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-29  6:31 [PATCH] thp: reduce usage of huge zero page's atomic counter Aaron Lu
2016-08-29  6:31 ` Aaron Lu
2016-08-29  8:49 ` Anshuman Khandual
2016-08-29  8:49   ` Anshuman Khandual
2016-08-29  8:53   ` Aaron Lu
2016-08-29  8:53     ` Aaron Lu
2016-08-29 13:47     ` Anshuman Khandual
2016-08-29 13:47       ` Anshuman Khandual
2016-08-29 14:10       ` Aaron Lu
2016-08-29 14:10         ` Aaron Lu
2016-08-29 22:50 ` Andrew Morton
2016-08-29 22:50   ` Andrew Morton
2016-08-30  3:09   ` Aaron Lu
2016-08-30  3:09     ` Aaron Lu
2016-08-30  3:39     ` Andrew Morton
2016-08-30  3:39       ` Andrew Morton
2016-08-30  4:44       ` Anshuman Khandual
2016-08-30  4:44         ` Anshuman Khandual
2016-08-30  4:56         ` Andrew Morton
2016-08-30  4:56           ` Andrew Morton
2016-08-30  5:54         ` Aaron Lu
2016-08-30  5:54           ` Aaron Lu
2016-08-30  6:47           ` Anshuman Khandual
2016-08-30  6:47             ` Anshuman Khandual
2016-08-30  5:51       ` Aaron Lu
2016-08-30  5:51         ` Aaron Lu
2016-08-30  5:14   ` Anshuman Khandual
2016-08-30  5:14     ` Anshuman Khandual
2016-08-30  5:19     ` Andrew Morton
2016-08-30  5:19       ` Andrew Morton
2016-08-30 15:59 ` Sergey Senozhatsky
2016-08-30 15:59   ` Sergey Senozhatsky
2016-08-31  2:08   ` [PATCH v2] " Aaron Lu
2016-08-31  2:08     ` Aaron Lu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.