Linux-mm Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH] mm: clear 1G pages with streaming stores on x86
@ 2020-03-07  1:03 Cannon Matthews
  2020-03-07 15:36 ` kbuild test robot
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Cannon Matthews @ 2020-03-07  1:03 UTC (permalink / raw)
  To: Mike Kravetz, Andrew Morton
  Cc: Matthew Wilcox, Michal Hocko, David Rientjes, Greg Thelen,
	Salman Qazi, linux-mm, linux-kernel, Cannon Matthews

Reimplement clear_gigantic_page() to clear gigabytes pages using the
non-temporal streaming store instructions that bypass the cache
(movnti), since an entire 1GiB region will not fit in the cache anyway.

Doing an mlock() on a 512GiB 1G-hugetlb region previously would take on
average 134 seconds, about 260ms/GiB which is quite slow. Using `movnti`
and optimizing the control flow over the constituent small pages, this
can be improved roughly by a factor of 3-4x, with the 512GiB mlock()
taking only 34 seconds on average, or 67ms/GiB.

The assembly code for the __clear_page_nt routine is more or less
taken directly from the output of gcc with -O3 for this function with
some tweaks to support arbitrary sizes and moving memory barriers:

void clear_page_nt_64i (void *page)
{
  for (int i = 0; i < GiB /sizeof(long long int); ++i)
    {
      _mm_stream_si64 (((long long int*)page) + i, 0);
    }
  sfence();
}

Tested:
	Time to `mlock()` a 512GiB region on broadwell CPU
				AVG time (s)	% imp.	ms/page
	clear_page_erms		133.584		-	261
	clear_page_nt		34.154		74.43%	67

An earlier version of this code was sent as an RFC patch ~July 2018
https://patchwork.kernel.org/patch/10543193/ but never merged.

Signed-off-by: Cannon Matthews <cannonmatthews@google.com>
---
 MAINTAINERS                        |  1 +
 arch/x86/Kconfig                   |  4 ++++
 arch/x86/include/asm/page_64.h     |  1 +
 arch/x86/lib/Makefile              |  2 +-
 arch/x86/lib/clear_gigantic_page.c | 28 ++++++++++++++++++++++++++++
 arch/x86/lib/clear_page_64.S       | 19 +++++++++++++++++++
 include/linux/mm.h                 |  2 ++
 mm/memory.c                        |  2 ++
 8 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/lib/clear_gigantic_page.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 68eebf3650ac..efe84f085404 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7702,6 +7702,7 @@ S:	Maintained
 F:	fs/hugetlbfs/
 F:	mm/hugetlb.c
 F:	include/linux/hugetlb.h
+F:	arch/x86/lib/clear_gigantic_page.c
 F:	Documentation/admin-guide/mm/hugetlbpage.rst
 F:	Documentation/vm/hugetlbfs_reserv.rst
 F:	Documentation/ABI/testing/sysfs-kernel-mm-hugepages
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index beea77046f9b..f49e7b6f6851 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -70,6 +70,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MEM_ENCRYPT
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
+	select ARCH_HAS_CLEAR_GIGANTIC_PAGE	if X86_64
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
@@ -290,6 +291,9 @@ config ARCH_MAY_HAVE_PC_FDC
 config GENERIC_CALIBRATE_DELAY
 	def_bool y
 
+config ARCH_HAS_CLEAR_GIGANTIC_PAGE
+	bool
+
 config ARCH_HAS_CPU_RELAX
 	def_bool y
 
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 939b1cff4a7b..6ea60883b6d6 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -55,6 +55,7 @@ static inline void clear_page(void *page)
 }
 
 void copy_page(void *to, void *from);
+void clear_page_nt(void *page, u64 page_size);
 
 #endif	/* !__ASSEMBLY__ */
 
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 5246db42de45..a620c6636210 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -56,7 +56,7 @@ endif
 else
         obj-y += iomap_copy_64.o
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
-        lib-y += clear_page_64.o copy_page_64.o
+        lib-y += clear_page_64.o copy_page_64.o clear_gigantic_page.o
         lib-y += memmove_64.o memset_64.o
         lib-y += copy_user_64.o
 	lib-y += cmpxchg16b_emu.o
diff --git a/arch/x86/lib/clear_gigantic_page.c b/arch/x86/lib/clear_gigantic_page.c
new file mode 100644
index 000000000000..6fcb494ec9bc
--- /dev/null
+++ b/arch/x86/lib/clear_gigantic_page.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/page.h>
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+
+void clear_gigantic_page(struct page *page, unsigned long addr,
+			 unsigned int pages)
+{
+	int i;
+	void *dest = page_to_virt(page);
+
+	/*
+	 * cond_resched() every 2M. Hypothetical page sizes not divisible by
+	 * this are not supported.
+	 */
+	BUG_ON(pages % HPAGE_PMD_NR != 0);
+	for (i = 0; i < pages; i += HPAGE_PMD_NR) {
+		clear_page_nt(dest + (i * PAGE_SIZE), HPAGE_PMD_NR * PAGE_SIZE);
+		cond_resched();
+	}
+	/* clear_page_nt requires an `sfence` barrier. */
+	wmb();
+}
+#endif /* defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) */
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index c4c7dd115953..1224094fd863 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -50,3 +50,22 @@ SYM_FUNC_START(clear_page_erms)
 	ret
 SYM_FUNC_END(clear_page_erms)
 EXPORT_SYMBOL_GPL(clear_page_erms)
+
+/*
+ * Zero memory using non temporal stores, bypassing the cache.
+ * Requires an `sfence` (wmb()) afterwards.
+ * %rdi - destination.
+ * %rsi - page size. Must be 64 bit aligned.
+*/
+SYM_FUNC_START(clear_page_nt)
+	leaq	(%rdi,%rsi), %rdx
+	xorl	%eax, %eax
+	.p2align 4,,10
+	.p2align 3
+.L2:
+	movnti	%rax, (%rdi)
+	addq	$8, %rdi
+	cmpq	%rdx, %rdi
+	jne	.L2
+	ret
+SYM_FUNC_END(clear_page_nt)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c54fb96cb1e6..a57f9007374b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2856,6 +2856,8 @@ enum mf_action_page_type {
 };
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+extern void clear_gigantic_page(struct page *page, unsigned long addr,
+				unsigned int pages);
 extern void clear_huge_page(struct page *page,
 			    unsigned long addr_hint,
 			    unsigned int pages_per_huge_page);
diff --git a/mm/memory.c b/mm/memory.c
index e8bfdf0d9d1d..2a13bf102890 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4706,6 +4706,7 @@ static inline void process_huge_page(
 	}
 }
 
+#ifndef CONFIG_ARCH_HAS_CLEAR_GIGANTIC_PAGE
 static void clear_gigantic_page(struct page *page,
 				unsigned long addr,
 				unsigned int pages_per_huge_page)
@@ -4720,6 +4721,7 @@ static void clear_gigantic_page(struct page *page,
 		clear_user_highpage(p, addr + i * PAGE_SIZE);
 	}
 }
+#endif  /* CONFIG_ARCH_HAS_CLEAR_GIGANTIC_PAGE */
 
 static void clear_subpage(unsigned long addr, int idx, void *arg)
 {
-- 
2.25.1.481.gfbce0eb801-goog



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-07  1:03 [PATCH] mm: clear 1G pages with streaming stores on x86 Cannon Matthews
@ 2020-03-07 15:36 ` kbuild test robot
  2020-03-07 22:06 ` Andrew Morton
  2020-03-09  0:08 ` Kirill A. Shutemov
  2 siblings, 0 replies; 24+ messages in thread
From: kbuild test robot @ 2020-03-07 15:36 UTC (permalink / raw)
  To: Cannon Matthews
  Cc: kbuild-all, clang-built-linux, Linux Memory Management List

[-- Attachment #1: Type: text/plain, Size: 1510 bytes --]

Hi Cannon,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/x86/core]
[also build test ERROR on linux/master linus/master v5.6-rc4 next-20200306]
[cannot apply to mmotm/master]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Cannon-Matthews/mm-clear-1G-pages-with-streaming-stores-on-x86/20200307-090542
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 248ed51048c40d36728e70914e38bffd7821da57
config: x86_64-defconfig (attached as .config)
compiler: clang version 11.0.0 (git://gitmirror/llvm_project 65b21282c710afe9c275778820c6e3c1cf46734b)
reproduce:
        # FIXME the reproduce steps for clang is not ready yet

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: arch/x86/lib/clear_gigantic_page.o: in function `clear_gigantic_page':
>> clear_gigantic_page.c:(.text+0x1a): undefined reference to `__compiletime_assert_20'
>> ld: clear_gigantic_page.c:(.text+0x31): undefined reference to `__compiletime_assert_22'
   ld: clear_gigantic_page.c:(.text+0x45): undefined reference to `__compiletime_assert_21'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 28834 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-07  1:03 [PATCH] mm: clear 1G pages with streaming stores on x86 Cannon Matthews
  2020-03-07 15:36 ` kbuild test robot
@ 2020-03-07 22:06 ` Andrew Morton
  2020-03-09  0:08 ` Kirill A. Shutemov
  2 siblings, 0 replies; 24+ messages in thread
From: Andrew Morton @ 2020-03-07 22:06 UTC (permalink / raw)
  To: Cannon Matthews
  Cc: Mike Kravetz, Matthew Wilcox, Michal Hocko, David Rientjes,
	Greg Thelen, Salman Qazi, linux-mm, linux-kernel, Ingo Molnar,
	Thomas Gleixner, H. Peter Anvin, Peter Zijlstra

On Fri,  6 Mar 2020 17:03:53 -0800 Cannon Matthews <cannonmatthews@google.com> wrote:

> Reimplement clear_gigantic_page() to clear gigabytes pages using the
> non-temporal streaming store instructions that bypass the cache
> (movnti), since an entire 1GiB region will not fit in the cache anyway.
> 
> Doing an mlock() on a 512GiB 1G-hugetlb region previously would take on
> average 134 seconds, about 260ms/GiB which is quite slow. Using `movnti`
> and optimizing the control flow over the constituent small pages, this
> can be improved roughly by a factor of 3-4x, with the 512GiB mlock()
> taking only 34 seconds on average, or 67ms/GiB.
> 
> The assembly code for the __clear_page_nt routine is more or less
> taken directly from the output of gcc with -O3 for this function with
> some tweaks to support arbitrary sizes and moving memory barriers:
> 
> void clear_page_nt_64i (void *page)
> {
>   for (int i = 0; i < GiB /sizeof(long long int); ++i)
>     {
>       _mm_stream_si64 (((long long int*)page) + i, 0);
>     }
>   sfence();
> }
> 
> Tested:
> 	Time to `mlock()` a 512GiB region on broadwell CPU
> 				AVG time (s)	% imp.	ms/page
> 	clear_page_erms		133.584		-	261
> 	clear_page_nt		34.154		74.43%	67
> 
> An earlier version of this code was sent as an RFC patch ~July 2018
> https://patchwork.kernel.org/patch/10543193/ but never merged.
> 
> ...
>
>  MAINTAINERS                        |  1 +
>  arch/x86/Kconfig                   |  4 ++++
>  arch/x86/include/asm/page_64.h     |  1 +
>  arch/x86/lib/Makefile              |  2 +-
>  arch/x86/lib/clear_gigantic_page.c | 28 ++++++++++++++++++++++++++++
>  arch/x86/lib/clear_page_64.S       | 19 +++++++++++++++++++
>  include/linux/mm.h                 |  2 ++
>  mm/memory.c                        |  2 ++

Please cc the x86 maintainers on such things.

>
> ...
>
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -70,6 +70,7 @@ config X86
>  	select ARCH_HAS_KCOV			if X86_64
>  	select ARCH_HAS_MEM_ENCRYPT
>  	select ARCH_HAS_MEMBARRIER_SYNC_CORE
> +	select ARCH_HAS_CLEAR_GIGANTIC_PAGE	if X86_64
>  	select ARCH_HAS_PMEM_API		if X86_64
>  	select ARCH_HAS_PTE_DEVMAP		if X86_64
>  	select ARCH_HAS_PTE_SPECIAL
> @@ -290,6 +291,9 @@ config ARCH_MAY_HAVE_PC_FDC
>  config GENERIC_CALIBRATE_DELAY
>  	def_bool y
>  
> +config ARCH_HAS_CLEAR_GIGANTIC_PAGE
> +	bool

Opinions might differ, but I believe the Linus-approved way of doing
this sort of thing is

#ifndef clear_gigantic_page
extern void clear_gigantic_page(...);
#define clear_gigantic_page clear_gigantic_page
#endif

alternatively, __weak works nicely.

>
> ...
>
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2856,6 +2856,8 @@ enum mf_action_page_type {
>  };
>  
>  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
> +extern void clear_gigantic_page(struct page *page, unsigned long addr,
> +				unsigned int pages);

Shouldn't this be inside #ifdef CONFIG_ARCH_HAS_CLEAR_GIGANTIC_PAGE?

Also, the third argument here is called "pages" but here:

> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4706,6 +4706,7 @@ static inline void process_huge_page(
>  	}
>  }
>  
> +#ifndef CONFIG_ARCH_HAS_CLEAR_GIGANTIC_PAGE
>  static void clear_gigantic_page(struct page *page,
>  				unsigned long addr,
>  				unsigned int pages_per_huge_page)

it is called "pages_per_huge_page".

> @@ -4720,6 +4721,7 @@ static void clear_gigantic_page(struct page *page,
>  		clear_user_highpage(p, addr + i * PAGE_SIZE);
>  	}
>  }
> +#endif  /* CONFIG_ARCH_HAS_CLEAR_GIGANTIC_PAGE */
>  
> ...


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-07  1:03 [PATCH] mm: clear 1G pages with streaming stores on x86 Cannon Matthews
  2020-03-07 15:36 ` kbuild test robot
  2020-03-07 22:06 ` Andrew Morton
@ 2020-03-09  0:08 ` Kirill A. Shutemov
  2020-03-09  9:06   ` Michal Hocko
  2020-03-09 15:33   ` Andi Kleen
  2 siblings, 2 replies; 24+ messages in thread
From: Kirill A. Shutemov @ 2020-03-09  0:08 UTC (permalink / raw)
  To: Cannon Matthews
  Cc: Mike Kravetz, Andrew Morton, Matthew Wilcox, Michal Hocko,
	David Rientjes, Greg Thelen, Salman Qazi, linux-mm, linux-kernel,
	ak, x86

On Fri, Mar 06, 2020 at 05:03:53PM -0800, Cannon Matthews wrote:
> Reimplement clear_gigantic_page() to clear gigabytes pages using the
> non-temporal streaming store instructions that bypass the cache
> (movnti), since an entire 1GiB region will not fit in the cache anyway.
> 
> Doing an mlock() on a 512GiB 1G-hugetlb region previously would take on
> average 134 seconds, about 260ms/GiB which is quite slow. Using `movnti`
> and optimizing the control flow over the constituent small pages, this
> can be improved roughly by a factor of 3-4x, with the 512GiB mlock()
> taking only 34 seconds on average, or 67ms/GiB.
> 
> The assembly code for the __clear_page_nt routine is more or less
> taken directly from the output of gcc with -O3 for this function with
> some tweaks to support arbitrary sizes and moving memory barriers:
> 
> void clear_page_nt_64i (void *page)
> {
>   for (int i = 0; i < GiB /sizeof(long long int); ++i)
>     {
>       _mm_stream_si64 (((long long int*)page) + i, 0);
>     }
>   sfence();
> }
> 
> Tested:
> 	Time to `mlock()` a 512GiB region on broadwell CPU
> 				AVG time (s)	% imp.	ms/page
> 	clear_page_erms		133.584		-	261
> 	clear_page_nt		34.154		74.43%	67

Some macrobenchmark would be great too.

> An earlier version of this code was sent as an RFC patch ~July 2018
> https://patchwork.kernel.org/patch/10543193/ but never merged.

Andi and I tried to use MOVNTI for large/gigantic page clearing back in
2012[1]. Maybe it can be useful.

That patchset is somewhat more complex trying to keep the memory around
the fault address hot in cache. In theory it should help to reduce latency
on the first access to the memory.

I was not able to get convincing numbers back then for the hardware of the
time. Maybe it's better now.

https://lore.kernel.org/r/1345470757-12005-1-git-send-email-kirill.shutemov@linux.intel.com

> 
> Signed-off-by: Cannon Matthews <cannonmatthews@google.com>
> ---
>  MAINTAINERS                        |  1 +
>  arch/x86/Kconfig                   |  4 ++++
>  arch/x86/include/asm/page_64.h     |  1 +
>  arch/x86/lib/Makefile              |  2 +-
>  arch/x86/lib/clear_gigantic_page.c | 28 ++++++++++++++++++++++++++++
>  arch/x86/lib/clear_page_64.S       | 19 +++++++++++++++++++
>  include/linux/mm.h                 |  2 ++
>  mm/memory.c                        |  2 ++
>  8 files changed, 58 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/lib/clear_gigantic_page.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 68eebf3650ac..efe84f085404 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -7702,6 +7702,7 @@ S:	Maintained
>  F:	fs/hugetlbfs/
>  F:	mm/hugetlb.c
>  F:	include/linux/hugetlb.h
> +F:	arch/x86/lib/clear_gigantic_page.c
>  F:	Documentation/admin-guide/mm/hugetlbpage.rst
>  F:	Documentation/vm/hugetlbfs_reserv.rst
>  F:	Documentation/ABI/testing/sysfs-kernel-mm-hugepages
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index beea77046f9b..f49e7b6f6851 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -70,6 +70,7 @@ config X86
>  	select ARCH_HAS_KCOV			if X86_64
>  	select ARCH_HAS_MEM_ENCRYPT
>  	select ARCH_HAS_MEMBARRIER_SYNC_CORE
> +	select ARCH_HAS_CLEAR_GIGANTIC_PAGE	if X86_64
>  	select ARCH_HAS_PMEM_API		if X86_64
>  	select ARCH_HAS_PTE_DEVMAP		if X86_64
>  	select ARCH_HAS_PTE_SPECIAL
> @@ -290,6 +291,9 @@ config ARCH_MAY_HAVE_PC_FDC
>  config GENERIC_CALIBRATE_DELAY
>  	def_bool y
>  
> +config ARCH_HAS_CLEAR_GIGANTIC_PAGE
> +	bool
> +
>  config ARCH_HAS_CPU_RELAX
>  	def_bool y
>  
> diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
> index 939b1cff4a7b..6ea60883b6d6 100644
> --- a/arch/x86/include/asm/page_64.h
> +++ b/arch/x86/include/asm/page_64.h
> @@ -55,6 +55,7 @@ static inline void clear_page(void *page)
>  }
>  
>  void copy_page(void *to, void *from);
> +void clear_page_nt(void *page, u64 page_size);
>  
>  #endif	/* !__ASSEMBLY__ */
>  
> diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
> index 5246db42de45..a620c6636210 100644
> --- a/arch/x86/lib/Makefile
> +++ b/arch/x86/lib/Makefile
> @@ -56,7 +56,7 @@ endif
>  else
>          obj-y += iomap_copy_64.o
>          lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
> -        lib-y += clear_page_64.o copy_page_64.o
> +        lib-y += clear_page_64.o copy_page_64.o clear_gigantic_page.o
>          lib-y += memmove_64.o memset_64.o
>          lib-y += copy_user_64.o
>  	lib-y += cmpxchg16b_emu.o
> diff --git a/arch/x86/lib/clear_gigantic_page.c b/arch/x86/lib/clear_gigantic_page.c
> new file mode 100644
> index 000000000000..6fcb494ec9bc
> --- /dev/null
> +++ b/arch/x86/lib/clear_gigantic_page.c
> @@ -0,0 +1,28 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <asm/page.h>
> +
> +#include <linux/kernel.h>
> +#include <linux/mm.h>
> +#include <linux/sched.h>
> +
> +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
> +
> +void clear_gigantic_page(struct page *page, unsigned long addr,
> +			 unsigned int pages)
> +{
> +	int i;
> +	void *dest = page_to_virt(page);
> +
> +	/*
> +	 * cond_resched() every 2M. Hypothetical page sizes not divisible by
> +	 * this are not supported.
> +	 */
> +	BUG_ON(pages % HPAGE_PMD_NR != 0);
> +	for (i = 0; i < pages; i += HPAGE_PMD_NR) {
> +		clear_page_nt(dest + (i * PAGE_SIZE), HPAGE_PMD_NR * PAGE_SIZE);
> +		cond_resched();
> +	}
> +	/* clear_page_nt requires an `sfence` barrier. */
> +	wmb();
> +}
> +#endif /* defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) */
> diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
> index c4c7dd115953..1224094fd863 100644
> --- a/arch/x86/lib/clear_page_64.S
> +++ b/arch/x86/lib/clear_page_64.S
> @@ -50,3 +50,22 @@ SYM_FUNC_START(clear_page_erms)
>  	ret
>  SYM_FUNC_END(clear_page_erms)
>  EXPORT_SYMBOL_GPL(clear_page_erms)
> +
> +/*
> + * Zero memory using non temporal stores, bypassing the cache.
> + * Requires an `sfence` (wmb()) afterwards.
> + * %rdi - destination.
> + * %rsi - page size. Must be 64 bit aligned.
> +*/
> +SYM_FUNC_START(clear_page_nt)
> +	leaq	(%rdi,%rsi), %rdx
> +	xorl	%eax, %eax
> +	.p2align 4,,10
> +	.p2align 3
> +.L2:
> +	movnti	%rax, (%rdi)
> +	addq	$8, %rdi
> +	cmpq	%rdx, %rdi
> +	jne	.L2
> +	ret
> +SYM_FUNC_END(clear_page_nt)
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c54fb96cb1e6..a57f9007374b 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2856,6 +2856,8 @@ enum mf_action_page_type {
>  };
>  
>  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
> +extern void clear_gigantic_page(struct page *page, unsigned long addr,
> +				unsigned int pages);
>  extern void clear_huge_page(struct page *page,
>  			    unsigned long addr_hint,
>  			    unsigned int pages_per_huge_page);
> diff --git a/mm/memory.c b/mm/memory.c
> index e8bfdf0d9d1d..2a13bf102890 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4706,6 +4706,7 @@ static inline void process_huge_page(
>  	}
>  }
>  
> +#ifndef CONFIG_ARCH_HAS_CLEAR_GIGANTIC_PAGE
>  static void clear_gigantic_page(struct page *page,
>  				unsigned long addr,
>  				unsigned int pages_per_huge_page)
> @@ -4720,6 +4721,7 @@ static void clear_gigantic_page(struct page *page,
>  		clear_user_highpage(p, addr + i * PAGE_SIZE);
>  	}
>  }
> +#endif  /* CONFIG_ARCH_HAS_CLEAR_GIGANTIC_PAGE */
>  
>  static void clear_subpage(unsigned long addr, int idx, void *arg)
>  {
> -- 
> 2.25.1.481.gfbce0eb801-goog
> 
> 

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-09  0:08 ` Kirill A. Shutemov
@ 2020-03-09  9:06   ` Michal Hocko
  2020-03-09  9:35     ` Kirill A. Shutemov
                       ` (2 more replies)
  2020-03-09 15:33   ` Andi Kleen
  1 sibling, 3 replies; 24+ messages in thread
From: Michal Hocko @ 2020-03-09  9:06 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Cannon Matthews, Mike Kravetz, Andrew Morton, Matthew Wilcox,
	David Rientjes, Greg Thelen, Salman Qazi, linux-mm, linux-kernel,
	ak, x86

On Mon 09-03-20 03:08:20, Kirill A. Shutemov wrote:
> On Fri, Mar 06, 2020 at 05:03:53PM -0800, Cannon Matthews wrote:
> > Reimplement clear_gigantic_page() to clear gigabytes pages using the
> > non-temporal streaming store instructions that bypass the cache
> > (movnti), since an entire 1GiB region will not fit in the cache anyway.
> > 
> > Doing an mlock() on a 512GiB 1G-hugetlb region previously would take on
> > average 134 seconds, about 260ms/GiB which is quite slow. Using `movnti`
> > and optimizing the control flow over the constituent small pages, this
> > can be improved roughly by a factor of 3-4x, with the 512GiB mlock()
> > taking only 34 seconds on average, or 67ms/GiB.
> > 
> > The assembly code for the __clear_page_nt routine is more or less
> > taken directly from the output of gcc with -O3 for this function with
> > some tweaks to support arbitrary sizes and moving memory barriers:
> > 
> > void clear_page_nt_64i (void *page)
> > {
> >   for (int i = 0; i < GiB /sizeof(long long int); ++i)
> >     {
> >       _mm_stream_si64 (((long long int*)page) + i, 0);
> >     }
> >   sfence();
> > }
> > 
> > Tested:
> > 	Time to `mlock()` a 512GiB region on broadwell CPU
> > 				AVG time (s)	% imp.	ms/page
> > 	clear_page_erms		133.584		-	261
> > 	clear_page_nt		34.154		74.43%	67
> 
> Some macrobenchmark would be great too.
> 
> > An earlier version of this code was sent as an RFC patch ~July 2018
> > https://patchwork.kernel.org/patch/10543193/ but never merged.
> 
> Andi and I tried to use MOVNTI for large/gigantic page clearing back in
> 2012[1]. Maybe it can be useful.
> 
> That patchset is somewhat more complex trying to keep the memory around
> the fault address hot in cache. In theory it should help to reduce latency
> on the first access to the memory.
> 
> I was not able to get convincing numbers back then for the hardware of the
> time. Maybe it's better now.
> 
> https://lore.kernel.org/r/1345470757-12005-1-git-send-email-kirill.shutemov@linux.intel.com

Thanks for the reminder. I've had only a very vague recollection. Your
series had a much wider scope indeed. Since then we have gained
process_huge_page which tries to optimize normal huge pages.

Gigantic huge pages are a bit different. They are much less dynamic from
the usage POV in my experience. Micro-optimizations for the first access
tends to not matter at all as it is usually pre-allocation scenario. On
the other hand, speeding up the initialization sounds like a good thing
in general. It will be a single time benefit but if the additional code
is not hard to maintain then I would be inclined to take it even with
"artificial" numbers state above. There really shouldn't be other downsides
except for the code maintenance, right?
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-09  9:06   ` Michal Hocko
@ 2020-03-09  9:35     ` Kirill A. Shutemov
  2020-03-09 11:36     ` Kirill A. Shutemov
  2020-03-09 15:38     ` Andi Kleen
  2 siblings, 0 replies; 24+ messages in thread
From: Kirill A. Shutemov @ 2020-03-09  9:35 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Cannon Matthews, Mike Kravetz, Andrew Morton, Matthew Wilcox,
	David Rientjes, Greg Thelen, Salman Qazi, linux-mm, linux-kernel,
	ak, x86

On Mon, Mar 09, 2020 at 10:06:30AM +0100, Michal Hocko wrote:
> On Mon 09-03-20 03:08:20, Kirill A. Shutemov wrote:
> > On Fri, Mar 06, 2020 at 05:03:53PM -0800, Cannon Matthews wrote:
> > > Reimplement clear_gigantic_page() to clear gigabytes pages using the
> > > non-temporal streaming store instructions that bypass the cache
> > > (movnti), since an entire 1GiB region will not fit in the cache anyway.
> > > 
> > > Doing an mlock() on a 512GiB 1G-hugetlb region previously would take on
> > > average 134 seconds, about 260ms/GiB which is quite slow. Using `movnti`
> > > and optimizing the control flow over the constituent small pages, this
> > > can be improved roughly by a factor of 3-4x, with the 512GiB mlock()
> > > taking only 34 seconds on average, or 67ms/GiB.
> > > 
> > > The assembly code for the __clear_page_nt routine is more or less
> > > taken directly from the output of gcc with -O3 for this function with
> > > some tweaks to support arbitrary sizes and moving memory barriers:
> > > 
> > > void clear_page_nt_64i (void *page)
> > > {
> > >   for (int i = 0; i < GiB /sizeof(long long int); ++i)
> > >     {
> > >       _mm_stream_si64 (((long long int*)page) + i, 0);
> > >     }
> > >   sfence();
> > > }
> > > 
> > > Tested:
> > > 	Time to `mlock()` a 512GiB region on broadwell CPU
> > > 				AVG time (s)	% imp.	ms/page
> > > 	clear_page_erms		133.584		-	261
> > > 	clear_page_nt		34.154		74.43%	67
> > 
> > Some macrobenchmark would be great too.
> > 
> > > An earlier version of this code was sent as an RFC patch ~July 2018
> > > https://patchwork.kernel.org/patch/10543193/ but never merged.
> > 
> > Andi and I tried to use MOVNTI for large/gigantic page clearing back in
> > 2012[1]. Maybe it can be useful.
> > 
> > That patchset is somewhat more complex trying to keep the memory around
> > the fault address hot in cache. In theory it should help to reduce latency
> > on the first access to the memory.
> > 
> > I was not able to get convincing numbers back then for the hardware of the
> > time. Maybe it's better now.
> > 
> > https://lore.kernel.org/r/1345470757-12005-1-git-send-email-kirill.shutemov@linux.intel.com
> 
> Thanks for the reminder. I've had only a very vague recollection. Your
> series had a much wider scope indeed. Since then we have gained
> process_huge_page which tries to optimize normal huge pages.
> 
> Gigantic huge pages are a bit different. They are much less dynamic from
> the usage POV in my experience. Micro-optimizations for the first access
> tends to not matter at all as it is usually pre-allocation scenario. On
> the other hand, speeding up the initialization sounds like a good thing
> in general. It will be a single time benefit but if the additional code
> is not hard to maintain then I would be inclined to take it even with
> "artificial" numbers state above. There really shouldn't be other downsides
> except for the code maintenance, right?

I cannot think of any, no.

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-09  9:06   ` Michal Hocko
  2020-03-09  9:35     ` Kirill A. Shutemov
@ 2020-03-09 11:36     ` Kirill A. Shutemov
  2020-03-09 12:26       ` Michal Hocko
  2020-03-09 15:38     ` Andi Kleen
  2 siblings, 1 reply; 24+ messages in thread
From: Kirill A. Shutemov @ 2020-03-09 11:36 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Cannon Matthews, Mike Kravetz, Andrew Morton, Matthew Wilcox,
	David Rientjes, Greg Thelen, Salman Qazi, linux-mm, linux-kernel,
	ak, x86

On Mon, Mar 09, 2020 at 10:06:30AM +0100, Michal Hocko wrote:
> On Mon 09-03-20 03:08:20, Kirill A. Shutemov wrote:
> > On Fri, Mar 06, 2020 at 05:03:53PM -0800, Cannon Matthews wrote:
> > > Reimplement clear_gigantic_page() to clear gigabytes pages using the
> > > non-temporal streaming store instructions that bypass the cache
> > > (movnti), since an entire 1GiB region will not fit in the cache anyway.
> > > 
> > > Doing an mlock() on a 512GiB 1G-hugetlb region previously would take on
> > > average 134 seconds, about 260ms/GiB which is quite slow. Using `movnti`
> > > and optimizing the control flow over the constituent small pages, this
> > > can be improved roughly by a factor of 3-4x, with the 512GiB mlock()
> > > taking only 34 seconds on average, or 67ms/GiB.
> > > 
> > > The assembly code for the __clear_page_nt routine is more or less
> > > taken directly from the output of gcc with -O3 for this function with
> > > some tweaks to support arbitrary sizes and moving memory barriers:
> > > 
> > > void clear_page_nt_64i (void *page)
> > > {
> > >   for (int i = 0; i < GiB /sizeof(long long int); ++i)
> > >     {
> > >       _mm_stream_si64 (((long long int*)page) + i, 0);
> > >     }
> > >   sfence();
> > > }
> > > 
> > > Tested:
> > > 	Time to `mlock()` a 512GiB region on broadwell CPU
> > > 				AVG time (s)	% imp.	ms/page
> > > 	clear_page_erms		133.584		-	261
> > > 	clear_page_nt		34.154		74.43%	67
> > 
> > Some macrobenchmark would be great too.
> > 
> > > An earlier version of this code was sent as an RFC patch ~July 2018
> > > https://patchwork.kernel.org/patch/10543193/ but never merged.
> > 
> > Andi and I tried to use MOVNTI for large/gigantic page clearing back in
> > 2012[1]. Maybe it can be useful.
> > 
> > That patchset is somewhat more complex trying to keep the memory around
> > the fault address hot in cache. In theory it should help to reduce latency
> > on the first access to the memory.
> > 
> > I was not able to get convincing numbers back then for the hardware of the
> > time. Maybe it's better now.
> > 
> > https://lore.kernel.org/r/1345470757-12005-1-git-send-email-kirill.shutemov@linux.intel.com
> 
> Thanks for the reminder. I've had only a very vague recollection. Your
> series had a much wider scope indeed. Since then we have gained
> process_huge_page which tries to optimize normal huge pages.
> 
> Gigantic huge pages are a bit different. They are much less dynamic from
> the usage POV in my experience. Micro-optimizations for the first access
> tends to not matter at all as it is usually pre-allocation scenario.

The page got cleared not on reservation, but on allocation, including page
fault time. Keeping the page around the fault address can still be
beneficial.

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-09 11:36     ` Kirill A. Shutemov
@ 2020-03-09 12:26       ` Michal Hocko
  2020-03-09 18:01         ` Mike Kravetz
  0 siblings, 1 reply; 24+ messages in thread
From: Michal Hocko @ 2020-03-09 12:26 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Cannon Matthews, Mike Kravetz, Andrew Morton, Matthew Wilcox,
	David Rientjes, Greg Thelen, Salman Qazi, linux-mm, linux-kernel,
	ak, x86

On Mon 09-03-20 14:36:58, Kirill A. Shutemov wrote:
> On Mon, Mar 09, 2020 at 10:06:30AM +0100, Michal Hocko wrote:
> > On Mon 09-03-20 03:08:20, Kirill A. Shutemov wrote:
> > > On Fri, Mar 06, 2020 at 05:03:53PM -0800, Cannon Matthews wrote:
> > > > Reimplement clear_gigantic_page() to clear gigabytes pages using the
> > > > non-temporal streaming store instructions that bypass the cache
> > > > (movnti), since an entire 1GiB region will not fit in the cache anyway.
> > > > 
> > > > Doing an mlock() on a 512GiB 1G-hugetlb region previously would take on
> > > > average 134 seconds, about 260ms/GiB which is quite slow. Using `movnti`
> > > > and optimizing the control flow over the constituent small pages, this
> > > > can be improved roughly by a factor of 3-4x, with the 512GiB mlock()
> > > > taking only 34 seconds on average, or 67ms/GiB.
> > > > 
> > > > The assembly code for the __clear_page_nt routine is more or less
> > > > taken directly from the output of gcc with -O3 for this function with
> > > > some tweaks to support arbitrary sizes and moving memory barriers:
> > > > 
> > > > void clear_page_nt_64i (void *page)
> > > > {
> > > >   for (int i = 0; i < GiB /sizeof(long long int); ++i)
> > > >     {
> > > >       _mm_stream_si64 (((long long int*)page) + i, 0);
> > > >     }
> > > >   sfence();
> > > > }
> > > > 
> > > > Tested:
> > > > 	Time to `mlock()` a 512GiB region on broadwell CPU
> > > > 				AVG time (s)	% imp.	ms/page
> > > > 	clear_page_erms		133.584		-	261
> > > > 	clear_page_nt		34.154		74.43%	67
> > > 
> > > Some macrobenchmark would be great too.
> > > 
> > > > An earlier version of this code was sent as an RFC patch ~July 2018
> > > > https://patchwork.kernel.org/patch/10543193/ but never merged.
> > > 
> > > Andi and I tried to use MOVNTI for large/gigantic page clearing back in
> > > 2012[1]. Maybe it can be useful.
> > > 
> > > That patchset is somewhat more complex trying to keep the memory around
> > > the fault address hot in cache. In theory it should help to reduce latency
> > > on the first access to the memory.
> > > 
> > > I was not able to get convincing numbers back then for the hardware of the
> > > time. Maybe it's better now.
> > > 
> > > https://lore.kernel.org/r/1345470757-12005-1-git-send-email-kirill.shutemov@linux.intel.com
> > 
> > Thanks for the reminder. I've had only a very vague recollection. Your
> > series had a much wider scope indeed. Since then we have gained
> > process_huge_page which tries to optimize normal huge pages.
> > 
> > Gigantic huge pages are a bit different. They are much less dynamic from
> > the usage POV in my experience. Micro-optimizations for the first access
> > tends to not matter at all as it is usually pre-allocation scenario.
> 
> The page got cleared not on reservation, but on allocation, including page
> fault time. Keeping the page around the fault address can still be
> beneficial.

You are right of course. What I meant to say that GB pages backed
workloads I have seen tend to pre-allocate during the startup so they do
not realy on lazy initialization duing #PF. This is slightly easier to
handle for resource that is essentially impossible to get on-demand so
an early failure is easier to handle.

If there are workloads which can benefit from page fault
microptimizations then all good but this can be done on top and
demonstrate by numbers. It is much more easier to demonstrate the speed
up on pre-initialization workloads. That's all I wanted to say here.
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-09  0:08 ` Kirill A. Shutemov
  2020-03-09  9:06   ` Michal Hocko
@ 2020-03-09 15:33   ` Andi Kleen
  1 sibling, 0 replies; 24+ messages in thread
From: Andi Kleen @ 2020-03-09 15:33 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Cannon Matthews, Mike Kravetz, Andrew Morton, Matthew Wilcox,
	Michal Hocko, David Rientjes, Greg Thelen, Salman Qazi, linux-mm,
	linux-kernel, x86

> > Tested:
> > 	Time to `mlock()` a 512GiB region on broadwell CPU
> > 				AVG time (s)	% imp.	ms/page
> > 	clear_page_erms		133.584		-	261
> > 	clear_page_nt		34.154		74.43%	67
> 
> Some macrobenchmark would be great too.

Yes especially something that actually uses the memory. I suspect you'll need
something like the below

> That patchset is somewhat more complex trying to keep the memory around
> the fault address hot in cache. In theory it should help to reduce latency
> on the first access to the memory.

-Andi


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-09  9:06   ` Michal Hocko
  2020-03-09  9:35     ` Kirill A. Shutemov
  2020-03-09 11:36     ` Kirill A. Shutemov
@ 2020-03-09 15:38     ` Andi Kleen
  2020-03-09 18:37       ` Matthew Wilcox
  2020-03-11 15:07       ` David Laight
  2 siblings, 2 replies; 24+ messages in thread
From: Andi Kleen @ 2020-03-09 15:38 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Kirill A. Shutemov, Cannon Matthews, Mike Kravetz, Andrew Morton,
	Matthew Wilcox, David Rientjes, Greg Thelen, Salman Qazi,
	linux-mm, linux-kernel, x86


> Gigantic huge pages are a bit different. They are much less dynamic from
> the usage POV in my experience. Micro-optimizations for the first access
> tends to not matter at all as it is usually pre-allocation scenario. On
> the other hand, speeding up the initialization sounds like a good thing
> in general. It will be a single time benefit but if the additional code
> is not hard to maintain then I would be inclined to take it even with
> "artificial" numbers state above. There really shouldn't be other downsides
> except for the code maintenance, right?

There's a cautious tale of the old crappy RAID5 XOR assembler functions which
were optimized a long time ago for the Pentium1, and stayed around,
even though the compiler could actually do a better job.

String instructions are constantly improving in performance (Broadwell is
very old at this point) Most likely over time (and maybe even today
on newer CPUs) you would need much more sophisticated unrolled MOVNTI variants
(or maybe even AVX-*) to be competitive.

The best clear functions may also be different for different CPU
generations.

Using the string instructions has the advantage that all of this is abstracted
from the kernel.

-Andi


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-09 12:26       ` Michal Hocko
@ 2020-03-09 18:01         ` Mike Kravetz
  0 siblings, 0 replies; 24+ messages in thread
From: Mike Kravetz @ 2020-03-09 18:01 UTC (permalink / raw)
  To: Michal Hocko, Kirill A. Shutemov
  Cc: Cannon Matthews, Andrew Morton, Matthew Wilcox, David Rientjes,
	Greg Thelen, Salman Qazi, linux-mm, linux-kernel, ak, x86

On 3/9/20 5:26 AM, Michal Hocko wrote:
> On Mon 09-03-20 14:36:58, Kirill A. Shutemov wrote:
>> On Mon, Mar 09, 2020 at 10:06:30AM +0100, Michal Hocko wrote:
>>> On Mon 09-03-20 03:08:20, Kirill A. Shutemov wrote:
>>>> On Fri, Mar 06, 2020 at 05:03:53PM -0800, Cannon Matthews wrote:
>>>>> Reimplement clear_gigantic_page() to clear gigabytes pages using the
>>>>> non-temporal streaming store instructions that bypass the cache
>>>>> (movnti), since an entire 1GiB region will not fit in the cache anyway.
>>>>>
>>> Gigantic huge pages are a bit different. They are much less dynamic from
>>> the usage POV in my experience. Micro-optimizations for the first access
>>> tends to not matter at all as it is usually pre-allocation scenario.
>>
>> The page got cleared not on reservation, but on allocation, including page
>> fault time. Keeping the page around the fault address can still be
>> beneficial.
> 
> You are right of course. What I meant to say that GB pages backed
> workloads I have seen tend to pre-allocate during the startup so they do
> not realy on lazy initialization duing #PF. This is slightly easier to
> handle for resource that is essentially impossible to get on-demand so
> an early failure is easier to handle.
> 
> If there are workloads which can benefit from page fault
> microptimizations then all good but this can be done on top and
> demonstrate by numbers. It is much more easier to demonstrate the speed
> up on pre-initialization workloads. That's all I wanted to say here.

I tend to agree that use of GB pages is generally going to be in workloads
that will do a one time setup/allocation/initialization of all the pages
it will use.  These improvements will help most in those workloads.

Workloads which include random faults should still benefit.  And, yes
the code could be made even better by taking the faulting address into
account.

I would be happy to see this go forward after addressing the few nits
mentioned by Andrew.  However, I myself can not comment with authority
on the low level x86 specific code.
-- 
Mike Kravetz


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-09 15:38     ` Andi Kleen
@ 2020-03-09 18:37       ` Matthew Wilcox
  2020-03-11  0:21         ` Cannon Matthews
  2020-03-11 15:07       ` David Laight
  1 sibling, 1 reply; 24+ messages in thread
From: Matthew Wilcox @ 2020-03-09 18:37 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Michal Hocko, Kirill A. Shutemov, Cannon Matthews, Mike Kravetz,
	Andrew Morton, David Rientjes, Greg Thelen, Salman Qazi,
	linux-mm, linux-kernel, x86

On Mon, Mar 09, 2020 at 08:38:31AM -0700, Andi Kleen wrote:
> > Gigantic huge pages are a bit different. They are much less dynamic from
> > the usage POV in my experience. Micro-optimizations for the first access
> > tends to not matter at all as it is usually pre-allocation scenario. On
> > the other hand, speeding up the initialization sounds like a good thing
> > in general. It will be a single time benefit but if the additional code
> > is not hard to maintain then I would be inclined to take it even with
> > "artificial" numbers state above. There really shouldn't be other downsides
> > except for the code maintenance, right?
> 
> There's a cautious tale of the old crappy RAID5 XOR assembler functions which
> were optimized a long time ago for the Pentium1, and stayed around,
> even though the compiler could actually do a better job.
> 
> String instructions are constantly improving in performance (Broadwell is
> very old at this point) Most likely over time (and maybe even today
> on newer CPUs) you would need much more sophisticated unrolled MOVNTI variants
> (or maybe even AVX-*) to be competitive.

Presumably you have access to current and maybe even some unreleased
CPUs ... I mean, he's posted the patches, so you can test this hypothesis.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-09 18:37       ` Matthew Wilcox
@ 2020-03-11  0:21         ` Cannon Matthews
  2020-03-11  0:54           ` Kirill A. Shutemov
  0 siblings, 1 reply; 24+ messages in thread
From: Cannon Matthews @ 2020-03-11  0:21 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Andi Kleen, Michal Hocko, Kirill A. Shutemov, Mike Kravetz,
	Andrew Morton, David Rientjes, Greg Thelen, Salman Qazi,
	linux-mm, linux-kernel, x86

On Mon, Mar 9, 2020 at 11:37 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Mon, Mar 09, 2020 at 08:38:31AM -0700, Andi Kleen wrote:
> > > Gigantic huge pages are a bit different. They are much less dynamic from
> > > the usage POV in my experience. Micro-optimizations for the first access
> > > tends to not matter at all as it is usually pre-allocation scenario. On
> > > the other hand, speeding up the initialization sounds like a good thing
> > > in general. It will be a single time benefit but if the additional code
> > > is not hard to maintain then I would be inclined to take it even with
> > > "artificial" numbers state above. There really shouldn't be other downsides
> > > except for the code maintenance, right?
> >
> > There's a cautious tale of the old crappy RAID5 XOR assembler functions which
> > were optimized a long time ago for the Pentium1, and stayed around,
> > even though the compiler could actually do a better job.
> >
> > String instructions are constantly improving in performance (Broadwell is
> > very old at this point) Most likely over time (and maybe even today
> > on newer CPUs) you would need much more sophisticated unrolled MOVNTI variants
> > (or maybe even AVX-*) to be competitive.
>
> Presumably you have access to current and maybe even some unreleased
> CPUs ... I mean, he's posted the patches, so you can test this hypothesis.

I don't have the data at hand, but could reproduce it if strongly
desired, but I've
also tested this on skylake and  cascade lake, and we've had success running
with this for a while now.

When developing this originally, I tested all of this compared with
AVX-* instructions as well
as the string ops, they all seemed to be functionally equivalent, and
all were beat out by this
MOVNTI thing for large regions of 1G pages.

There is probably room to further optimize the MOVNTI stuff with
better loop unrolling or
optimizations, if anyone has specific suggestions I'm happy to try to
incorporate them, but
this has shown to be effective as written so far, and I think I lack
that assembly expertise to
micro optimize further on my own.

But just in general, while there are probably some ways this could be
made better, it does a
good job so far for the workloads that are more specific to 1G pages.

Making it work for 2MiB in a convincing general purpose way is a
harder problem and feels
out of scope, and further optimizations can always be added later on
for some other things.

I'm working on a v2 of this patch addressing some of the nits
mentioned by Andrew, should have
that hopefully soon.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-11  0:21         ` Cannon Matthews
@ 2020-03-11  0:54           ` Kirill A. Shutemov
  2020-03-11  3:35             ` Arvind Sankar
  2020-03-16 10:18             ` Michal Hocko
  0 siblings, 2 replies; 24+ messages in thread
From: Kirill A. Shutemov @ 2020-03-11  0:54 UTC (permalink / raw)
  To: Cannon Matthews
  Cc: Matthew Wilcox, Andi Kleen, Michal Hocko, Mike Kravetz,
	Andrew Morton, David Rientjes, Greg Thelen, Salman Qazi,
	linux-mm, linux-kernel, x86

On Tue, Mar 10, 2020 at 05:21:30PM -0700, Cannon Matthews wrote:
> On Mon, Mar 9, 2020 at 11:37 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Mon, Mar 09, 2020 at 08:38:31AM -0700, Andi Kleen wrote:
> > > > Gigantic huge pages are a bit different. They are much less dynamic from
> > > > the usage POV in my experience. Micro-optimizations for the first access
> > > > tends to not matter at all as it is usually pre-allocation scenario. On
> > > > the other hand, speeding up the initialization sounds like a good thing
> > > > in general. It will be a single time benefit but if the additional code
> > > > is not hard to maintain then I would be inclined to take it even with
> > > > "artificial" numbers state above. There really shouldn't be other downsides
> > > > except for the code maintenance, right?
> > >
> > > There's a cautious tale of the old crappy RAID5 XOR assembler functions which
> > > were optimized a long time ago for the Pentium1, and stayed around,
> > > even though the compiler could actually do a better job.
> > >
> > > String instructions are constantly improving in performance (Broadwell is
> > > very old at this point) Most likely over time (and maybe even today
> > > on newer CPUs) you would need much more sophisticated unrolled MOVNTI variants
> > > (or maybe even AVX-*) to be competitive.
> >
> > Presumably you have access to current and maybe even some unreleased
> > CPUs ... I mean, he's posted the patches, so you can test this hypothesis.
> 
> I don't have the data at hand, but could reproduce it if strongly
> desired, but I've also tested this on skylake and  cascade lake, and
> we've had success running with this for a while now.
> 
> When developing this originally, I tested all of this compared with
> AVX-* instructions as well as the string ops, they all seemed to be
> functionally equivalent, and all were beat out by this MOVNTI thing for
> large regions of 1G pages.
> 
> There is probably room to further optimize the MOVNTI stuff with better
> loop unrolling or optimizations, if anyone has specific suggestions I'm
> happy to try to incorporate them, but this has shown to be effective as
> written so far, and I think I lack that assembly expertise to micro
> optimize further on my own.

Andi's point is that string instructions might be a better bet in a long
run. You may win something with MOVNTI on current CPUs, but it may become
a burden on newer microarchitectures when string instructions improves.
Nobody realistically would re-validate if MOVNTI microoptimazation still
make sense for every new microarchitecture.

> 
> But just in general, while there are probably some ways this could be
> made better, it does a good job so far for the workloads that are more
> specific to 1G pages.
> 
> Making it work for 2MiB in a convincing general purpose way is a harder
> problem and feels out of scope, and further optimizations can always be
> added later on for some other things.
> 
> I'm working on a v2 of this patch addressing some of the nits mentioned
> by Andrew, should have that hopefully soon.

Have you got any data for a macrobenchmark?

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-11  0:54           ` Kirill A. Shutemov
@ 2020-03-11  3:35             ` Arvind Sankar
  2020-03-11  8:16               ` Kirill A. Shutemov
  2020-03-16 10:18             ` Michal Hocko
  1 sibling, 1 reply; 24+ messages in thread
From: Arvind Sankar @ 2020-03-11  3:35 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Cannon Matthews, Matthew Wilcox, Andi Kleen, Michal Hocko,
	Mike Kravetz, Andrew Morton, David Rientjes, Greg Thelen,
	Salman Qazi, linux-mm, linux-kernel, x86

On Wed, Mar 11, 2020 at 03:54:47AM +0300, Kirill A. Shutemov wrote:
> On Tue, Mar 10, 2020 at 05:21:30PM -0700, Cannon Matthews wrote:
> > On Mon, Mar 9, 2020 at 11:37 AM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Mon, Mar 09, 2020 at 08:38:31AM -0700, Andi Kleen wrote:
> > > > > Gigantic huge pages are a bit different. They are much less dynamic from
> > > > > the usage POV in my experience. Micro-optimizations for the first access
> > > > > tends to not matter at all as it is usually pre-allocation scenario. On
> > > > > the other hand, speeding up the initialization sounds like a good thing
> > > > > in general. It will be a single time benefit but if the additional code
> > > > > is not hard to maintain then I would be inclined to take it even with
> > > > > "artificial" numbers state above. There really shouldn't be other downsides
> > > > > except for the code maintenance, right?
> > > >
> > > > There's a cautious tale of the old crappy RAID5 XOR assembler functions which
> > > > were optimized a long time ago for the Pentium1, and stayed around,
> > > > even though the compiler could actually do a better job.
> > > >
> > > > String instructions are constantly improving in performance (Broadwell is
> > > > very old at this point) Most likely over time (and maybe even today
> > > > on newer CPUs) you would need much more sophisticated unrolled MOVNTI variants
> > > > (or maybe even AVX-*) to be competitive.
> > >
> > > Presumably you have access to current and maybe even some unreleased
> > > CPUs ... I mean, he's posted the patches, so you can test this hypothesis.
> > 
> > I don't have the data at hand, but could reproduce it if strongly
> > desired, but I've also tested this on skylake and  cascade lake, and
> > we've had success running with this for a while now.
> > 
> > When developing this originally, I tested all of this compared with
> > AVX-* instructions as well as the string ops, they all seemed to be
> > functionally equivalent, and all were beat out by this MOVNTI thing for
> > large regions of 1G pages.
> > 
> > There is probably room to further optimize the MOVNTI stuff with better
> > loop unrolling or optimizations, if anyone has specific suggestions I'm
> > happy to try to incorporate them, but this has shown to be effective as
> > written so far, and I think I lack that assembly expertise to micro
> > optimize further on my own.
> 
> Andi's point is that string instructions might be a better bet in a long
> run. You may win something with MOVNTI on current CPUs, but it may become
> a burden on newer microarchitectures when string instructions improves.
> Nobody realistically would re-validate if MOVNTI microoptimazation still
> make sense for every new microarchitecture.
>

The rationale for MOVNTI instruction is supposed to be that it avoids
cache pollution. Aside from the bench that shows MOVNTI to be faster for
the move itself, shouldn't it have an additional benefit in not trashing
the CPU caches?

As string instructions improve, why wouldn't the same improvements be
applied to MOVNTI?


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-11  3:35             ` Arvind Sankar
@ 2020-03-11  8:16               ` Kirill A. Shutemov
  2020-03-11 18:32                 ` Arvind Sankar
  0 siblings, 1 reply; 24+ messages in thread
From: Kirill A. Shutemov @ 2020-03-11  8:16 UTC (permalink / raw)
  To: Arvind Sankar
  Cc: Cannon Matthews, Matthew Wilcox, Andi Kleen, Michal Hocko,
	Mike Kravetz, Andrew Morton, David Rientjes, Greg Thelen,
	Salman Qazi, linux-mm, linux-kernel, x86

On Tue, Mar 10, 2020 at 11:35:54PM -0400, Arvind Sankar wrote:
> On Wed, Mar 11, 2020 at 03:54:47AM +0300, Kirill A. Shutemov wrote:
> > On Tue, Mar 10, 2020 at 05:21:30PM -0700, Cannon Matthews wrote:
> > > On Mon, Mar 9, 2020 at 11:37 AM Matthew Wilcox <willy@infradead.org> wrote:
> > > >
> > > > On Mon, Mar 09, 2020 at 08:38:31AM -0700, Andi Kleen wrote:
> > > > > > Gigantic huge pages are a bit different. They are much less dynamic from
> > > > > > the usage POV in my experience. Micro-optimizations for the first access
> > > > > > tends to not matter at all as it is usually pre-allocation scenario. On
> > > > > > the other hand, speeding up the initialization sounds like a good thing
> > > > > > in general. It will be a single time benefit but if the additional code
> > > > > > is not hard to maintain then I would be inclined to take it even with
> > > > > > "artificial" numbers state above. There really shouldn't be other downsides
> > > > > > except for the code maintenance, right?
> > > > >
> > > > > There's a cautious tale of the old crappy RAID5 XOR assembler functions which
> > > > > were optimized a long time ago for the Pentium1, and stayed around,
> > > > > even though the compiler could actually do a better job.
> > > > >
> > > > > String instructions are constantly improving in performance (Broadwell is
> > > > > very old at this point) Most likely over time (and maybe even today
> > > > > on newer CPUs) you would need much more sophisticated unrolled MOVNTI variants
> > > > > (or maybe even AVX-*) to be competitive.
> > > >
> > > > Presumably you have access to current and maybe even some unreleased
> > > > CPUs ... I mean, he's posted the patches, so you can test this hypothesis.
> > > 
> > > I don't have the data at hand, but could reproduce it if strongly
> > > desired, but I've also tested this on skylake and  cascade lake, and
> > > we've had success running with this for a while now.
> > > 
> > > When developing this originally, I tested all of this compared with
> > > AVX-* instructions as well as the string ops, they all seemed to be
> > > functionally equivalent, and all were beat out by this MOVNTI thing for
> > > large regions of 1G pages.
> > > 
> > > There is probably room to further optimize the MOVNTI stuff with better
> > > loop unrolling or optimizations, if anyone has specific suggestions I'm
> > > happy to try to incorporate them, but this has shown to be effective as
> > > written so far, and I think I lack that assembly expertise to micro
> > > optimize further on my own.
> > 
> > Andi's point is that string instructions might be a better bet in a long
> > run. You may win something with MOVNTI on current CPUs, but it may become
> > a burden on newer microarchitectures when string instructions improves.
> > Nobody realistically would re-validate if MOVNTI microoptimazation still
> > make sense for every new microarchitecture.
> >
> 
> The rationale for MOVNTI instruction is supposed to be that it avoids
> cache pollution. Aside from the bench that shows MOVNTI to be faster for
> the move itself, shouldn't it have an additional benefit in not trashing
> the CPU caches?
> 
> As string instructions improve, why wouldn't the same improvements be
> applied to MOVNTI?

String instructions inherently more flexible. Implementation can choose
caching strategy depending on the operation size (cx) and other factors.
Like if operation is large enough and cache is full of dirty cache lines
that expensive to free up, it can choose to bypass cache. MOVNTI is more
strict on semantics and more opaque to CPU.

And more importantly string instructions, unlike MOVNTI, is something that
generated often by compiler and used in standard libraries a lot. It is
and will be focus of optimization of CPU architects.

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-09 15:38     ` Andi Kleen
  2020-03-09 18:37       ` Matthew Wilcox
@ 2020-03-11 15:07       ` David Laight
  1 sibling, 0 replies; 24+ messages in thread
From: David Laight @ 2020-03-11 15:07 UTC (permalink / raw)
  To: 'Andi Kleen', Michal Hocko
  Cc: Kirill A. Shutemov, Cannon Matthews, Mike Kravetz, Andrew Morton,
	Matthew Wilcox, David Rientjes, Greg Thelen, Salman Qazi,
	linux-mm, linux-kernel, x86

From: Andi Kleen
> Sent: 09 March 2020 15:39
...
> There's a cautious tale of the old crappy RAID5 XOR assembler functions which
> were optimized a long time ago for the Pentium1, and stayed around,
> even though the compiler could actually do a better job.

Or the amd64 asm loop for doing the IP checksum.
I doubt it was even the fastest version when it was written.
A whole set of Intel cpus can run twice as fast as that version
with less loop unrolling (and associated code for 'odd' lengths).

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-11  8:16               ` Kirill A. Shutemov
@ 2020-03-11 18:32                 ` Arvind Sankar
  2020-03-11 20:32                   ` Arvind Sankar
  2020-03-31  0:40                   ` Elliott, Robert (Servers)
  0 siblings, 2 replies; 24+ messages in thread
From: Arvind Sankar @ 2020-03-11 18:32 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Arvind Sankar, Cannon Matthews, Matthew Wilcox, Andi Kleen,
	Michal Hocko, Mike Kravetz, Andrew Morton, David Rientjes,
	Greg Thelen, Salman Qazi, linux-mm, linux-kernel, x86

On Wed, Mar 11, 2020 at 11:16:07AM +0300, Kirill A. Shutemov wrote:
> On Tue, Mar 10, 2020 at 11:35:54PM -0400, Arvind Sankar wrote:
> > 
> > The rationale for MOVNTI instruction is supposed to be that it avoids
> > cache pollution. Aside from the bench that shows MOVNTI to be faster for
> > the move itself, shouldn't it have an additional benefit in not trashing
> > the CPU caches?
> > 
> > As string instructions improve, why wouldn't the same improvements be
> > applied to MOVNTI?
> 
> String instructions inherently more flexible. Implementation can choose
> caching strategy depending on the operation size (cx) and other factors.
> Like if operation is large enough and cache is full of dirty cache lines
> that expensive to free up, it can choose to bypass cache. MOVNTI is more
> strict on semantics and more opaque to CPU.

But with today's processors, wouldn't writing 1G via the string
operations empty out almost the whole cache? Or are there already
optimizations to prevent one thread from hogging the L3?

If we do want to just use the string operations, it seems like the
clear_page routines should just call memset instead of duplicating it.

> 
> And more importantly string instructions, unlike MOVNTI, is something that
> generated often by compiler and used in standard libraries a lot. It is
> and will be focus of optimization of CPU architects.
> 
> -- 
>  Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-11 18:32                 ` Arvind Sankar
@ 2020-03-11 20:32                   ` Arvind Sankar
  2020-03-12  0:52                     ` Kirill A. Shutemov
  2020-03-31  0:40                   ` Elliott, Robert (Servers)
  1 sibling, 1 reply; 24+ messages in thread
From: Arvind Sankar @ 2020-03-11 20:32 UTC (permalink / raw)
  To: Arvind Sankar
  Cc: Kirill A. Shutemov, Cannon Matthews, Matthew Wilcox, Andi Kleen,
	Michal Hocko, Mike Kravetz, Andrew Morton, David Rientjes,
	Greg Thelen, Salman Qazi, linux-mm, linux-kernel, x86

On Wed, Mar 11, 2020 at 02:32:41PM -0400, Arvind Sankar wrote:
> On Wed, Mar 11, 2020 at 11:16:07AM +0300, Kirill A. Shutemov wrote:
> > On Tue, Mar 10, 2020 at 11:35:54PM -0400, Arvind Sankar wrote:
> > > 
> > > The rationale for MOVNTI instruction is supposed to be that it avoids
> > > cache pollution. Aside from the bench that shows MOVNTI to be faster for
> > > the move itself, shouldn't it have an additional benefit in not trashing
> > > the CPU caches?
> > > 
> > > As string instructions improve, why wouldn't the same improvements be
> > > applied to MOVNTI?
> > 
> > String instructions inherently more flexible. Implementation can choose
> > caching strategy depending on the operation size (cx) and other factors.
> > Like if operation is large enough and cache is full of dirty cache lines
> > that expensive to free up, it can choose to bypass cache. MOVNTI is more
> > strict on semantics and more opaque to CPU.
> 
> But with today's processors, wouldn't writing 1G via the string
> operations empty out almost the whole cache? Or are there already
> optimizations to prevent one thread from hogging the L3?

Also, currently the stringop is only done 4k at a time, so it would
likely not trigger any future cache-bypassing optimizations in any case.

> 
> If we do want to just use the string operations, it seems like the
> clear_page routines should just call memset instead of duplicating it.
> 
> > 
> > And more importantly string instructions, unlike MOVNTI, is something that
> > generated often by compiler and used in standard libraries a lot. It is
> > and will be focus of optimization of CPU architects.
> > 
> > -- 
> >  Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-11 20:32                   ` Arvind Sankar
@ 2020-03-12  0:52                     ` Kirill A. Shutemov
  0 siblings, 0 replies; 24+ messages in thread
From: Kirill A. Shutemov @ 2020-03-12  0:52 UTC (permalink / raw)
  To: Arvind Sankar
  Cc: Cannon Matthews, Matthew Wilcox, Andi Kleen, Michal Hocko,
	Mike Kravetz, Andrew Morton, David Rientjes, Greg Thelen,
	Salman Qazi, linux-mm, linux-kernel, x86

On Wed, Mar 11, 2020 at 04:32:47PM -0400, Arvind Sankar wrote:
> On Wed, Mar 11, 2020 at 02:32:41PM -0400, Arvind Sankar wrote:
> > On Wed, Mar 11, 2020 at 11:16:07AM +0300, Kirill A. Shutemov wrote:
> > > On Tue, Mar 10, 2020 at 11:35:54PM -0400, Arvind Sankar wrote:
> > > > 
> > > > The rationale for MOVNTI instruction is supposed to be that it avoids
> > > > cache pollution. Aside from the bench that shows MOVNTI to be faster for
> > > > the move itself, shouldn't it have an additional benefit in not trashing
> > > > the CPU caches?
> > > > 
> > > > As string instructions improve, why wouldn't the same improvements be
> > > > applied to MOVNTI?
> > > 
> > > String instructions inherently more flexible. Implementation can choose
> > > caching strategy depending on the operation size (cx) and other factors.
> > > Like if operation is large enough and cache is full of dirty cache lines
> > > that expensive to free up, it can choose to bypass cache. MOVNTI is more
> > > strict on semantics and more opaque to CPU.
> > 
> > But with today's processors, wouldn't writing 1G via the string
> > operations empty out almost the whole cache? Or are there already
> > optimizations to prevent one thread from hogging the L3?
> 
> Also, currently the stringop is only done 4k at a time, so it would
> likely not trigger any future cache-bypassing optimizations in any case.

What I tried to say is that we need to be careful with this kind of
optimizations. We need to see a sizable improvement on something beyond
microbenchmark, ideally across multiple CPU microarchitectures.

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-11  0:54           ` Kirill A. Shutemov
  2020-03-11  3:35             ` Arvind Sankar
@ 2020-03-16 10:18             ` Michal Hocko
  2020-03-16 12:19               ` Kirill A. Shutemov
  1 sibling, 1 reply; 24+ messages in thread
From: Michal Hocko @ 2020-03-16 10:18 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Cannon Matthews, Matthew Wilcox, Andi Kleen, Mike Kravetz,
	Andrew Morton, David Rientjes, Greg Thelen, Salman Qazi,
	linux-mm, linux-kernel, x86

On Wed 11-03-20 03:54:47, Kirill A. Shutemov wrote:
> On Tue, Mar 10, 2020 at 05:21:30PM -0700, Cannon Matthews wrote:
> > On Mon, Mar 9, 2020 at 11:37 AM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Mon, Mar 09, 2020 at 08:38:31AM -0700, Andi Kleen wrote:
> > > > > Gigantic huge pages are a bit different. They are much less dynamic from
> > > > > the usage POV in my experience. Micro-optimizations for the first access
> > > > > tends to not matter at all as it is usually pre-allocation scenario. On
> > > > > the other hand, speeding up the initialization sounds like a good thing
> > > > > in general. It will be a single time benefit but if the additional code
> > > > > is not hard to maintain then I would be inclined to take it even with
> > > > > "artificial" numbers state above. There really shouldn't be other downsides
> > > > > except for the code maintenance, right?
> > > >
> > > > There's a cautious tale of the old crappy RAID5 XOR assembler functions which
> > > > were optimized a long time ago for the Pentium1, and stayed around,
> > > > even though the compiler could actually do a better job.
> > > >
> > > > String instructions are constantly improving in performance (Broadwell is
> > > > very old at this point) Most likely over time (and maybe even today
> > > > on newer CPUs) you would need much more sophisticated unrolled MOVNTI variants
> > > > (or maybe even AVX-*) to be competitive.
> > >
> > > Presumably you have access to current and maybe even some unreleased
> > > CPUs ... I mean, he's posted the patches, so you can test this hypothesis.
> > 
> > I don't have the data at hand, but could reproduce it if strongly
> > desired, but I've also tested this on skylake and  cascade lake, and
> > we've had success running with this for a while now.
> > 
> > When developing this originally, I tested all of this compared with
> > AVX-* instructions as well as the string ops, they all seemed to be
> > functionally equivalent, and all were beat out by this MOVNTI thing for
> > large regions of 1G pages.
> > 
> > There is probably room to further optimize the MOVNTI stuff with better
> > loop unrolling or optimizations, if anyone has specific suggestions I'm
> > happy to try to incorporate them, but this has shown to be effective as
> > written so far, and I think I lack that assembly expertise to micro
> > optimize further on my own.
> 
> Andi's point is that string instructions might be a better bet in a long
> run. You may win something with MOVNTI on current CPUs, but it may become
> a burden on newer microarchitectures when string instructions improves.
> Nobody realistically would re-validate if MOVNTI microoptimazation still
> make sense for every new microarchitecture.

While this might be true, isn't that easily solveable by the existing
ALTERNATIVE and cpu features framework. Can we have a feature bit to
tell that movnti is worthwile for large data copy routines. Probably
something for x86 maintainers.
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-16 10:18             ` Michal Hocko
@ 2020-03-16 12:19               ` Kirill A. Shutemov
  2020-03-26 19:46                 ` Matthew Wilcox
  0 siblings, 1 reply; 24+ messages in thread
From: Kirill A. Shutemov @ 2020-03-16 12:19 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Cannon Matthews, Matthew Wilcox, Andi Kleen, Mike Kravetz,
	Andrew Morton, David Rientjes, Greg Thelen, Salman Qazi,
	linux-mm, linux-kernel, x86

On Mon, Mar 16, 2020 at 11:18:56AM +0100, Michal Hocko wrote:
> On Wed 11-03-20 03:54:47, Kirill A. Shutemov wrote:
> > On Tue, Mar 10, 2020 at 05:21:30PM -0700, Cannon Matthews wrote:
> > > On Mon, Mar 9, 2020 at 11:37 AM Matthew Wilcox <willy@infradead.org> wrote:
> > > >
> > > > On Mon, Mar 09, 2020 at 08:38:31AM -0700, Andi Kleen wrote:
> > > > > > Gigantic huge pages are a bit different. They are much less dynamic from
> > > > > > the usage POV in my experience. Micro-optimizations for the first access
> > > > > > tends to not matter at all as it is usually pre-allocation scenario. On
> > > > > > the other hand, speeding up the initialization sounds like a good thing
> > > > > > in general. It will be a single time benefit but if the additional code
> > > > > > is not hard to maintain then I would be inclined to take it even with
> > > > > > "artificial" numbers state above. There really shouldn't be other downsides
> > > > > > except for the code maintenance, right?
> > > > >
> > > > > There's a cautious tale of the old crappy RAID5 XOR assembler functions which
> > > > > were optimized a long time ago for the Pentium1, and stayed around,
> > > > > even though the compiler could actually do a better job.
> > > > >
> > > > > String instructions are constantly improving in performance (Broadwell is
> > > > > very old at this point) Most likely over time (and maybe even today
> > > > > on newer CPUs) you would need much more sophisticated unrolled MOVNTI variants
> > > > > (or maybe even AVX-*) to be competitive.
> > > >
> > > > Presumably you have access to current and maybe even some unreleased
> > > > CPUs ... I mean, he's posted the patches, so you can test this hypothesis.
> > > 
> > > I don't have the data at hand, but could reproduce it if strongly
> > > desired, but I've also tested this on skylake and  cascade lake, and
> > > we've had success running with this for a while now.
> > > 
> > > When developing this originally, I tested all of this compared with
> > > AVX-* instructions as well as the string ops, they all seemed to be
> > > functionally equivalent, and all were beat out by this MOVNTI thing for
> > > large regions of 1G pages.
> > > 
> > > There is probably room to further optimize the MOVNTI stuff with better
> > > loop unrolling or optimizations, if anyone has specific suggestions I'm
> > > happy to try to incorporate them, but this has shown to be effective as
> > > written so far, and I think I lack that assembly expertise to micro
> > > optimize further on my own.
> > 
> > Andi's point is that string instructions might be a better bet in a long
> > run. You may win something with MOVNTI on current CPUs, but it may become
> > a burden on newer microarchitectures when string instructions improves.
> > Nobody realistically would re-validate if MOVNTI microoptimazation still
> > make sense for every new microarchitecture.
> 
> While this might be true, isn't that easily solveable by the existing
> ALTERNATIVE and cpu features framework. Can we have a feature bit to
> tell that movnti is worthwile for large data copy routines. Probably
> something for x86 maintainers.

It still need somody to test which approach is better for the CPU.
See X86_FEATURE_REP_GOOD.

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-16 12:19               ` Kirill A. Shutemov
@ 2020-03-26 19:46                 ` Matthew Wilcox
  0 siblings, 0 replies; 24+ messages in thread
From: Matthew Wilcox @ 2020-03-26 19:46 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Michal Hocko, Cannon Matthews, Andi Kleen, Mike Kravetz,
	Andrew Morton, David Rientjes, Greg Thelen, Salman Qazi,
	linux-mm, linux-kernel, x86

On Mon, Mar 16, 2020 at 03:19:55PM +0300, Kirill A. Shutemov wrote:
> On Mon, Mar 16, 2020 at 11:18:56AM +0100, Michal Hocko wrote:
> > While this might be true, isn't that easily solveable by the existing
> > ALTERNATIVE and cpu features framework. Can we have a feature bit to
> > tell that movnti is worthwile for large data copy routines. Probably
> > something for x86 maintainers.
> 
> It still need somody to test which approach is better for the CPU.
> See X86_FEATURE_REP_GOOD.

How about inverting the sense of the bit?  Set it on Broadwell-and-
earlier CPUs, where it definitely improves performance by a huge amount,
and new microarches can set it if it still outperforms rep mov?


^ permalink raw reply	[flat|nested] 24+ messages in thread

* RE: [PATCH] mm: clear 1G pages with streaming stores on x86
  2020-03-11 18:32                 ` Arvind Sankar
  2020-03-11 20:32                   ` Arvind Sankar
@ 2020-03-31  0:40                   ` Elliott, Robert (Servers)
  1 sibling, 0 replies; 24+ messages in thread
From: Elliott, Robert (Servers) @ 2020-03-31  0:40 UTC (permalink / raw)
  To: Arvind Sankar, Kirill A. Shutemov
  Cc: Cannon Matthews, Matthew Wilcox, Andi Kleen, Michal Hocko,
	Mike Kravetz, Andrew Morton, David Rientjes, Greg Thelen,
	Salman Qazi, linux-mm, linux-kernel, x86



> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org <linux-kernel-
> owner@vger.kernel.org> On Behalf Of Arvind Sankar
> Sent: Wednesday, March 11, 2020 1:33 PM
> To: Kirill A. Shutemov <kirill@shutemov.name>
> Cc: Arvind Sankar <nivedita@alum.mit.edu>; Cannon Matthews
> <cannonmatthews@google.com>; Matthew Wilcox <willy@infradead.org>;
> Andi Kleen <ak@linux.intel.com>; Michal Hocko <mhocko@kernel.org>;
> Mike Kravetz <mike.kravetz@oracle.com>; Andrew Morton <akpm@linux-
> foundation.org>; David Rientjes <rientjes@google.com>; Greg Thelen
> <gthelen@google.com>; Salman Qazi <sqazi@google.com>; linux-
> mm@kvack.org; linux-kernel@vger.kernel.org; x86@kernel.org
> Subject: Re: [PATCH] mm: clear 1G pages with streaming stores on x86
> 
> On Wed, Mar 11, 2020 at 11:16:07AM +0300, Kirill A. Shutemov wrote:
> > On Tue, Mar 10, 2020 at 11:35:54PM -0400, Arvind Sankar wrote:
> > >
> > > The rationale for MOVNTI instruction is supposed to be that it
> avoids
> > > cache pollution. Aside from the bench that shows MOVNTI to be
> faster for
> > > the move itself, shouldn't it have an additional benefit in not
> trashing
> > > the CPU caches?
> > >
> > > As string instructions improve, why wouldn't the same
> improvements be
> > > applied to MOVNTI?
> >
> > String instructions inherently more flexible. Implementation can
> choose
> > caching strategy depending on the operation size (cx) and other
> factors.
> > Like if operation is large enough and cache is full of dirty cache
> lines
> > that expensive to free up, it can choose to bypass cache. MOVNTI is
> more
> > strict on semantics and more opaque to CPU.
> 
> But with today's processors, wouldn't writing 1G via the string
> operations empty out almost the whole cache? Or are there already
> optimizations to prevent one thread from hogging the L3?
> 
> If we do want to just use the string operations, it seems like the
> clear_page routines should just call memset instead of duplicating
> it.
> 

The last time I checked, glibc memcpy() chose non-temporal stores based
on transfer size, L3 cache size, and the number of cores.
For example, with glibc-2.216-16.fc27 (August 2017), on a Broadwell
system with E5-2699 36 cores 45 MiB L3 cache, non-temporal stores only
start to be used above 36 MiB.



^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, back to index

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-07  1:03 [PATCH] mm: clear 1G pages with streaming stores on x86 Cannon Matthews
2020-03-07 15:36 ` kbuild test robot
2020-03-07 22:06 ` Andrew Morton
2020-03-09  0:08 ` Kirill A. Shutemov
2020-03-09  9:06   ` Michal Hocko
2020-03-09  9:35     ` Kirill A. Shutemov
2020-03-09 11:36     ` Kirill A. Shutemov
2020-03-09 12:26       ` Michal Hocko
2020-03-09 18:01         ` Mike Kravetz
2020-03-09 15:38     ` Andi Kleen
2020-03-09 18:37       ` Matthew Wilcox
2020-03-11  0:21         ` Cannon Matthews
2020-03-11  0:54           ` Kirill A. Shutemov
2020-03-11  3:35             ` Arvind Sankar
2020-03-11  8:16               ` Kirill A. Shutemov
2020-03-11 18:32                 ` Arvind Sankar
2020-03-11 20:32                   ` Arvind Sankar
2020-03-12  0:52                     ` Kirill A. Shutemov
2020-03-31  0:40                   ` Elliott, Robert (Servers)
2020-03-16 10:18             ` Michal Hocko
2020-03-16 12:19               ` Kirill A. Shutemov
2020-03-26 19:46                 ` Matthew Wilcox
2020-03-11 15:07       ` David Laight
2020-03-09 15:33   ` Andi Kleen

Linux-mm Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-mm/0 linux-mm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-mm linux-mm/ https://lore.kernel.org/linux-mm \
		linux-mm@kvack.org
	public-inbox-index linux-mm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kvack.linux-mm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git