linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 RESEND] x86: optimize memcpy_flushcache
       [not found]     ` <20180524182013.GA59755@redhat.com>
@ 2018-06-18 13:23       ` Mike Snitzer
  2018-06-21 14:31         ` Ingo Molnar
  0 siblings, 1 reply; 7+ messages in thread
From: Mike Snitzer @ 2018-06-18 13:23 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner
  Cc: Mikulas Patocka, Dan Williams, device-mapper development, X86 ML,
	linux-kernel

From: Mikulas Patocka <mpatocka@redhat.com>
Subject: [PATCH v2] x86: optimize memcpy_flushcache

In the context of constant short length stores to persistent memory,
memcpy_flushcache suffers from a 2% performance degradation compared to
explicitly using the "movnti" instruction.

Optimize 4, 8, and 16 byte memcpy_flushcache calls to explicitly use the
movnti instruction with inline assembler.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 arch/x86/include/asm/string_64.h | 28 +++++++++++++++++++++++++++-
 arch/x86/lib/usercopy_64.c       |  4 ++--
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 533f74c300c2..aaba83478cdc 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -147,7 +147,33 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)
 
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
-void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
+static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+	if (__builtin_constant_p(cnt)) {
+		switch (cnt) {
+		case 4:
+			asm volatile("movntil %1, %0"
+				     : "=m" (*(u32 *)dst)
+				     : "r" (*(u32 *)src));
+			return;
+		case 8:
+			asm volatile("movntiq %1, %0"
+				     : "=m" (*(u64 *)dst)
+				     : "r" (*(u64 *)src));
+			return;
+		case 16:
+			asm volatile("movntiq %1, %0"
+				     : "=m" (*(u64 *)dst)
+				     : "r" (*(u64 *)src));
+			asm volatile("movntiq %1, %0"
+				     : "=m" (*(u64 *)(dst + 8))
+				     : "r" (*(u64 *)(src + 8)));
+			return;
+		}
+	}
+	__memcpy_flushcache(dst, src, cnt);
+}
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 75d3776123cc..26f515aa3529 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -133,7 +133,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
 	return rc;
 }
 
-void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
 {
 	unsigned long dest = (unsigned long) _dst;
 	unsigned long source = (unsigned long) _src;
@@ -196,7 +196,7 @@ void memcpy_flushcache(void *_dst, const void *_src, size_t size)
 		clean_cache_range((void *) dest, size);
 	}
 }
-EXPORT_SYMBOL_GPL(memcpy_flushcache);
+EXPORT_SYMBOL_GPL(__memcpy_flushcache);
 
 void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
 		size_t len)
-- 
2.15.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 RESEND] x86: optimize memcpy_flushcache
  2018-06-18 13:23       ` [PATCH v2 RESEND] x86: optimize memcpy_flushcache Mike Snitzer
@ 2018-06-21 14:31         ` Ingo Molnar
  2018-06-22  1:19           ` Mikulas Patocka
  0 siblings, 1 reply; 7+ messages in thread
From: Ingo Molnar @ 2018-06-21 14:31 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Thomas Gleixner, Mikulas Patocka, Dan Williams,
	device-mapper development, X86 ML, linux-kernel


* Mike Snitzer <snitzer@redhat.com> wrote:

> From: Mikulas Patocka <mpatocka@redhat.com>
> Subject: [PATCH v2] x86: optimize memcpy_flushcache
> 
> In the context of constant short length stores to persistent memory,
> memcpy_flushcache suffers from a 2% performance degradation compared to
> explicitly using the "movnti" instruction.
> 
> Optimize 4, 8, and 16 byte memcpy_flushcache calls to explicitly use the
> movnti instruction with inline assembler.

Linus requested asm optimizations to include actual benchmarks, so it would be 
nice to describe how this was tested, on what hardware, and what the before/after 
numbers are.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 RESEND] x86: optimize memcpy_flushcache
  2018-06-21 14:31         ` Ingo Molnar
@ 2018-06-22  1:19           ` Mikulas Patocka
  2018-06-22  1:30             ` Ingo Molnar
  0 siblings, 1 reply; 7+ messages in thread
From: Mikulas Patocka @ 2018-06-22  1:19 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Snitzer, Thomas Gleixner, Dan Williams,
	device-mapper development, X86 ML, linux-kernel



On Thu, 21 Jun 2018, Ingo Molnar wrote:

> 
> * Mike Snitzer <snitzer@redhat.com> wrote:
> 
> > From: Mikulas Patocka <mpatocka@redhat.com>
> > Subject: [PATCH v2] x86: optimize memcpy_flushcache
> > 
> > In the context of constant short length stores to persistent memory,
> > memcpy_flushcache suffers from a 2% performance degradation compared to
> > explicitly using the "movnti" instruction.
> > 
> > Optimize 4, 8, and 16 byte memcpy_flushcache calls to explicitly use the
> > movnti instruction with inline assembler.
> 
> Linus requested asm optimizations to include actual benchmarks, so it would be 
> nice to describe how this was tested, on what hardware, and what the before/after 
> numbers are.
> 
> Thanks,
> 
> 	Ingo

It was tested on 4-core skylake machine with persistent memory being 
emulated using the memmap kernel option. The dm-writecache target used the 
emulated persistent memory as a cache and sata SSD as a backing device. 
The patch results in 2% improved throughput when writing data using dd.

I don't have access to the machine anymore.

Mikulas

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 RESEND] x86: optimize memcpy_flushcache
  2018-06-22  1:19           ` Mikulas Patocka
@ 2018-06-22  1:30             ` Ingo Molnar
  2018-08-08 21:22               ` [PATCH v3 " Mikulas Patocka
  0 siblings, 1 reply; 7+ messages in thread
From: Ingo Molnar @ 2018-06-22  1:30 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Thomas Gleixner, Dan Williams,
	device-mapper development, X86 ML, linux-kernel


* Mikulas Patocka <mpatocka@redhat.com> wrote:

> On Thu, 21 Jun 2018, Ingo Molnar wrote:
> 
> > 
> > * Mike Snitzer <snitzer@redhat.com> wrote:
> > 
> > > From: Mikulas Patocka <mpatocka@redhat.com>
> > > Subject: [PATCH v2] x86: optimize memcpy_flushcache
> > > 
> > > In the context of constant short length stores to persistent memory,
> > > memcpy_flushcache suffers from a 2% performance degradation compared to
> > > explicitly using the "movnti" instruction.
> > > 
> > > Optimize 4, 8, and 16 byte memcpy_flushcache calls to explicitly use the
> > > movnti instruction with inline assembler.
> > 
> > Linus requested asm optimizations to include actual benchmarks, so it would be 
> > nice to describe how this was tested, on what hardware, and what the before/after 
> > numbers are.
> > 
> > Thanks,
> > 
> > 	Ingo
> 
> It was tested on 4-core skylake machine with persistent memory being 
> emulated using the memmap kernel option. The dm-writecache target used the 
> emulated persistent memory as a cache and sata SSD as a backing device. 
> The patch results in 2% improved throughput when writing data using dd.
> 
> I don't have access to the machine anymore.

I think this information is enough, but do we know how well memmap emulation 
represents true persistent memory speed and cache management characteristics?
It might be representative - but I don't know for sure, nor probably most
readers of the changelog.

So could you please put all this into an updated changelog, and also add a short 
description that outlines exactly which codepaths end up using this method in a 
typical persistent memory setup? All filesystem ops - or only reads, etc?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v3 RESEND] x86: optimize memcpy_flushcache
  2018-06-22  1:30             ` Ingo Molnar
@ 2018-08-08 21:22               ` Mikulas Patocka
  2018-09-10 13:18                 ` Ingo Molnar
  2018-09-11  6:22                 ` [tip:x86/asm] x86/asm: Optimize memcpy_flushcache() tip-bot for Mikulas Patocka
  0 siblings, 2 replies; 7+ messages in thread
From: Mikulas Patocka @ 2018-08-08 21:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Mike Snitzer, Thomas Gleixner, Dan Williams,
	device-mapper development, X86 ML, linux-kernel



On Fri, 22 Jun 2018, Ingo Molnar wrote:

> 
> * Mikulas Patocka <mpatocka@redhat.com> wrote:
> 
> > On Thu, 21 Jun 2018, Ingo Molnar wrote:
> > 
> > > 
> > > * Mike Snitzer <snitzer@redhat.com> wrote:
> > > 
> > > > From: Mikulas Patocka <mpatocka@redhat.com>
> > > > Subject: [PATCH v2] x86: optimize memcpy_flushcache
> > > > 
> > > > In the context of constant short length stores to persistent memory,
> > > > memcpy_flushcache suffers from a 2% performance degradation compared to
> > > > explicitly using the "movnti" instruction.
> > > > 
> > > > Optimize 4, 8, and 16 byte memcpy_flushcache calls to explicitly use the
> > > > movnti instruction with inline assembler.
> > > 
> > > Linus requested asm optimizations to include actual benchmarks, so it would be 
> > > nice to describe how this was tested, on what hardware, and what the before/after 
> > > numbers are.
> > > 
> > > Thanks,
> > > 
> > > 	Ingo
> > 
> > It was tested on 4-core skylake machine with persistent memory being 
> > emulated using the memmap kernel option. The dm-writecache target used the 
> > emulated persistent memory as a cache and sata SSD as a backing device. 
> > The patch results in 2% improved throughput when writing data using dd.
> > 
> > I don't have access to the machine anymore.
> 
> I think this information is enough, but do we know how well memmap emulation 
> represents true persistent memory speed and cache management characteristics?
> It might be representative - but I don't know for sure, nor probably most
> readers of the changelog.
> 
> So could you please put all this into an updated changelog, and also add a short 
> description that outlines exactly which codepaths end up using this method in a 
> typical persistent memory setup? All filesystem ops - or only reads, etc?
> 
> Thanks,
> 
> 	Ingo

Here I resend it:


From: Mikulas Patocka <mpatocka@redhat.com>
Subject: [PATCH] x86: optimize memcpy_flushcache

I use memcpy_flushcache in my persistent memory driver for metadata
updates, there are many 8-byte and 16-byte updates and it turns out that
the overhead of memcpy_flushcache causes 2% performance degradation
compared to "movnti" instruction explicitly coded using inline assembler.

The tests were done on a Skylake processor with persistent memory emulated
using the "memmap" kernel parameter. dd was used to copy data to the
dm-writecache target.

This patch recognizes memcpy_flushcache calls with constant short length
and turns them into inline assembler - so that I don't have to use inline
assembler in the driver.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 arch/x86/include/asm/string_64.h |   20 +++++++++++++++++++-
 arch/x86/lib/usercopy_64.c       |    4 ++--
 2 files changed, 21 insertions(+), 3 deletions(-)

Index: linux-2.6/arch/x86/include/asm/string_64.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/string_64.h
+++ linux-2.6/arch/x86/include/asm/string_64.h
@@ -149,7 +149,25 @@ memcpy_mcsafe(void *dst, const void *src
 
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
-void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
+static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+	if (__builtin_constant_p(cnt)) {
+		switch (cnt) {
+			case 4:
+				asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
+				return;
+			case 8:
+				asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+				return;
+			case 16:
+				asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+				asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
+				return;
+		}
+	}
+	__memcpy_flushcache(dst, src, cnt);
+}
 #endif
 
 #endif /* __KERNEL__ */
Index: linux-2.6/arch/x86/lib/usercopy_64.c
===================================================================
--- linux-2.6.orig/arch/x86/lib/usercopy_64.c
+++ linux-2.6/arch/x86/lib/usercopy_64.c
@@ -153,7 +153,7 @@ long __copy_user_flushcache(void *dst, c
 	return rc;
 }
 
-void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
 {
 	unsigned long dest = (unsigned long) _dst;
 	unsigned long source = (unsigned long) _src;
@@ -216,7 +216,7 @@ void memcpy_flushcache(void *_dst, const
 		clean_cache_range((void *) dest, size);
 	}
 }
-EXPORT_SYMBOL_GPL(memcpy_flushcache);
+EXPORT_SYMBOL_GPL(__memcpy_flushcache);
 
 void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
 		size_t len)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3 RESEND] x86: optimize memcpy_flushcache
  2018-08-08 21:22               ` [PATCH v3 " Mikulas Patocka
@ 2018-09-10 13:18                 ` Ingo Molnar
  2018-09-11  6:22                 ` [tip:x86/asm] x86/asm: Optimize memcpy_flushcache() tip-bot for Mikulas Patocka
  1 sibling, 0 replies; 7+ messages in thread
From: Ingo Molnar @ 2018-09-10 13:18 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Mike Snitzer, Thomas Gleixner, Dan Williams,
	device-mapper development, X86 ML, linux-kernel


* Mikulas Patocka <mpatocka@redhat.com> wrote:

> Here I resend it:
> 
> 
> From: Mikulas Patocka <mpatocka@redhat.com>
> Subject: [PATCH] x86: optimize memcpy_flushcache
> 
> I use memcpy_flushcache in my persistent memory driver for metadata
> updates, there are many 8-byte and 16-byte updates and it turns out that
> the overhead of memcpy_flushcache causes 2% performance degradation
> compared to "movnti" instruction explicitly coded using inline assembler.
> 
> The tests were done on a Skylake processor with persistent memory emulated
> using the "memmap" kernel parameter. dd was used to copy data to the
> dm-writecache target.
> 
> This patch recognizes memcpy_flushcache calls with constant short length
> and turns them into inline assembler - so that I don't have to use inline
> assembler in the driver.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> 
> ---
>  arch/x86/include/asm/string_64.h |   20 +++++++++++++++++++-
>  arch/x86/lib/usercopy_64.c       |    4 ++--
>  2 files changed, 21 insertions(+), 3 deletions(-)

Applied to tip:x86/asm, thanks!

I'll push it out later today after some testing.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [tip:x86/asm] x86/asm: Optimize memcpy_flushcache()
  2018-08-08 21:22               ` [PATCH v3 " Mikulas Patocka
  2018-09-10 13:18                 ` Ingo Molnar
@ 2018-09-11  6:22                 ` tip-bot for Mikulas Patocka
  1 sibling, 0 replies; 7+ messages in thread
From: tip-bot for Mikulas Patocka @ 2018-09-11  6:22 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: mingo, hpa, peterz, tglx, mpatocka, snitzer, linux-kernel,
	torvalds, dan.j.williams, dm-devel

Commit-ID:  02101c45ec5b19d607af7372680f5259050b4e9c
Gitweb:     https://git.kernel.org/tip/02101c45ec5b19d607af7372680f5259050b4e9c
Author:     Mikulas Patocka <mpatocka@redhat.com>
AuthorDate: Wed, 8 Aug 2018 17:22:16 -0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 10 Sep 2018 15:17:12 +0200

x86/asm: Optimize memcpy_flushcache()

I use memcpy_flushcache() in my persistent memory driver for metadata
updates, there are many 8-byte and 16-byte updates and it turns out that
the overhead of memcpy_flushcache causes 2% performance degradation
compared to "movnti" instruction explicitly coded using inline assembler.

The tests were done on a Skylake processor with persistent memory emulated
using the "memmap" kernel parameter. dd was used to copy data to the
dm-writecache target.

This patch recognizes memcpy_flushcache calls with constant short length
and turns them into inline assembler - so that I don't have to use inline
assembler in the driver.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: device-mapper development <dm-devel@redhat.com>
Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1808081720460.24747@file01.intranet.prod.int.rdu2.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/string_64.h | 20 +++++++++++++++++++-
 arch/x86/lib/usercopy_64.c       |  4 ++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index d33f92b9fa22..7ad41bfcc16c 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -149,7 +149,25 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)
 
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
-void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
+static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+	if (__builtin_constant_p(cnt)) {
+		switch (cnt) {
+			case 4:
+				asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
+				return;
+			case 8:
+				asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+				return;
+			case 16:
+				asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+				asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
+				return;
+		}
+	}
+	__memcpy_flushcache(dst, src, cnt);
+}
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 9c5606d88f61..c50a1d815a37 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -153,7 +153,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
 	return rc;
 }
 
-void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
 {
 	unsigned long dest = (unsigned long) _dst;
 	unsigned long source = (unsigned long) _src;
@@ -216,7 +216,7 @@ void memcpy_flushcache(void *_dst, const void *_src, size_t size)
 		clean_cache_range((void *) dest, size);
 	}
 }
-EXPORT_SYMBOL_GPL(memcpy_flushcache);
+EXPORT_SYMBOL_GPL(__memcpy_flushcache);
 
 void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
 		size_t len)

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-09-11  6:22 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20180519052503.325953342@debian.vm>
     [not found] ` <20180519052631.730455475@debian.vm>
     [not found]   ` <CAPcyv4jpY0x7kZtT+afAKnHKy8Uy1AC_N_QM7RrELSj_0iNrRw@mail.gmail.com>
     [not found]     ` <20180524182013.GA59755@redhat.com>
2018-06-18 13:23       ` [PATCH v2 RESEND] x86: optimize memcpy_flushcache Mike Snitzer
2018-06-21 14:31         ` Ingo Molnar
2018-06-22  1:19           ` Mikulas Patocka
2018-06-22  1:30             ` Ingo Molnar
2018-08-08 21:22               ` [PATCH v3 " Mikulas Patocka
2018-09-10 13:18                 ` Ingo Molnar
2018-09-11  6:22                 ` [tip:x86/asm] x86/asm: Optimize memcpy_flushcache() tip-bot for Mikulas Patocka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).