All of lore.kernel.org
 help / color / mirror / Atom feed
From: Nadav Amit <namit@vmware.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: x86@kernel.org, linux-kernel@vger.kernel.org,
	Thomas Gleixner <tglx@linutronix.de>,
	Thomas Garnier <thgarnie@chromium.org>,
	Ingo Molnar <mingo@redhat.com>, Nadav Amit <namit@vmware.com>
Subject: [PATCH 3/7] x86/percpu: Use C for percpu accesses when possible
Date: Fri, 23 Aug 2019 15:44:20 -0700	[thread overview]
Message-ID: <20190823224424.15296-4-namit@vmware.com> (raw)
In-Reply-To: <20190823224424.15296-1-namit@vmware.com>

The percpu code mostly uses inline assembly. Using segment qualifiers
allows to use C code instead, which enables the compiler to perform
various optimizations (e.g., CSE). For example, in __schedule() the
following two instructions:

  mov    %gs:0x7e5f1eff(%rip),%edx        # 0x10350 <cpu_number>
  movslq %edx,%rdx

Turn with this patch into:

  movslq %gs:0x7e5f2e6e(%rip),%rax        # 0x10350 <cpu_number>

In addition, operations that have no guarantee against concurrent
interrupts or preemption, such as __this_cpu_cmpxchg() can be further
optimized by the compiler when they are implemented in C, for example
in call_timer_fn().

Signed-off-by: Nadav Amit <namit@vmware.com>
---
 arch/x86/include/asm/percpu.h | 115 +++++++++++++++++++++++++++++++---
 1 file changed, 105 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 1fe348884477..13987f9bc82f 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -439,13 +439,88 @@ do {									\
  */
 #define this_cpu_read_stable(var)	percpu_stable_op("mov", var)
 
+#if USE_X86_SEG_SUPPORT
+
+#define __raw_cpu_read(qual, pcp)					\
+({									\
+	*(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp));		\
+})
+
+#define __raw_cpu_write(qual, pcp, val)					\
+	do {								\
+		*(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \
+	} while (0)
+
+/*
+ * Performance-wise, C operations are only more efficient than their inline
+ * assembly counterparts for non-volatile variables (__this_*) and for volatile
+ * loads and stores.
+ *
+ * Since we do not use assembly, we are free to define 64-bit operations
+ * on 32-bit architecture
+ */
+#define __raw_cpu_add(pcp, val)		do { __my_cpu_var(pcp) += (val); } while (0)
+#define __raw_cpu_and(pcp, val)		do { __my_cpu_var(pcp) &= (val); } while (0)
+#define __raw_cpu_or(pcp, val)		do { __my_cpu_var(pcp) |= (val); } while (0)
+#define __raw_cpu_add_return(pcp, val)	({ __my_cpu_var(pcp) += (val); })
+
+#define __raw_cpu_xchg(pcp, val)					\
+({									\
+	typeof(pcp) pxo_ret__ = __my_cpu_var(pcp);			\
+									\
+	__my_cpu_var(pcp) = (val);					\
+	pxo_ret__;							\
+})
+
+#define __raw_cpu_cmpxchg(pcp, oval, nval)				\
+({									\
+	__my_cpu_type(pcp) *__p = __my_cpu_ptr(&(pcp));			\
+									\
+	typeof(pcp) __ret = *__p;					\
+									\
+	if (__ret == (oval))						\
+		*__p = nval;						\
+	__ret;								\
+})
+
+#define raw_cpu_read_1(pcp)		__raw_cpu_read(, pcp)
+#define raw_cpu_read_2(pcp)		__raw_cpu_read(, pcp)
+#define raw_cpu_read_4(pcp)		__raw_cpu_read(, pcp)
+#define raw_cpu_write_1(pcp, val)	__raw_cpu_write(, pcp, val)
+#define raw_cpu_write_2(pcp, val)	__raw_cpu_write(, pcp, val)
+#define raw_cpu_write_4(pcp, val)	__raw_cpu_write(, pcp, val)
+#define raw_cpu_add_1(pcp, val)		__raw_cpu_add(pcp, val)
+#define raw_cpu_add_2(pcp, val)		__raw_cpu_add(pcp, val)
+#define raw_cpu_add_4(pcp, val)		__raw_cpu_add(pcp, val)
+#define raw_cpu_and_1(pcp, val)		__raw_cpu_and(pcp, val)
+#define raw_cpu_and_2(pcp, val)		__raw_cpu_and(pcp, val)
+#define raw_cpu_and_4(pcp, val)		__raw_cpu_and(pcp, val)
+#define raw_cpu_or_1(pcp, val)		__raw_cpu_or(pcp, val)
+#define raw_cpu_or_2(pcp, val)		__raw_cpu_or(pcp, val)
+#define raw_cpu_or_4(pcp, val)		__raw_cpu_or(pcp, val)
+#define raw_cpu_xchg_1(pcp, val)	__raw_cpu_xchg(pcp, val)
+#define raw_cpu_xchg_2(pcp, val)	__raw_cpu_xchg(pcp, val)
+#define raw_cpu_xchg_4(pcp, val)	__raw_cpu_xchg(pcp, val)
+#define raw_cpu_add_return_1(pcp, val)	__raw_cpu_add_return(pcp, val)
+#define raw_cpu_add_return_2(pcp, val)	__raw_cpu_add_return(pcp, val)
+#define raw_cpu_add_return_4(pcp, val)	__raw_cpu_add_return(pcp, val)
+#define raw_cpu_add_return_8(pcp, val)		__raw_cpu_add_return(pcp, val)
+#define raw_cpu_cmpxchg_1(pcp, oval, nval)	__raw_cpu_cmpxchg(pcp, oval, nval)
+#define raw_cpu_cmpxchg_2(pcp, oval, nval)	__raw_cpu_cmpxchg(pcp, oval, nval)
+#define raw_cpu_cmpxchg_4(pcp, oval, nval)	__raw_cpu_cmpxchg(pcp, oval, nval)
+
+#define this_cpu_read_1(pcp)		__raw_cpu_read(volatile, pcp)
+#define this_cpu_read_2(pcp)		__raw_cpu_read(volatile, pcp)
+#define this_cpu_read_4(pcp)		__raw_cpu_read(volatile, pcp)
+#define this_cpu_write_1(pcp, val)	__raw_cpu_write(volatile, pcp, val)
+#define this_cpu_write_2(pcp, val)	__raw_cpu_write(volatile, pcp, val)
+#define this_cpu_write_4(pcp, val)	__raw_cpu_write(volatile, pcp, val)
+
+#else
 #define raw_cpu_read_1(pcp)		percpu_from_op(, "mov", pcp)
 #define raw_cpu_read_2(pcp)		percpu_from_op(, "mov", pcp)
 #define raw_cpu_read_4(pcp)		percpu_from_op(, "mov", pcp)
 
-#define raw_cpu_write_1(pcp, val)	percpu_to_op(, "mov", (pcp), val)
-#define raw_cpu_write_2(pcp, val)	percpu_to_op(, "mov", (pcp), val)
-#define raw_cpu_write_4(pcp, val)	percpu_to_op(, "mov", (pcp), val)
 #define raw_cpu_add_1(pcp, val)		percpu_add_op(, (pcp), val)
 #define raw_cpu_add_2(pcp, val)		percpu_add_op(, (pcp), val)
 #define raw_cpu_add_4(pcp, val)		percpu_add_op(, (pcp), val)
@@ -477,6 +552,14 @@ do {									\
 #define this_cpu_write_1(pcp, val)	percpu_to_op(volatile, "mov", (pcp), val)
 #define this_cpu_write_2(pcp, val)	percpu_to_op(volatile, "mov", (pcp), val)
 #define this_cpu_write_4(pcp, val)	percpu_to_op(volatile, "mov", (pcp), val)
+
+#define raw_cpu_add_return_1(pcp, val)		percpu_add_return_op(, pcp, val)
+#define raw_cpu_add_return_2(pcp, val)		percpu_add_return_op(, pcp, val)
+#define raw_cpu_add_return_4(pcp, val)		percpu_add_return_op(, pcp, val)
+#define raw_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(, pcp, oval, nval)
+#define raw_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(, pcp, oval, nval)
+#define raw_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(, pcp, oval, nval)
+#endif
 #define this_cpu_add_1(pcp, val)	percpu_add_op(volatile, (pcp), val)
 #define this_cpu_add_2(pcp, val)	percpu_add_op(volatile, (pcp), val)
 #define this_cpu_add_4(pcp, val)	percpu_add_op(volatile, (pcp), val)
@@ -490,13 +573,6 @@ do {									\
 #define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(volatile, pcp, nval)
 #define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(volatile, pcp, nval)
 
-#define raw_cpu_add_return_1(pcp, val)		percpu_add_return_op(, pcp, val)
-#define raw_cpu_add_return_2(pcp, val)		percpu_add_return_op(, pcp, val)
-#define raw_cpu_add_return_4(pcp, val)		percpu_add_return_op(, pcp, val)
-#define raw_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(, pcp, oval, nval)
-#define raw_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(, pcp, oval, nval)
-#define raw_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(, pcp, oval, nval)
-
 #define this_cpu_add_return_1(pcp, val)		percpu_add_return_op(volatile, pcp, val)
 #define this_cpu_add_return_2(pcp, val)		percpu_add_return_op(volatile, pcp, val)
 #define this_cpu_add_return_4(pcp, val)		percpu_add_return_op(volatile, pcp, val)
@@ -527,6 +603,22 @@ do {									\
  * 32 bit must fall back to generic operations.
  */
 #ifdef CONFIG_X86_64
+
+#if USE_X86_SEG_SUPPORT
+
+#define raw_cpu_read_8(pcp)			__raw_cpu_read(, pcp)
+#define raw_cpu_write_8(pcp, val)		__raw_cpu_write(, pcp, val)
+#define raw_cpu_add_8(pcp, val)			__raw_cpu_add(pcp, val)
+#define raw_cpu_and_8(pcp, val)			__raw_cpu_and(pcp, val)
+#define raw_cpu_or_8(pcp, val)			__raw_cpu_or(pcp, val)
+#define raw_cpu_xchg_8(pcp, nval)		__raw_cpu_xchg(pcp, nval)
+#define raw_cpu_cmpxchg_8(pcp, oval, nval)	__raw_cpu_cmpxchg(pcp, oval, nval)
+
+#define this_cpu_read_8(pcp)			__raw_cpu_read(volatile, pcp)
+#define this_cpu_write_8(pcp, val)		__raw_cpu_write(volatile, pcp, val)
+
+#else
+
 #define raw_cpu_read_8(pcp)			percpu_from_op(, "mov", pcp)
 #define raw_cpu_write_8(pcp, val)		percpu_to_op(, "mov", (pcp), val)
 #define raw_cpu_add_8(pcp, val)			percpu_add_op(, (pcp), val)
@@ -538,6 +630,9 @@ do {									\
 
 #define this_cpu_read_8(pcp)			percpu_from_op(volatile, "mov", pcp)
 #define this_cpu_write_8(pcp, val)		percpu_to_op(volatile, "mov", (pcp), val)
+
+#endif
+
 #define this_cpu_add_8(pcp, val)		percpu_add_op(volatile, (pcp), val)
 #define this_cpu_and_8(pcp, val)		percpu_to_op(volatile, "and", (pcp), val)
 #define this_cpu_or_8(pcp, val)			percpu_to_op(volatile, "or", (pcp), val)
-- 
2.17.1


  parent reply	other threads:[~2019-08-24  6:05 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-08-23 22:44 [PATCH 0/7] x86/percpu: Use segment qualifiers Nadav Amit
2019-08-23 22:44 ` [PATCH 1/7] compiler: Report x86 segment support Nadav Amit
2019-08-23 22:44 ` [PATCH 2/7] x86/percpu: Use compiler segment prefix qualifier Nadav Amit
2019-08-23 22:44 ` Nadav Amit [this message]
2019-08-23 22:44 ` [PATCH 4/7] x86: Fix possible caching of current_task Nadav Amit
2019-08-23 22:44 ` [PATCH 5/7] percpu: Assume preemption is disabled on per_cpu_ptr() Nadav Amit
2019-08-23 22:44 ` [PATCH 6/7] x86/percpu: Optimized arch_raw_cpu_ptr() Nadav Amit
2019-08-23 22:44 ` [PATCH 7/7] x86/current: Aggressive caching of current Nadav Amit

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190823224424.15296-4-namit@vmware.com \
    --to=namit@vmware.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=thgarnie@chromium.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.