All of lore.kernel.org
 help / color / mirror / Atom feed
From: Luca Barbieri <luca@luca-barbieri.com>
To: mingo@elte.hu
Cc: hpa@zytor.com, a.p.zijlstra@chello.nl, akpm@linux-foundation.org,
	linux-kernel@vger.kernel.org,
	Luca Barbieri <luca@luca-barbieri.com>
Subject: [PATCH 09/10] x86-32: use SSE for atomic64_read/set if available
Date: Wed, 17 Feb 2010 12:42:41 +0100	[thread overview]
Message-ID: <1266406962-17463-10-git-send-email-luca@luca-barbieri.com> (raw)
In-Reply-To: <1266406962-17463-1-git-send-email-luca@luca-barbieri.com>

This patch uses SSE movlps to perform 64-bit atomic reads and writes.

According to Intel manuals, all aligned 64-bit reads and writes are
atomically, which should include movlps.

To do this, we need to disable preempt, clts if TS was set, and
restore TS.

If we don't need to change TS, using SSE is much faster.

Otherwise, it should be essentially even, with the fastest method
depending on the specific architecture.

Another important point is that with SSE atomic64_read can keep the
cacheline in shared state.

If we could keep TS off and reenable it when returning to userspace,
this would be even faster, but this is left for a later patch.

We use SSE because we can just save the low part %xmm0, whereas using
the FPU or MMX requires at least saving the environment, and seems
impossible to do fast.

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
---
 arch/x86/include/asm/atomic_32.h |   10 ++++-
 arch/x86/lib/atomic64_32.c       |   67 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 1ab431c..d03e471 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -268,6 +268,9 @@ typedef struct {
 
 #define ATOMIC64_INIT(val)	{ (val) }
 
+long long sse_atomic64_read_cx8call(long long, const atomic64_t *v);
+void sse_atomic64_set_cx8call(long long, unsigned high);
+
 long long cx8_atomic64_read_cx8call(long long, const atomic64_t *v);
 long long cx8_atomic64_set_cx8call(long long, const atomic64_t *v);
 long long cx8_atomic64_xchg_cx8call(long long, unsigned high);
@@ -281,8 +284,10 @@ int cx8_atomic64_add_unless(atomic64_t *v, long long a, long long u);
 
 #ifdef CONFIG_X86_CMPXCHG64
 #define ATOMIC64_ALTERNATIVE(f) "call cx8_atomic64_" #f
+#define ATOMIC64_ALTERNATIVE_XMM(f) ALTERNATIVE("call cx8_atomic64_" #f, "call sse_atomic64_" #g, X86_FEATURE_XMM)
 #else
 #define ATOMIC64_ALTERNATIVE(f) ALTERNATIVE("call generic_atomic64_" #f, "call cx8_atomic64_" #f, X86_FEATURE_CX8)
+#define ATOMIC64_ALTERNATIVE_XMM(f) ALTERNATIVE3("call generic_atomic64_" #f, "call cx8_atomic64_" #f, X86_FEATURE_CX8, "call sse_atomic64_" #f, X86_FEATURE_XMM)
 #endif
 
 /**
@@ -349,7 +354,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
 {
 	unsigned high = (unsigned)(i >> 32);
 	unsigned low = (unsigned)i;
-	asm volatile(ATOMIC64_ALTERNATIVE(set_cx8call)
+	asm volatile(ATOMIC64_ALTERNATIVE_XMM(set_cx8call)
 			: "+b" (low), "+c" (high)
 			: "S" (v)
 			: "eax", "edx", "memory"
@@ -365,7 +370,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
 static inline long long atomic64_read(atomic64_t *v)
 {
 	long long r;
-	asm volatile(ATOMIC64_ALTERNATIVE(read_cx8call)
+	asm volatile(ATOMIC64_ALTERNATIVE_XMM(read_cx8call)
 				: "=A" (r), "+c" (v)
 				: : "memory"
 				);
@@ -470,6 +475,7 @@ static inline int atomic64_inc_not_zero(atomic64_t *v)
 #define atomic64_dec_and_test(v)	(atomic64_dec_return((v)) == 0)
 
 #undef ATOMIC64_ALTERNATIVE
+#undef ATOMIC64_ALTERNATIVE_XMM
 
 #include <asm-generic/atomic-long.h>
 #endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index b7edbb3..9ff8589 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -61,6 +61,47 @@ EXPORT_SYMBOL(generic_atomic64_read_cx8call);
 
 #endif /* CONFIG_X86_CMPXCHG64 */
 
+struct sse_atomic64_percpu {
+	long long xmm0_low;
+	long low;
+	long high;
+};
+
+/* we actually only need 8-byte alignment, but using cacheline alignment is the only simple way to this */
+/* we use a per-CPU variable because we need to disable preemption anyway and this is faster than
+ * aligning the stack pointer to 8 bytes
+ */
+DEFINE_PER_CPU_ALIGNED(struct sse_atomic64_percpu, sse_atomic64_percpu);
+
+/* using the fpu/mmx looks infeasible due to the need to save the FPU environment, which is very slow
+ * SSE2 is slightly slower on Core 2 and less compatible, so avoid it for now
+ */
+long long sse_atomic64_read_cx8call(long long dummy, const atomic64_t *v)
+{
+	long long res;
+	unsigned long cr0 = 0;
+	struct thread_info *me = current_thread_info();
+	preempt_disable();
+	if (!(me->status & TS_USEDFPU)) {
+		cr0 = read_cr0();
+		if (cr0 & X86_CR0_TS)
+			clts();
+	}
+	asm volatile(
+			"movlps %%xmm0, " __percpu_arg(0) "\n\t"
+			"movlps %3, %%xmm0\n\t"
+			"movlps %%xmm0, " __percpu_arg(1) "\n\t"
+			"movlps " __percpu_arg(0) ", %%xmm0\n\t"
+			    : "+m" (per_cpu__sse_atomic64_percpu.xmm0_low), "=m" (per_cpu__sse_atomic64_percpu.low), "=m" (per_cpu__sse_atomic64_percpu.high)
+			    : "m" (v->counter));
+	if (cr0 & X86_CR0_TS)
+		write_cr0(cr0);
+	res = (long long)(unsigned)percpu_read(sse_atomic64_percpu.low) | ((long long)(unsigned)percpu_read(sse_atomic64_percpu.high) << 32);
+	preempt_enable();
+	return res;
+}
+EXPORT_SYMBOL(sse_atomic64_read_cx8call);
+
 register unsigned low asm("ebx");
 register atomic64_t *v asm("esi");
 
@@ -121,3 +162,29 @@ int generic_atomic64_inc_not_zero_cx8call(void)
 EXPORT_SYMBOL(generic_atomic64_inc_not_zero_cx8call);
 
 #endif /* CONFIG_X86_CMPXCHG64 */
+
+/* put this here because we need access to the global register variables */
+void sse_atomic64_set_cx8call(long long dummy, unsigned high)
+{
+	struct thread_info *me = current_thread_info();
+	unsigned long cr0 = 0;
+	preempt_disable();
+	percpu_write(sse_atomic64_percpu.low, low);
+	percpu_write(sse_atomic64_percpu.high, high);
+	if (!(me->status & TS_USEDFPU)) {
+		cr0 = read_cr0();
+		if (cr0 & X86_CR0_TS)
+			clts();
+	}
+	asm volatile(
+			"movlps %%xmm0, " __percpu_arg(0) "\n\t"
+			"movlps " __percpu_arg(2) ", %%xmm0\n\t"
+			"movlps %%xmm0, %1\n\t"
+			"movlps " __percpu_arg(0) ", %%xmm0\n\t"
+			    : "+m" (per_cpu__sse_atomic64_percpu.xmm0_low), "=m" (v->counter)
+			    : "m" (per_cpu__sse_atomic64_percpu.low), "m" (per_cpu__sse_atomic64_percpu.high));
+	if (cr0 & X86_CR0_TS)
+		write_cr0(cr0);
+	preempt_enable();
+}
+EXPORT_SYMBOL(sse_atomic64_set_cx8call);
-- 
1.6.6.1.476.g01ddb


  parent reply	other threads:[~2010-02-17 11:43 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-02-17 11:42 [PATCH 0/10] x86-32: improve atomic64_t functions Luca Barbieri
2010-02-17 11:42 ` [PATCH 01/10] x86: add support for multiple choice alternatives Luca Barbieri
2010-02-17 12:47   ` Avi Kivity
2010-02-18 19:46     ` H. Peter Anvin
2010-02-17 11:42 ` [PATCH 02/10] x86: add support for relative CALL and JMP in alternatives Luca Barbieri
2010-02-18 19:40   ` H. Peter Anvin
2010-02-18 23:38     ` Luca Barbieri
2010-02-18 23:54       ` H. Peter Anvin
2010-02-19 14:27   ` Masami Hiramatsu
2010-02-17 11:42 ` [PATCH 03/10] x86: add support for lock prefix " Luca Barbieri
2010-02-17 11:42 ` [PATCH 04/10] x86-32: allow UP/SMP lock replacement in cmpxchg64 Luca Barbieri
2010-02-17 11:42 ` [PATCH 05/10] lib: add self-test for atomic64_t Luca Barbieri
2010-02-17 11:42 ` [PATCH 06/10] x86-32: rewrite 32-bit atomic64 functions in assembly Luca Barbieri
2010-02-17 11:42 ` [PATCH 07/10] lib: move generic atomic64 to atomic64-impl.h Luca Barbieri
2010-02-17 11:42 ` [PATCH 08/10] x86-32: support atomic64_t on 386/486 UP/SMP Luca Barbieri
2010-02-18 10:25   ` Peter Zijlstra
2010-02-18 10:58     ` Luca Barbieri
2010-02-18 15:20     ` H. Peter Anvin
2010-02-17 11:42 ` Luca Barbieri [this message]
2010-02-17 22:39   ` [PATCH 09/10] x86-32: use SSE for atomic64_read/set if available H. Peter Anvin
2010-02-18  0:41     ` Luca Barbieri
2010-02-18  0:47       ` H. Peter Anvin
2010-02-18  9:56         ` Avi Kivity
2010-02-18 10:07           ` Luca Barbieri
2010-02-18  8:23   ` Andi Kleen
2010-02-18  9:53     ` Luca Barbieri
2010-02-18  9:56       ` Luca Barbieri
2010-02-18 10:11       ` Andi Kleen
2010-02-18 10:27         ` Luca Barbieri
2010-02-18 15:24           ` H. Peter Anvin
2010-02-18 18:14             ` Luca Barbieri
2010-02-18 18:28               ` H. Peter Anvin
2010-02-18 18:42                 ` Luca Barbieri
2010-02-18 19:07                   ` H. Peter Anvin
2010-02-18 20:26               ` Andi Kleen
2010-02-18 16:52           ` H. Peter Anvin
2010-02-18 18:49             ` Luca Barbieri
2010-02-18 19:06               ` H. Peter Anvin
2010-02-18 19:43                 ` Luca Barbieri
2010-02-18 19:45                 ` Yuhong Bao
2010-02-18 10:24       ` Peter Zijlstra
2010-02-18 10:25   ` Peter Zijlstra
2010-02-18 10:50     ` Luca Barbieri
2010-02-18 11:00       ` Peter Zijlstra
2010-02-18 12:29         ` Luca Barbieri
2010-02-18 12:32           ` Peter Zijlstra
2010-02-18 13:45             ` Luca Barbieri
2010-02-17 11:42 ` [PATCH 10/10] x86-32: panic on !CX8 && XMM Luca Barbieri
2010-02-17 22:38   ` H. Peter Anvin
2010-02-17 23:00     ` Yuhong Bao
2010-02-17 23:41       ` H. Peter Anvin
2010-02-18  1:13         ` Yuhong Bao
2010-02-25 20:24           ` Yuhong Bao
2010-02-18  0:46     ` Luca Barbieri

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1266406962-17463-10-git-send-email-luca@luca-barbieri.com \
    --to=luca@luca-barbieri.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=akpm@linux-foundation.org \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.