All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/4] x86/xor: improve XMM register spill/fill
@ 2012-09-10 12:38 Jan Beulich
  2012-09-10 14:05 ` H. Peter Anvin
  0 siblings, 1 reply; 3+ messages in thread
From: Jan Beulich @ 2012-09-10 12:38 UTC (permalink / raw)
  To: mingo, tglx, hpa; +Cc: linux-kernel

Provided a new enough gcc is in use, we can avoid using the potentially
much slower MOVUPS by making sure stack frame and spilled to variables
are suitably aligned.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

---
 arch/x86/include/asm/xor.h |   56 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 16 deletions(-)

--- 3.6-rc5-x86-xor.orig/arch/x86/include/asm/xor.h
+++ 3.6-rc5-x86-xor/arch/x86/include/asm/xor.h
@@ -36,16 +36,37 @@
  * no advantages to be gotten from x86-64 here anyways.
  */
 
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
+# ifdef CONFIG_X86_32
+#  define XOR_ALIGN_STACK __attribute__((force_align_arg_pointer))
+#  define XOR_ALIGN 16
+# else
+/*
+ * By forcing the alignment beyond the default of 16 bytes, we make the
+ * compiler guarantee the alignment. Passing -mincoming-stack-boundary=3
+ * (which would have been the better global alternative, as the kernel
+ * never guarantees better stack alignment) isn't permitted on x86-64.
+ */
+#  define XOR_ALIGN_STACK
+#  define XOR_ALIGN 32
+# endif
+# define XOR_MOV "movaps"
+#else
+# define XOR_ALIGN_STACK
+# define XOR_ALIGN 16
+# define XOR_MOV "movups"
+#endif
+
 #define XMMS_SAVE				\
 do {						\
 	preempt_disable();			\
 	cr0 = read_cr0();			\
 	clts();					\
 	asm volatile(				\
-		"movups %%xmm0,(%0)	;\n\t"	\
-		"movups %%xmm1,0x10(%0)	;\n\t"	\
-		"movups %%xmm2,0x20(%0)	;\n\t"	\
-		"movups %%xmm3,0x30(%0)	;\n\t"	\
+		XOR_MOV " %%xmm0,(%0)	;\n\t"	\
+		XOR_MOV " %%xmm1,0x10(%0);\n\t"	\
+		XOR_MOV " %%xmm2,0x20(%0);\n\t"	\
+		XOR_MOV " %%xmm3,0x30(%0);\n\t"	\
 		:				\
 		: "r" (xmm_save) 		\
 		: "memory");			\
@@ -55,10 +76,10 @@ do {						\
 do {						\
 	asm volatile(				\
 		"sfence			;\n\t"	\
-		"movups (%0),%%xmm0	;\n\t"	\
-		"movups 0x10(%0),%%xmm1	;\n\t"	\
-		"movups 0x20(%0),%%xmm2	;\n\t"	\
-		"movups 0x30(%0),%%xmm3	;\n\t"	\
+		XOR_MOV " (%0),%%xmm0	;\n\t"	\
+		XOR_MOV " 0x10(%0),%%xmm1;\n\t"	\
+		XOR_MOV " 0x20(%0),%%xmm2;\n\t"	\
+		XOR_MOV " 0x30(%0),%%xmm3;\n\t"	\
 		:				\
 		: "r" (xmm_save)		\
 		: "memory");			\
@@ -87,11 +108,11 @@ do {						\
 #define XO3(x, y)	"	xorps "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
 #define XO4(x, y)	"	xorps "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
 
-static void
+static void XOR_ALIGN_STACK
 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 {
 	unsigned long cr0, lines = bytes >> 8;
-	char xmm_save[16*4] __aligned(16);
+	char xmm_save[16*4] __aligned(XOR_ALIGN);
 
 	XMMS_SAVE;
 
@@ -139,12 +160,12 @@ xor_sse_2(unsigned long bytes, unsigned
 	XMMS_RESTORE;
 }
 
-static void
+static void XOR_ALIGN_STACK
 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	  unsigned long *p3)
 {
 	unsigned long cr0, lines = bytes >> 8;
-	char xmm_save[16*4] __aligned(16);
+	char xmm_save[16*4] __aligned(XOR_ALIGN);
 
 	XMMS_SAVE;
 
@@ -199,12 +220,12 @@ xor_sse_3(unsigned long bytes, unsigned
 	XMMS_RESTORE;
 }
 
-static void
+static void XOR_ALIGN_STACK
 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	  unsigned long *p3, unsigned long *p4)
 {
 	unsigned long cr0, lines = bytes >> 8;
-	char xmm_save[16*4] __aligned(16);
+	char xmm_save[16*4] __aligned(XOR_ALIGN);
 
 	XMMS_SAVE;
 
@@ -266,12 +287,12 @@ xor_sse_4(unsigned long bytes, unsigned
 	XMMS_RESTORE;
 }
 
-static void
+static void XOR_ALIGN_STACK
 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
 {
 	unsigned long cr0, lines = bytes >> 8;
-	char xmm_save[16*4] __aligned(16);
+	char xmm_save[16*4] __aligned(XOR_ALIGN);
 
 	XMMS_SAVE;
 
@@ -348,6 +369,9 @@ xor_sse_5(unsigned long bytes, unsigned
 #undef ST
 #undef BLOCK
 
+#undef XOR_ALIGN_STACK
+#undef XOR_ALIGN
+#undef XOR_MOV
 #undef XOR_CONSTANT_CONSTRAINT
 
 #ifdef CONFIG_X86_32




^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH 2/4] x86/xor: improve XMM register spill/fill
  2012-09-10 12:38 [PATCH 2/4] x86/xor: improve XMM register spill/fill Jan Beulich
@ 2012-09-10 14:05 ` H. Peter Anvin
  2012-09-10 14:46   ` Jan Beulich
  0 siblings, 1 reply; 3+ messages in thread
From: H. Peter Anvin @ 2012-09-10 14:05 UTC (permalink / raw)
  To: Jan Beulich; +Cc: mingo, tglx, linux-kernel

On 09/10/2012 05:38 AM, Jan Beulich wrote:
> +/*
> + * By forcing the alignment beyond the default of 16 bytes, we make the
> + * compiler guarantee the alignment. Passing -mincoming-stack-boundary=3
> + * (which would have been the better global alternative, as the kernel
> + * never guarantees better stack alignment) isn't permitted on x86-64.
> + */

The very latest gcc should handle it, and in fact we compile with 
-mstack-alignment=3 if gcc accepts it (if it is not yet upstream it will 
be soon.)  This affects the validity of this patch.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH 2/4] x86/xor: improve XMM register spill/fill
  2012-09-10 14:05 ` H. Peter Anvin
@ 2012-09-10 14:46   ` Jan Beulich
  0 siblings, 0 replies; 3+ messages in thread
From: Jan Beulich @ 2012-09-10 14:46 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: mingo, tglx, linux-kernel

>>> On 10.09.12 at 16:05, "H. Peter Anvin" <hpa@zytor.com> wrote:
> On 09/10/2012 05:38 AM, Jan Beulich wrote:
>> +/*
>> + * By forcing the alignment beyond the default of 16 bytes, we make the
>> + * compiler guarantee the alignment. Passing -mincoming-stack-boundary=3
>> + * (which would have been the better global alternative, as the kernel
>> + * never guarantees better stack alignment) isn't permitted on x86-64.
>> + */
> 
> The very latest gcc should handle it, and in fact we compile with 
> -mstack-alignment=3 if gcc accepts it (if it is not yet upstream it will 
> be soon.)  This affects the validity of this patch.

The comment would be stale with that, but the code should still
be fine - it would merely over-align the stack in that case (to
32 bytes when 16 would suffice). Or did you spot something else
that I'm missing?

Also, I can't spot any use of -mstack-alignment= in today's tip's
arch/x86/Makefile* - where's that hidden?

Jan


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2012-09-10 14:46 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-10 12:38 [PATCH 2/4] x86/xor: improve XMM register spill/fill Jan Beulich
2012-09-10 14:05 ` H. Peter Anvin
2012-09-10 14:46   ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.