All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4 TRY#3] improve alternative instruction code and optimize get_cycles_sync
@ 2007-03-09 15:08 Joerg Roedel
  2007-03-09 15:11 ` [PATCH 1/4 TRY#3] i386: extend alternative instructions framework Joerg Roedel
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Joerg Roedel @ 2007-03-09 15:08 UTC (permalink / raw)
  To: discuss; +Cc: Andi Kleen, linux-kernel

This series of patches extend the alternative instructions framework on
i386 and x86_64 architectures to support two alternative instruction
replacements. This code is used together with the introduction of the
X86_FEATURE_SYNC_RDTSC flag on i386 to simplify and optimize the
get_cycles_sync() function. The optimization changes this function to
use RDTSCP instead of CPUID;RDTSC if this instruction is available.
Don't use CPUID there is really important if the kernel runs as a KVM
guest, because this instruction is intercepted and causes an expensive
VMEXIT.

Changes to the previous submit:
 * rebased to current linus git tree
 * replaced RDTSCP usage in get_cycles_sync with the opcode to
   make it compile with older binutils

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG



^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/4 TRY#3] i386: extend alternative instructions framework
  2007-03-09 15:08 [PATCH 0/4 TRY#3] improve alternative instruction code and optimize get_cycles_sync Joerg Roedel
@ 2007-03-09 15:11 ` Joerg Roedel
  2007-03-09 15:12 ` [PATCH 2/4 TRY#3] x86_64: changes to x86_64 architecture for alternative instruction improvements Joerg Roedel
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 13+ messages in thread
From: Joerg Roedel @ 2007-03-09 15:11 UTC (permalink / raw)
  To: discuss; +Cc: Andi Kleen, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 267 bytes --]

From: Joerg Roedel <joerg.roedel@amd.com>

This patch extends the alternative instructions framework to support 2
alternative instructions.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG

[-- Attachment #2: two-alternatives-i386.patch --]
[-- Type: text/plain, Size: 5024 bytes --]

diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 9eca21b..59f1770 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -153,14 +153,23 @@ extern u8 __smp_alt_begin[], __smp_alt_end[];
 void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 {
 	struct alt_instr *a;
-	u8 *instr;
+	u8 *instr, *replacement;
+	u8 replacementlen;
 	int diff;
 
 	DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
 	for (a = start; a < end; a++) {
-		BUG_ON(a->replacementlen > a->instrlen);
-		if (!boot_cpu_has(a->cpuid))
+		if (boot_cpu_has(a->cpuid)) {
+			replacement = a->replacement;
+			replacementlen = a->replacementlen;
+		} else if ((a->replacementlen2 > 0) &&
+			   (boot_cpu_has(a->cpuid2))) {
+			replacement = a->replacement2;
+			replacementlen = a->replacementlen2;
+		} else
 			continue;
+
+		BUG_ON(replacementlen > a->instrlen);
 		instr = a->instr;
 #ifdef CONFIG_X86_64
 		/* vsyscall code is not mapped yet. resolve it manually. */
@@ -170,9 +179,9 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 				__FUNCTION__, a->instr, instr);
 		}
 #endif
-		memcpy(instr, a->replacement, a->replacementlen);
-		diff = a->instrlen - a->replacementlen;
-		nop_out(instr + a->replacementlen, diff);
+		memcpy(instr, replacement, replacementlen);
+		diff = a->instrlen - replacementlen;
+		nop_out(instr + replacementlen, diff);
 	}
 }
 
diff --git a/include/asm-i386/alternative.h b/include/asm-i386/alternative.h
index b8fa955..4a77e93 100644
--- a/include/asm-i386/alternative.h
+++ b/include/asm-i386/alternative.h
@@ -10,11 +10,14 @@
 struct alt_instr {
 	u8 *instr; 		/* original instruction */
 	u8 *replacement;
+	u8 *replacement2;
 	u8  cpuid;		/* cpuid bit set for replacement */
+	u8  cpuid2;		/* cpuid bit set for replacement2 */
 	u8  instrlen;		/* length of original instruction */
 	u8  replacementlen; 	/* length of new instruction, <= instrlen */
-	u8  pad;
-};
+	u8  replacementlen2;
+	u8  pad[3];
+}  __attribute__ ((packed));
 
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 
@@ -36,6 +39,12 @@ static inline void alternatives_smp_switch(int smp) {}
 #endif
 
 /*
+ * use this macro(s) if you need more than one output parameter
+ * in alternative_io_*
+ */
+#define ASM_OUTPUT2(a, b) a, b
+
+/*
  * Alternative instructions for different CPU types or capabilities.
  *
  * This allows to use optimized instructions even on generic binary
@@ -53,9 +62,12 @@ static inline void alternatives_smp_switch(int smp) {}
 		      "  .align 4\n"					\
 		      "  .long 661b\n"            /* label */		\
 		      "  .long 663f\n"		  /* new instruction */	\
+		      "  .long 0x00\n"					\
 		      "  .byte %c0\n"             /* feature bit */	\
+		      "  .byte 0x00\n"					\
 		      "  .byte 662b-661b\n"       /* sourcelen */	\
 		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      "  .byte 0x00\n"					\
 		      ".previous\n"					\
 		      ".section .altinstr_replacement,\"ax\"\n"		\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */\
@@ -77,14 +89,38 @@ static inline void alternatives_smp_switch(int smp) {}
 		      "  .align 4\n"					\
 		      "  .long 661b\n"            /* label */		\
 		      "  .long 663f\n"		  /* new instruction */ \
+		      "  .long 0x00\n"					\
 		      "  .byte %c0\n"             /* feature bit */	\
+		      "  .byte 0x00\n"					\
 		      "  .byte 662b-661b\n"       /* sourcelen */	\
 		      "  .byte 664f-663f\n"       /* replacementlen */ 	\
+		      "  .byte 0x00\n"					\
 		      ".previous\n"					\
 		      ".section .altinstr_replacement,\"ax\"\n"		\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */\
 		      ".previous" :: "i" (feature), ##input)
 
+/* Like alternative_io, but supports 2 possible alternatives */
+#define alternative_io_two(oldinstr, newinstr, feat, newinstr2, feat2,\
+		output, input...) \
+	asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
+		      ".section .altinstructions,\"a\"\n"               \
+		      "  .align 4\n"                                    \
+		      "  .long 661b\n"            /* label */           \
+		      "  .long 663f\n"            /* new instruction */ \
+		      "  .long 665f\n"            /* new instruction 2 */\
+		      "  .byte %c[f]\n"        /* feature bit */        \
+		      "  .byte %c[f2]\n"          /* feature bit 2*/    \
+		      "  .byte 662b-661b\n"       /* sourcelen */       \
+		      "  .byte 664f-663f\n"       /* replacementlen */  \
+		      "  .byte 666f-665f\n"       /* replacementlen 2 */\
+		      ".previous\n"                                     \
+		      ".section .altinstr_replacement,\"ax\"\n"         \
+		      "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
+		      "665:\n\t" newinstr2 "\n666:\n" /* replacement2 */ \
+		      ".previous" : output : [f] "i" (feat), \
+		      [f2] "i" (feat2),##input)
+
 /*
  * Alternative inline assembly for SMP.
  *

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/4 TRY#3] x86_64: changes to x86_64 architecture for alternative instruction improvements
  2007-03-09 15:08 [PATCH 0/4 TRY#3] improve alternative instruction code and optimize get_cycles_sync Joerg Roedel
  2007-03-09 15:11 ` [PATCH 1/4 TRY#3] i386: extend alternative instructions framework Joerg Roedel
@ 2007-03-09 15:12 ` Joerg Roedel
  2007-03-09 15:13 ` [PATCH 3/4 TRY#3] i386: add the X86_FEATURE_SYNC_RDTSC flag Joerg Roedel
  2007-03-09 15:15 ` [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync() Joerg Roedel
  3 siblings, 0 replies; 13+ messages in thread
From: Joerg Roedel @ 2007-03-09 15:12 UTC (permalink / raw)
  To: discuss; +Cc: Andi Kleen, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 277 bytes --]

From: Joerg Roedel <joerg.roedel@amd.com>

In this patch updates the x86_64 architecture to work with the changes
to alternative instructions in i386

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG

[-- Attachment #2: two-alternatives-x86_64.patch --]
[-- Type: text/plain, Size: 5805 bytes --]

diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 9a10a78..ab525ee 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -53,7 +53,10 @@ ENDPROC(clear_page)
 	.align 8
 	.quad clear_page
 	.quad 1b
+	.quad 0
 	.byte X86_FEATURE_REP_GOOD
+	.byte 0
 	.byte .Lclear_page_end - clear_page
 	.byte 2b - 1b
+	.byte 0
 	.previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index 727a5d4..b4d0329 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -113,7 +113,10 @@ ENDPROC(copy_page)
 	.align 8
 	.quad copy_page
 	.quad 1b
+	.quad 0
 	.byte X86_FEATURE_REP_GOOD
+	.byte 0
 	.byte .Lcopy_page_end - copy_page
 	.byte 2b - 1b
+	.byte 0
 	.previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
index 70bebd3..d505df3 100644
--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -27,9 +27,12 @@
 	.align 8
 	.quad  0b
 	.quad  2b
+	.quad  0
 	.byte  \feature		     /* when feature is set */
+	.byte  0
 	.byte  5
 	.byte  5
+	.byte  0
 	.previous
 	.endm
 
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index 0ea0ddc..b1e1686 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -123,7 +123,10 @@ ENDPROC(__memcpy)
 	.align 8
 	.quad memcpy
 	.quad 1b
+	.quad 0
 	.byte X86_FEATURE_REP_GOOD
+	.byte 0
 	.byte .Lfinal - memcpy
 	.byte 2b - 1b
+	.byte 0
 	.previous
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 2c59481..566e179 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -127,7 +127,10 @@ ENDPROC(__memset)
 	.align 8
 	.quad memset
 	.quad 1b
+	.quad 0
 	.byte X86_FEATURE_REP_GOOD
+	.byte 0
 	.byte .Lfinal - memset
 	.byte 2b - 1b
+	.byte 0
 	.previous
diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h
index a6657b4..63cd8e5 100644
--- a/include/asm-x86_64/alternative.h
+++ b/include/asm-x86_64/alternative.h
@@ -10,11 +10,14 @@
 struct alt_instr {
 	u8 *instr; 		/* original instruction */
 	u8 *replacement;
+	u8 *replacement2;
 	u8  cpuid;		/* cpuid bit set for replacement */
+	u8  cpuid2;		/* cpuid bit set for replacement2 */
 	u8  instrlen;		/* length of original instruction */
 	u8  replacementlen; 	/* length of new instruction, <= instrlen */
-	u8  pad[5];
-};
+	u8  replacementlen2;
+	u8  pad[3];
+} __attribute__ ((packed));
 
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 
@@ -36,6 +39,12 @@ static inline void alternatives_smp_switch(int smp) {}
 
 #endif
 
+/*
+ * use this macro(s) if you need more than one output parameter
+ * in alternative_io_*
+ */
+#define ASM_OUTPUT2(a, b) a, b
+
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
@@ -54,9 +63,12 @@ static inline void alternatives_smp_switch(int smp) {}
 		      "  .align 8\n"				       \
 		      "  .quad 661b\n"            /* label */          \
 		      "  .quad 663f\n"		  /* new instruction */ \
+		      "  .quad 0x00\n"					\
 		      "  .byte %c0\n"             /* feature bit */    \
+		      "  .byte 0x00\n"					\
 		      "  .byte 662b-661b\n"       /* sourcelen */      \
 		      "  .byte 664f-663f\n"       /* replacementlen */ \
+		      "  .byte 0x00\n"					\
 		      ".previous\n"					\
 		      ".section .altinstr_replacement,\"ax\"\n"		\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
@@ -78,9 +90,12 @@ static inline void alternatives_smp_switch(int smp) {}
 		      "  .align 8\n"					\
 		      "  .quad 661b\n"            /* label */		\
 		      "  .quad 663f\n"		  /* new instruction */	\
+		      "  .quad 0x00\n"					\
 		      "  .byte %c0\n"             /* feature bit */	\
+		      "  .byte 0x00\n"					\
 		      "  .byte 662b-661b\n"       /* sourcelen */	\
 		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      "  .byte 0x00\n"					\
 		      ".previous\n"					\
 		      ".section .altinstr_replacement,\"ax\"\n"		\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
@@ -93,14 +108,37 @@ static inline void alternatives_smp_switch(int smp) {}
 		      "  .align 8\n"					\
 		      "  .quad 661b\n"            /* label */		\
 		      "  .quad 663f\n"		  /* new instruction */	\
+		      "  .quad 0x00\n"					\
 		      "  .byte %c[feat]\n"        /* feature bit */	\
+		      "  .byte 0x00\n"					\
 		      "  .byte 662b-661b\n"       /* sourcelen */	\
 		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      "  .byte 0x00\n"					\
 		      ".previous\n"					\
 		      ".section .altinstr_replacement,\"ax\"\n"		\
 		      "663:\n\t" newinstr "\n664:\n"   /* replacement */ \
 		      ".previous" : output : [feat] "i" (feature), ##input)
 
+/* Like alternative_io, but supports 2 possible alternatives */
+#define alternative_io_two(oldinstr, newinstr, feat, newinstr2, feat2,\
+		output, input...) \
+	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
+		      ".section .altinstructions,\"a\"\n"		\
+		      "  .align 8\n"					\
+		      "  .quad 661b\n"            /* label */		\
+		      "  .quad 663f\n"		  /* new instruction */	\
+		      "  .quad 665f\n"		  /* new instruction 2 */\
+		      "  .byte %c[f]\n"        /* feature bit */	\
+		      "  .byte %c[f2]\n"	  /* feature bit 2*/	\
+		      "  .byte 662b-661b\n"       /* sourcelen */	\
+		      "  .byte 664f-663f\n"       /* replacementlen */	\
+		      "  .byte 666f-665f\n"	  /* replacementlen 2 */\
+		      ".previous\n"					\
+		      ".section .altinstr_replacement,\"ax\"\n"		\
+		      "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
+		      "665:\n\t" newinstr2 "\n666:\n" /* replacement2 */ \
+		      ".previous" : output : [f] "i" (feat), \
+			[f2] "i" (feat2),##input)
 /*
  * Alternative inline assembly for SMP.
  *

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/4 TRY#3] i386: add the X86_FEATURE_SYNC_RDTSC flag
  2007-03-09 15:08 [PATCH 0/4 TRY#3] improve alternative instruction code and optimize get_cycles_sync Joerg Roedel
  2007-03-09 15:11 ` [PATCH 1/4 TRY#3] i386: extend alternative instructions framework Joerg Roedel
  2007-03-09 15:12 ` [PATCH 2/4 TRY#3] x86_64: changes to x86_64 architecture for alternative instruction improvements Joerg Roedel
@ 2007-03-09 15:13 ` Joerg Roedel
  2007-03-09 15:15 ` [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync() Joerg Roedel
  3 siblings, 0 replies; 13+ messages in thread
From: Joerg Roedel @ 2007-03-09 15:13 UTC (permalink / raw)
  To: discuss; +Cc: Andi Kleen, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 337 bytes --]

From: Joerg Roedel <joerg.roedel@amd.com>

This patch adds the  X86_FEATURE_SYNC_RDTSC to the i386 architecture.
This is very helpfull to simplify the get_cycles_sync() function and
remove the #ifdefs from it.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG

[-- Attachment #2: i386-x86_sync_rdtsc.patch --]
[-- Type: text/plain, Size: 1673 bytes --]

diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 41cfea5..11f5730 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -241,6 +241,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	if (cpuid_eax(0x80000000) >= 0x80000006)
 		num_cache_leaves = 3;
+
+	clear_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
 }
 
 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 56fe265..403a495 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -188,8 +188,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	}
 #endif
 
-	if (c->x86 == 15)
+	if (c->x86 == 15) {
 		set_bit(X86_FEATURE_P4, c->x86_capability);
+		set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
+	} else
+		clear_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
 	if (c->x86 == 6) 
 		set_bit(X86_FEATURE_P3, c->x86_capability);
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h
index 3f92b94..a9f1f01 100644
--- a/include/asm-i386/cpufeature.h
+++ b/include/asm-i386/cpufeature.h
@@ -75,6 +76,7 @@
 #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
 #define X86_FEATURE_PEBS	(3*32+12)  /* Precise-Event Based Sampling */
 #define X86_FEATURE_BTS		(3*32+13)  /* Branch Trace Store */
+#define X86_FEATURE_SYNC_RDTSC  (3*32+14)  /* RDTSC is serializing */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* Streaming SIMD Extensions-3 */

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync()
  2007-03-09 15:08 [PATCH 0/4 TRY#3] improve alternative instruction code and optimize get_cycles_sync Joerg Roedel
                   ` (2 preceding siblings ...)
  2007-03-09 15:13 ` [PATCH 3/4 TRY#3] i386: add the X86_FEATURE_SYNC_RDTSC flag Joerg Roedel
@ 2007-03-09 15:15 ` Joerg Roedel
  2007-03-09 18:10   ` Avi Kivity
  3 siblings, 1 reply; 13+ messages in thread
From: Joerg Roedel @ 2007-03-09 15:15 UTC (permalink / raw)
  To: discuss; +Cc: Andi Kleen, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 518 bytes --]

From: Joerg Roedel <joerg.roedel@amd.com>

This patch simplifies the get_cycles_sync() function by removing
the #ifdefs from it. Further it introduces an optimization for AMD
processors. There the RDTSCP instruction is used instead of CPUID;RDTSC
which is helpfull if the kernel runs as a KVM guest. Running as a guest
makes CPUID very expensive because it causes an intercept of the guest.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG

[-- Attachment #2: optimized-get_cycles_sync.patch --]
[-- Type: text/plain, Size: 1807 bytes --]

diff --git a/include/asm-i386/cpufeature.h b/include/asm-i386/cpufeature.h
index 3f92b94..a9f1f01 100644
--- a/include/asm-i386/cpufeature.h
+++ b/include/asm-i386/cpufeature.h
@@ -49,6 +49,7 @@
 #define X86_FEATURE_MP		(1*32+19) /* MP Capable. */
 #define X86_FEATURE_NX		(1*32+20) /* Execute Disable */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_RDTSCP      (1*32+27) /* RDTSCP */
 #define X86_FEATURE_LM		(1*32+29) /* Long Mode (x86-64) */
 #define X86_FEATURE_3DNOWEXT	(1*32+30) /* AMD 3DNow! extensions */
 #define X86_FEATURE_3DNOW	(1*32+31) /* 3DNow! */
diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h
index 84016ff..0b769ad 100644
--- a/include/asm-i386/tsc.h
+++ b/include/asm-i386/tsc.h
@@ -7,6 +7,7 @@
 #define _ASM_i386_TSC_H
 
 #include <asm/processor.h>
+#include <asm/alternative.h>
 
 /*
  * Standard way to access the cycle counter.
@@ -34,22 +35,16 @@ static inline cycles_t get_cycles(void)
 /* Like get_cycles, but make sure the CPU is synchronized. */
 static __always_inline cycles_t get_cycles_sync(void)
 {
-	unsigned long long ret;
-#ifdef X86_FEATURE_SYNC_RDTSC
-	unsigned eax;
+	unsigned int a, d;
 
-	/*
-	 * Don't do an additional sync on CPUs where we know
-	 * RDTSC is already synchronous:
-	 */
-	alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
-			  "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
-#else
-	sync_core();
-#endif
-	rdtscll(ret);
+#define RDTSCP ".byte 0x0f, 0x01, 0xf9"
+	alternative_io_two("cpuid\nrdtsc",
+			   "rdtsc", X86_FEATURE_SYNC_RDTSC,
+			   ".byte 0x0f, 0x01, 0xf9", X86_FEATURE_RDTSCP,
+			   ASM_OUTPUT2("=a" (a), "=d" (d)),
+			   "0" (1) : "ecx", "memory");
 
-	return ret;
+	return ((unsigned long long)a) | (((unsigned long long)d)<<32);
 }
 
 extern void tsc_init(void);

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync()
  2007-03-09 15:15 ` [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync() Joerg Roedel
@ 2007-03-09 18:10   ` Avi Kivity
  2007-03-12 13:02     ` Joerg Roedel
  0 siblings, 1 reply; 13+ messages in thread
From: Avi Kivity @ 2007-03-09 18:10 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: discuss, Andi Kleen, linux-kernel

Joerg Roedel wrote:
> From: Joerg Roedel <joerg.roedel@amd.com>
>
> This patch simplifies the get_cycles_sync() function by removing
> the #ifdefs from it. Further it introduces an optimization for AMD
> processors. There the RDTSCP instruction is used instead of CPUID;RDTSC
> which is helpfull if the kernel runs as a KVM guest. Running as a guest
> makes CPUID very expensive because it causes an intercept of the guest.
>
>   
> +#define RDTSCP ".byte 0x0f, 0x01, 0xf9"
> +	alternative_io_two("cpuid\nrdtsc",
> +			   "rdtsc", X86_FEATURE_SYNC_RDTSC,
> +			   ".byte 0x0f, 0x01, 0xf9", X86_FEATURE_RDTSCP,
>   

why not use the RDTSCP macro here?


-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync()
  2007-03-09 18:10   ` Avi Kivity
@ 2007-03-12 13:02     ` Joerg Roedel
  2007-03-12 13:09       ` [discuss] " Andi Kleen
                         ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Joerg Roedel @ 2007-03-12 13:02 UTC (permalink / raw)
  To: Avi Kivity; +Cc: discuss, Andi Kleen, linux-kernel

On Fri, Mar 09, 2007 at 08:10:03PM +0200, Avi Kivity wrote:
> Joerg Roedel wrote:
> >From: Joerg Roedel <joerg.roedel@amd.com>
> >
> >This patch simplifies the get_cycles_sync() function by removing
> >the #ifdefs from it. Further it introduces an optimization for AMD
> >processors. There the RDTSCP instruction is used instead of CPUID;RDTSC
> >which is helpfull if the kernel runs as a KVM guest. Running as a guest
> >makes CPUID very expensive because it causes an intercept of the guest.
> >
> >  +#define RDTSCP ".byte 0x0f, 0x01, 0xf9"
> >+	alternative_io_two("cpuid\nrdtsc",
> >+			   "rdtsc", X86_FEATURE_SYNC_RDTSC,
> >+			   ".byte 0x0f, 0x01, 0xf9", X86_FEATURE_RDTSCP,
> >  
> 
> why not use the RDTSCP macro here?

Does this macro exist? I couldn't found it in the current git tree. And
the rdtscp macros in msr.h use the plain opcode too.

Regards,
Joerg

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync()
  2007-03-12 13:02     ` Joerg Roedel
@ 2007-03-12 13:09       ` Andi Kleen
  2007-03-12 13:21         ` Joerg Roedel
  2007-03-12 13:29       ` Michael Matz
  2007-03-12 16:08       ` Avi Kivity
  2 siblings, 1 reply; 13+ messages in thread
From: Andi Kleen @ 2007-03-12 13:09 UTC (permalink / raw)
  To: discuss; +Cc: Joerg Roedel, Avi Kivity, linux-kernel

On Monday 12 March 2007 14:02, Joerg Roedel wrote:
> On Fri, Mar 09, 2007 at 08:10:03PM +0200, Avi Kivity wrote:
> > Joerg Roedel wrote:
> > >From: Joerg Roedel <joerg.roedel@amd.com>
> > >
> > >This patch simplifies the get_cycles_sync() function by removing
> > >the #ifdefs from it. Further it introduces an optimization for AMD
> > >processors. There the RDTSCP instruction is used instead of CPUID;RDTSC
> > >which is helpfull if the kernel runs as a KVM guest. Running as a guest
> > >makes CPUID very expensive because it causes an intercept of the guest.
> > >
> > >  +#define RDTSCP ".byte 0x0f, 0x01, 0xf9"
> > >+	alternative_io_two("cpuid\nrdtsc",
> > >+			   "rdtsc", X86_FEATURE_SYNC_RDTSC,
> > >+			   ".byte 0x0f, 0x01, 0xf9", X86_FEATURE_RDTSCP,
> > >  
> > 
> > why not use the RDTSCP macro here?
> 
> Does this macro exist? I couldn't found it in the current git tree. And
> the rdtscp macros in msr.h use the plain opcode too.

It doesn't exist. The rdtscp macros are also not used currently, that
is why nobody's binutils complained.

Doing the .bytes is ok

I still don't like the alternative() record complications though.

-Andi

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync()
  2007-03-12 13:09       ` [discuss] " Andi Kleen
@ 2007-03-12 13:21         ` Joerg Roedel
  0 siblings, 0 replies; 13+ messages in thread
From: Joerg Roedel @ 2007-03-12 13:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: discuss, Avi Kivity, linux-kernel

On Mon, Mar 12, 2007 at 02:09:18PM +0100, Andi Kleen wrote:
> On Monday 12 March 2007 14:02, Joerg Roedel wrote:
> > On Fri, Mar 09, 2007 at 08:10:03PM +0200, Avi Kivity wrote:
> > > Joerg Roedel wrote:
> > > >From: Joerg Roedel <joerg.roedel@amd.com>
> > > >
> > > >This patch simplifies the get_cycles_sync() function by removing
> > > >the #ifdefs from it. Further it introduces an optimization for AMD
> > > >processors. There the RDTSCP instruction is used instead of CPUID;RDTSC
> > > >which is helpfull if the kernel runs as a KVM guest. Running as a guest
> > > >makes CPUID very expensive because it causes an intercept of the guest.
> > > >
> > > >  +#define RDTSCP ".byte 0x0f, 0x01, 0xf9"
> > > >+	alternative_io_two("cpuid\nrdtsc",
> > > >+			   "rdtsc", X86_FEATURE_SYNC_RDTSC,
> > > >+			   ".byte 0x0f, 0x01, 0xf9", X86_FEATURE_RDTSCP,
> > > >  
> > > 
> > > why not use the RDTSCP macro here?
> > 
> > Does this macro exist? I couldn't found it in the current git tree. And
> > the rdtscp macros in msr.h use the plain opcode too.
> 
> It doesn't exist. The rdtscp macros are also not used currently, that
> is why nobody's binutils complained.
> 
> Doing the .bytes is ok
> 
> I still don't like the alternative() record complications though.

Do you think of another way to make use of RDTSCP in the get_cycles_sync
function? Using CPUID in a function called such often is bad when
running Linux as a  virtualization guest...
So using RDTSCP there might be a goog idea.

Regards,
Joerg

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync()
  2007-03-12 13:02     ` Joerg Roedel
  2007-03-12 13:09       ` [discuss] " Andi Kleen
@ 2007-03-12 13:29       ` Michael Matz
  2007-03-12 13:45         ` Joerg Roedel
  2007-03-12 16:08       ` Avi Kivity
  2 siblings, 1 reply; 13+ messages in thread
From: Michael Matz @ 2007-03-12 13:29 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: Avi Kivity, discuss, Andi Kleen, linux-kernel

Hi Joerg,

On Mon, 12 Mar 2007, Joerg Roedel wrote:

> > >+#define RDTSCP ".byte 0x0f, 0x01, 0xf9"
> > >+	alternative_io_two("cpuid\nrdtsc",
> > >+			   "rdtsc", X86_FEATURE_SYNC_RDTSC,
> > >+			   ".byte 0x0f, 0x01, 0xf9", X86_FEATURE_RDTSCP,
> > >  
> > 
> > why not use the RDTSCP macro here?
> 
> Does this macro exist?

Look carefully at your patch again, or at least the four quoted lines 
above.  You've added it yourself, in exactly the form you'd need in the 
alternative_io_two() call :-)

> I couldn't found it in the current git tree. And the rdtscp macros in 
> msr.h use the plain opcode too.


Ciao,
Michael.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync()
  2007-03-12 13:29       ` Michael Matz
@ 2007-03-12 13:45         ` Joerg Roedel
  0 siblings, 0 replies; 13+ messages in thread
From: Joerg Roedel @ 2007-03-12 13:45 UTC (permalink / raw)
  To: Michael Matz; +Cc: Avi Kivity, discuss, Andi Kleen, linux-kernel

On Mon, Mar 12, 2007 at 02:29:43PM +0100, Michael Matz wrote:
> Hi Joerg,
> 
> On Mon, 12 Mar 2007, Joerg Roedel wrote:
> 
> > > >+#define RDTSCP ".byte 0x0f, 0x01, 0xf9"
> > > >+	alternative_io_two("cpuid\nrdtsc",
> > > >+			   "rdtsc", X86_FEATURE_SYNC_RDTSC,
> > > >+			   ".byte 0x0f, 0x01, 0xf9", X86_FEATURE_RDTSCP,
> > > >  
> > > 
> > > why not use the RDTSCP macro here?
> > 
> > Does this macro exist?
> 
> Look carefully at your patch again, or at least the four quoted lines 
> above.  You've added it yourself, in exactly the form you'd need in the 
> alternative_io_two() call :-)

Hmmkay, thanks for opening my eyes :-)
I considered defining this macro while writing this patch, but decided
against this because the X86_FEATURE_RDTSCP on the same line should
documenting the opcode sufficiently. I just forgot to remove that
#define :)

Thanks again,
Joerg

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync()
  2007-03-12 13:02     ` Joerg Roedel
  2007-03-12 13:09       ` [discuss] " Andi Kleen
  2007-03-12 13:29       ` Michael Matz
@ 2007-03-12 16:08       ` Avi Kivity
  2007-03-12 16:24         ` Joerg Roedel
  2 siblings, 1 reply; 13+ messages in thread
From: Avi Kivity @ 2007-03-12 16:08 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: discuss, Andi Kleen, linux-kernel

Joerg Roedel wrote:
> On Fri, Mar 09, 2007 at 08:10:03PM +0200, Avi Kivity wrote:
>   
>> Joerg Roedel wrote:
>>     
>>> From: Joerg Roedel <joerg.roedel@amd.com>
>>>
>>> This patch simplifies the get_cycles_sync() function by removing
>>> the #ifdefs from it. Further it introduces an optimization for AMD
>>> processors. There the RDTSCP instruction is used instead of CPUID;RDTSC
>>> which is helpfull if the kernel runs as a KVM guest. Running as a guest
>>> makes CPUID very expensive because it causes an intercept of the guest.
>>>
>>>  +#define RDTSCP ".byte 0x0f, 0x01, 0xf9"
>>> +	alternative_io_two("cpuid\nrdtsc",
>>> +			   "rdtsc", X86_FEATURE_SYNC_RDTSC,
>>> +			   ".byte 0x0f, 0x01, 0xf9", X86_FEATURE_RDTSCP,
>>>  
>>>       
>> why not use the RDTSCP macro here?
>>     
>
> Does this macro exist? I couldn't found it in the current git tree. And
> the rdtscp macros in msr.h use the plain opcode too.
>   

-ENOCOFFEE?  You defined the macro 3 lines above the .byte directive...


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync()
  2007-03-12 16:08       ` Avi Kivity
@ 2007-03-12 16:24         ` Joerg Roedel
  0 siblings, 0 replies; 13+ messages in thread
From: Joerg Roedel @ 2007-03-12 16:24 UTC (permalink / raw)
  To: Avi Kivity; +Cc: discuss, Andi Kleen, linux-kernel

On Mon, Mar 12, 2007 at 06:08:11PM +0200, Avi Kivity wrote:
> Joerg Roedel wrote:
> >On Fri, Mar 09, 2007 at 08:10:03PM +0200, Avi Kivity wrote:
> >  
> >>Joerg Roedel wrote:
> >>    
> >>>From: Joerg Roedel <joerg.roedel@amd.com>
> >>>
> >>>This patch simplifies the get_cycles_sync() function by removing
> >>>the #ifdefs from it. Further it introduces an optimization for AMD
> >>>processors. There the RDTSCP instruction is used instead of CPUID;RDTSC
> >>>which is helpfull if the kernel runs as a KVM guest. Running as a guest
> >>>makes CPUID very expensive because it causes an intercept of the guest.
> >>>
> >>> +#define RDTSCP ".byte 0x0f, 0x01, 0xf9"
> >>>+	alternative_io_two("cpuid\nrdtsc",
> >>>+			   "rdtsc", X86_FEATURE_SYNC_RDTSC,
> >>>+			   ".byte 0x0f, 0x01, 0xf9", X86_FEATURE_RDTSCP,
> >>>       
> >>why not use the RDTSCP macro here?
> >>    
> >
> >Does this macro exist? I couldn't found it in the current git tree. And
> >the rdtscp macros in msr.h use the plain opcode too.
> >  
> 
> -ENOCOFFEE?  You defined the macro 3 lines above the .byte directive...

Yes, lack of coffee, and even worse -EBLINDHACKER :-(

-- 
Joerg Roedel
Operating System Research Center
AMD Saxony LLC & Co. KG



^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2007-03-12 16:26 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-09 15:08 [PATCH 0/4 TRY#3] improve alternative instruction code and optimize get_cycles_sync Joerg Roedel
2007-03-09 15:11 ` [PATCH 1/4 TRY#3] i386: extend alternative instructions framework Joerg Roedel
2007-03-09 15:12 ` [PATCH 2/4 TRY#3] x86_64: changes to x86_64 architecture for alternative instruction improvements Joerg Roedel
2007-03-09 15:13 ` [PATCH 3/4 TRY#3] i386: add the X86_FEATURE_SYNC_RDTSC flag Joerg Roedel
2007-03-09 15:15 ` [PATCH 4/4 TRY#3] optimize and simplify get_cycles_sync() Joerg Roedel
2007-03-09 18:10   ` Avi Kivity
2007-03-12 13:02     ` Joerg Roedel
2007-03-12 13:09       ` [discuss] " Andi Kleen
2007-03-12 13:21         ` Joerg Roedel
2007-03-12 13:29       ` Michael Matz
2007-03-12 13:45         ` Joerg Roedel
2007-03-12 16:08       ` Avi Kivity
2007-03-12 16:24         ` Joerg Roedel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.