All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andy Lutomirski <luto@MIT.EDU>
To: x86@kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@elte.hu>,
	Andi Kleen <andi@firstfloor.org>,
	linux-kernel@vger.kernel.org, Andy Lutomirski <luto@MIT.EDU>
Subject: [RFT/PATCH v2 5/6] x86-64: Move vread_tsc into a new file with sensible options
Date: Wed,  6 Apr 2011 22:04:02 -0400	[thread overview]
Message-ID: <6fc6093ec1acb82860cb941d63b2b9aa288bae4c.1302137785.git.luto@mit.edu> (raw)
In-Reply-To: <cover.1302137785.git.luto@mit.edu>
In-Reply-To: <cover.1302137785.git.luto@mit.edu>

vread_tsc is short and hot, and it's userspace code so the usual
reasons to keep frame pointers around, enable -pg, and turn off
sibling calls don't apply.

(OK, turning off sibling calls has no effect.  But it might
someday...)

As an added benefit, tsc.c is profilable now.

Signed-off-by: Andy Lutomirski <luto@mit.edu>
---
 arch/x86/include/asm/tsc.h     |    4 +++
 arch/x86/kernel/Makefile       |    8 +++--
 arch/x86/kernel/tsc.c          |   53 --------------------------------------
 arch/x86/kernel/vread_tsc_64.c |   55 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+), 56 deletions(-)
 create mode 100644 arch/x86/kernel/vread_tsc_64.c

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 1ca132f..8f2b3c6 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,10 @@ extern int unsynchronized_tsc(void);
 extern int check_tsc_unstable(void);
 extern unsigned long native_calibrate_tsc(void);
 
+#ifdef CONFIG_X86_64
+extern cycles_t vread_tsc(void);
+#endif
+
 /*
  * Boot-time check whether the TSCs are synchronized across
  * all CPUs/cores:
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34244b2..7626fb8 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -8,7 +8,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not profile debug and lowlevel utilities
-CFLAGS_REMOVE_tsc.o = -pg
 CFLAGS_REMOVE_rtc.o = -pg
 CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
 CFLAGS_REMOVE_pvclock.o = -pg
@@ -24,13 +23,16 @@ endif
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_vsyscall_64.o	:= $(PROFILING) -g0 $(nostackp)
 CFLAGS_hpet.o		:= $(nostackp)
-CFLAGS_tsc.o		:= $(nostackp)
+CFLAGS_vread_tsc_64.o	:= $(nostackp)
 CFLAGS_paravirt.o	:= $(nostackp)
 GCOV_PROFILE_vsyscall_64.o	:= n
 GCOV_PROFILE_hpet.o		:= n
 GCOV_PROFILE_tsc.o		:= n
 GCOV_PROFILE_paravirt.o		:= n
 
+# vread_tsc_64 is hot and should be fully optimized:
+CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-omit-frame-pointer -fno-optimize-sibling-calls
+
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o
@@ -39,7 +41,7 @@ obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
+obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o vread_tsc_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 69ff619..5346381 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,59 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
 		ret : clocksource_tsc.cycle_last;
 }
 
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
-	cycle_t ret;
-	u64 zero, last;
-
-	/*
-	 * rdtsc is unordered, and we want it to be ordered like
-	 * a load with respect to other CPUs (and we don't want
-	 * it to execute absurdly early wrt code on this CPU).
-	 * rdtsc_barrier() is a barrier that provides this ordering
-	 * with respect to *earlier* loads.  (Which barrier to use
-	 * depends on the CPU.)
-	 */
-	rdtsc_barrier();
-
-	asm volatile ("rdtsc\n\t"
-		      "shl $0x20,%%rdx\n\t"
-		      "or %%rdx,%%rax\n\t"
-		      "shl $0x20,%%rdx"
-		      : "=a" (ret), "=d" (zero) : : "cc");
-
-	/*
-	 * zero == 0, but as far as the processor is concerned, zero
-	 * depends on the output of rdtsc.  So we can use it as a
-	 * load barrier by loading something that depends on it.
-	 * x86-64 keeps all loads in order wrt each other, so this
-	 * ensures that rdtsc is ordered wrt all later loads.
-	 */
-
-	/*
-	 * This doesn't multiply 'zero' by anything, which generates
-	 * very slightly nicer code than multiplying it by 8.
-	 */
-	last = *( (cycle_t *)
-		  ((char *)&VVAR(vsyscall_gtod_data).clock.cycle_last + zero) );
-
-	if (likely(ret >= last))
-		return ret;
-
-	/*
-	 * GCC likes to generate cmov here, but this branch is extremely
-	 * predictable (it's just a funciton of time and the likely is
-	 * very likely) and there's a data dependence, so force GCC
-	 * to generate a branch instead.  I don't barrier() because
-	 * we don't actually need a barrier, and if this function
-	 * ever gets inlined it will generate worse code.
-	 */
-	asm volatile ("");
-	return last;
-}
-#endif
-
 static void resume_tsc(struct clocksource *cs)
 {
 	clocksource_tsc.cycle_last = 0;
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
new file mode 100644
index 0000000..856330e
--- /dev/null
+++ b/arch/x86/kernel/vread_tsc_64.c
@@ -0,0 +1,55 @@
+/* This code runs in userspace. */
+
+#define DISABLE_BRANCH_PROFILING
+#include <asm/vgtod.h>
+
+notrace cycle_t __vsyscall_fn vread_tsc(void)
+{
+	cycle_t ret;
+	u64 zero, last;
+
+	/*
+	 * rdtsc is unordered, and we want it to be ordered like
+	 * a load with respect to other CPUs (and we don't want
+	 * it to execute absurdly early wrt code on this CPU).
+	 * rdtsc_barrier() is a barrier that provides this ordering
+	 * with respect to *earlier* loads.  (Which barrier to use
+	 * depends on the CPU.)
+	 */
+	rdtsc_barrier();
+
+	asm volatile ("rdtsc\n\t"
+		      "shl $0x20,%%rdx\n\t"
+		      "or %%rdx,%%rax\n\t"
+		      "shl $0x20,%%rdx"
+		      : "=a" (ret), "=d" (zero) : : "cc");
+
+	/*
+	 * zero == 0, but as far as the processor is concerned, zero
+	 * depends on the output of rdtsc.  So we can use it as a
+	 * load barrier by loading something that depends on it.
+	 * x86-64 keeps all loads in order wrt each other, so this
+	 * ensures that rdtsc is ordered wrt all later loads.
+	 */
+
+	/*
+	 * This doesn't multiply 'zero' by anything, which generates
+	 * very slightly nicer code than multiplying it by 8.
+	 */
+	last = *( (cycle_t *)
+		  ((char *)&VVAR(vsyscall_gtod_data).clock.cycle_last + zero) );
+
+	if (likely(ret >= last))
+		return ret;
+
+	/*
+	 * GCC likes to generate cmov here, but this branch is extremely
+	 * predictable (it's just a funciton of time and the likely is
+	 * very likely) and there's a data dependence, so force GCC
+	 * to generate a branch instead.  I don't barrier() because
+	 * we don't actually need a barrier, and if this function
+	 * ever gets inlined it will generate worse code.
+	 */
+	asm volatile ("");
+	return last;
+}
-- 
1.7.4


  parent reply	other threads:[~2011-04-07  2:05 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-04-07  2:03 [RFT/PATCH v2 0/6] Micro-optimize vclock_gettime Andy Lutomirski
2011-04-07  2:03 ` [RFT/PATCH v2 1/6] x86-64: Clean up vdso/kernel shared variables Andy Lutomirski
2011-04-07  8:08   ` Ingo Molnar
2011-04-07  2:03 ` [RFT/PATCH v2 2/6] x86-64: Optimize vread_tsc's barriers Andy Lutomirski
2011-04-07  8:25   ` Ingo Molnar
2011-04-07 11:44     ` Andrew Lutomirski
2011-04-07 15:23     ` Andi Kleen
2011-04-07 17:28       ` Ingo Molnar
2011-04-07 16:18   ` Linus Torvalds
2011-04-07 16:42     ` Andi Kleen
2011-04-07 17:20       ` Linus Torvalds
2011-04-07 18:15         ` Andi Kleen
2011-04-07 18:30           ` Linus Torvalds
2011-04-07 21:26             ` Andrew Lutomirski
2011-04-08 17:59               ` Andrew Lutomirski
2011-04-09 11:51                 ` Ingo Molnar
2011-04-07 21:43         ` Raghavendra D Prabhu
2011-04-07 22:52           ` Andi Kleen
2011-04-07  2:04 ` [RFT/PATCH v2 3/6] x86-64: Don't generate cmov in vread_tsc Andy Lutomirski
2011-04-07  7:54   ` Ingo Molnar
2011-04-07 11:25     ` Andrew Lutomirski
2011-04-07  2:04 ` [RFT/PATCH v2 4/6] x86-64: vclock_gettime(CLOCK_MONOTONIC) can't ever see nsec < 0 Andy Lutomirski
2011-04-07  7:57   ` Ingo Molnar
2011-04-07 11:27     ` Andrew Lutomirski
2011-04-07  2:04 ` Andy Lutomirski [this message]
2011-04-07  2:04 ` [RFT/PATCH v2 6/6] x86-64: Turn off -pg and turn on -foptimize-sibling-calls for vDSO Andy Lutomirski
2011-04-07  8:03   ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=6fc6093ec1acb82860cb941d63b2b9aa288bae4c.1302137785.git.luto@mit.edu \
    --to=luto@mit.edu \
    --cc=andi@firstfloor.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.