All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] User return notifiers / just-in-time MSR switching for KVM
@ 2009-09-16 12:45 Avi Kivity
  2009-09-16 12:45 ` [PATCH 1/4] core, x86: Add user return notifiers Avi Kivity
                   ` (4 more replies)
  0 siblings, 5 replies; 29+ messages in thread
From: Avi Kivity @ 2009-09-16 12:45 UTC (permalink / raw)
  To: Ingo Molnar, Marcelo Tosatti; +Cc: kvm, linux-kernel

The various x86 syscall related MSRs (MSR_LSTAR and friends, EFER when SCE
needs to be updated) are fairly expensive to read or write.  Since different
operating systems can set different values for these MSRs, KVM needs to reload
them when switching to a different guest or to the host.

Switching on every guest entry/exit is too expensive, so KVM reloads on
guest preemption, which is a lot rarer.  Even so, preemption based reload
is suboptimal:

- if we're switching to a kernel thread and back, there's no need to reload
  the MSRs.  Examples of kernel threads we're likely to switch to are:

  - the idle task
  - a threaded interrupt handler
  - a kernel-mode virtio server (vhost-net)

- if we're switching to a guest running the same OS, the MSRs will have the
  same values and there's no need to reload them

- if the guest and host run the same OS, again the MSRs need not be reloaded.

This patchset implements just-in-time reloads to defer them to the last
possible instant.  When we do reload, we check whether the values have in
fact changed and reload conditionally.

For the just-in-time reloads the first patch implements "user return
notifiers", a callback invoked just before return to userspace.  This has
been written so that there is no code impact if KVM is not configured, and
no runtime impact if KVM is not running.

The patchset improves guest/idle/guest switches by about 2000 cycles.

Avi Kivity (4):
  core, x86: Add user return notifiers
  KVM: VMX: Move MSR_KERNEL_GS_BASE out of the vmx autoload msr area
  KVM: x86 shared msr infrastructure
  KVM: VMX: Use shared msr infrastructure

 arch/Kconfig                       |   10 +++
 arch/x86/Kconfig                   |    1 +
 arch/x86/include/asm/kvm_host.h    |    3 +
 arch/x86/include/asm/thread_info.h |    7 +-
 arch/x86/kernel/process.c          |    2 +
 arch/x86/kernel/signal.c           |    3 +
 arch/x86/kvm/Kconfig               |    1 +
 arch/x86/kvm/vmx.c                 |  142 +++++++++++++++--------------------
 arch/x86/kvm/x86.c                 |   79 ++++++++++++++++++++
 kernel/Makefile                    |    1 +
 10 files changed, 166 insertions(+), 83 deletions(-)


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 1/4] core, x86: Add user return notifiers
  2009-09-16 12:45 [PATCH 0/4] User return notifiers / just-in-time MSR switching for KVM Avi Kivity
@ 2009-09-16 12:45 ` Avi Kivity
  2009-09-18 18:14   ` H. Peter Anvin
  2009-09-16 12:45 ` [PATCH 2/4] KVM: VMX: Move MSR_KERNEL_GS_BASE out of the vmx autoload msr area Avi Kivity
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2009-09-16 12:45 UTC (permalink / raw)
  To: Ingo Molnar, Marcelo Tosatti; +Cc: kvm, linux-kernel

Add a general per-cpu notifier that is called whenever the kernel is
about to return to userspace.  The notifier uses a thread_info flag
and existing checks, so there is no impact on user return or context
switch fast paths.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/Kconfig                       |   10 ++++++++++
 arch/x86/Kconfig                   |    1 +
 arch/x86/include/asm/thread_info.h |    7 +++++--
 arch/x86/kernel/process.c          |    2 ++
 arch/x86/kernel/signal.c           |    3 +++
 kernel/Makefile                    |    1 +
 6 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 99193b1..2275b46 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -70,6 +70,13 @@ config KRETPROBES
 	def_bool y
 	depends on KPROBES && HAVE_KRETPROBES
 
+config USER_RETURN_NOTIFIER
+	bool
+	depends on HAVE_USER_RETURN_NOTIFIER
+	help
+	  Provide a kernel-internal notification when a cpu is about to
+	  switch to user mode.
+
 config HAVE_IOREMAP_PROT
 	bool
 
@@ -113,4 +120,7 @@ config HAVE_DMA_API_DEBUG
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
 	bool
 
+config HAVE_USER_RETURN_NOTIFIER
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 13ffa5d..5fc4fae 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -50,6 +50,7 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_USER_RETURN_NOTIFIER
 
 config OUTPUT_FORMAT
 	string
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index fad7d40..a8f46de 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
+#define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -141,13 +143,14 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME |	\
+	 _TIF_USER_RETURN_NOTIFY)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
 
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 
 #define PREEMPT_ACTIVE		0x10000000
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 071166a..7ea6972 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/random.h>
+#include <linux/user-return-notifier.h>
 #include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
@@ -227,6 +228,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
+	propagate_user_return_notify(prev_p, next_p);
 }
 
 int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4c57875..bc0db10 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
 #include <linux/stddef.h>
 #include <linux/personality.h>
 #include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -870,6 +871,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		clear_thread_flag(TIF_NOTIFY_RESUME);
 		tracehook_notify_resume(regs);
 	}
+	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+		fire_user_return_notifiers();
 
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a69..11c5de5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-- 
1.6.4.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 2/4] KVM: VMX: Move MSR_KERNEL_GS_BASE out of the vmx autoload msr area
  2009-09-16 12:45 [PATCH 0/4] User return notifiers / just-in-time MSR switching for KVM Avi Kivity
  2009-09-16 12:45 ` [PATCH 1/4] core, x86: Add user return notifiers Avi Kivity
@ 2009-09-16 12:45 ` Avi Kivity
  2009-09-16 12:45 ` [PATCH 3/4] KVM: x86 shared msr infrastructure Avi Kivity
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 29+ messages in thread
From: Avi Kivity @ 2009-09-16 12:45 UTC (permalink / raw)
  To: Ingo Molnar, Marcelo Tosatti; +Cc: kvm, linux-kernel

Currently MSR_KERNEL_GS_BASE is saved and restored as part of the
guest/host msr reloading.  Since we wish to lazy-restore all the other
msrs, save and reload MSR_KERNEL_GS_BASE explicitly instead of using
the common code.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c |   34 +++++++++++++++++++++-------------
 1 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d3213ac..547881a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -80,7 +80,8 @@ struct vcpu_vmx {
 	int                   save_nmsrs;
 	int                   msr_offset_efer;
 #ifdef CONFIG_X86_64
-	int                   msr_offset_kernel_gs_base;
+	u64 		      msr_host_kernel_gs_base;
+	u64 		      msr_guest_kernel_gs_base;
 #endif
 	struct vmcs          *vmcs;
 	struct {
@@ -183,7 +184,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
  */
 static const u32 vmx_msr_index[] = {
 #ifdef CONFIG_X86_64
-	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
+	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
 #endif
 	MSR_EFER, MSR_K6_STAR,
 };
@@ -649,10 +650,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 #endif
 
 #ifdef CONFIG_X86_64
-	if (is_long_mode(&vmx->vcpu))
-		save_msrs(vmx->host_msrs +
-			  vmx->msr_offset_kernel_gs_base, 1);
-
+	if (is_long_mode(&vmx->vcpu)) {
+		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+	}
 #endif
 	load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
 	load_transition_efer(vmx);
@@ -686,6 +687,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
 	load_msrs(vmx->host_msrs, vmx->save_nmsrs);
 	reload_host_efer(vmx);
+#ifdef CONFIG_X86_64
+	if (is_long_mode(&vmx->vcpu)) {
+		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+	}
+#endif
 }
 
 static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -915,9 +922,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		index = __find_msr_index(vmx, MSR_CSTAR);
 		if (index >= 0)
 			move_msr_up(vmx, index, save_nmsrs++);
-		index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-		if (index >= 0)
-			move_msr_up(vmx, index, save_nmsrs++);
 		/*
 		 * MSR_K6_STAR is only needed on long mode guests, and only
 		 * if efer.sce is enabled.
@@ -929,10 +933,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 #endif
 	vmx->save_nmsrs = save_nmsrs;
 
-#ifdef CONFIG_X86_64
-	vmx->msr_offset_kernel_gs_base =
-		__find_msr_index(vmx, MSR_KERNEL_GS_BASE);
-#endif
 	vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
 
 	if (cpu_has_vmx_msr_bitmap()) {
@@ -990,6 +990,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 	case MSR_GS_BASE:
 		data = vmcs_readl(GUEST_GS_BASE);
 		break;
+	case MSR_KERNEL_GS_BASE:
+		vmx_load_host_state(to_vmx(vcpu));
+		data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+		break;
 	case MSR_EFER:
 		return kvm_get_msr_common(vcpu, msr_index, pdata);
 #endif
@@ -1043,6 +1047,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 	case MSR_GS_BASE:
 		vmcs_writel(GUEST_GS_BASE, data);
 		break;
+	case MSR_KERNEL_GS_BASE:
+		vmx_load_host_state(vmx);
+		vmx->msr_guest_kernel_gs_base = data;
+		break;
 #endif
 	case MSR_IA32_SYSENTER_CS:
 		vmcs_write32(GUEST_SYSENTER_CS, data);
-- 
1.6.4.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 3/4] KVM: x86 shared msr infrastructure
  2009-09-16 12:45 [PATCH 0/4] User return notifiers / just-in-time MSR switching for KVM Avi Kivity
  2009-09-16 12:45 ` [PATCH 1/4] core, x86: Add user return notifiers Avi Kivity
  2009-09-16 12:45 ` [PATCH 2/4] KVM: VMX: Move MSR_KERNEL_GS_BASE out of the vmx autoload msr area Avi Kivity
@ 2009-09-16 12:45 ` Avi Kivity
  2009-09-16 21:21   ` Marcelo Tosatti
  2009-09-16 12:45 ` [PATCH 4/4] KVM: VMX: Use " Avi Kivity
  2009-10-25 12:23 ` [PATCH 0/4] User return notifiers / just-in-time MSR switching for KVM Avi Kivity
  4 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2009-09-16 12:45 UTC (permalink / raw)
  To: Ingo Molnar, Marcelo Tosatti; +Cc: kvm, linux-kernel

The various syscall-related MSRs are fairly expensive to switch.  Currently
we switch them on every vcpu preemption, which is far too often:

- if we're switching to a kernel thread (idle task, threaded interrupt,
  kernel-mode virtio server (vhost-net), for example) and back, then
  there's no need to switch those MSRs since kernel threasd won't
  be exiting to userspace.

- if we're switching to another guest running an identical OS, most likely
  those MSRs will have the same value, so there's little point in reloading
  them.

- if we're running the same OS on the guest and host, the MSRs will have
  identical values and reloading is unnecessary.

This patch uses the new user return notifiers to implement last-minute
switching, and checks the msr values to avoid unnecessary reloading.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |    3 +
 arch/x86/kvm/Kconfig            |    1 +
 arch/x86/kvm/x86.c              |   79 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 45226f0..863bde8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -799,4 +799,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 
+void kvm_define_shared_msr(unsigned index, u32 msr);
+void kvm_set_shared_msr(unsigned index, u64 val);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b84e571..4cd4983 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_EVENTFD
 	select KVM_APIC_ARCHITECTURE
+	select USER_RETURN_NOTIFIER
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1d454d9..0e7c40c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,6 +37,7 @@
 #include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 #include <linux/cpufreq.h>
+#include <linux/user-return-notifier.h>
 #include <trace/events/kvm.h>
 #undef TRACE_INCLUDE_FILE
 #define CREATE_TRACE_POINTS
@@ -87,6 +88,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
 int ignore_msrs = 0;
 module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
+#define KVM_NR_SHARED_MSRS 16
+
+struct kvm_shared_msrs_global {
+	int nr;
+	struct kvm_shared_msr {
+		u32 msr;
+		u64 value;
+	} msrs[KVM_NR_SHARED_MSRS];
+};
+
+struct kvm_shared_msrs {
+	struct user_return_notifier urn;
+	bool registered;
+	u64 current_value[KVM_NR_SHARED_MSRS];
+};
+
+static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
+static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "pf_fixed", VCPU_STAT(pf_fixed) },
 	{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -123,6 +143,64 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+static void kvm_on_user_return(struct user_return_notifier *urn)
+{
+	unsigned slot;
+	struct kvm_shared_msr *global;
+	struct kvm_shared_msrs *locals
+		= container_of(urn, struct kvm_shared_msrs, urn);
+
+	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
+		global = &shared_msrs_global.msrs[slot];
+		if (global->value != locals->current_value[slot]) {
+			wrmsrl(global->msr, global->value);
+			locals->current_value[slot] = global->value;
+		}
+	}
+	locals->registered = false;
+	user_return_notifier_unregister(urn);
+}
+
+void kvm_define_shared_msr(unsigned slot, u32 msr)
+{
+	int cpu;
+	u64 value;
+
+	if (slot >= shared_msrs_global.nr)
+		shared_msrs_global.nr = slot + 1;
+	shared_msrs_global.msrs[slot].msr = msr;
+	rdmsrl_safe(msr, &value);
+	shared_msrs_global.msrs[slot].value = value;
+	for_each_online_cpu(cpu)
+		per_cpu(shared_msrs, cpu).current_value[slot] = value;
+}
+EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
+
+static void kvm_shared_msr_cpu_online(void)
+{
+	unsigned i;
+	struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs);
+
+	for (i = 0; i < shared_msrs_global.nr; ++i)
+		locals->current_value[i] = shared_msrs_global.msrs[i].value;
+}
+
+void kvm_set_shared_msr(unsigned slot, u64 value)
+{
+	struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+
+	if (value == smsr->current_value[slot])
+		return;
+	smsr->current_value[slot] = value;
+	wrmsrl(shared_msrs_global.msrs[slot].msr, value);
+	if (!smsr->registered) {
+		smsr->urn.on_user_return = kvm_on_user_return;
+		user_return_notifier_register(&smsr->urn);
+		smsr->registered = true;
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+
 unsigned long segment_base(u16 selector)
 {
 	struct descriptor_table gdt;
@@ -4696,6 +4774,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 void kvm_arch_hardware_enable(void *garbage)
 {
+	kvm_shared_msr_cpu_online();
 	kvm_x86_ops->hardware_enable(garbage);
 }
 
-- 
1.6.4.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH 4/4] KVM: VMX: Use shared msr infrastructure
  2009-09-16 12:45 [PATCH 0/4] User return notifiers / just-in-time MSR switching for KVM Avi Kivity
                   ` (2 preceding siblings ...)
  2009-09-16 12:45 ` [PATCH 3/4] KVM: x86 shared msr infrastructure Avi Kivity
@ 2009-09-16 12:45 ` Avi Kivity
  2009-10-25 12:23 ` [PATCH 0/4] User return notifiers / just-in-time MSR switching for KVM Avi Kivity
  4 siblings, 0 replies; 29+ messages in thread
From: Avi Kivity @ 2009-09-16 12:45 UTC (permalink / raw)
  To: Ingo Molnar, Marcelo Tosatti; +Cc: kvm, linux-kernel

Instead of reloading syscall MSRs on every preemption, use the new shared
msr infrastructure to reload them at the last possible minute (just before
exit to userspace).

Improves vcpu/idle/vcpu switches by about 2000 cycles (when EFER needs to be
reloaded as well).

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c |  108 +++++++++++++++++++--------------------------------
 1 files changed, 40 insertions(+), 68 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 547881a..8dae1dc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -67,6 +67,11 @@ struct vmcs {
 	char data[0];
 };
 
+struct shared_msr_entry {
+	unsigned index;
+	u64 data;
+};
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	struct list_head      local_vcpus_link;
@@ -74,8 +79,7 @@ struct vcpu_vmx {
 	int                   launched;
 	u8                    fail;
 	u32                   idt_vectoring_info;
-	struct kvm_msr_entry *guest_msrs;
-	struct kvm_msr_entry *host_msrs;
+	struct shared_msr_entry *guest_msrs;
 	int                   nmsrs;
 	int                   save_nmsrs;
 	int                   msr_offset_efer;
@@ -89,7 +93,6 @@ struct vcpu_vmx {
 		u16           fs_sel, gs_sel, ldt_sel;
 		int           gs_ldt_reload_needed;
 		int           fs_reload_needed;
-		int           guest_efer_loaded;
 	} host_state;
 	struct {
 		int vm86_active;
@@ -176,6 +179,8 @@ static struct kvm_vmx_segment_field {
 	VMX_SEGMENT_FIELD(LDTR),
 };
 
+static u64 host_efer;
+
 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
 
 /*
@@ -190,22 +195,6 @@ static const u32 vmx_msr_index[] = {
 };
 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
 
-static void load_msrs(struct kvm_msr_entry *e, int n)
-{
-	int i;
-
-	for (i = 0; i < n; ++i)
-		wrmsrl(e[i].index, e[i].data);
-}
-
-static void save_msrs(struct kvm_msr_entry *e, int n)
-{
-	int i;
-
-	for (i = 0; i < n; ++i)
-		rdmsrl(e[i].index, e[i].data);
-}
-
 static inline int is_page_fault(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -348,7 +337,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 	int i;
 
 	for (i = 0; i < vmx->nmsrs; ++i)
-		if (vmx->guest_msrs[i].index == msr)
+		if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
 			return i;
 	return -1;
 }
@@ -379,7 +368,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
 			: : "a" (&operand), "c" (ext) : "cc", "memory");
 }
 
-static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
 
@@ -570,17 +559,15 @@ static void reload_tss(void)
 	load_TR_desc();
 }
 
-static void load_transition_efer(struct vcpu_vmx *vmx)
+static bool update_transition_efer(struct vcpu_vmx *vmx)
 {
 	int efer_offset = vmx->msr_offset_efer;
-	u64 host_efer;
 	u64 guest_efer;
 	u64 ignore_bits;
 
 	if (efer_offset < 0)
-		return;
-	host_efer = vmx->host_msrs[efer_offset].data;
-	guest_efer = vmx->guest_msrs[efer_offset].data;
+		return false;
+	guest_efer = vmx->vcpu.arch.shadow_efer;
 
 	/*
 	 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -594,26 +581,18 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
 		ignore_bits &= ~(u64)EFER_SCE;
 #endif
 	if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
-		return;
+		return false;
 
-	vmx->host_state.guest_efer_loaded = 1;
 	guest_efer &= ~ignore_bits;
 	guest_efer |= host_efer & ignore_bits;
-	wrmsrl(MSR_EFER, guest_efer);
-	vmx->vcpu.stat.efer_reload++;
-}
-
-static void reload_host_efer(struct vcpu_vmx *vmx)
-{
-	if (vmx->host_state.guest_efer_loaded) {
-		vmx->host_state.guest_efer_loaded = 0;
-		load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
-	}
+	vmx->guest_msrs[efer_offset].data = guest_efer;
+	return true;
 }
 
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int i;
 
 	if (vmx->host_state.loaded)
 		return;
@@ -655,8 +634,8 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
 	}
 #endif
-	load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-	load_transition_efer(vmx);
+	for (i = 0; i < vmx->save_nmsrs; ++i)
+		kvm_set_shared_msr(i, vmx->guest_msrs[i].data);
 }
 
 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +663,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 		local_irq_restore(flags);
 	}
 	reload_tss();
-	save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
-	load_msrs(vmx->host_msrs, vmx->save_nmsrs);
-	reload_host_efer(vmx);
 #ifdef CONFIG_X86_64
 	if (is_long_mode(&vmx->vcpu)) {
 		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
@@ -886,14 +862,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 #ifdef CONFIG_X86_64
 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 {
-	struct kvm_msr_entry tmp;
+	struct shared_msr_entry tmp;
 
 	tmp = vmx->guest_msrs[to];
 	vmx->guest_msrs[to] = vmx->guest_msrs[from];
 	vmx->guest_msrs[from] = tmp;
-	tmp = vmx->host_msrs[to];
-	vmx->host_msrs[to] = vmx->host_msrs[from];
-	vmx->host_msrs[from] = tmp;
 }
 #endif
 
@@ -904,15 +877,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
  */
 static void setup_msrs(struct vcpu_vmx *vmx)
 {
-	int save_nmsrs;
+	int save_nmsrs, index;
 	unsigned long *msr_bitmap;
 
 	vmx_load_host_state(vmx);
 	save_nmsrs = 0;
 #ifdef CONFIG_X86_64
 	if (is_long_mode(&vmx->vcpu)) {
-		int index;
-
 		index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
 		if (index >= 0)
 			move_msr_up(vmx, index, save_nmsrs++);
@@ -931,9 +902,11 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 			move_msr_up(vmx, index, save_nmsrs++);
 	}
 #endif
-	vmx->save_nmsrs = save_nmsrs;
+	vmx->msr_offset_efer = index = __find_msr_index(vmx, MSR_EFER);
+	if (index >= 0 && update_transition_efer(vmx))
+		move_msr_up(vmx, index, save_nmsrs++);
 
-	vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
+	vmx->save_nmsrs = save_nmsrs;
 
 	if (cpu_has_vmx_msr_bitmap()) {
 		if (is_long_mode(&vmx->vcpu))
@@ -975,7 +948,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
 static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 {
 	u64 data;
-	struct kvm_msr_entry *msr;
+	struct shared_msr_entry *msr;
 
 	if (!pdata) {
 		printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -994,9 +967,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		vmx_load_host_state(to_vmx(vcpu));
 		data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
 		break;
+#endif
 	case MSR_EFER:
 		return kvm_get_msr_common(vcpu, msr_index, pdata);
-#endif
 	case MSR_IA32_TSC:
 		data = guest_read_tsc();
 		break;
@@ -1010,6 +983,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		data = vmcs_readl(GUEST_SYSENTER_ESP);
 		break;
 	default:
+		vmx_load_host_state(to_vmx(vcpu));
 		msr = find_msr_entry(to_vmx(vcpu), msr_index);
 		if (msr) {
 			vmx_load_host_state(to_vmx(vcpu));
@@ -1031,7 +1005,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_msr_entry *msr;
+	struct shared_msr_entry *msr;
 	u64 host_tsc;
 	int ret = 0;
 
@@ -1543,8 +1517,10 @@ continue_rmode:
 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+	struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
 
+	if (!msr)
+		return;
 	vcpu->arch.shadow_efer = efer;
 	if (!msr)
 		return;
@@ -2383,10 +2359,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		if (wrmsr_safe(index, data_low, data_high) < 0)
 			continue;
 		data = data_low | ((u64)data_high << 32);
-		vmx->host_msrs[j].index = index;
-		vmx->host_msrs[j].reserved = 0;
-		vmx->host_msrs[j].data = data;
-		vmx->guest_msrs[j] = vmx->host_msrs[j];
+		vmx->guest_msrs[j].index = i;
+		vmx->guest_msrs[j].data = 0;
 		++vmx->nmsrs;
 	}
 
@@ -3774,7 +3748,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
 	spin_unlock(&vmx_vpid_lock);
 	vmx_free_vmcs(vcpu);
-	kfree(vmx->host_msrs);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3801,10 +3774,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto uninit_vcpu;
 	}
 
-	vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!vmx->host_msrs)
-		goto free_guest_msrs;
-
 	vmx->vmcs = alloc_vmcs();
 	if (!vmx->vmcs)
 		goto free_msrs;
@@ -3835,8 +3804,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 free_vmcs:
 	free_vmcs(vmx->vmcs);
 free_msrs:
-	kfree(vmx->host_msrs);
-free_guest_msrs:
 	kfree(vmx->guest_msrs);
 uninit_vcpu:
 	kvm_vcpu_uninit(&vmx->vcpu);
@@ -3986,7 +3953,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 static int __init vmx_init(void)
 {
-	int r;
+	int r, i;
+
+	rdmsrl_safe(MSR_EFER, &host_efer);
+
+	for (i = 0; i < NR_VMX_MSR; ++i)
+		kvm_define_shared_msr(i, vmx_msr_index[i]);
 
 	vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_io_bitmap_a)
-- 
1.6.4.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/4] KVM: x86 shared msr infrastructure
  2009-09-16 12:45 ` [PATCH 3/4] KVM: x86 shared msr infrastructure Avi Kivity
@ 2009-09-16 21:21   ` Marcelo Tosatti
  2009-09-16 21:24     ` Avi Kivity
  0 siblings, 1 reply; 29+ messages in thread
From: Marcelo Tosatti @ 2009-09-16 21:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, kvm, linux-kernel

On Wed, Sep 16, 2009 at 03:45:33PM +0300, Avi Kivity wrote:
> The various syscall-related MSRs are fairly expensive to switch.  Currently
> we switch them on every vcpu preemption, which is far too often:
> 
> - if we're switching to a kernel thread (idle task, threaded interrupt,
>   kernel-mode virtio server (vhost-net), for example) and back, then
>   there's no need to switch those MSRs since kernel threasd won't
>   be exiting to userspace.
> 
> - if we're switching to another guest running an identical OS, most likely
>   those MSRs will have the same value, so there's little point in reloading
>   them.
> 
> - if we're running the same OS on the guest and host, the MSRs will have
>   identical values and reloading is unnecessary.
> 
> This patch uses the new user return notifiers to implement last-minute
> switching, and checks the msr values to avoid unnecessary reloading.
> 
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    3 +
>  arch/x86/kvm/Kconfig            |    1 +
>  arch/x86/kvm/x86.c              |   79 +++++++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 45226f0..863bde8 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -799,4 +799,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
>  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
>  
> +void kvm_define_shared_msr(unsigned index, u32 msr);
> +void kvm_set_shared_msr(unsigned index, u64 val);
> +
>  #endif /* _ASM_X86_KVM_HOST_H */
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index b84e571..4cd4983 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -28,6 +28,7 @@ config KVM
>  	select HAVE_KVM_IRQCHIP
>  	select HAVE_KVM_EVENTFD
>  	select KVM_APIC_ARCHITECTURE
> +	select USER_RETURN_NOTIFIER
>  	---help---
>  	  Support hosting fully virtualized guest machines using hardware
>  	  virtualization extensions.  You will need a fairly recent
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1d454d9..0e7c40c 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -37,6 +37,7 @@
>  #include <linux/iommu.h>
>  #include <linux/intel-iommu.h>
>  #include <linux/cpufreq.h>
> +#include <linux/user-return-notifier.h>
>  #include <trace/events/kvm.h>
>  #undef TRACE_INCLUDE_FILE
>  #define CREATE_TRACE_POINTS
> @@ -87,6 +88,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
>  int ignore_msrs = 0;
>  module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
>  
> +#define KVM_NR_SHARED_MSRS 16
> +
> +struct kvm_shared_msrs_global {
> +	int nr;
> +	struct kvm_shared_msr {
> +		u32 msr;
> +		u64 value;
> +	} msrs[KVM_NR_SHARED_MSRS];
> +};
> +
> +struct kvm_shared_msrs {
> +	struct user_return_notifier urn;
> +	bool registered;
> +	u64 current_value[KVM_NR_SHARED_MSRS];
> +};
> +
> +static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;

Does this assume the MSRs in question are consistent across CPUs?

I guess that is not true with arch_prctl(ARCH_SET_GS/ARCH_GET_GS) ?


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 3/4] KVM: x86 shared msr infrastructure
  2009-09-16 21:21   ` Marcelo Tosatti
@ 2009-09-16 21:24     ` Avi Kivity
  0 siblings, 0 replies; 29+ messages in thread
From: Avi Kivity @ 2009-09-16 21:24 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Ingo Molnar, kvm, linux-kernel

On 09/17/2009 12:21 AM, Marcelo Tosatti wrote:
>> +static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
>>      
> Does this assume the MSRs in question are consistent across CPUs?
>    

Yes.  And they are.

> I guess that is not true with arch_prctl(ARCH_SET_GS/ARCH_GET_GS) ?
>    

These aren't shared MSRs and are saved and restored independently.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/4] core, x86: Add user return notifiers
  2009-09-16 12:45 ` [PATCH 1/4] core, x86: Add user return notifiers Avi Kivity
@ 2009-09-18 18:14   ` H. Peter Anvin
  2009-09-19  6:40     ` [PATCH v2] " Avi Kivity
  2009-09-19  6:40     ` [PATCH 1/4] " Avi Kivity
  0 siblings, 2 replies; 29+ messages in thread
From: H. Peter Anvin @ 2009-09-18 18:14 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, Marcelo Tosatti, kvm, linux-kernel

On 09/16/2009 05:45 AM, Avi Kivity wrote:
> Add a general per-cpu notifier that is called whenever the kernel is
> about to return to userspace.  The notifier uses a thread_info flag
> and existing checks, so there is no impact on user return or context
> switch fast paths.
> 
> Signed-off-by: Avi Kivity <avi@redhat.com>

The concept of this patch looks entirely sane, as does the code,
*except* for:

+#include <linux/user-return-notifier.h>

... missing header file, and ...

+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o

... missing source file.

Could you fix the patch, please?

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v2] core, x86: Add user return notifiers
  2009-09-18 18:14   ` H. Peter Anvin
@ 2009-09-19  6:40     ` Avi Kivity
  2009-09-22  9:25       ` Avi Kivity
  2009-11-02 10:12       ` [tip:x86/entry] " tip-bot for Avi Kivity
  2009-09-19  6:40     ` [PATCH 1/4] " Avi Kivity
  1 sibling, 2 replies; 29+ messages in thread
From: Avi Kivity @ 2009-09-19  6:40 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin; +Cc: linux-kernel, kvm, Marcelo Tosatti

Add a general per-cpu notifier that is called whenever the kernel is
about to return to userspace.  The notifier uses a thread_info flag
and existing checks, so there is no impact on user return or context
switch fast paths.

Signed-off-by: Avi Kivity <avi@redhat.com>
---

v2: include new files in patch


 arch/Kconfig                         |   10 +++++++
 arch/x86/Kconfig                     |    1 +
 arch/x86/include/asm/thread_info.h   |    7 +++-
 arch/x86/kernel/process.c            |    2 +
 arch/x86/kernel/signal.c             |    3 ++
 include/linux/user-return-notifier.h |   42 +++++++++++++++++++++++++++++++
 kernel/Makefile                      |    1 +
 kernel/user-return-notifier.c        |   46 ++++++++++++++++++++++++++++++++++
 8 files changed, 110 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/user-return-notifier.h
 create mode 100644 kernel/user-return-notifier.c

diff --git a/arch/Kconfig b/arch/Kconfig
index beea3cc..b1d0757 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -82,6 +82,13 @@ config KRETPROBES
 	def_bool y
 	depends on KPROBES && HAVE_KRETPROBES
 
+config USER_RETURN_NOTIFIER
+	bool
+	depends on HAVE_USER_RETURN_NOTIFIER
+	help
+	  Provide a kernel-internal notification when a cpu is about to
+	  switch to user mode.
+
 config HAVE_IOREMAP_PROT
 	bool
 
@@ -125,4 +132,7 @@ config HAVE_DMA_API_DEBUG
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
 	bool
 
+config HAVE_USER_RETURN_NOTIFIER
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fc20fdc..ed21d6a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -50,6 +50,7 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_USER_RETURN_NOTIFIER
 
 config OUTPUT_FORMAT
 	string
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2..375c917 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
+#define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME |	\
+	 _TIF_USER_RETURN_NOTIFY)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
 
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 
 #define PREEMPT_ACTIVE		0x10000000
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 071166a..7ea6972 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/random.h>
+#include <linux/user-return-notifier.h>
 #include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
@@ -227,6 +228,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
+	propagate_user_return_notify(prev_p, next_p);
 }
 
 int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 81e5823..13aa99c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
 #include <linux/stddef.h>
 #include <linux/personality.h>
 #include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -872,6 +873,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		if (current->replacement_session_keyring)
 			key_replace_session_keyring();
 	}
+	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+		fire_user_return_notifiers();
 
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
diff --git a/include/linux/user-return-notifier.h b/include/linux/user-return-notifier.h
new file mode 100644
index 0000000..ef04e2e
--- /dev/null
+++ b/include/linux/user-return-notifier.h
@@ -0,0 +1,42 @@
+#ifndef _LINUX_USER_RETURN_NOTIFIER_H
+#define _LINUX_USER_RETURN_NOTIFIER_H
+
+#ifdef CONFIG_USER_RETURN_NOTIFIER
+
+#include <linux/list.h>
+#include <linux/sched.h>
+
+struct user_return_notifier {
+	void (*on_user_return)(struct user_return_notifier *urn);
+	struct hlist_node link;
+};
+
+
+void user_return_notifier_register(struct user_return_notifier *urn);
+void user_return_notifier_unregister(struct user_return_notifier *urn);
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+						struct task_struct *next)
+{
+	if (test_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY)) {
+		clear_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY);
+		set_tsk_thread_flag(next, TIF_USER_RETURN_NOTIFY);
+	}
+}
+
+void fire_user_return_notifiers(void);
+
+#else
+
+struct user_return_notifier {};
+
+static inline propagate_user_return_notify(struct task_struct *prev,
+					   struct task_struct *next)
+{
+}
+
+static inline void fire_user_return_notifiers(void) {}
+
+#endif
+
+#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 961379c..f6abe84 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
+obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 0000000..530ccb8
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,46 @@
+
+#include <linux/user-return-notifier.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
+
+#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
+
+/*
+ * Request a notification when the current cpu returns to userspace.  Must be
+ * called in atomic context.  The notifier will also be called in atomic
+ * context.
+ */
+void user_return_notifier_register(struct user_return_notifier *urn)
+{
+	set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+	hlist_add_head(&urn->link, &URN_LIST_HEAD);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_register);
+
+/*
+ * Removes a registered user return notifier.  Must be called from atomic
+ * context, and from the same cpu registration occured in.
+ */
+void user_return_notifier_unregister(struct user_return_notifier *urn)
+{
+	hlist_del(&urn->link);
+	if (hlist_empty(&URN_LIST_HEAD))
+		clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
+
+/* Calls registered user return notifiers */
+void fire_user_return_notifiers(void)
+{
+	struct user_return_notifier *urn;
+	struct hlist_node *tmp1, *tmp2;
+	struct hlist_head *head;
+
+	head = &get_cpu_var(return_notifier_list);
+	hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+		urn->on_user_return(urn);
+	put_cpu_var();
+}
-- 
1.6.4.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH 1/4] core, x86: Add user return notifiers
  2009-09-18 18:14   ` H. Peter Anvin
  2009-09-19  6:40     ` [PATCH v2] " Avi Kivity
@ 2009-09-19  6:40     ` Avi Kivity
  1 sibling, 0 replies; 29+ messages in thread
From: Avi Kivity @ 2009-09-19  6:40 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: Ingo Molnar, Marcelo Tosatti, kvm, linux-kernel

On 09/18/2009 09:14 PM, H. Peter Anvin wrote:
> On 09/16/2009 05:45 AM, Avi Kivity wrote:
>    
>> Add a general per-cpu notifier that is called whenever the kernel is
>> about to return to userspace.  The notifier uses a thread_info flag
>> and existing checks, so there is no impact on user return or context
>> switch fast paths.
>>
>> Signed-off-by: Avi Kivity<avi@redhat.com>
>>      
> The concept of this patch looks entirely sane, as does the code,
> *except* for:
>
> +#include<linux/user-return-notifier.h>
>
> ... missing header file, and ...
>
> +obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
>
> ... missing source file.
>
> Could you fix the patch, please?
>
>    

Doh.  Sent out fixed patch.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-19  6:40     ` [PATCH v2] " Avi Kivity
@ 2009-09-22  9:25       ` Avi Kivity
  2009-09-22  9:37         ` Arjan van de Ven
  2009-09-22 14:32         ` Ingo Molnar
  2009-11-02 10:12       ` [tip:x86/entry] " tip-bot for Avi Kivity
  1 sibling, 2 replies; 29+ messages in thread
From: Avi Kivity @ 2009-09-22  9:25 UTC (permalink / raw)
  To: Ingo Molnar, H. Peter Anvin; +Cc: linux-kernel, kvm, Marcelo Tosatti

On 09/19/2009 09:40 AM, Avi Kivity wrote:
> Add a general per-cpu notifier that is called whenever the kernel is
> about to return to userspace.  The notifier uses a thread_info flag
> and existing checks, so there is no impact on user return or context
> switch fast paths.
>    

Ingo/Peter?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22  9:25       ` Avi Kivity
@ 2009-09-22  9:37         ` Arjan van de Ven
  2009-09-22  9:48           ` Avi Kivity
  2009-09-22 14:32         ` Ingo Molnar
  1 sibling, 1 reply; 29+ messages in thread
From: Arjan van de Ven @ 2009-09-22  9:37 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Ingo Molnar, H. Peter Anvin, linux-kernel, kvm, Marcelo Tosatti

On Tue, 22 Sep 2009 12:25:33 +0300
Avi Kivity <avi@redhat.com> wrote:

> On 09/19/2009 09:40 AM, Avi Kivity wrote:
> > Add a general per-cpu notifier that is called whenever the kernel is
> > about to return to userspace.  The notifier uses a thread_info flag
> > and existing checks, so there is no impact on user return or context
> > switch fast paths.
> >    
> 
> Ingo/Peter?

isn't this like really expensive when used ?



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22  9:37         ` Arjan van de Ven
@ 2009-09-22  9:48           ` Avi Kivity
  0 siblings, 0 replies; 29+ messages in thread
From: Avi Kivity @ 2009-09-22  9:48 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, H. Peter Anvin, linux-kernel, kvm, Marcelo Tosatti

On 09/22/2009 12:37 PM, Arjan van de Ven wrote:
> On Tue, 22 Sep 2009 12:25:33 +0300
> Avi Kivity<avi@redhat.com>  wrote:
>
>    
>> On 09/19/2009 09:40 AM, Avi Kivity wrote:
>>      
>>> Add a general per-cpu notifier that is called whenever the kernel is
>>> about to return to userspace.  The notifier uses a thread_info flag
>>> and existing checks, so there is no impact on user return or context
>>> switch fast paths.
>>>
>>>        
>> Ingo/Peter?
>>      
> isn't this like really expensive when used ?
>    

No, why?  It triggers a call to do_notify_resume() and a walks a list of 
length 1.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22  9:25       ` Avi Kivity
  2009-09-22  9:37         ` Arjan van de Ven
@ 2009-09-22 14:32         ` Ingo Molnar
  2009-09-22 14:45           ` Avi Kivity
                             ` (3 more replies)
  1 sibling, 4 replies; 29+ messages in thread
From: Ingo Molnar @ 2009-09-22 14:32 UTC (permalink / raw)
  To: Avi Kivity, Peter Zijlstra
  Cc: H. Peter Anvin, linux-kernel, kvm, Marcelo Tosatti


* Avi Kivity <avi@redhat.com> wrote:

> On 09/19/2009 09:40 AM, Avi Kivity wrote:
>> Add a general per-cpu notifier that is called whenever the kernel is
>> about to return to userspace.  The notifier uses a thread_info flag
>> and existing checks, so there is no impact on user return or context
>> switch fast paths.
>>    
>
> Ingo/Peter?

Would be nice to convert some existing open-coded return-to-user-space 
logic to this facility. One such candidate would be lockdep_sys_exit?

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22 14:32         ` Ingo Molnar
@ 2009-09-22 14:45           ` Avi Kivity
  2009-09-22 15:50             ` Avi Kivity
  2009-09-22 15:19           ` H. Peter Anvin
                             ` (2 subsequent siblings)
  3 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2009-09-22 14:45 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, H. Peter Anvin, linux-kernel, kvm, Marcelo Tosatti

On 09/22/2009 05:32 PM, Ingo Molnar wrote:
> * Avi Kivity<avi@redhat.com>  wrote:
>
>    
>> On 09/19/2009 09:40 AM, Avi Kivity wrote:
>>      
>>> Add a general per-cpu notifier that is called whenever the kernel is
>>> about to return to userspace.  The notifier uses a thread_info flag
>>> and existing checks, so there is no impact on user return or context
>>> switch fast paths.
>>>
>>>        
>> Ingo/Peter?
>>      
> Would be nice to convert some existing open-coded return-to-user-space
> logic to this facility. One such candidate would be lockdep_sys_exit?
>    

I only implemented this for x86, while lockdep is arch independent.  If 
arch support is added, it should be trivial.

I think perf counters could use preempt notifiers though, these are arch 
independent.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22 14:32         ` Ingo Molnar
  2009-09-22 14:45           ` Avi Kivity
@ 2009-09-22 15:19           ` H. Peter Anvin
  2009-10-01 15:21             ` Avi Kivity
  2009-09-22 16:50           ` Peter Zijlstra
  2009-09-22 18:06           ` Peter Zijlstra
  3 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2009-09-22 15:19 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Avi Kivity, Peter Zijlstra, linux-kernel, kvm, Marcelo Tosatti

Ingo Molnar wrote:
> * Avi Kivity <avi@redhat.com> wrote:
> 
>> On 09/19/2009 09:40 AM, Avi Kivity wrote:
>>> Add a general per-cpu notifier that is called whenever the kernel is
>>> about to return to userspace.  The notifier uses a thread_info flag
>>> and existing checks, so there is no impact on user return or context
>>> switch fast paths.
>>>    
>> Ingo/Peter?
> 
> Would be nice to convert some existing open-coded return-to-user-space 
> logic to this facility. One such candidate would be lockdep_sys_exit?
> 
> 	Ingo

Sorry, limited bandwidth due to LinuxCon, but I like the concept, and 
the previous (partial) patch was really clean.  I agree with Ingo that 
arch support so we can use this as a general facility would be nice, but 
I don't consider that as a prerequisite for merging.

	-hpa


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22 14:45           ` Avi Kivity
@ 2009-09-22 15:50             ` Avi Kivity
  2009-09-22 17:08               ` Avi Kivity
  0 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2009-09-22 15:50 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, H. Peter Anvin, linux-kernel, kvm, Marcelo Tosatti

On 09/22/2009 05:45 PM, Avi Kivity wrote:
>> Would be nice to convert some existing open-coded return-to-user-space
>> logic to this facility. One such candidate would be lockdep_sys_exit?
>
> I only implemented this for x86, while lockdep is arch independent.  
> If arch support is added, it should be trivial.
>

The lockdep_sys_exit bit is actually x86/s390 only, and can easily be 
adapted to use the new functionality on x86 only.  I'll try it out.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22 14:32         ` Ingo Molnar
  2009-09-22 14:45           ` Avi Kivity
  2009-09-22 15:19           ` H. Peter Anvin
@ 2009-09-22 16:50           ` Peter Zijlstra
  2009-09-22 16:52             ` Avi Kivity
  2009-09-22 18:06           ` Peter Zijlstra
  3 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2009-09-22 16:50 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Avi Kivity, H. Peter Anvin, linux-kernel, kvm, Marcelo Tosatti

On Tue, 2009-09-22 at 16:32 +0200, Ingo Molnar wrote:
> * Avi Kivity <avi@redhat.com> wrote:
> 
> > On 09/19/2009 09:40 AM, Avi Kivity wrote:
> >> Add a general per-cpu notifier that is called whenever the kernel is
> >> about to return to userspace.  The notifier uses a thread_info flag
> >> and existing checks, so there is no impact on user return or context
> >> switch fast paths.
> >>    
> >
> > Ingo/Peter?
> 
> Would be nice to convert some existing open-coded return-to-user-space 
> logic to this facility. One such candidate would be lockdep_sys_exit?

And here I was thinking this was one of the hottest code paths in the
whole kernel...


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22 16:50           ` Peter Zijlstra
@ 2009-09-22 16:52             ` Avi Kivity
  2009-09-22 16:55               ` Peter Zijlstra
  0 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2009-09-22 16:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, H. Peter Anvin, linux-kernel, kvm, Marcelo Tosatti

On 09/22/2009 07:50 PM, Peter Zijlstra wrote:
>> Would be nice to convert some existing open-coded return-to-user-space
>> logic to this facility. One such candidate would be lockdep_sys_exit?
>>      
> And here I was thinking this was one of the hottest code paths in the
> whole kernel...
>    

If you're using lockdep, surely that's not your biggest worry?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22 16:52             ` Avi Kivity
@ 2009-09-22 16:55               ` Peter Zijlstra
  2009-09-22 17:05                 ` Avi Kivity
  0 siblings, 1 reply; 29+ messages in thread
From: Peter Zijlstra @ 2009-09-22 16:55 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Ingo Molnar, H. Peter Anvin, linux-kernel, kvm, Marcelo Tosatti

On Tue, 2009-09-22 at 19:52 +0300, Avi Kivity wrote:
> On 09/22/2009 07:50 PM, Peter Zijlstra wrote:
> >> Would be nice to convert some existing open-coded return-to-user-space
> >> logic to this facility. One such candidate would be lockdep_sys_exit?
> >>      
> > And here I was thinking this was one of the hottest code paths in the
> > whole kernel...
> >    
> 
> If you're using lockdep, surely that's not your biggest worry?

No, but that's all under #ifdef and fully disappears when not enabled.
Generic return-tu-user notifiers don't sound like they will though.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22 16:55               ` Peter Zijlstra
@ 2009-09-22 17:05                 ` Avi Kivity
  0 siblings, 0 replies; 29+ messages in thread
From: Avi Kivity @ 2009-09-22 17:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, H. Peter Anvin, linux-kernel, kvm, Marcelo Tosatti

On 09/22/2009 07:55 PM, Peter Zijlstra wrote:
>> If you're using lockdep, surely that's not your biggest worry?
>>      
> No, but that's all under #ifdef and fully disappears when not enabled.
> Generic return-tu-user notifiers don't sound like they will though.
>    

They will if not selected.  If selected and not armed, they will have 
zero runtime impact since they piggyback on existing branches 
(_TIF_DO_NOTIFY_MASK and near relatives).  If selected and armed they'll 
cause __switch_to_xtra() on every context switch and do_notity_resume() 
on syscall exit until disarmed, but then you've asked for it.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22 15:50             ` Avi Kivity
@ 2009-09-22 17:08               ` Avi Kivity
  0 siblings, 0 replies; 29+ messages in thread
From: Avi Kivity @ 2009-09-22 17:08 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, H. Peter Anvin, linux-kernel, kvm, Marcelo Tosatti

On 09/22/2009 06:50 PM, Avi Kivity wrote:
> On 09/22/2009 05:45 PM, Avi Kivity wrote:
>>> Would be nice to convert some existing open-coded return-to-user-space
>>> logic to this facility. One such candidate would be lockdep_sys_exit?
>>
>> I only implemented this for x86, while lockdep is arch independent.  
>> If arch support is added, it should be trivial.
>>
>
> The lockdep_sys_exit bit is actually x86/s390 only, and can easily be 
> adapted to use the new functionality on x86 only.  I'll try it out.

Unfortunately it doesn't work out well.  The notifier is called until 
explicitly unregistered (since it relies on a bit in TIF_NOTIFY_MASK), 
so we have to disarm it on the first return to usersspace or it spins 
forever.  We could re-arm it on the next kernel entry, but we don't have 
a kernel entry notifier so we'll just be moving hooks from one point to 
another.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22 14:32         ` Ingo Molnar
                             ` (2 preceding siblings ...)
  2009-09-22 16:50           ` Peter Zijlstra
@ 2009-09-22 18:06           ` Peter Zijlstra
  3 siblings, 0 replies; 29+ messages in thread
From: Peter Zijlstra @ 2009-09-22 18:06 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Avi Kivity, H. Peter Anvin, linux-kernel, kvm, Marcelo Tosatti

On Tue, 2009-09-22 at 16:32 +0200, Ingo Molnar wrote:

> Would be nice to convert some existing open-coded return-to-user-space 
> logic to this facility. One such candidate would be lockdep_sys_exit?

I don't really like lockdep_sys_exit() in such a call,
lockdep_sys_exit() is currently placed such that it will execute as the
last C code before returning to user-space.

Put it any earlier and you've got a window where people can leak a lock
to userspace undetected.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-09-22 15:19           ` H. Peter Anvin
@ 2009-10-01 15:21             ` Avi Kivity
  2009-10-01 15:25               ` H. Peter Anvin
  0 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2009-10-01 15:21 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ingo Molnar, Peter Zijlstra, linux-kernel, kvm, Marcelo Tosatti

On 09/22/2009 06:19 PM, H. Peter Anvin wrote:
> Ingo Molnar wrote:
>> * Avi Kivity <avi@redhat.com> wrote:
>>
>>> On 09/19/2009 09:40 AM, Avi Kivity wrote:
>>>> Add a general per-cpu notifier that is called whenever the kernel is
>>>> about to return to userspace.  The notifier uses a thread_info flag
>>>> and existing checks, so there is no impact on user return or context
>>>> switch fast paths.
>>> Ingo/Peter?
>>
>> Would be nice to convert some existing open-coded 
>> return-to-user-space logic to this facility. One such candidate would 
>> be lockdep_sys_exit?
>>
>>     Ingo
>
> Sorry, limited bandwidth due to LinuxCon, but I like the concept, and 
> the previous (partial) patch was really clean.  I agree with Ingo that 
> arch support so we can use this as a general facility would be nice, 
> but I don't consider that as a prerequisite for merging.

Re-ping?

If accepted, please merge just the core patch and I will carry all of 
them in parallel.  Once tip is merged I'll drop my copy of the first patch.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-10-01 15:21             ` Avi Kivity
@ 2009-10-01 15:25               ` H. Peter Anvin
  2009-10-01 15:30                 ` Avi Kivity
  0 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2009-10-01 15:25 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Ingo Molnar, Peter Zijlstra, linux-kernel, kvm, Marcelo Tosatti

On 10/01/2009 08:21 AM, Avi Kivity wrote:
> 
> Re-ping?
> 
> If accepted, please merge just the core patch and I will carry all of
> them in parallel.  Once tip is merged I'll drop my copy of the first patch.
> 

I was just talking to Ingo about this... would you consider this .32 or
.33 material at his time?  It came in awfully late for .32...

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-10-01 15:25               ` H. Peter Anvin
@ 2009-10-01 15:30                 ` Avi Kivity
  2009-10-01 18:33                   ` H. Peter Anvin
  0 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2009-10-01 15:30 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ingo Molnar, Peter Zijlstra, linux-kernel, kvm, Marcelo Tosatti

On 10/01/2009 05:25 PM, H. Peter Anvin wrote:
> On 10/01/2009 08:21 AM, Avi Kivity wrote:
>    
>> Re-ping?
>>
>> If accepted, please merge just the core patch and I will carry all of
>> them in parallel.  Once tip is merged I'll drop my copy of the first patch.
>>
>>      
> I was just talking to Ingo about this... would you consider this .32 or
> .33 material at his time?  It came in awfully late for .32...
>    

While I'd prefer it to be in .32, as there's much churn in both trees, I 
guess we should follow the process and schedule it for .33.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v2] core, x86: Add user return notifiers
  2009-10-01 15:30                 ` Avi Kivity
@ 2009-10-01 18:33                   ` H. Peter Anvin
  0 siblings, 0 replies; 29+ messages in thread
From: H. Peter Anvin @ 2009-10-01 18:33 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Ingo Molnar, Peter Zijlstra, linux-kernel, kvm, Marcelo Tosatti

On 10/01/2009 08:30 AM, Avi Kivity wrote:
> 
> While I'd prefer it to be in .32, as there's much churn in both trees, I 
> guess we should follow the process and schedule it for .33.
> 

OK, sounds like a plan.  I'll pull it into -tip.

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/4] User return notifiers / just-in-time MSR switching for KVM
  2009-09-16 12:45 [PATCH 0/4] User return notifiers / just-in-time MSR switching for KVM Avi Kivity
                   ` (3 preceding siblings ...)
  2009-09-16 12:45 ` [PATCH 4/4] KVM: VMX: Use " Avi Kivity
@ 2009-10-25 12:23 ` Avi Kivity
  4 siblings, 0 replies; 29+ messages in thread
From: Avi Kivity @ 2009-10-25 12:23 UTC (permalink / raw)
  To: Ingo Molnar, Marcelo Tosatti; +Cc: kvm, linux-kernel

On 09/16/2009 03:45 PM, Avi Kivity wrote:
> The various x86 syscall related MSRs (MSR_LSTAR and friends, EFER when SCE
> needs to be updated) are fairly expensive to read or write.  Since different
> operating systems can set different values for these MSRs, KVM needs to reload
> them when switching to a different guest or to the host.
>
> Switching on every guest entry/exit is too expensive, so KVM reloads on
> guest preemption, which is a lot rarer.  Even so, preemption based reload
> is suboptimal:
>
> - if we're switching to a kernel thread and back, there's no need to reload
>    the MSRs.  Examples of kernel threads we're likely to switch to are:
>
>    - the idle task
>    - a threaded interrupt handler
>    - a kernel-mode virtio server (vhost-net)
>
> - if we're switching to a guest running the same OS, the MSRs will have the
>    same values and there's no need to reload them
>
> - if the guest and host run the same OS, again the MSRs need not be reloaded.
>
> This patchset implements just-in-time reloads to defer them to the last
> possible instant.  When we do reload, we check whether the values have in
> fact changed and reload conditionally.
>
> For the just-in-time reloads the first patch implements "user return
> notifiers", a callback invoked just before return to userspace.  This has
> been written so that there is no code impact if KVM is not configured, and
> no runtime impact if KVM is not running.
>
> The patchset improves guest/idle/guest switches by about 2000 cycles.
>
>    

I've applied this to kvm.git master.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [tip:x86/entry] core, x86: Add user return notifiers
  2009-09-19  6:40     ` [PATCH v2] " Avi Kivity
  2009-09-22  9:25       ` Avi Kivity
@ 2009-11-02 10:12       ` tip-bot for Avi Kivity
  1 sibling, 0 replies; 29+ messages in thread
From: tip-bot for Avi Kivity @ 2009-11-02 10:12 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, tglx, avi

Commit-ID:  7c68af6e32c73992bad24107311f3433c89016e2
Gitweb:     http://git.kernel.org/tip/7c68af6e32c73992bad24107311f3433c89016e2
Author:     Avi Kivity <avi@redhat.com>
AuthorDate: Sat, 19 Sep 2009 09:40:22 +0300
Committer:  H. Peter Anvin <hpa@zytor.com>
CommitDate: Thu, 1 Oct 2009 12:12:18 -0700

core, x86: Add user return notifiers

Add a general per-cpu notifier that is called whenever the kernel is
about to return to userspace.  The notifier uses a thread_info flag
and existing checks, so there is no impact on user return or context
switch fast paths.

This will be used initially to speed up KVM task switching by lazily
updating MSRs.

Signed-off-by: Avi Kivity <avi@redhat.com>
LKML-Reference: <1253342422-13811-1-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/Kconfig                         |   10 +++++++
 arch/x86/Kconfig                     |    1 +
 arch/x86/include/asm/thread_info.h   |    7 +++-
 arch/x86/kernel/process.c            |    2 +
 arch/x86/kernel/signal.c             |    3 ++
 include/linux/user-return-notifier.h |   42 +++++++++++++++++++++++++++++++
 kernel/user-return-notifier.c        |   46 ++++++++++++++++++++++++++++++++++
 7 files changed, 109 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 7f418bb..4e312ff 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -83,6 +83,13 @@ config KRETPROBES
 	def_bool y
 	depends on KPROBES && HAVE_KRETPROBES
 
+config USER_RETURN_NOTIFIER
+	bool
+	depends on HAVE_USER_RETURN_NOTIFIER
+	help
+	  Provide a kernel-internal notification when a cpu is about to
+	  switch to user mode.
+
 config HAVE_IOREMAP_PROT
 	bool
 
@@ -126,4 +133,7 @@ config HAVE_DMA_API_DEBUG
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
 	bool
 
+config HAVE_USER_RETURN_NOTIFIER
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8da9374..1df175d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -50,6 +50,7 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_USER_RETURN_NOTIFIER
 
 config OUTPUT_FORMAT
 	string
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2..375c917 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -107,6 +108,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
+#define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -142,13 +144,14 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME |	\
+	 _TIF_USER_RETURN_NOTIFY)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
 
-#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 
 #define PREEMPT_ACTIVE		0x10000000
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2..e51b056 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/random.h>
+#include <linux/user-return-notifier.h>
 #include <trace/events/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
@@ -224,6 +225,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
+	propagate_user_return_notify(prev_p, next_p);
 }
 
 int sys_fork(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76..c49f90f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
 #include <linux/stddef.h>
 #include <linux/personality.h>
 #include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -872,6 +873,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		if (current->replacement_session_keyring)
 			key_replace_session_keyring();
 	}
+	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+		fire_user_return_notifiers();
 
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
diff --git a/include/linux/user-return-notifier.h b/include/linux/user-return-notifier.h
new file mode 100644
index 0000000..b6ac056
--- /dev/null
+++ b/include/linux/user-return-notifier.h
@@ -0,0 +1,42 @@
+#ifndef _LINUX_USER_RETURN_NOTIFIER_H
+#define _LINUX_USER_RETURN_NOTIFIER_H
+
+#ifdef CONFIG_USER_RETURN_NOTIFIER
+
+#include <linux/list.h>
+#include <linux/sched.h>
+
+struct user_return_notifier {
+	void (*on_user_return)(struct user_return_notifier *urn);
+	struct hlist_node link;
+};
+
+
+void user_return_notifier_register(struct user_return_notifier *urn);
+void user_return_notifier_unregister(struct user_return_notifier *urn);
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+						struct task_struct *next)
+{
+	if (test_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY)) {
+		clear_tsk_thread_flag(prev, TIF_USER_RETURN_NOTIFY);
+		set_tsk_thread_flag(next, TIF_USER_RETURN_NOTIFY);
+	}
+}
+
+void fire_user_return_notifiers(void);
+
+#else
+
+struct user_return_notifier {};
+
+static inline void propagate_user_return_notify(struct task_struct *prev,
+						struct task_struct *next)
+{
+}
+
+static inline void fire_user_return_notifiers(void) {}
+
+#endif
+
+#endif
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
new file mode 100644
index 0000000..530ccb8
--- /dev/null
+++ b/kernel/user-return-notifier.c
@@ -0,0 +1,46 @@
+
+#include <linux/user-return-notifier.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
+
+#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
+
+/*
+ * Request a notification when the current cpu returns to userspace.  Must be
+ * called in atomic context.  The notifier will also be called in atomic
+ * context.
+ */
+void user_return_notifier_register(struct user_return_notifier *urn)
+{
+	set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+	hlist_add_head(&urn->link, &URN_LIST_HEAD);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_register);
+
+/*
+ * Removes a registered user return notifier.  Must be called from atomic
+ * context, and from the same cpu registration occured in.
+ */
+void user_return_notifier_unregister(struct user_return_notifier *urn)
+{
+	hlist_del(&urn->link);
+	if (hlist_empty(&URN_LIST_HEAD))
+		clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
+
+/* Calls registered user return notifiers */
+void fire_user_return_notifiers(void)
+{
+	struct user_return_notifier *urn;
+	struct hlist_node *tmp1, *tmp2;
+	struct hlist_head *head;
+
+	head = &get_cpu_var(return_notifier_list);
+	hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+		urn->on_user_return(urn);
+	put_cpu_var();
+}

^ permalink raw reply related	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2009-11-02 10:12 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-16 12:45 [PATCH 0/4] User return notifiers / just-in-time MSR switching for KVM Avi Kivity
2009-09-16 12:45 ` [PATCH 1/4] core, x86: Add user return notifiers Avi Kivity
2009-09-18 18:14   ` H. Peter Anvin
2009-09-19  6:40     ` [PATCH v2] " Avi Kivity
2009-09-22  9:25       ` Avi Kivity
2009-09-22  9:37         ` Arjan van de Ven
2009-09-22  9:48           ` Avi Kivity
2009-09-22 14:32         ` Ingo Molnar
2009-09-22 14:45           ` Avi Kivity
2009-09-22 15:50             ` Avi Kivity
2009-09-22 17:08               ` Avi Kivity
2009-09-22 15:19           ` H. Peter Anvin
2009-10-01 15:21             ` Avi Kivity
2009-10-01 15:25               ` H. Peter Anvin
2009-10-01 15:30                 ` Avi Kivity
2009-10-01 18:33                   ` H. Peter Anvin
2009-09-22 16:50           ` Peter Zijlstra
2009-09-22 16:52             ` Avi Kivity
2009-09-22 16:55               ` Peter Zijlstra
2009-09-22 17:05                 ` Avi Kivity
2009-09-22 18:06           ` Peter Zijlstra
2009-11-02 10:12       ` [tip:x86/entry] " tip-bot for Avi Kivity
2009-09-19  6:40     ` [PATCH 1/4] " Avi Kivity
2009-09-16 12:45 ` [PATCH 2/4] KVM: VMX: Move MSR_KERNEL_GS_BASE out of the vmx autoload msr area Avi Kivity
2009-09-16 12:45 ` [PATCH 3/4] KVM: x86 shared msr infrastructure Avi Kivity
2009-09-16 21:21   ` Marcelo Tosatti
2009-09-16 21:24     ` Avi Kivity
2009-09-16 12:45 ` [PATCH 4/4] KVM: VMX: Use " Avi Kivity
2009-10-25 12:23 ` [PATCH 0/4] User return notifiers / just-in-time MSR switching for KVM Avi Kivity

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.