[RFC 1/2] KVM/nVMX: Cleanly exit from L2 to L1 on user-space exit

From: KarimAllah Ahmed <karahmed@amazon.de>
To: kvm@vger.kernel.org, x86@kernel.org
Cc: KarimAllah Ahmed <karahmed@amazon.de>
Subject: [RFC 1/2] KVM/nVMX: Cleanly exit from L2 to L1 on user-space exit
Date: Fri, 16 Feb 2018 15:23:49 +0100	[thread overview]
Message-ID: <1518791030-31765-2-git-send-email-karahmed@amazon.de> (raw)
In-Reply-To: <1518791030-31765-1-git-send-email-karahmed@amazon.de>

On exit to L0 user-space, always exit from L2 to L1 and synchronize the
state properly for L1. This ensures that user-space only ever sees L1
state. It also allows L1 to be saved and resumed properly. Obviously
horrible things will still happen to the L2 guest. This will be handled in
a seperate patch.

There is only a single case which requires a bit of extra care. When the
decision to switch to user space happens while handling an L1
VMRESUME/VMLAUNCH (i.e. pending_nested_run). In order to handle this
as cleanly as possible without major restructuring, we simply do not exit
to user-space in this case and give L2 another chance to actually run. We
also request an immediate exit to ensure that an exit to user space will
still happen for the L2.

The only reason I can see where an exit to user space will occur while L2
is running is because of a pending signal. The is how user space preempts
the KVM_RUN in order to save the state. L2 exits are either handled in L0
kernel or reflected to L1 and not handled in L0 user-space.

Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/vmx.c              | 39 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              | 33 ++++++++++++++++++++++++++++-----
 3 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 318a414..2c8be56 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -961,6 +961,8 @@ struct kvm_x86_ops {
 			      struct msr_bitmap_range *whitelist);
 
 	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+	void (*prepare_exit_user)(struct kvm_vcpu *vcpu);
+	bool (*allow_exit_user)(struct kvm_vcpu *vcpu);
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 52539be..22eb0dc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2130,6 +2130,42 @@ static unsigned long segment_base(u16 selector)
 }
 #endif
 
+static bool vmx_allow_exit_user(struct kvm_vcpu *vcpu)
+{
+	return !to_vmx(vcpu)->nested.nested_run_pending;
+}
+
+static void vmx_prepare_exit_user(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vmx->nested.current_vmptr == -1ull)
+                return;
+
+	/*
+	 * If L2 is running no need to update vmcs12 from shadow VMCS.
+	 * Just force an exit from L2 to L1
+	 */
+	if (is_guest_mode(vcpu)) {
+		/*
+		 * Pretend that an external interrupt occurred while L2 is
+		 * running to cleanly exit into L1.
+		 */
+		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+
+		/* Switch from L2 MMU to L1 MMU */
+		kvm_mmu_reset_context(vcpu);
+	} else if (enable_shadow_vmcs) {
+		copy_shadow_to_vmcs12(vmx);
+	}
+
+	/* Flush VMCS12 to guest memory */
+	kvm_write_guest(vcpu->kvm, vmx->nested.current_vmptr,
+			get_vmcs12(vcpu), sizeof(*vmx->nested.cached_vmcs12));
+
+	return;
+}
+
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -12440,6 +12476,9 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.whitelist_msrs = vmx_whitelist_msrs,
 
+	.prepare_exit_user = vmx_prepare_exit_user,
+	.allow_exit_user = vmx_allow_exit_user,
+
 	.prepare_guest_switch = vmx_save_host_state,
 	.vcpu_load = vmx_vcpu_load,
 	.vcpu_put = vmx_vcpu_put,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2cfbf39..8256a2d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -996,6 +996,12 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_rdpmc);
 
+static __always_inline bool should_exit_user(struct kvm_vcpu *vcpu)
+{
+	return signal_pending(current) && (kvm_x86_ops->allow_exit_user ?
+					   kvm_x86_ops->allow_exit_user(vcpu): true);
+}
+
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -7187,8 +7193,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
 		kvm_x86_ops->sync_pir_to_irr(vcpu);
 
-	if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
-	    || need_resched() || signal_pending(current)) {
+	if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched()) {
 		vcpu->mode = OUTSIDE_GUEST_MODE;
 		smp_wmb();
 		local_irq_enable();
@@ -7198,6 +7203,20 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		goto cancel_injection;
 	}
 
+	if (signal_pending(current)) {
+		if (kvm_x86_ops->allow_exit_user &&
+		    kvm_x86_ops->allow_exit_user(vcpu)) {
+			vcpu->mode = OUTSIDE_GUEST_MODE;
+			smp_wmb();
+			local_irq_enable();
+			preempt_enable();
+			vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+			r = 1;
+			goto cancel_injection;
+		} else
+			req_immediate_exit = true;
+	}
+
 	kvm_load_guest_xcr0(vcpu);
 
 	if (req_immediate_exit) {
@@ -7364,7 +7383,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 
 		kvm_check_async_pf_completion(vcpu);
 
-		if (signal_pending(current)) {
+		if (should_exit_user(vcpu)) {
 			r = -EINTR;
 			vcpu->run->exit_reason = KVM_EXIT_INTR;
 			++vcpu->stat.signal_exits;
@@ -7506,11 +7525,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	} else
 		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
-	if (kvm_run->immediate_exit)
+	if (kvm_run->immediate_exit) {
 		r = -EINTR;
-	else
+	} else {
 		r = vcpu_run(vcpu);
 
+		if (kvm_x86_ops->prepare_exit_user)
+			kvm_x86_ops->prepare_exit_user(vcpu);
+	}
+
 out:
 	kvm_put_guest_fpu(vcpu);
 	post_kvm_run_save(vcpu);
-- 
2.7.4