From mboxrd@z Thu Jan  1 00:00:00 1970
From: "Nadav Har'El" <nyh@il.ibm.com>
Subject: [PATCH 08/31] nVMX: Fix local_vcpus_link handling
Date: Mon, 16 May 2011 22:48:01 +0300
Message-ID: <201105161948.p4GJm1as001742@rice.haifa.ibm.com>
References: <1305575004-nyh@il.ibm.com>
Cc: gleb@redhat.com, avi@redhat.com
To: kvm@vger.kernel.org
Return-path: <kvm-owner@vger.kernel.org>
Received: from mtagate7.uk.ibm.com ([194.196.100.167]:53709 "EHLO
	mtagate7.uk.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1751013Ab1EPTsF (ORCPT <rfc822;kvm@vger.kernel.org>);
	Mon, 16 May 2011 15:48:05 -0400
Received: from d06nrmr1707.portsmouth.uk.ibm.com (d06nrmr1707.portsmouth.uk.ibm.com [9.149.39.225])
	by mtagate7.uk.ibm.com (8.13.1/8.13.1) with ESMTP id p4GJm39W006686
	for <kvm@vger.kernel.org>; Mon, 16 May 2011 19:48:03 GMT
Received: from d06av05.portsmouth.uk.ibm.com (d06av05.portsmouth.uk.ibm.com [9.149.37.229])
	by d06nrmr1707.portsmouth.uk.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id p4GJm3lY1925282
	for <kvm@vger.kernel.org>; Mon, 16 May 2011 20:48:03 +0100
Received: from d06av05.portsmouth.uk.ibm.com (loopback [127.0.0.1])
	by d06av05.portsmouth.uk.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP id p4GJm2go021086
	for <kvm@vger.kernel.org>; Mon, 16 May 2011 13:48:03 -0600
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>

In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it
because (at least in theory) the processor might not have written all of its
content back to memory. Since a patch from June 26, 2008, this is done using
a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU.

The problem is that with nested VMX, we no longer have the concept of a
vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for
L2s), and each of those may be have been last loaded on a different cpu.

Our solution is to hold, in addition to vcpus_on_cpu, a second linked list
saved_vmcss_on_cpu, which holds the current list of "saved" VMCSs, i.e.,
VMCSs which are loaded on this CPU but are not the vmx->vmcs of any of
the vcpus. These saved VMCSs include L1's VMCS while L2 is running
(saved_vmcs01), and L2 VMCSs not currently used - because L1 is running or
because the vmcs02_pool contains more than one entry.

When we will switch between L1's and L2's VMCSs, they need to be moved
between vcpus_on_cpu and saved_vmcs_on_cpu lists and vice versa. A new
function, nested_maintain_per_cpu_lists(), takes care of that.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
---
 arch/x86/kvm/vmx.c |   67 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

--- .before/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
+++ .after/arch/x86/kvm/vmx.c	2011-05-16 22:36:47.000000000 +0300
@@ -181,6 +181,7 @@ struct saved_vmcs {
 	struct vmcs *vmcs;
 	int cpu;
 	int launched;
+	struct list_head local_saved_vmcss_link; /* see saved_vmcss_on_cpu */
 };
 
 /* Used to remember the last vmcs02 used for some recently used vmcs12s */
@@ -315,7 +316,20 @@ static int vmx_set_tss_addr(struct kvm *
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+/*
+ * We maintain a per-CPU linked-list vcpus_on_cpu, holding for each CPU a list
+ * of vcpus whose VMCS are loaded on that CPU. This is needed when a CPU is
+ * brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ *
+ * With nested VMX, we have additional VMCSs which are not the current
+ * vmx->vmcs of any vcpu, but may also be loaded on some CPU: While L2 is
+ * running, L1's VMCS is loaded but not the VMCS of any vcpu; While L1 is
+ * running, a previously used L2 VMCS might still be around and loaded on some
+ * CPU, somes even more than one such L2 VMCSs is kept (see VMCS02_POOL_SIZE).
+ * The list of these additional VMCSs is kept on cpu saved_vmcss_on_cpu.
+ */
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct list_head, saved_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 static unsigned long *vmx_io_bitmap_a;
@@ -1818,6 +1832,7 @@ static int hardware_enable(void *garbage
 		return -EBUSY;
 
 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
+	INIT_LIST_HEAD(&per_cpu(saved_vmcss_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
 	test_bits = FEATURE_CONTROL_LOCKED;
@@ -1860,10 +1875,13 @@ static void kvm_cpu_vmxoff(void)
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
 }
 
+static void vmclear_local_saved_vmcss(void);
+
 static void hardware_disable(void *garbage)
 {
 	if (vmm_exclusive) {
 		vmclear_local_vcpus();
+		vmclear_local_saved_vmcss();
 		kvm_cpu_vmxoff();
 	}
 	write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -4248,6 +4266,8 @@ static void __nested_free_saved_vmcs(voi
 	vmcs_clear(saved_vmcs->vmcs);
 	if (per_cpu(current_vmcs, saved_vmcs->cpu) == saved_vmcs->vmcs)
 		per_cpu(current_vmcs, saved_vmcs->cpu) = NULL;
+	list_del(&saved_vmcs->local_saved_vmcss_link);
+	saved_vmcs->cpu = -1;
 }
 
 /*
@@ -4265,6 +4285,21 @@ static void nested_free_saved_vmcs(struc
 	free_vmcs(saved_vmcs->vmcs);
 }
 
+/*
+ * VMCLEAR all the currently unused (not vmx->vmcs on any vcpu) saved_vmcss
+ * which were loaded on the current CPU. See also vmclear_load_vcpus(), which
+ * does the same for VMCS currently used in vcpus.
+ */
+static void vmclear_local_saved_vmcss(void)
+{
+	int cpu = raw_smp_processor_id();
+	struct saved_vmcs *v, *n;
+
+	list_for_each_entry_safe(v, n, &per_cpu(saved_vmcss_on_cpu, cpu),
+				 local_saved_vmcss_link)
+		__nested_free_saved_vmcs(v);
+}
+
 /* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
 static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
 {
@@ -5143,6 +5178,38 @@ static void vmx_set_supported_cpuid(u32 
 {
 }
 
+/*
+ * Maintain the vcpus_on_cpu and saved_vmcss_on_cpu lists of vcpus and
+ * inactive saved_vmcss on nested entry (L1->L2) or nested exit (L2->L1).
+ *
+ * nested_maintain_per_cpu_lists should be called after the VMCS was switched
+ * to the new one, with parameters giving both the new on (after the entry
+ * or exit) and the old one, in that order.
+ */
+static void nested_maintain_per_cpu_lists(struct vcpu_vmx *vmx,
+		struct saved_vmcs *new_vmcs,
+		struct saved_vmcs *old_vmcs)
+{
+	/*
+	 * When a vcpus's old vmcs is saved, we need to drop it from
+	 * vcpus_on_cpu and put it on saved_vmcss_on_cpu.
+	 */
+	if (old_vmcs->cpu != -1) {
+		list_del(&vmx->local_vcpus_link);
+		list_add(&old_vmcs->local_saved_vmcss_link,
+			 &per_cpu(saved_vmcss_on_cpu, old_vmcs->cpu));
+	}
+	/*
+	 * When a saved vmcs becomes a vcpu's new vmcs, we need to drop it
+	 * from saved_vmcss_on_cpu and put it on vcpus_on_cpu.
+	 */
+	if (new_vmcs->cpu != -1) {
+		list_del(&new_vmcs->local_saved_vmcss_link);
+		list_add(&vmx->local_vcpus_link,
+			 &per_cpu(vcpus_on_cpu, new_vmcs->cpu));
+	}
+}
+
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage)