linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] Expose VMFUNC to the nested hypervisor
@ 2017-06-29 23:29 Bandan Das
  2017-06-29 23:29 ` [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor Bandan Das
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Bandan Das @ 2017-06-29 23:29 UTC (permalink / raw)
  To: kvm; +Cc: pbonzini, linux-kernel

These patches expose eptp switching/vmfunc to the nested hypervisor. Testing with
kvm-unit-tests seems to work ok.

If the guest hypervisor enables vmfunc/eptp switching, a "shadow" eptp list
address page is written to the VMCS. Initially, it would be unpopulated which
would result in a vmexit with exit reason 59. This hooks to handle_vmfunc()
to rewrite vmcs12->ept_pointer to reload the mmu and get a new root hpa.
This new shadow ept pointer is written to the shadow eptp list in the given
index. A next vmfunc call to switch to the given index would succeed without
an exit.

Bandan Das (2):
  KVM: nVMX: Implement EPTP switching for the L1 hypervisor
  KVM: nVMX: Advertise VMFUNC to L1 hypervisor

 arch/x86/include/asm/vmx.h |   9 ++++
 arch/x86/kvm/vmx.c         | 122 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 131 insertions(+)

-- 
2.9.4

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor
  2017-06-29 23:29 [PATCH 0/2] Expose VMFUNC to the nested hypervisor Bandan Das
@ 2017-06-29 23:29 ` Bandan Das
  2017-06-30  7:29   ` Paolo Bonzini
  2017-06-29 23:29 ` [PATCH 2/2] KVM: nVMX: Advertise VMFUNC to " Bandan Das
  2017-06-30 17:06 ` [PATCH 0/2] Expose VMFUNC to the nested hypervisor Jim Mattson
  2 siblings, 1 reply; 9+ messages in thread
From: Bandan Das @ 2017-06-29 23:29 UTC (permalink / raw)
  To: kvm; +Cc: pbonzini, linux-kernel

This is a mix of emulation/passthrough to implement EPTP
switching for the nested hypervisor.

If the shadow EPT are absent, a vmexit occurs with reason 59.
L0 can then create shadow structures based on the entry that the
guest calls with to obtain a new root_hpa that can be written to
the shadow list and subsequently, reload the mmu to resume L2.
On the next vmfunc(0, index) however, the processor will load the
entry without an exit.

Signed-off-by: Bandan Das <bsd@redhat.com>
---
 arch/x86/include/asm/vmx.h |   5 +++
 arch/x86/kvm/vmx.c         | 104 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 35cd06f..e06783e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,6 +72,7 @@
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
 #define SECONDARY_EXEC_RDRAND			0x00000800
 #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
+#define SECONDARY_EXEC_ENABLE_VMFUNC            0x00002000
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
 #define SECONDARY_EXEC_RDSEED			0x00010000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
@@ -114,6 +115,10 @@
 #define VMX_MISC_SAVE_EFER_LMA			0x00000020
 #define VMX_MISC_ACTIVITY_HLT			0x00000040
 
+/* VMFUNC functions */
+#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
+#define VMFUNC_EPTP_ENTRIES  512
+
 static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
 {
 	return vmx_basic & GENMASK_ULL(30, 0);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ca5d2b9..75049c0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -240,11 +240,13 @@ struct __packed vmcs12 {
 	u64 virtual_apic_page_addr;
 	u64 apic_access_addr;
 	u64 posted_intr_desc_addr;
+	u64 vm_function_control;
 	u64 ept_pointer;
 	u64 eoi_exit_bitmap0;
 	u64 eoi_exit_bitmap1;
 	u64 eoi_exit_bitmap2;
 	u64 eoi_exit_bitmap3;
+	u64 eptp_list_address;
 	u64 xss_exit_bitmap;
 	u64 guest_physical_address;
 	u64 vmcs_link_pointer;
@@ -441,6 +443,7 @@ struct nested_vmx {
 	struct page *apic_access_page;
 	struct page *virtual_apic_page;
 	struct page *pi_desc_page;
+	struct page *shadow_eptp_list;
 	struct pi_desc *pi_desc;
 	bool pi_pending;
 	u16 posted_intr_nv;
@@ -481,6 +484,7 @@ struct nested_vmx {
 	u64 nested_vmx_cr4_fixed0;
 	u64 nested_vmx_cr4_fixed1;
 	u64 nested_vmx_vmcs_enum;
+	u64 nested_vmx_vmfunc_controls;
 };
 
 #define POSTED_INTR_ON  0
@@ -1314,6 +1318,22 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
 		SECONDARY_EXEC_TSC_SCALING;
 }
 
+static inline bool cpu_has_vmx_vmfunc(void)
+{
+	return vmcs_config.cpu_based_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_VMFUNC;
+}
+
+static inline u64 vmx_vmfunc_controls(void)
+{
+	u64 controls = 0;
+
+	if (cpu_has_vmx_vmfunc())
+		rdmsrl(MSR_IA32_VMX_VMFUNC, controls);
+
+	return controls;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -1388,6 +1408,18 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
 	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
 }
 
+static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
+}
+
+static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has_vmfunc(vmcs12) &&
+		(vmcs12->vm_function_control &
+		 VMX_VMFUNC_EPTP_SWITCHING);
+}
+
 static inline bool is_nmi(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -3143,6 +3175,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		*pdata = vmx->nested.nested_vmx_ept_caps |
 			((u64)vmx->nested.nested_vmx_vpid_caps << 32);
 		break;
+	case MSR_IA32_VMX_VMFUNC:
+		*pdata = vmx->nested.nested_vmx_vmfunc_controls;
+		break;
 	default:
 		return 1;
 	}
@@ -6959,6 +6994,14 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
 	}
 
+	if (vmx_vmfunc_controls() & 1) {
+		struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+		if (!page)
+			goto out_shadow_vmcs;
+		vmx->nested.shadow_eptp_list = page;
+	}
+
 	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
 	vmx->nested.vmcs02_num = 0;
 
@@ -7128,6 +7171,11 @@ static void free_nested(struct vcpu_vmx *vmx)
 		vmx->vmcs01.shadow_vmcs = NULL;
 	}
 	kfree(vmx->nested.cached_vmcs12);
+
+	if (vmx->nested.shadow_eptp_list) {
+		__free_page(vmx->nested.shadow_eptp_list);
+		vmx->nested.shadow_eptp_list = NULL;
+	}
 	/* Unpin physical memory we referred to in current vmcs02 */
 	if (vmx->nested.apic_access_page) {
 		nested_release_page(vmx->nested.apic_access_page);
@@ -7740,6 +7788,61 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_vmfunc(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12;
+	struct page *page = NULL,
+		*shadow_page = vmx->nested.shadow_eptp_list;
+	u64 *l1_eptp_list, *shadow_eptp_list;
+	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
+	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+
+	if (is_guest_mode(vcpu)) {
+		vmcs12 = get_vmcs12(vcpu);
+		if (!nested_cpu_has_ept(vmcs12) ||
+		    !nested_cpu_has_eptp_switching(vmcs12))
+			goto fail;
+
+		/*
+		 * Only function 0 is valid, everything upto 63 injects VMFUNC
+		 * exit reason to L1, and #UD thereafter
+		 */
+		if (function || !vmcs12->eptp_list_address ||
+		    index >= VMFUNC_EPTP_ENTRIES)
+			goto fail;
+
+		page = nested_get_page(vcpu, vmcs12->eptp_list_address);
+		if (!page)
+			goto fail;
+
+		l1_eptp_list = kmap(page);
+		if (!l1_eptp_list[index])
+			goto fail;
+
+		kvm_mmu_unload(vcpu);
+		/*
+		 * TODO: Verify that guest ept satisfies vmentry prereqs
+		 */
+		vmcs12->ept_pointer = l1_eptp_list[index];
+		shadow_eptp_list = phys_to_virt(page_to_phys(shadow_page));
+		kvm_mmu_reload(vcpu);
+		shadow_eptp_list[index] =
+			construct_eptp(vcpu->arch.mmu.root_hpa);
+		kunmap(page);
+
+		return kvm_skip_emulated_instruction(vcpu);
+	}
+
+fail:
+	if (page)
+		kunmap(page);
+	nested_vmx_vmexit(vcpu, vmx->exit_reason,
+			  vmcs_read32(VM_EXIT_INTR_INFO),
+			  vmcs_readl(EXIT_QUALIFICATION));
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7790,6 +7893,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_XSAVES]                  = handle_xsaves,
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
+	[EXIT_REASON_VMFUNC]                  = handle_vmfunc,
 	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 };
 
-- 
2.9.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/2] KVM: nVMX: Advertise VMFUNC to L1 hypervisor
  2017-06-29 23:29 [PATCH 0/2] Expose VMFUNC to the nested hypervisor Bandan Das
  2017-06-29 23:29 ` [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor Bandan Das
@ 2017-06-29 23:29 ` Bandan Das
  2017-06-30 17:06 ` [PATCH 0/2] Expose VMFUNC to the nested hypervisor Jim Mattson
  2 siblings, 0 replies; 9+ messages in thread
From: Bandan Das @ 2017-06-29 23:29 UTC (permalink / raw)
  To: kvm; +Cc: pbonzini, linux-kernel

Advertise VMFUNC and EPTP switching function to the L1
hypervisor. Change nested_vmx_exit_handled() to return false
for VMFUNC so L0 can handle it.

Signed-off-by: Bandan Das <bsd@redhat.com>
---
 arch/x86/include/asm/vmx.h |  4 ++++
 arch/x86/kvm/vmx.c         | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index e06783e..5f63a2e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -192,6 +192,8 @@ enum vmcs_field {
 	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
 	POSTED_INTR_DESC_ADDR           = 0x00002016,
 	POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
+	VM_FUNCTION_CONTROL             = 0x00002018,
+	VM_FUNCTION_CONTROL_HIGH        = 0x00002019,
 	EPT_POINTER                     = 0x0000201a,
 	EPT_POINTER_HIGH                = 0x0000201b,
 	EOI_EXIT_BITMAP0                = 0x0000201c,
@@ -202,6 +204,8 @@ enum vmcs_field {
 	EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
 	EOI_EXIT_BITMAP3                = 0x00002022,
 	EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
+	EPTP_LIST_ADDRESS               = 0x00002024,
+	EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
 	VMREAD_BITMAP                   = 0x00002026,
 	VMWRITE_BITMAP                  = 0x00002028,
 	XSS_EXIT_BITMAP                 = 0x0000202C,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 75049c0..bf06bef 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -767,11 +767,13 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
 	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
 	FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
+	FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
 	FIELD64(EPT_POINTER, ept_pointer),
 	FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
 	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
 	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
 	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
+	FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
 	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
 	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
 	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -2806,6 +2808,13 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
+	if (cpu_has_vmx_vmfunc()) {
+		vmx->nested.nested_vmx_secondary_ctls_high |=
+			SECONDARY_EXEC_ENABLE_VMFUNC;
+		vmx->nested.nested_vmx_vmfunc_controls =
+			vmx_vmfunc_controls() & 1;
+	}
+
 	/*
 	 * Old versions of KVM use the single-context version without
 	 * checking for support, so declare that it is supported even
@@ -8215,6 +8224,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_PML_FULL:
 		/* We emulate PML support to L1. */
 		return false;
+	case EXIT_REASON_VMFUNC:
+		return false;
 	default:
 		return true;
 	}
@@ -10309,6 +10320,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmx_flush_tlb_ept_only(vcpu);
 	}
 
+	if (nested_cpu_has_eptp_switching(vmcs12)) {
+		vmcs_write64(VM_FUNCTION_CONTROL,
+			     vmcs12->vm_function_control & 1);
+		vmcs_write64(EPTP_LIST_ADDRESS,
+			     page_to_phys(vmx->nested.shadow_eptp_list));
+	}
+
 	/*
 	 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
 	 * bits which we consider mandatory enabled.
-- 
2.9.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor
  2017-06-29 23:29 ` [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor Bandan Das
@ 2017-06-30  7:29   ` Paolo Bonzini
  2017-06-30 17:20     ` Bandan Das
  0 siblings, 1 reply; 9+ messages in thread
From: Paolo Bonzini @ 2017-06-30  7:29 UTC (permalink / raw)
  To: Bandan Das; +Cc: kvm, linux-kernel



----- Original Message -----
> From: "Bandan Das" <bsd@redhat.com>
> To: kvm@vger.kernel.org
> Cc: pbonzini@redhat.com, linux-kernel@vger.kernel.org
> Sent: Friday, June 30, 2017 1:29:55 AM
> Subject: [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor
> 
> This is a mix of emulation/passthrough to implement EPTP
> switching for the nested hypervisor.
> 
> If the shadow EPT are absent, a vmexit occurs with reason 59.
> L0 can then create shadow structures based on the entry that the
> guest calls with to obtain a new root_hpa that can be written to
> the shadow list and subsequently, reload the mmu to resume L2.
> On the next vmfunc(0, index) however, the processor will load the
> entry without an exit.

What happens if the root_hpa is dropped by the L0 MMU?  I'm not sure
what removes it from the shadow EPT list.

For a first version of the patch, I would prefer a less optimized
version that always goes through L0 when L2 executes VMFUNC.
To achieve this effect, you can copy "enable VM functions" secondary
execution control from vmcs12 to vmcs02, but leave the VM-function
controls to 0 in vmcs02.

Paolo

> Signed-off-by: Bandan Das <bsd@redhat.com>
> ---
>  arch/x86/include/asm/vmx.h |   5 +++
>  arch/x86/kvm/vmx.c         | 104
>  +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 109 insertions(+)
> 
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 35cd06f..e06783e 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -72,6 +72,7 @@
>  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
>  #define SECONDARY_EXEC_RDRAND			0x00000800
>  #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
> +#define SECONDARY_EXEC_ENABLE_VMFUNC            0x00002000
>  #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
>  #define SECONDARY_EXEC_RDSEED			0x00010000
>  #define SECONDARY_EXEC_ENABLE_PML               0x00020000
> @@ -114,6 +115,10 @@
>  #define VMX_MISC_SAVE_EFER_LMA			0x00000020
>  #define VMX_MISC_ACTIVITY_HLT			0x00000040
>  
> +/* VMFUNC functions */
> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
> +#define VMFUNC_EPTP_ENTRIES  512
> +
>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>  {
>  	return vmx_basic & GENMASK_ULL(30, 0);
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index ca5d2b9..75049c0 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -240,11 +240,13 @@ struct __packed vmcs12 {
>  	u64 virtual_apic_page_addr;
>  	u64 apic_access_addr;
>  	u64 posted_intr_desc_addr;
> +	u64 vm_function_control;
>  	u64 ept_pointer;
>  	u64 eoi_exit_bitmap0;
>  	u64 eoi_exit_bitmap1;
>  	u64 eoi_exit_bitmap2;
>  	u64 eoi_exit_bitmap3;
> +	u64 eptp_list_address;
>  	u64 xss_exit_bitmap;
>  	u64 guest_physical_address;
>  	u64 vmcs_link_pointer;
> @@ -441,6 +443,7 @@ struct nested_vmx {
>  	struct page *apic_access_page;
>  	struct page *virtual_apic_page;
>  	struct page *pi_desc_page;
> +	struct page *shadow_eptp_list;
>  	struct pi_desc *pi_desc;
>  	bool pi_pending;
>  	u16 posted_intr_nv;
> @@ -481,6 +484,7 @@ struct nested_vmx {
>  	u64 nested_vmx_cr4_fixed0;
>  	u64 nested_vmx_cr4_fixed1;
>  	u64 nested_vmx_vmcs_enum;
> +	u64 nested_vmx_vmfunc_controls;
>  };
>  
>  #define POSTED_INTR_ON  0
> @@ -1314,6 +1318,22 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
>  		SECONDARY_EXEC_TSC_SCALING;
>  }
>  
> +static inline bool cpu_has_vmx_vmfunc(void)
> +{
> +	return vmcs_config.cpu_based_exec_ctrl &
> +		SECONDARY_EXEC_ENABLE_VMFUNC;
> +}
> +
> +static inline u64 vmx_vmfunc_controls(void)
> +{
> +	u64 controls = 0;
> +
> +	if (cpu_has_vmx_vmfunc())
> +		rdmsrl(MSR_IA32_VMX_VMFUNC, controls);
> +
> +	return controls;
> +}
> +
>  static inline bool report_flexpriority(void)
>  {
>  	return flexpriority_enabled;
> @@ -1388,6 +1408,18 @@ static inline bool nested_cpu_has_posted_intr(struct
> vmcs12 *vmcs12)
>  	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
>  }
>  
> +static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
> +{
> +	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
> +}
> +
> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
> +{
> +	return nested_cpu_has_vmfunc(vmcs12) &&
> +		(vmcs12->vm_function_control &
> +		 VMX_VMFUNC_EPTP_SWITCHING);
> +}
> +
>  static inline bool is_nmi(u32 intr_info)
>  {
>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
> @@ -3143,6 +3175,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32
> msr_index, u64 *pdata)
>  		*pdata = vmx->nested.nested_vmx_ept_caps |
>  			((u64)vmx->nested.nested_vmx_vpid_caps << 32);
>  		break;
> +	case MSR_IA32_VMX_VMFUNC:
> +		*pdata = vmx->nested.nested_vmx_vmfunc_controls;
> +		break;
>  	default:
>  		return 1;
>  	}
> @@ -6959,6 +6994,14 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
>  		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
>  	}
>  
> +	if (vmx_vmfunc_controls() & 1) {
> +		struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +
> +		if (!page)
> +			goto out_shadow_vmcs;
> +		vmx->nested.shadow_eptp_list = page;
> +	}
> +
>  	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
>  	vmx->nested.vmcs02_num = 0;
>  
> @@ -7128,6 +7171,11 @@ static void free_nested(struct vcpu_vmx *vmx)
>  		vmx->vmcs01.shadow_vmcs = NULL;
>  	}
>  	kfree(vmx->nested.cached_vmcs12);
> +
> +	if (vmx->nested.shadow_eptp_list) {
> +		__free_page(vmx->nested.shadow_eptp_list);
> +		vmx->nested.shadow_eptp_list = NULL;
> +	}
>  	/* Unpin physical memory we referred to in current vmcs02 */
>  	if (vmx->nested.apic_access_page) {
>  		nested_release_page(vmx->nested.apic_access_page);
> @@ -7740,6 +7788,61 @@ static int handle_preemption_timer(struct kvm_vcpu
> *vcpu)
>  	return 1;
>  }
>  
> +static int handle_vmfunc(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct vmcs12 *vmcs12;
> +	struct page *page = NULL,
> +		*shadow_page = vmx->nested.shadow_eptp_list;
> +	u64 *l1_eptp_list, *shadow_eptp_list;
> +	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
> +	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
> +
> +	if (is_guest_mode(vcpu)) {
> +		vmcs12 = get_vmcs12(vcpu);
> +		if (!nested_cpu_has_ept(vmcs12) ||
> +		    !nested_cpu_has_eptp_switching(vmcs12))
> +			goto fail;
> +
> +		/*
> +		 * Only function 0 is valid, everything upto 63 injects VMFUNC
> +		 * exit reason to L1, and #UD thereafter
> +		 */
> +		if (function || !vmcs12->eptp_list_address ||
> +		    index >= VMFUNC_EPTP_ENTRIES)
> +			goto fail;
> +
> +		page = nested_get_page(vcpu, vmcs12->eptp_list_address);
> +		if (!page)
> +			goto fail;
> +
> +		l1_eptp_list = kmap(page);
> +		if (!l1_eptp_list[index])
> +			goto fail;
> +
> +		kvm_mmu_unload(vcpu);
> +		/*
> +		 * TODO: Verify that guest ept satisfies vmentry prereqs
> +		 */
> +		vmcs12->ept_pointer = l1_eptp_list[index];
> +		shadow_eptp_list = phys_to_virt(page_to_phys(shadow_page));
> +		kvm_mmu_reload(vcpu);
> +		shadow_eptp_list[index] =
> +			construct_eptp(vcpu->arch.mmu.root_hpa);
> +		kunmap(page);
> +
> +		return kvm_skip_emulated_instruction(vcpu);
> +	}
> +
> +fail:
> +	if (page)
> +		kunmap(page);
> +	nested_vmx_vmexit(vcpu, vmx->exit_reason,
> +			  vmcs_read32(VM_EXIT_INTR_INFO),
> +			  vmcs_readl(EXIT_QUALIFICATION));
> +	return 1;
> +}
> +
>  /*
>   * The exit handlers return 1 if the exit was handled fully and guest
>   execution
>   * may resume.  Otherwise they set the kvm_run parameter to indicate what
>   needs
> @@ -7790,6 +7893,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct
> kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_XSAVES]                  = handle_xsaves,
>  	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
>  	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
> +	[EXIT_REASON_VMFUNC]                  = handle_vmfunc,
>  	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
>  };
>  
> --
> 2.9.4
> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/2] Expose VMFUNC to the nested hypervisor
  2017-06-29 23:29 [PATCH 0/2] Expose VMFUNC to the nested hypervisor Bandan Das
  2017-06-29 23:29 ` [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor Bandan Das
  2017-06-29 23:29 ` [PATCH 2/2] KVM: nVMX: Advertise VMFUNC to " Bandan Das
@ 2017-06-30 17:06 ` Jim Mattson
  2017-06-30 17:58   ` Bandan Das
  2017-06-30 19:06   ` Paolo Bonzini
  2 siblings, 2 replies; 9+ messages in thread
From: Jim Mattson @ 2017-06-30 17:06 UTC (permalink / raw)
  To: Bandan Das; +Cc: kvm list, Paolo Bonzini, LKML

Isn't McAfee DeepSAFE defunct? Are there any other consumers of EPTP switching?

On Thu, Jun 29, 2017 at 4:29 PM, Bandan Das <bsd@redhat.com> wrote:
> These patches expose eptp switching/vmfunc to the nested hypervisor. Testing with
> kvm-unit-tests seems to work ok.
>
> If the guest hypervisor enables vmfunc/eptp switching, a "shadow" eptp list
> address page is written to the VMCS. Initially, it would be unpopulated which
> would result in a vmexit with exit reason 59. This hooks to handle_vmfunc()
> to rewrite vmcs12->ept_pointer to reload the mmu and get a new root hpa.
> This new shadow ept pointer is written to the shadow eptp list in the given
> index. A next vmfunc call to switch to the given index would succeed without
> an exit.
>
> Bandan Das (2):
>   KVM: nVMX: Implement EPTP switching for the L1 hypervisor
>   KVM: nVMX: Advertise VMFUNC to L1 hypervisor
>
>  arch/x86/include/asm/vmx.h |   9 ++++
>  arch/x86/kvm/vmx.c         | 122 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 131 insertions(+)
>
> --
> 2.9.4
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor
  2017-06-30  7:29   ` Paolo Bonzini
@ 2017-06-30 17:20     ` Bandan Das
  2017-07-01  5:34       ` Paolo Bonzini
  0 siblings, 1 reply; 9+ messages in thread
From: Bandan Das @ 2017-06-30 17:20 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: kvm, linux-kernel

Hi Paolo,

Paolo Bonzini <pbonzini@redhat.com> writes:

> ----- Original Message -----
>> From: "Bandan Das" <bsd@redhat.com>
>> To: kvm@vger.kernel.org
>> Cc: pbonzini@redhat.com, linux-kernel@vger.kernel.org
>> Sent: Friday, June 30, 2017 1:29:55 AM
>> Subject: [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor
>> 
>> This is a mix of emulation/passthrough to implement EPTP
>> switching for the nested hypervisor.
>> 
>> If the shadow EPT are absent, a vmexit occurs with reason 59.
>> L0 can then create shadow structures based on the entry that the
>> guest calls with to obtain a new root_hpa that can be written to
>> the shadow list and subsequently, reload the mmu to resume L2.
>> On the next vmfunc(0, index) however, the processor will load the
>> entry without an exit.
>
> What happens if the root_hpa is dropped by the L0 MMU?  I'm not sure
> what removes it from the shadow EPT list.

That would result in a vmfunc vmexit, which will jump to handle_vmfunc
and then a call to mmu_alloc_shadow_roots() that will overwrite the shadow
eptp entry with the new one.

I believe part of your question is also that root_hpa is valid, just not
tracking the current l1 eptp and in that case, the processor can jump to
some other guest's page tables which is a problem. In that case, I think it should
be possible to invalidate that entry in the shadow eptp list.

> For a first version of the patch, I would prefer a less optimized
> version that always goes through L0 when L2 executes VMFUNC.
> To achieve this effect, you can copy "enable VM functions" secondary
> execution control from vmcs12 to vmcs02, but leave the VM-function
> controls to 0 in vmcs02.

Is the current approach prone to other undesired corner cases like the one
you pointed above ? :) I would be uncomfortable having this in if you feel
having the cpu jump to a new eptp feels like an interesting exploitable
interface; however, I think it would be nice to have l2 execute vmfunc without
exiting to L0. 

Thanks for the quick review!

> Paolo
>
>> Signed-off-by: Bandan Das <bsd@redhat.com>
>> ---
>>  arch/x86/include/asm/vmx.h |   5 +++
>>  arch/x86/kvm/vmx.c         | 104
>>  +++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 109 insertions(+)
>> 
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index 35cd06f..e06783e 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -72,6 +72,7 @@
>>  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
>>  #define SECONDARY_EXEC_RDRAND			0x00000800
>>  #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
>> +#define SECONDARY_EXEC_ENABLE_VMFUNC            0x00002000
>>  #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
>>  #define SECONDARY_EXEC_RDSEED			0x00010000
>>  #define SECONDARY_EXEC_ENABLE_PML               0x00020000
>> @@ -114,6 +115,10 @@
>>  #define VMX_MISC_SAVE_EFER_LMA			0x00000020
>>  #define VMX_MISC_ACTIVITY_HLT			0x00000040
>>  
>> +/* VMFUNC functions */
>> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
>> +#define VMFUNC_EPTP_ENTRIES  512
>> +
>>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>>  {
>>  	return vmx_basic & GENMASK_ULL(30, 0);
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index ca5d2b9..75049c0 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -240,11 +240,13 @@ struct __packed vmcs12 {
>>  	u64 virtual_apic_page_addr;
>>  	u64 apic_access_addr;
>>  	u64 posted_intr_desc_addr;
>> +	u64 vm_function_control;
>>  	u64 ept_pointer;
>>  	u64 eoi_exit_bitmap0;
>>  	u64 eoi_exit_bitmap1;
>>  	u64 eoi_exit_bitmap2;
>>  	u64 eoi_exit_bitmap3;
>> +	u64 eptp_list_address;
>>  	u64 xss_exit_bitmap;
>>  	u64 guest_physical_address;
>>  	u64 vmcs_link_pointer;
>> @@ -441,6 +443,7 @@ struct nested_vmx {
>>  	struct page *apic_access_page;
>>  	struct page *virtual_apic_page;
>>  	struct page *pi_desc_page;
>> +	struct page *shadow_eptp_list;
>>  	struct pi_desc *pi_desc;
>>  	bool pi_pending;
>>  	u16 posted_intr_nv;
>> @@ -481,6 +484,7 @@ struct nested_vmx {
>>  	u64 nested_vmx_cr4_fixed0;
>>  	u64 nested_vmx_cr4_fixed1;
>>  	u64 nested_vmx_vmcs_enum;
>> +	u64 nested_vmx_vmfunc_controls;
>>  };
>>  
>>  #define POSTED_INTR_ON  0
>> @@ -1314,6 +1318,22 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
>>  		SECONDARY_EXEC_TSC_SCALING;
>>  }
>>  
>> +static inline bool cpu_has_vmx_vmfunc(void)
>> +{
>> +	return vmcs_config.cpu_based_exec_ctrl &
>> +		SECONDARY_EXEC_ENABLE_VMFUNC;
>> +}
>> +
>> +static inline u64 vmx_vmfunc_controls(void)
>> +{
>> +	u64 controls = 0;
>> +
>> +	if (cpu_has_vmx_vmfunc())
>> +		rdmsrl(MSR_IA32_VMX_VMFUNC, controls);
>> +
>> +	return controls;
>> +}
>> +
>>  static inline bool report_flexpriority(void)
>>  {
>>  	return flexpriority_enabled;
>> @@ -1388,6 +1408,18 @@ static inline bool nested_cpu_has_posted_intr(struct
>> vmcs12 *vmcs12)
>>  	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
>>  }
>>  
>> +static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>> +{
>> +	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>> +}
>> +
>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>> +{
>> +	return nested_cpu_has_vmfunc(vmcs12) &&
>> +		(vmcs12->vm_function_control &
>> +		 VMX_VMFUNC_EPTP_SWITCHING);
>> +}
>> +
>>  static inline bool is_nmi(u32 intr_info)
>>  {
>>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>> @@ -3143,6 +3175,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32
>> msr_index, u64 *pdata)
>>  		*pdata = vmx->nested.nested_vmx_ept_caps |
>>  			((u64)vmx->nested.nested_vmx_vpid_caps << 32);
>>  		break;
>> +	case MSR_IA32_VMX_VMFUNC:
>> +		*pdata = vmx->nested.nested_vmx_vmfunc_controls;
>> +		break;
>>  	default:
>>  		return 1;
>>  	}
>> @@ -6959,6 +6994,14 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
>>  		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
>>  	}
>>  
>> +	if (vmx_vmfunc_controls() & 1) {
>> +		struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
>> +
>> +		if (!page)
>> +			goto out_shadow_vmcs;
>> +		vmx->nested.shadow_eptp_list = page;
>> +	}
>> +
>>  	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
>>  	vmx->nested.vmcs02_num = 0;
>>  
>> @@ -7128,6 +7171,11 @@ static void free_nested(struct vcpu_vmx *vmx)
>>  		vmx->vmcs01.shadow_vmcs = NULL;
>>  	}
>>  	kfree(vmx->nested.cached_vmcs12);
>> +
>> +	if (vmx->nested.shadow_eptp_list) {
>> +		__free_page(vmx->nested.shadow_eptp_list);
>> +		vmx->nested.shadow_eptp_list = NULL;
>> +	}
>>  	/* Unpin physical memory we referred to in current vmcs02 */
>>  	if (vmx->nested.apic_access_page) {
>>  		nested_release_page(vmx->nested.apic_access_page);
>> @@ -7740,6 +7788,61 @@ static int handle_preemption_timer(struct kvm_vcpu
>> *vcpu)
>>  	return 1;
>>  }
>>  
>> +static int handle_vmfunc(struct kvm_vcpu *vcpu)
>> +{
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +	struct vmcs12 *vmcs12;
>> +	struct page *page = NULL,
>> +		*shadow_page = vmx->nested.shadow_eptp_list;
>> +	u64 *l1_eptp_list, *shadow_eptp_list;
>> +	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>> +	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
>> +
>> +	if (is_guest_mode(vcpu)) {
>> +		vmcs12 = get_vmcs12(vcpu);
>> +		if (!nested_cpu_has_ept(vmcs12) ||
>> +		    !nested_cpu_has_eptp_switching(vmcs12))
>> +			goto fail;
>> +
>> +		/*
>> +		 * Only function 0 is valid, everything upto 63 injects VMFUNC
>> +		 * exit reason to L1, and #UD thereafter
>> +		 */
>> +		if (function || !vmcs12->eptp_list_address ||
>> +		    index >= VMFUNC_EPTP_ENTRIES)
>> +			goto fail;
>> +
>> +		page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>> +		if (!page)
>> +			goto fail;
>> +
>> +		l1_eptp_list = kmap(page);
>> +		if (!l1_eptp_list[index])
>> +			goto fail;
>> +
>> +		kvm_mmu_unload(vcpu);
>> +		/*
>> +		 * TODO: Verify that guest ept satisfies vmentry prereqs
>> +		 */
>> +		vmcs12->ept_pointer = l1_eptp_list[index];
>> +		shadow_eptp_list = phys_to_virt(page_to_phys(shadow_page));
>> +		kvm_mmu_reload(vcpu);
>> +		shadow_eptp_list[index] =
>> +			construct_eptp(vcpu->arch.mmu.root_hpa);
>> +		kunmap(page);
>> +
>> +		return kvm_skip_emulated_instruction(vcpu);
>> +	}
>> +
>> +fail:
>> +	if (page)
>> +		kunmap(page);
>> +	nested_vmx_vmexit(vcpu, vmx->exit_reason,
>> +			  vmcs_read32(VM_EXIT_INTR_INFO),
>> +			  vmcs_readl(EXIT_QUALIFICATION));
>> +	return 1;
>> +}
>> +
>>  /*
>>   * The exit handlers return 1 if the exit was handled fully and guest
>>   execution
>>   * may resume.  Otherwise they set the kvm_run parameter to indicate what
>>   needs
>> @@ -7790,6 +7893,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct
>> kvm_vcpu *vcpu) = {
>>  	[EXIT_REASON_XSAVES]                  = handle_xsaves,
>>  	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
>>  	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
>> +	[EXIT_REASON_VMFUNC]                  = handle_vmfunc,
>>  	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
>>  };
>>  
>> --
>> 2.9.4
>> 
>> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/2] Expose VMFUNC to the nested hypervisor
  2017-06-30 17:06 ` [PATCH 0/2] Expose VMFUNC to the nested hypervisor Jim Mattson
@ 2017-06-30 17:58   ` Bandan Das
  2017-06-30 19:06   ` Paolo Bonzini
  1 sibling, 0 replies; 9+ messages in thread
From: Bandan Das @ 2017-06-30 17:58 UTC (permalink / raw)
  To: Jim Mattson; +Cc: kvm list, Paolo Bonzini, LKML

Jim Mattson <jmattson@google.com> writes:

> Isn't McAfee DeepSAFE defunct? Are there any other consumers of EPTP switching?

I don't know of any real users but I think we should be providing this functionality to
the L1 hypervisor :)

IIRC, Xen lets you use EPTP switching as part of VM introspection ?

Bandan


> On Thu, Jun 29, 2017 at 4:29 PM, Bandan Das <bsd@redhat.com> wrote:
>> These patches expose eptp switching/vmfunc to the nested hypervisor. Testing with
>> kvm-unit-tests seems to work ok.
>>
>> If the guest hypervisor enables vmfunc/eptp switching, a "shadow" eptp list
>> address page is written to the VMCS. Initially, it would be unpopulated which
>> would result in a vmexit with exit reason 59. This hooks to handle_vmfunc()
>> to rewrite vmcs12->ept_pointer to reload the mmu and get a new root hpa.
>> This new shadow ept pointer is written to the shadow eptp list in the given
>> index. A next vmfunc call to switch to the given index would succeed without
>> an exit.
>>
>> Bandan Das (2):
>>   KVM: nVMX: Implement EPTP switching for the L1 hypervisor
>>   KVM: nVMX: Advertise VMFUNC to L1 hypervisor
>>
>>  arch/x86/include/asm/vmx.h |   9 ++++
>>  arch/x86/kvm/vmx.c         | 122 +++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 131 insertions(+)
>>
>> --
>> 2.9.4
>>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/2] Expose VMFUNC to the nested hypervisor
  2017-06-30 17:06 ` [PATCH 0/2] Expose VMFUNC to the nested hypervisor Jim Mattson
  2017-06-30 17:58   ` Bandan Das
@ 2017-06-30 19:06   ` Paolo Bonzini
  1 sibling, 0 replies; 9+ messages in thread
From: Paolo Bonzini @ 2017-06-30 19:06 UTC (permalink / raw)
  To: Jim Mattson; +Cc: Bandan Das, kvm list, LKML



----- Original Message -----
> From: "Jim Mattson" <jmattson@google.com>
> To: "Bandan Das" <bsd@redhat.com>
> Cc: "kvm list" <kvm@vger.kernel.org>, "Paolo Bonzini" <pbonzini@redhat.com>, "LKML" <linux-kernel@vger.kernel.org>
> Sent: Friday, June 30, 2017 7:06:43 PM
> Subject: Re: [PATCH 0/2] Expose VMFUNC to the nested hypervisor
> 
> Isn't McAfee DeepSAFE defunct? Are there any other consumers of EPTP
> switching?

Xen can use it optionally, and #VE as well.

Paolo

> On Thu, Jun 29, 2017 at 4:29 PM, Bandan Das <bsd@redhat.com> wrote:
> > These patches expose eptp switching/vmfunc to the nested hypervisor.
> > Testing with
> > kvm-unit-tests seems to work ok.
> >
> > If the guest hypervisor enables vmfunc/eptp switching, a "shadow" eptp list
> > address page is written to the VMCS. Initially, it would be unpopulated
> > which
> > would result in a vmexit with exit reason 59. This hooks to handle_vmfunc()
> > to rewrite vmcs12->ept_pointer to reload the mmu and get a new root hpa.
> > This new shadow ept pointer is written to the shadow eptp list in the given
> > index. A next vmfunc call to switch to the given index would succeed
> > without
> > an exit.
> >
> > Bandan Das (2):
> >   KVM: nVMX: Implement EPTP switching for the L1 hypervisor
> >   KVM: nVMX: Advertise VMFUNC to L1 hypervisor
> >
> >  arch/x86/include/asm/vmx.h |   9 ++++
> >  arch/x86/kvm/vmx.c         | 122
> >  +++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 131 insertions(+)
> >
> > --
> > 2.9.4
> >
> 
 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor
  2017-06-30 17:20     ` Bandan Das
@ 2017-07-01  5:34       ` Paolo Bonzini
  0 siblings, 0 replies; 9+ messages in thread
From: Paolo Bonzini @ 2017-07-01  5:34 UTC (permalink / raw)
  To: Bandan Das; +Cc: kvm, linux-kernel


> > What happens if the root_hpa is dropped by the L0 MMU?  I'm not sure
> > what removes it from the shadow EPT list.
> 
> That would result in a vmfunc vmexit, which will jump to handle_vmfunc
> and then a call to mmu_alloc_shadow_roots() that will overwrite the shadow
> eptp entry with the new one.
> 
> I believe part of your question is also that root_hpa is valid, just not
> tracking the current l1 eptp and in that case, the processor can jump to
> some other guest's page tables which is a problem. In that case, I think it
> should be possible to invalidate that entry in the shadow eptp list.

Exactly.  However, I'm not sure it's worth fixing it.  There's already
enough complication in the MMU, and there are way more important things
to optimize in nested VMX!

Paolo

> > For a first version of the patch, I would prefer a less optimized
> > version that always goes through L0 when L2 executes VMFUNC.
> > To achieve this effect, you can copy "enable VM functions" secondary
> > execution control from vmcs12 to vmcs02, but leave the VM-function
> > controls to 0 in vmcs02.
> 
> Is the current approach prone to other undesired corner cases like the one
> you pointed above ? :) I would be uncomfortable having this in if you feel
> having the cpu jump to a new eptp feels like an interesting exploitable
> interface; however, I think it would be nice to have l2 execute vmfunc
> without exiting to L0.
> 
> Thanks for the quick review!
> 
> > Paolo
> >
> >> Signed-off-by: Bandan Das <bsd@redhat.com>
> >> ---
> >>  arch/x86/include/asm/vmx.h |   5 +++
> >>  arch/x86/kvm/vmx.c         | 104
> >>  +++++++++++++++++++++++++++++++++++++++++++++
> >>  2 files changed, 109 insertions(+)
> >> 
> >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> >> index 35cd06f..e06783e 100644
> >> --- a/arch/x86/include/asm/vmx.h
> >> +++ b/arch/x86/include/asm/vmx.h
> >> @@ -72,6 +72,7 @@
> >>  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
> >>  #define SECONDARY_EXEC_RDRAND			0x00000800
> >>  #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
> >> +#define SECONDARY_EXEC_ENABLE_VMFUNC            0x00002000
> >>  #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
> >>  #define SECONDARY_EXEC_RDSEED			0x00010000
> >>  #define SECONDARY_EXEC_ENABLE_PML               0x00020000
> >> @@ -114,6 +115,10 @@
> >>  #define VMX_MISC_SAVE_EFER_LMA			0x00000020
> >>  #define VMX_MISC_ACTIVITY_HLT			0x00000040
> >>  
> >> +/* VMFUNC functions */
> >> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
> >> +#define VMFUNC_EPTP_ENTRIES  512
> >> +
> >>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
> >>  {
> >>  	return vmx_basic & GENMASK_ULL(30, 0);
> >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> >> index ca5d2b9..75049c0 100644
> >> --- a/arch/x86/kvm/vmx.c
> >> +++ b/arch/x86/kvm/vmx.c
> >> @@ -240,11 +240,13 @@ struct __packed vmcs12 {
> >>  	u64 virtual_apic_page_addr;
> >>  	u64 apic_access_addr;
> >>  	u64 posted_intr_desc_addr;
> >> +	u64 vm_function_control;
> >>  	u64 ept_pointer;
> >>  	u64 eoi_exit_bitmap0;
> >>  	u64 eoi_exit_bitmap1;
> >>  	u64 eoi_exit_bitmap2;
> >>  	u64 eoi_exit_bitmap3;
> >> +	u64 eptp_list_address;
> >>  	u64 xss_exit_bitmap;
> >>  	u64 guest_physical_address;
> >>  	u64 vmcs_link_pointer;
> >> @@ -441,6 +443,7 @@ struct nested_vmx {
> >>  	struct page *apic_access_page;
> >>  	struct page *virtual_apic_page;
> >>  	struct page *pi_desc_page;
> >> +	struct page *shadow_eptp_list;
> >>  	struct pi_desc *pi_desc;
> >>  	bool pi_pending;
> >>  	u16 posted_intr_nv;
> >> @@ -481,6 +484,7 @@ struct nested_vmx {
> >>  	u64 nested_vmx_cr4_fixed0;
> >>  	u64 nested_vmx_cr4_fixed1;
> >>  	u64 nested_vmx_vmcs_enum;
> >> +	u64 nested_vmx_vmfunc_controls;
> >>  };
> >>  
> >>  #define POSTED_INTR_ON  0
> >> @@ -1314,6 +1318,22 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
> >>  		SECONDARY_EXEC_TSC_SCALING;
> >>  }
> >>  
> >> +static inline bool cpu_has_vmx_vmfunc(void)
> >> +{
> >> +	return vmcs_config.cpu_based_exec_ctrl &
> >> +		SECONDARY_EXEC_ENABLE_VMFUNC;
> >> +}
> >> +
> >> +static inline u64 vmx_vmfunc_controls(void)
> >> +{
> >> +	u64 controls = 0;
> >> +
> >> +	if (cpu_has_vmx_vmfunc())
> >> +		rdmsrl(MSR_IA32_VMX_VMFUNC, controls);
> >> +
> >> +	return controls;
> >> +}
> >> +
> >>  static inline bool report_flexpriority(void)
> >>  {
> >>  	return flexpriority_enabled;
> >> @@ -1388,6 +1408,18 @@ static inline bool
> >> nested_cpu_has_posted_intr(struct
> >> vmcs12 *vmcs12)
> >>  	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
> >>  }
> >>  
> >> +static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
> >> +{
> >> +	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
> >> +}
> >> +
> >> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
> >> +{
> >> +	return nested_cpu_has_vmfunc(vmcs12) &&
> >> +		(vmcs12->vm_function_control &
> >> +		 VMX_VMFUNC_EPTP_SWITCHING);
> >> +}
> >> +
> >>  static inline bool is_nmi(u32 intr_info)
> >>  {
> >>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
> >> @@ -3143,6 +3175,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu,
> >> u32
> >> msr_index, u64 *pdata)
> >>  		*pdata = vmx->nested.nested_vmx_ept_caps |
> >>  			((u64)vmx->nested.nested_vmx_vpid_caps << 32);
> >>  		break;
> >> +	case MSR_IA32_VMX_VMFUNC:
> >> +		*pdata = vmx->nested.nested_vmx_vmfunc_controls;
> >> +		break;
> >>  	default:
> >>  		return 1;
> >>  	}
> >> @@ -6959,6 +6994,14 @@ static int enter_vmx_operation(struct kvm_vcpu
> >> *vcpu)
> >>  		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
> >>  	}
> >>  
> >> +	if (vmx_vmfunc_controls() & 1) {
> >> +		struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> >> +
> >> +		if (!page)
> >> +			goto out_shadow_vmcs;
> >> +		vmx->nested.shadow_eptp_list = page;
> >> +	}
> >> +
> >>  	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
> >>  	vmx->nested.vmcs02_num = 0;
> >>  
> >> @@ -7128,6 +7171,11 @@ static void free_nested(struct vcpu_vmx *vmx)
> >>  		vmx->vmcs01.shadow_vmcs = NULL;
> >>  	}
> >>  	kfree(vmx->nested.cached_vmcs12);
> >> +
> >> +	if (vmx->nested.shadow_eptp_list) {
> >> +		__free_page(vmx->nested.shadow_eptp_list);
> >> +		vmx->nested.shadow_eptp_list = NULL;
> >> +	}
> >>  	/* Unpin physical memory we referred to in current vmcs02 */
> >>  	if (vmx->nested.apic_access_page) {
> >>  		nested_release_page(vmx->nested.apic_access_page);
> >> @@ -7740,6 +7788,61 @@ static int handle_preemption_timer(struct kvm_vcpu
> >> *vcpu)
> >>  	return 1;
> >>  }
> >>  
> >> +static int handle_vmfunc(struct kvm_vcpu *vcpu)
> >> +{
> >> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> >> +	struct vmcs12 *vmcs12;
> >> +	struct page *page = NULL,
> >> +		*shadow_page = vmx->nested.shadow_eptp_list;
> >> +	u64 *l1_eptp_list, *shadow_eptp_list;
> >> +	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
> >> +	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
> >> +
> >> +	if (is_guest_mode(vcpu)) {
> >> +		vmcs12 = get_vmcs12(vcpu);
> >> +		if (!nested_cpu_has_ept(vmcs12) ||
> >> +		    !nested_cpu_has_eptp_switching(vmcs12))
> >> +			goto fail;
> >> +
> >> +		/*
> >> +		 * Only function 0 is valid, everything upto 63 injects VMFUNC
> >> +		 * exit reason to L1, and #UD thereafter
> >> +		 */
> >> +		if (function || !vmcs12->eptp_list_address ||
> >> +		    index >= VMFUNC_EPTP_ENTRIES)
> >> +			goto fail;
> >> +
> >> +		page = nested_get_page(vcpu, vmcs12->eptp_list_address);
> >> +		if (!page)
> >> +			goto fail;
> >> +
> >> +		l1_eptp_list = kmap(page);
> >> +		if (!l1_eptp_list[index])
> >> +			goto fail;
> >> +
> >> +		kvm_mmu_unload(vcpu);
> >> +		/*
> >> +		 * TODO: Verify that guest ept satisfies vmentry prereqs
> >> +		 */
> >> +		vmcs12->ept_pointer = l1_eptp_list[index];
> >> +		shadow_eptp_list = phys_to_virt(page_to_phys(shadow_page));
> >> +		kvm_mmu_reload(vcpu);
> >> +		shadow_eptp_list[index] =
> >> +			construct_eptp(vcpu->arch.mmu.root_hpa);
> >> +		kunmap(page);
> >> +
> >> +		return kvm_skip_emulated_instruction(vcpu);
> >> +	}
> >> +
> >> +fail:
> >> +	if (page)
> >> +		kunmap(page);
> >> +	nested_vmx_vmexit(vcpu, vmx->exit_reason,
> >> +			  vmcs_read32(VM_EXIT_INTR_INFO),
> >> +			  vmcs_readl(EXIT_QUALIFICATION));
> >> +	return 1;
> >> +}
> >> +
> >>  /*
> >>   * The exit handlers return 1 if the exit was handled fully and guest
> >>   execution
> >>   * may resume.  Otherwise they set the kvm_run parameter to indicate what
> >>   needs
> >> @@ -7790,6 +7893,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct
> >> kvm_vcpu *vcpu) = {
> >>  	[EXIT_REASON_XSAVES]                  = handle_xsaves,
> >>  	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
> >>  	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
> >> +	[EXIT_REASON_VMFUNC]                  = handle_vmfunc,
> >>  	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
> >>  };
> >>  
> >> --
> >> 2.9.4
> >> 
> >> 
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2017-07-01  5:34 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-29 23:29 [PATCH 0/2] Expose VMFUNC to the nested hypervisor Bandan Das
2017-06-29 23:29 ` [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor Bandan Das
2017-06-30  7:29   ` Paolo Bonzini
2017-06-30 17:20     ` Bandan Das
2017-07-01  5:34       ` Paolo Bonzini
2017-06-29 23:29 ` [PATCH 2/2] KVM: nVMX: Advertise VMFUNC to " Bandan Das
2017-06-30 17:06 ` [PATCH 0/2] Expose VMFUNC to the nested hypervisor Jim Mattson
2017-06-30 17:58   ` Bandan Das
2017-06-30 19:06   ` Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).