All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] KVM: nVMX: nested EPT fixes
@ 2017-05-11 11:23 Paolo Bonzini
  2017-05-11 11:23 ` [PATCH 1/2] KVM: nVMX: fix EPT permissions as reported in exit qualification Paolo Bonzini
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Paolo Bonzini @ 2017-05-11 11:23 UTC (permalink / raw)
  To: linux-kernel, kvm
  Cc: Peter Feiner, David Matlack, Radim Krčmář,
	Xiao Guangrong, Wanpeng Li

These two patches fix a couple corner cases identified by the new
tests in vmx.flat.  See the individual patches for more information.

Paolo

Paolo Bonzini (2):
  KVM: nVMX: fix EPT permissions as reported in exit qualification
  KVM: nVMX: fix nEPT handling of guest page table accesses

 arch/x86/kvm/paging_tmpl.h | 35 +++++++++++++++++++++--------------
 arch/x86/kvm/vmx.c         | 36 +++++++++++++++++++++++-------------
 2 files changed, 44 insertions(+), 27 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] KVM: nVMX: fix EPT permissions as reported in exit qualification
  2017-05-11 11:23 [PATCH 0/2] KVM: nVMX: nested EPT fixes Paolo Bonzini
@ 2017-05-11 11:23 ` Paolo Bonzini
  2017-05-12  3:59   ` Xiao Guangrong
  2017-05-11 11:23 ` [PATCH 2/2] KVM: nVMX: fix nEPT handling of guest page table accesses Paolo Bonzini
  2017-05-16 14:03 ` [PATCH 0/2] KVM: nVMX: nested EPT fixes Radim Krčmář
  2 siblings, 1 reply; 8+ messages in thread
From: Paolo Bonzini @ 2017-05-11 11:23 UTC (permalink / raw)
  To: linux-kernel, kvm
  Cc: Peter Feiner, David Matlack, Radim Krčmář,
	Xiao Guangrong, Wanpeng Li

This fixes the new ept_access_test_read_only and ept_access_test_read_write
testcases from vmx.flat.

The problem is that gpte_access moves bits around to switch from EPT
bit order (XWR) to ACC_*_MASK bit order (RWX).  This results in an
incorrect exit qualification.  To fix this, make pt_access and
pte_access operate on raw PTE values (only with NX flipped to mean
"can execute") and call gpte_access at the end of the walk.  This
lets us use pte_access to compute the exit qualification with XWR
bit order.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 56241746abbd..b0454c7e4cff 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -283,11 +283,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	pt_element_t pte;
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
-	unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
+	u64 pt_access, pte_access;
+	unsigned index, accessed_dirty, pte_pkey;
 	unsigned nested_access;
 	gpa_t pte_gpa;
 	bool have_ad;
 	int offset;
+	u64 walk_nx_mask = 0;
 	const int write_fault = access & PFERR_WRITE_MASK;
 	const int user_fault  = access & PFERR_USER_MASK;
 	const int fetch_fault = access & PFERR_FETCH_MASK;
@@ -302,6 +304,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
 
 #if PTTYPE == 64
+	walk_nx_mask = 1ULL << PT64_NX_SHIFT;
 	if (walker->level == PT32E_ROOT_LEVEL) {
 		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
@@ -313,8 +316,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	walker->max_level = walker->level;
 	ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
 
-	accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0;
-
 	/*
 	 * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
 	 * by the MOV to CR instruction are treated as reads and do not cause the
@@ -322,14 +323,14 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	 */
 	nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
 
-	pt_access = pte_access = ACC_ALL;
+	pte_access = ~0;
 	++walker->level;
 
 	do {
 		gfn_t real_gfn;
 		unsigned long host_addr;
 
-		pt_access &= pte_access;
+		pt_access = pte_access;
 		--walker->level;
 
 		index = PT_INDEX(addr, walker->level);
@@ -371,6 +372,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 
 		trace_kvm_mmu_paging_element(pte, walker->level);
 
+		/*
+		 * Inverting the NX it lets us AND it like other
+		 * permission bits.
+		 */
+		pte_access = pt_access & (pte ^ walk_nx_mask);
+
 		if (unlikely(!FNAME(is_present_gpte)(pte)))
 			goto error;
 
@@ -379,14 +386,16 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 			goto error;
 		}
 
-		accessed_dirty &= pte;
-		pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
 		walker->ptes[walker->level - 1] = pte;
 	} while (!is_last_gpte(mmu, walker->level, pte));
 
 	pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
-	errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access);
+	accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
+
+	/* Convert to ACC_*_MASK flags for struct guest_walker.  */
+	walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask);
+	walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask);
+	errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
 	if (unlikely(errcode))
 		goto error;
 
@@ -403,7 +412,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	walker->gfn = real_gpa >> PAGE_SHIFT;
 
 	if (!write_fault)
-		FNAME(protect_clean_gpte)(mmu, &pte_access, pte);
+		FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
 	else
 		/*
 		 * On a write fault, fold the dirty bit into accessed_dirty.
@@ -421,10 +430,8 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 			goto retry_walk;
 	}
 
-	walker->pt_access = pt_access;
-	walker->pte_access = pte_access;
 	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
-		 __func__, (u64)pte, pte_access, pt_access);
+		 __func__, (u64)pte, walker->pte_access, walker->pt_access);
 	return 1;
 
 error:
@@ -452,7 +459,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	 */
 	if (!(errcode & PFERR_RSVD_MASK)) {
 		vcpu->arch.exit_qualification &= 0x187;
-		vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
+		vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
 	}
 #endif
 	walker->fault.address = addr;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] KVM: nVMX: fix nEPT handling of guest page table accesses
  2017-05-11 11:23 [PATCH 0/2] KVM: nVMX: nested EPT fixes Paolo Bonzini
  2017-05-11 11:23 ` [PATCH 1/2] KVM: nVMX: fix EPT permissions as reported in exit qualification Paolo Bonzini
@ 2017-05-11 11:23 ` Paolo Bonzini
  2017-05-12  7:38   ` Xiao Guangrong
  2017-05-16 14:03 ` [PATCH 0/2] KVM: nVMX: nested EPT fixes Radim Krčmář
  2 siblings, 1 reply; 8+ messages in thread
From: Paolo Bonzini @ 2017-05-11 11:23 UTC (permalink / raw)
  To: linux-kernel, kvm
  Cc: Peter Feiner, David Matlack, Radim Krčmář,
	Xiao Guangrong, Wanpeng Li

The new ept_access_test_paddr_read_only_ad_disabled testcase
caused an infinite stream of EPT violations because KVM did not
find anything bad in the page tables and kept re-executing the
faulting instruction.

This is because the exit qualification said we were reading from
the page tables, but actually writing the cause of the EPT violation
was writing the A/D bits.  This happened even with eptad=0, quite
surprisingly.

Thus, always treat guest page table accesses as read+write operations,
even if the exit qualification says otherwise.  This fixes the
testcase.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c6f4ad44aa95..c868cbdad29a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6209,17 +6209,19 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	u32 error_code;
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	trace_kvm_page_fault(gpa, exit_qualification);
 
-	if (is_guest_mode(vcpu)
-	    && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
-		/*
-		 * Fix up exit_qualification according to whether guest
-		 * page table accesses are reads or writes.
-		 */
-		u64 eptp = nested_ept_get_cr3(vcpu);
-		if (!(eptp & VMX_EPT_AD_ENABLE_BIT))
-			exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
-	}
+	/*
+	 * All guest page table accesses are potential writes to A/D bits.
+	 * but EPT microcode only reports them as such when EPT A/D is
+	 * enabled.  Tracing ept_access_test_paddr_read_only_ad_disabled (from
+	 * kvm-unit-tests) with eptad=0 and eptad=1 shows that the processor
+	 * does not change its behavior when EPTP enables A/D bits; the only
+	 * difference is in the exit qualification.  So fix this up here.
+	 */
+	if (!(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED))
+		exit_qualification |= EPT_VIOLATION_ACC_WRITE;
 
 	/*
 	 * EPT violation happened while executing iret from NMI,
@@ -6231,9 +6233,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
 		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
 
-	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-	trace_kvm_page_fault(gpa, exit_qualification);
-
 	/* Is it a read fault? */
 	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
 		     ? PFERR_USER_MASK : 0;
@@ -6250,6 +6249,17 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 		      ? PFERR_PRESENT_MASK : 0;
 
 	vcpu->arch.gpa_available = true;
+
+	if (is_guest_mode(vcpu)
+	    && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
+		/*
+		 * Now fix up exit_qualification according to what the
+		 * L1 hypervisor expects to see.
+		 */
+		u64 eptp = nested_ept_get_cr3(vcpu);
+		if (!(eptp & VMX_EPT_AD_ENABLE_BIT))
+			exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
+	}
 	vcpu->arch.exit_qualification = exit_qualification;
 
 	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] KVM: nVMX: fix EPT permissions as reported in exit qualification
  2017-05-11 11:23 ` [PATCH 1/2] KVM: nVMX: fix EPT permissions as reported in exit qualification Paolo Bonzini
@ 2017-05-12  3:59   ` Xiao Guangrong
  2017-05-12  5:13     ` Xiao Guangrong
  0 siblings, 1 reply; 8+ messages in thread
From: Xiao Guangrong @ 2017-05-12  3:59 UTC (permalink / raw)
  To: Paolo Bonzini, linux-kernel, kvm
  Cc: Peter Feiner, David Matlack, Radim Krčmář,
	Xiao Guangrong, Wanpeng Li



On 05/11/2017 07:23 PM, Paolo Bonzini wrote:
> This fixes the new ept_access_test_read_only and ept_access_test_read_write
> testcases from vmx.flat.
> 
> The problem is that gpte_access moves bits around to switch from EPT
> bit order (XWR) to ACC_*_MASK bit order (RWX).  This results in an
> incorrect exit qualification.  To fix this, make pt_access and
> pte_access operate on raw PTE values (only with NX flipped to mean
> "can execute") and call gpte_access at the end of the walk.  This
> lets us use pte_access to compute the exit qualification with XWR
> bit order.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   arch/x86/kvm/paging_tmpl.h | 35 +++++++++++++++++++++--------------
>   1 file changed, 21 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 56241746abbd..b0454c7e4cff 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -283,11 +283,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   	pt_element_t pte;
>   	pt_element_t __user *uninitialized_var(ptep_user);
>   	gfn_t table_gfn;
> -	unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
> +	u64 pt_access, pte_access;
> +	unsigned index, accessed_dirty, pte_pkey;
>   	unsigned nested_access;
>   	gpa_t pte_gpa;
>   	bool have_ad;
>   	int offset;
> +	u64 walk_nx_mask = 0;
>   	const int write_fault = access & PFERR_WRITE_MASK;
>   	const int user_fault  = access & PFERR_USER_MASK;
>   	const int fetch_fault = access & PFERR_FETCH_MASK;
> @@ -302,6 +304,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   	have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
>   
>   #if PTTYPE == 64
> +	walk_nx_mask = 1ULL << PT64_NX_SHIFT;

We can always make walk_nx_mask = 1ULL << PT64_NX_SHIFT, as:
- for EPT, this bit is useless
- for 32bit, bit 63 is always ZERO, so that the final result should be ZERO too,

>   	if (walker->level == PT32E_ROOT_LEVEL) {
>   		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
>   		trace_kvm_mmu_paging_element(pte, walker->level);
> @@ -313,8 +316,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   	walker->max_level = walker->level;
>   	ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
>   
> -	accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0;
> -
>   	/*
>   	 * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
>   	 * by the MOV to CR instruction are treated as reads and do not cause the
> @@ -322,14 +323,14 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   	 */
>   	nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
>   
> -	pt_access = pte_access = ACC_ALL;
> +	pte_access = ~0;
>   	++walker->level;
>   
>   	do {
>   		gfn_t real_gfn;
>   		unsigned long host_addr;
>   
> -		pt_access &= pte_access;
> +		pt_access = pte_access;
>   		--walker->level;
>   
>   		index = PT_INDEX(addr, walker->level);
> @@ -371,6 +372,12 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   
>   		trace_kvm_mmu_paging_element(pte, walker->level);
>   
> +		/*
> +		 * Inverting the NX it lets us AND it like other
> +		 * permission bits.
> +		 */
> +		pte_access = pt_access & (pte ^ walk_nx_mask);
> +
>   		if (unlikely(!FNAME(is_present_gpte)(pte)))
>   			goto error;
>   
> @@ -379,14 +386,16 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   			goto error;
>   		}
>   
> -		accessed_dirty &= pte;
> -		pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
> -
>   		walker->ptes[walker->level - 1] = pte;
>   	} while (!is_last_gpte(mmu, walker->level, pte));
>   
>   	pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
> -	errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access);
> +	accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
> +
> +	/* Convert to ACC_*_MASK flags for struct guest_walker.  */
> +	walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask);
> +	walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask);
> +	errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
>   	if (unlikely(errcode))
>   		goto error;
>   
> @@ -403,7 +412,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   	walker->gfn = real_gpa >> PAGE_SHIFT;
>   
>   	if (!write_fault)
> -		FNAME(protect_clean_gpte)(mmu, &pte_access, pte);
> +		FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
>   	else
>   		/*
>   		 * On a write fault, fold the dirty bit into accessed_dirty.
> @@ -421,10 +430,8 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   			goto retry_walk;
>   	}
>   
> -	walker->pt_access = pt_access;
> -	walker->pte_access = pte_access;
>   	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
> -		 __func__, (u64)pte, pte_access, pt_access);
> +		 __func__, (u64)pte, walker->pte_access, walker->pt_access);
>   	return 1;
>   
>   error:
> @@ -452,7 +459,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>   	 */
>   	if (!(errcode & PFERR_RSVD_MASK)) {
>   		vcpu->arch.exit_qualification &= 0x187;
> -		vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
                                                                 ^ here, the original code
is buggy as pt_access and pte have different bit order, fortunately, this patch fixes it
too. :)

Otherwise it looks good to me, thanks for your fix.

Reviewed-by: Xiao Guangrong <xiaoguangrong@tencent.com>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] KVM: nVMX: fix EPT permissions as reported in exit qualification
  2017-05-12  3:59   ` Xiao Guangrong
@ 2017-05-12  5:13     ` Xiao Guangrong
  0 siblings, 0 replies; 8+ messages in thread
From: Xiao Guangrong @ 2017-05-12  5:13 UTC (permalink / raw)
  To: Paolo Bonzini, linux-kernel, kvm
  Cc: Peter Feiner, David Matlack, Radim Krčmář,
	Xiao Guangrong, Wanpeng Li



On 05/12/2017 11:59 AM, Xiao Guangrong wrote:
error:
>> @@ -452,7 +459,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
>>        */
>>       if (!(errcode & PFERR_RSVD_MASK)) {
>>           vcpu->arch.exit_qualification &= 0x187;
>> -        vcpu->arch.exit_qualification |= ((pt_access & pte) & 0x7) << 3;
>                                                                  ^ here, the original code
> is buggy as pt_access and pte have different bit order, fortunately, this patch fixes it
> too. :)
> 

It's exactly what this patch aims at, do not know what i was thinking when
wrote these down. :(

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] KVM: nVMX: fix nEPT handling of guest page table accesses
  2017-05-11 11:23 ` [PATCH 2/2] KVM: nVMX: fix nEPT handling of guest page table accesses Paolo Bonzini
@ 2017-05-12  7:38   ` Xiao Guangrong
  2017-05-12  8:39     ` Paolo Bonzini
  0 siblings, 1 reply; 8+ messages in thread
From: Xiao Guangrong @ 2017-05-12  7:38 UTC (permalink / raw)
  To: Paolo Bonzini, linux-kernel, kvm
  Cc: Peter Feiner, David Matlack, Radim Krčmář,
	Xiao Guangrong, Wanpeng Li, kevin.tian


CC Kevin as i am not sure if Intel is aware of this issue, it
breaks other hypervisors, e.g, Xen, as swell.

On 05/11/2017 07:23 PM, Paolo Bonzini wrote:
> The new ept_access_test_paddr_read_only_ad_disabled testcase
> caused an infinite stream of EPT violations because KVM did not
> find anything bad in the page tables and kept re-executing the
> faulting instruction.
> 
> This is because the exit qualification said we were reading from
> the page tables, but actually writing the cause of the EPT violation
> was writing the A/D bits.  This happened even with eptad=0, quite
> surprisingly.
> 
> Thus, always treat guest page table accesses as read+write operations,
> even if the exit qualification says otherwise.  This fixes the
> testcase.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>   arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++-------------
>   1 file changed, 23 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index c6f4ad44aa95..c868cbdad29a 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -6209,17 +6209,19 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
>   	u32 error_code;
>   
>   	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	trace_kvm_page_fault(gpa, exit_qualification);
>   
> -	if (is_guest_mode(vcpu)
> -	    && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
> -		/*
> -		 * Fix up exit_qualification according to whether guest
> -		 * page table accesses are reads or writes.
> -		 */
> -		u64 eptp = nested_ept_get_cr3(vcpu);
> -		if (!(eptp & VMX_EPT_AD_ENABLE_BIT))
> -			exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
> -	}
> +	/*
> +	 * All guest page table accesses are potential writes to A/D bits.
> +	 * but EPT microcode only reports them as such when EPT A/D is
> +	 * enabled.  Tracing ept_access_test_paddr_read_only_ad_disabled (from
> +	 * kvm-unit-tests) with eptad=0 and eptad=1 shows that the processor
> +	 * does not change its behavior when EPTP enables A/D bits; the only
> +	 * difference is in the exit qualification.  So fix this up here.
> +	 */
> +	if (!(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED))
> +		exit_qualification |= EPT_VIOLATION_ACC_WRITE;
>   
>   	/*
>   	 * EPT violation happened while executing iret from NMI,
> @@ -6231,9 +6233,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
>   			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
>   		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
>   
> -	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> -	trace_kvm_page_fault(gpa, exit_qualification);
> -
>   	/* Is it a read fault? */
>   	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
>   		     ? PFERR_USER_MASK : 0;
> @@ -6250,6 +6249,17 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
>   		      ? PFERR_PRESENT_MASK : 0;
>   
>   	vcpu->arch.gpa_available = true;
> +
> +	if (is_guest_mode(vcpu)
> +	    && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
> +		/*
> +		 * Now fix up exit_qualification according to what the
> +		 * L1 hypervisor expects to see.
> +		 */
> +		u64 eptp = nested_ept_get_cr3(vcpu);
> +		if (!(eptp & VMX_EPT_AD_ENABLE_BIT))
> +			exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
> +	}

I am not sure if this is really needed, it (PFEC.W = 0 if A/D need to be set on
page structures) is not we expect.

Maybe always report the right behavior is better? Especially,Intel may fix its
microcode as it hurts the newest CPUs as well.

Thanks!

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] KVM: nVMX: fix nEPT handling of guest page table accesses
  2017-05-12  7:38   ` Xiao Guangrong
@ 2017-05-12  8:39     ` Paolo Bonzini
  0 siblings, 0 replies; 8+ messages in thread
From: Paolo Bonzini @ 2017-05-12  8:39 UTC (permalink / raw)
  To: Xiao Guangrong, linux-kernel, kvm
  Cc: Peter Feiner, David Matlack, Radim Krčmář,
	Xiao Guangrong, Wanpeng Li, kevin.tian

On 12/05/2017 09:38, Xiao Guangrong wrote:
> CC Kevin as i am not sure if Intel is aware of this issue, it
> breaks other hypervisors, e.g, Xen, as swell.

It's actually more complicated.

When EPT A/D bits are disabled, reads of the page tables behave as
described in the manual; writes have both bit 0 and bit 1 set, while the
manual suggests only bit 1 is set.

Peter and David convinced me that it's a hypervisor bug, and I'm not
surprised that Xen has the same issue.  You have to disable EPT A/D bits
for shadow EPT page tables when the L1 hypervisor is not using them.

Paolo

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 0/2] KVM: nVMX: nested EPT fixes
  2017-05-11 11:23 [PATCH 0/2] KVM: nVMX: nested EPT fixes Paolo Bonzini
  2017-05-11 11:23 ` [PATCH 1/2] KVM: nVMX: fix EPT permissions as reported in exit qualification Paolo Bonzini
  2017-05-11 11:23 ` [PATCH 2/2] KVM: nVMX: fix nEPT handling of guest page table accesses Paolo Bonzini
@ 2017-05-16 14:03 ` Radim Krčmář
  2 siblings, 0 replies; 8+ messages in thread
From: Radim Krčmář @ 2017-05-16 14:03 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: linux-kernel, kvm, Peter Feiner, David Matlack, Xiao Guangrong,
	Wanpeng Li

2017-05-11 13:23+0200, Paolo Bonzini:
> These two patches fix a couple corner cases identified by the new
> tests in vmx.flat.  See the individual patches for more information.
> 
> Paolo
> 
> Paolo Bonzini (2):
>   KVM: nVMX: fix EPT permissions as reported in exit qualification
>   KVM: nVMX: fix nEPT handling of guest page table accesses

Applied the first one to kvm/master, thanks.

(The second problem is expected to have a different fix.)

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-05-16 14:03 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-11 11:23 [PATCH 0/2] KVM: nVMX: nested EPT fixes Paolo Bonzini
2017-05-11 11:23 ` [PATCH 1/2] KVM: nVMX: fix EPT permissions as reported in exit qualification Paolo Bonzini
2017-05-12  3:59   ` Xiao Guangrong
2017-05-12  5:13     ` Xiao Guangrong
2017-05-11 11:23 ` [PATCH 2/2] KVM: nVMX: fix nEPT handling of guest page table accesses Paolo Bonzini
2017-05-12  7:38   ` Xiao Guangrong
2017-05-12  8:39     ` Paolo Bonzini
2017-05-16 14:03 ` [PATCH 0/2] KVM: nVMX: nested EPT fixes Radim Krčmář

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.