Re: [PATCH v2 4/4] KVM: nVMX: Add support for capturing highest observable L2 TSC

From: Liran Alon <liran.alon@oracle.com>
To: Aaron Lewis <aaronlewis@google.com>
Cc: kvm@vger.kernel.org, Paolo Bonzini <pbonzini@redhat.com>,
	Jim Mattson <jmattson@google.com>
Subject: Re: [PATCH v2 4/4] KVM: nVMX: Add support for capturing highest observable L2 TSC
Date: Tue, 5 Nov 2019 23:58:13 +0200	[thread overview]
Message-ID: <E7BEE4C5-F129-43B2-A70C-F7143E8665C9@oracle.com> (raw)
In-Reply-To: <20191105191910.56505-5-aaronlewis@google.com>

> On 5 Nov 2019, at 21:19, Aaron Lewis <aaronlewis@google.com> wrote:
> 
> The L1 hypervisor may include the IA32_TIME_STAMP_COUNTER MSR in the
> vmcs12 MSR VM-exit MSR-store area as a way of determining the highest
> TSC value that might have been observed by L2 prior to VM-exit. The
> current implementation does not capture a very tight bound on this
> value.  To tighten the bound, add the IA32_TIME_STAMP_COUNTER MSR to the
> vmcs02 VM-exit MSR-store area whenever it appears in the vmcs12 VM-exit
> MSR-store area.  When L0 processes the vmcs12 VM-exit MSR-store area
> during the emulation of an L2->L1 VM-exit, special-case the
> IA32_TIME_STAMP_COUNTER MSR, using the value stored in the vmcs02
> VM-exit MSR-store area to derive the value to be stored in the vmcs12
> VM-exit MSR-store area.
> 
> Reviewed-by: Jim Mattson <jmattson@google.com>
> Signed-off-by: Aaron Lewis <aaronlewis@google.com>

The patch looks correct to me and I had only some minor style comments below.
Reviewed-by: Liran Alon <liran.alon@oracle.com>

I think you may also consider to separate this patch into two:
First patch add all framework code without still using it specifically for MSR_IA32_TSC
and a second patch to use the framework for MSR_IA32_TSC case.

Just out of curiosity, may I ask which L1 hypervisor use this technique that you encountered this issue?

-Liran

> ---
> arch/x86/kvm/vmx/nested.c | 92 ++++++++++++++++++++++++++++++++++++---
> arch/x86/kvm/vmx/vmx.h    |  4 ++
> 2 files changed, 90 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 7b058d7b9fcc..cb2a92341eab 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -929,6 +929,37 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
> 	return i + 1;
> }
> 
> +static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
> +					    u32 msr_index,
> +					    u64 *data)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	/*
> +	 * If the L0 hypervisor stored a more accurate value for the TSC that
> +	 * does not include the time taken for emulation of the L2->L1
> +	 * VM-exit in L0, use the more accurate value.
> +	 */
> +	if (msr_index == MSR_IA32_TSC) {
> +		int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
> +					       MSR_IA32_TSC);
> +
> +		if (index >= 0) {
> +			u64 val = vmx->msr_autostore.guest.val[index].value;
> +

Nit: I would remove the new-line here.

> +			*data = kvm_read_l1_tsc(vcpu, val);
> +			return true;
> +		}
> +	}
> +
> +	if (kvm_get_msr(vcpu, msr_index, data)) {
> +		pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
> +			msr_index);
> +		return false;
> +	}
> +	return true;
> +}
> +
> static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
> 				     struct vmx_msr_entry *e)
> {
> @@ -963,12 +994,9 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
> 		if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
> 			return -EINVAL;
> 
> -		if (kvm_get_msr(vcpu, e.index, &data)) {
> -			pr_debug_ratelimited(
> -				"%s cannot read MSR (%u, 0x%x)\n",
> -				__func__, i, e.index);
> +		if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
> 			return -EINVAL;
> -		}
> +
> 		if (kvm_vcpu_write_guest(vcpu,
> 					 gpa + i * sizeof(e) +
> 					     offsetof(struct vmx_msr_entry, value),
> @@ -982,6 +1010,51 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
> 	return 0;
> }
> 
> +static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
> +{
> +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +	u32 count = vmcs12->vm_exit_msr_store_count;
> +	u64 gpa = vmcs12->vm_exit_msr_store_addr;
> +	struct vmx_msr_entry e;
> +	u32 i;
> +
> +	for (i = 0; i < count; i++) {
> +		if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
> +			return false;
> +
> +		if (e.index == msr_index)
> +			return true;
> +	}
> +	return false;
> +}
> +
> +static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
> +					   u32 msr_index)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
> +	int i = vmx_find_msr_index(autostore, msr_index);

Nit: I would rename “i" to “msr_autostore_index”.

> +	bool in_autostore_list = i >= 0;
> +	bool in_vmcs12_store_list;
> +	int last;
> +
> +	in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);

Nit: Why is “i" and “in_autostore_list” initialised at declaration but “in_vmcs12_store_list” initialised here?
I would prefer to just first declare variables and then have logic of method after declarations.

> +
> +	if (in_vmcs12_store_list && !in_autostore_list) {
> +		if (autostore->nr == NR_MSR_ENTRIES) {
> +			pr_warn_ratelimited(
> +				"Not enough msr entries in msr_autostore.  Can't add msr %x\n",
> +				msr_index);
> +			return;

I think it’s worth to add a comment here explaining that we don’t fail emulated VMEntry in this case
because on emulated VMExit, we will just get less accurate value by nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
instead of reading value from vmcs02 VMExit MSR-store. But we are still able to emulate VMEntry properly.

> +		}
> +		last = autostore->nr++;
> +		autostore->val[last].index = msr_index;
> +	} else if (!in_vmcs12_store_list && in_autostore_list) {
> +		last = --autostore->nr;
> +		autostore->val[i] = autostore->val[last];
> +	}
> +}
> +
> static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
> {
> 	unsigned long invalid_mask;
> @@ -2027,7 +2100,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
> 	 * addresses are constant (for vmcs02), the counts can change based
> 	 * on L2's behavior, e.g. switching to/from long mode.
> 	 */
> -	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
> +	vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
> 	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
> 	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
> 
> @@ -2294,6 +2367,13 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
> 		vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
> 	}
> 
> +	/*
> +	 * Make sure the msr_autostore list is up to date before we set the
> +	 * count in the vmcs02.
> +	 */
> +	prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
> +
> +	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
> 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
> 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
> 
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 34b5fef603d8..0ab1562287af 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -230,6 +230,10 @@ struct vcpu_vmx {
> 		struct vmx_msrs host;
> 	} msr_autoload;
> 
> +	struct msr_autostore {
> +		struct vmx_msrs guest;
> +	} msr_autostore;
> +
> 	struct {
> 		int vm86_active;
> 		ulong save_rflags;
> --
> 2.24.0.rc1.363.gb1bccd3e3d-goog
>