Re: [PATCH v2 07/10] KVM: X86: Move write_l1_tsc_offset() logic to common code and rename it

From: Sean Christopherson <seanjc@google.com>
To: Ilias Stamatis <ilstam@amazon.com>
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	pbonzini@redhat.com, mlevitsk@redhat.com, vkuznets@redhat.com,
	wanpengli@tencent.com, jmattson@google.com, joro@8bytes.org,
	zamsden@gmail.com, mtosatti@redhat.com, dwmw@amazon.co.uk
Subject: Re: [PATCH v2 07/10] KVM: X86: Move write_l1_tsc_offset() logic to common code and rename it
Date: Wed, 19 May 2021 00:05:09 +0000	[thread overview]
Message-ID: <YKRWNaqzo4GVDxHP@google.com> (raw)
In-Reply-To: <20210512150945.4591-8-ilstam@amazon.com>

On Wed, May 12, 2021, Ilias Stamatis wrote:
> The write_l1_tsc_offset() callback has a misleading name. It does not
> set L1's TSC offset, it rather updates the current TSC offset which
> might be different if a nested guest is executing. Additionally, both
> the vmx and svm implementations use the same logic for calculating the
> current TSC before writing it to hardware.

I don't disagree, but the current name as the advantage of clarifying (well,
hinting) that the offset is L1's offset.  That hint is lost in this refactoring.
Maybe rename "u64 offset" to "u64 l1_tsc_offset"?

> This patch renames the function and moves the common logic to the

Use imperative mood instead of "This patch.  From 
Documentation/process/submitting-patches.rst:

  Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
  instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
  to do frotz", as if you are giving orders to the codebase to change
  its behaviour.

> caller. The vmx/svm-specific code now merely sets the given offset to
> the corresponding hardware structure.
> 
> Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
> ---
>  arch/x86/include/asm/kvm-x86-ops.h |  2 +-
>  arch/x86/include/asm/kvm_host.h    |  3 +--
>  arch/x86/kvm/svm/svm.c             | 21 ++++-----------------
>  arch/x86/kvm/vmx/vmx.c             | 23 +++--------------------
>  arch/x86/kvm/x86.c                 | 17 ++++++++++++++++-
>  5 files changed, 25 insertions(+), 41 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> index 2063616fba1c..029c9615378f 100644
> --- a/arch/x86/include/asm/kvm-x86-ops.h
> +++ b/arch/x86/include/asm/kvm-x86-ops.h
> @@ -89,7 +89,7 @@ KVM_X86_OP(load_mmu_pgd)
>  KVM_X86_OP_NULL(has_wbinvd_exit)
>  KVM_X86_OP(get_l2_tsc_offset)
>  KVM_X86_OP(get_l2_tsc_multiplier)
> -KVM_X86_OP(write_l1_tsc_offset)
> +KVM_X86_OP(write_tsc_offset)
>  KVM_X86_OP(get_exit_info)
>  KVM_X86_OP(check_intercept)
>  KVM_X86_OP(handle_exit_irqoff)
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 57a25d8e8b0f..61cf201c001a 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1307,8 +1307,7 @@ struct kvm_x86_ops {
>  
>  	u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
>  	u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
> -	/* Returns actual tsc_offset set in active VMCS */
> -	u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
> +	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
>  
>  	/*
>  	 * Retrieve somewhat arbitrary exit information.  Intended to be used
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 679b2fc1a3f9..b18f60463073 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1094,26 +1094,13 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
>  	return kvm_default_tsc_scaling_ratio;
>  }
>  
> -static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>  {
>  	struct vcpu_svm *svm = to_svm(vcpu);
> -	u64 g_tsc_offset = 0;
> -
> -	if (is_guest_mode(vcpu)) {
> -		/* Write L1's TSC offset.  */
> -		g_tsc_offset = svm->vmcb->control.tsc_offset -
> -			       svm->vmcb01.ptr->control.tsc_offset;
> -		svm->vmcb01.ptr->control.tsc_offset = offset;
> -	}
> -
> -	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> -				   svm->vmcb->control.tsc_offset - g_tsc_offset,
> -				   offset);
> -
> -	svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
>  
> +	svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
> +	svm->vmcb->control.tsc_offset = offset;
>  	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
> -	return svm->vmcb->control.tsc_offset;
>  }
>  
>  /* Evaluate instruction intercepts that depend on guest CPUID features. */
> @@ -4540,7 +4527,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>  
>  	.get_l2_tsc_offset = svm_get_l2_tsc_offset,
>  	.get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
> -	.write_l1_tsc_offset = svm_write_l1_tsc_offset,
> +	.write_tsc_offset = svm_write_tsc_offset,
>  
>  	.load_mmu_pgd = svm_load_mmu_pgd,
>  
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 575e13bddda8..3c4eb14a1e86 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -1810,26 +1810,9 @@ static u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
>  	return multiplier;
>  }
>  
> -static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
> +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>  {
> -	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> -	u64 g_tsc_offset = 0;
> -
> -	/*
> -	 * We're here if L1 chose not to trap WRMSR to TSC. According
> -	 * to the spec, this should set L1's TSC; The offset that L1
> -	 * set for L2 remains unchanged, and still needs to be added
> -	 * to the newly set TSC to get L2's TSC.
> -	 */
> -	if (is_guest_mode(vcpu) &&
> -	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
> -		g_tsc_offset = vmcs12->tsc_offset;
> -
> -	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> -				   vcpu->arch.tsc_offset - g_tsc_offset,
> -				   offset);
> -	vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
> -	return offset + g_tsc_offset;
> +	vmcs_write64(TSC_OFFSET, offset);
>  }
>  
>  /*
> @@ -7725,7 +7708,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
>  
>  	.get_l2_tsc_offset = vmx_get_l2_tsc_offset,
>  	.get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
> -	.write_l1_tsc_offset = vmx_write_l1_tsc_offset,
> +	.write_tsc_offset = vmx_write_tsc_offset,
>  
>  	.load_mmu_pgd = vmx_load_mmu_pgd,
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1db6cfc2079f..f3ba1be4d5b9 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2377,8 +2377,23 @@ EXPORT_SYMBOL_GPL(kvm_set_02_tsc_multiplier);
>  
>  static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
>  {
> +	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
> +				   vcpu->arch.l1_tsc_offset,
> +				   offset);
> +
>  	vcpu->arch.l1_tsc_offset = offset;
> -	vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
> +	vcpu->arch.tsc_offset = offset;
> +
> +	if (is_guest_mode(vcpu)) {

Unnecessary curly braces.

> +		/*
> +		 * We're here if L1 chose not to trap WRMSR to TSC and
> +		 * according to the spec this should set L1's TSC (as opposed
> +		 * to setting L1's offset for L2).
> +		 */

While we're shuffling code, can we improve this comment?  It works for the WRMSR
case, but makes no sense in the context of host TSC adjustments.  It's not at all
clear to me that it's even correct or relevant in those cases.

> +		kvm_set_02_tsc_offset(vcpu);

I really dislike that kvm_set_02_tsc_offset() consumes a bunch of variables
_and_ sets arch.tsc_offset, e.g. it's not at all obvious that moving this call
around will break L2.  Even more bizarre is that arch.tsc_offset is conditionally
consumed.  Oh, and kvm_set_02_tsc_offset() is not idempotent since it can do a
RMW on arch.tsc_offset.

The below static_call() dependency doesn't appear to be easily solved, but we
can at least strongly hint that vcpu->arch.tsc_offset is set.  For everything
else, I think we can clean things up by doing this (with the vendor calls
providing the L2 variables directly):

void kvm_set_l2_tsc_offset(struct kvm_vcpu *vcpu, u64 l2_offset,
			   u64 l2_multiplier)
{
	u64 l1_offset = vcpu->arch.l1_tsc_offset;

	if (l2_multiplier != kvm_default_tsc_scaling_ratio)
		l2_offset += mul_s64_u64_shr((s64)l1_tsc_offset, l2_multiplier,
					     kvm_tsc_scaling_ratio_frac_bits);

	vcpu->arch.tsc_offset = l2_offset;
}
EXPORT_SYMBOL_GPL(kvm_get_l2_tsc_offset);

static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
				   vcpu->arch.l1_tsc_offset,
				   offset);

	vcpu->arch.l1_tsc_offset = offset;

	if (is_guest_mode(vcpu))
		kvm_set_l2_tsc_offset(vcpu,
				      static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
				      static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
	else
		vcpu->arch.tsc_offset = offset;

	static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
}

An alternative would be to explicitly track L1 and L2, and _never_ track the
current TSC values.  I.e. always compute the correct value on the fly.  I think
the only hot path is the TSC deadline timer, and AFAICT that always runs with
L1's timer.  Speaking of which, at the end of this series, vmx_set_hv_timer()
uses L1's TSC but the current scaling ratio; that can't be right.

> +	}
> +
> +	static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
>  }
>  
>  static inline bool kvm_check_tsc_unstable(void)
> -- 
> 2.17.1
>