[v2] KVM: SVM: Delay restoration of host MSR_TSC_AUX until return to userspace
diff mbox series

Message ID 20210422001736.3255735-1-seanjc@google.com
State New, archived
Headers show
Series
  • [v2] KVM: SVM: Delay restoration of host MSR_TSC_AUX until return to userspace
Related show

Commit Message

Sean Christopherson April 22, 2021, 12:17 a.m. UTC
Use KVM's "user return MSRs" framework to defer restoring the host's
MSR_TSC_AUX until the CPU returns to userspace.  Add/improve comments to
clarify why MSR_TSC_AUX is intercepted on both RDMSR and WRMSR, and why
it's safe for KVM to keep the guest's value loaded even if KVM is
scheduled out.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---

v2: Rebased to kvm/queue (ish), commit 0e91d1992235 ("KVM: SVM: Allocate SEV
    command structures on local stack")
 
 arch/x86/kvm/svm/svm.c | 50 ++++++++++++++++++------------------------
 arch/x86/kvm/svm/svm.h |  7 ------
 2 files changed, 21 insertions(+), 36 deletions(-)

Comments

Paolo Bonzini April 22, 2021, 6:45 a.m. UTC | #1
On 22/04/21 02:17, Sean Christopherson wrote:
> Use KVM's "user return MSRs" framework to defer restoring the host's
> MSR_TSC_AUX until the CPU returns to userspace.  Add/improve comments to
> clarify why MSR_TSC_AUX is intercepted on both RDMSR and WRMSR, and why
> it's safe for KVM to keep the guest's value loaded even if KVM is
> scheduled out.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> 
> v2: Rebased to kvm/queue (ish), commit 0e91d1992235 ("KVM: SVM: Allocate SEV
>      command structures on local stack")
>   
>   arch/x86/kvm/svm/svm.c | 50 ++++++++++++++++++------------------------
>   arch/x86/kvm/svm/svm.h |  7 ------
>   2 files changed, 21 insertions(+), 36 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index cd8c333ed2dc..596361449f25 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -213,6 +213,15 @@ struct kvm_ldttss_desc {
>   
>   DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
>   
> +/*
> + * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
> + * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
> + *
> + * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
> + * defer the restoration of TSC_AUX until the CPU returns to userspace.
> + */
> +#define TSC_AUX_URET_SLOT	0
> +
>   static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
>   
>   #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
> @@ -958,6 +967,9 @@ static __init int svm_hardware_setup(void)
>   		kvm_tsc_scaling_ratio_frac_bits = 32;
>   	}
>   
> +	if (boot_cpu_has(X86_FEATURE_RDTSCP))
> +		kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX);
> +
>   	/* Check for pause filtering support */
>   	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
>   		pause_filter_count = 0;
> @@ -1423,19 +1435,10 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_svm *svm = to_svm(vcpu);
>   	struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
> -	unsigned int i;
>   
>   	if (svm->guest_state_loaded)
>   		return;
>   
> -	/*
> -	 * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
> -	 * area (non-sev-es). Save ones that aren't so we can restore them
> -	 * individually later.
> -	 */
> -	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
> -		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> -
>   	/*
>   	 * Save additional host state that will be restored on VMEXIT (sev-es)
>   	 * or subsequent vmload of host save area.
> @@ -1454,29 +1457,15 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
>   		}
>   	}
>   
> -	/* This assumes that the kernel never uses MSR_TSC_AUX */
>   	if (static_cpu_has(X86_FEATURE_RDTSCP))
> -		wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
> +		kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull);
>   
>   	svm->guest_state_loaded = true;
>   }
>   
>   static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
>   {
> -	struct vcpu_svm *svm = to_svm(vcpu);
> -	unsigned int i;
> -
> -	if (!svm->guest_state_loaded)
> -		return;
> -
> -	/*
> -	 * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
> -	 * area (non-sev-es). Restore the ones that weren't.
> -	 */
> -	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
> -		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
> -
> -	svm->guest_state_loaded = false;
> +	to_svm(vcpu)->guest_state_loaded = false;
>   }
>   
>   static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> @@ -2893,12 +2882,15 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
>   			return 1;
>   
>   		/*
> -		 * This is rare, so we update the MSR here instead of using
> -		 * direct_access_msrs.  Doing that would require a rdmsr in
> -		 * svm_vcpu_put.
> +		 * TSC_AUX is usually changed only during boot and never read
> +		 * directly.  Intercept TSC_AUX instead of exposing it to the
> +		 * guest via direct_acess_msrs, and switch it via user return.
>   		 */
>   		svm->tsc_aux = data;
> -		wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
> +
> +		preempt_disable();
> +		kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
> +		preempt_enable();
>   		break;
>   	case MSR_IA32_DEBUGCTLMSR:
>   		if (!boot_cpu_has(X86_FEATURE_LBRV)) {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 454da1c1d9b7..9dce6f290041 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -23,11 +23,6 @@
>   
>   #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
>   
> -static const u32 host_save_user_msrs[] = {
> -	MSR_TSC_AUX,
> -};
> -#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
> -
>   #define	IOPM_SIZE PAGE_SIZE * 3
>   #define	MSRPM_SIZE PAGE_SIZE * 2
>   
> @@ -128,8 +123,6 @@ struct vcpu_svm {
>   
>   	u64 next_rip;
>   
> -	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
> -
>   	u64 spec_ctrl;
>   	/*
>   	 * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
> 

Queued, thanks.

Paolo
Reiji Watanabe April 22, 2021, 7:02 p.m. UTC | #2
@@ -2893,12 +2882,15 @@ static int svm_set_msr(struct kvm_vcpu *vcpu,
struct msr_data *msr)
                        return 1;

                /*
-                * This is rare, so we update the MSR here instead of using
-                * direct_access_msrs.  Doing that would require a rdmsr in
-                * svm_vcpu_put.
+                * TSC_AUX is usually changed only during boot and never read
+                * directly.  Intercept TSC_AUX instead of exposing it to the
+                * guest via direct_acess_msrs, and switch it via user return.
                 */

'direct_acess_msrs' should be 'direct_access_msrs'.


                svm->tsc_aux = data;
-               wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+
+               preempt_disable();
+               kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
+               preempt_enable();
                break;

One of the callers of svm_set_msr() is kvm_arch_vcpu_ioctl(KVM_SET_MSRS).
Since calling kvm_set_user_return_msr() looks unnecessary for the ioctl
case and makes extra things for the CPU to do when the CPU returns to
userspace for the case, I'm wondering if it might be better to check
svm->guest_state_loaded before calling kvm_set_user_return_msr() here.

The patch looks good to me other than those two minor things.

Thanks,
Reiji
Sean Christopherson April 22, 2021, 8:12 p.m. UTC | #3
+Tom

On Thu, Apr 22, 2021, Reiji Watanabe wrote:
> @@ -2893,12 +2882,15 @@ static int svm_set_msr(struct kvm_vcpu *vcpu,
> struct msr_data *msr)
>                         return 1;
> 
>                 /*
> -                * This is rare, so we update the MSR here instead of using
> -                * direct_access_msrs.  Doing that would require a rdmsr in
> -                * svm_vcpu_put.
> +                * TSC_AUX is usually changed only during boot and never read
> +                * directly.  Intercept TSC_AUX instead of exposing it to the
> +                * guest via direct_acess_msrs, and switch it via user return.
>                  */
> 
> 'direct_acess_msrs' should be 'direct_access_msrs'.
> 
> 
>                 svm->tsc_aux = data;
> -               wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
> +
> +               preempt_disable();
> +               kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
> +               preempt_enable();
>                 break;
> 
> One of the callers of svm_set_msr() is kvm_arch_vcpu_ioctl(KVM_SET_MSRS).
> Since calling kvm_set_user_return_msr() looks unnecessary for the ioctl
> case and makes extra things for the CPU to do when the CPU returns to
> userspace for the case, I'm wondering if it might be better to check
> svm->guest_state_loaded before calling kvm_set_user_return_msr() here.

Ugh.  AMD support for MSR_TSC_AUX is a train wreck.

Looking at VMX again, the reason it calls kvm_set_user_return_msr()
unconditionally is so that it can check the result of the WRMSR and inject #GP
into the guest if the WRMSR failed.

At first blush, that would appear to be unnecessary since host support for
MSR_TSC_AUX was already confirmed.  But on Intel, bits 63:32 of MSR_TSC_AUX are
actually "reserved", so in theory this code should not check guest_state_loaded,
but instead check the result and inject #GP as appropriate.

However, I put "reserved" in quotes because AMD CPUs apparently don't actually
do a reserved check.  Sadly, the APM doesn't say _anything_ about those bits in
the context of MSR access, the RDTSCP entry simply states that RCX contains bits
31:0 of the MSR, zero extended.  But even worse, the RDPID description implies
that it can consume all 64 bits of the MSR:

  RDPID reads the value of TSC_AUX MSR used by the RDTSCP instruction into the
  specified destination register. Normal operand size prefixes do not apply and
  the update is either 32 bit or 64 bit based on the current mode.

Intel has identical behavior for RDTSCP and RDPID, but because Intel CPUs
actually prevent software from writing bits 63:32, they haven't shot themselves
in the proverbial foot.

All that said, the AMD behavior of dropping the bits across the board actually
plays into KVM's favor because we can leverage it to support cross-vendor
migration.  I.e. explicitly drop svm->tsc_aux[63:32] on read (or clear on write),
that way userspace doesn't have to fudge the value itself when migrating an
adventurous guest from AMD to Intel.

Lastly, this flow is also missing a check on guest_cpuid_has().

All in all, I think we want this:

	case MSR_TSC_AUX:
		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
			return 1;

		if (!msr_info->host_initiated &&
		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
			return 1;

		/*
		 * TSC_AUX is usually changed only during boot and never read
		 * directly.  Intercept TSC_AUX instead of exposing it to the
		 * guest via direct_access_msrs, and switch it via user return.
		 */
		preempt_disable();
		r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
		preempt_enable();
		if (r)
			return 1;

		/*
		 * Bits 63:32 are dropped by AMD CPUs, but are reserved on
		 * Intel CPUs.  AMD's APM has incomplete and conflicting info
		 * on the architectural behavior; emulate current hardware as
		 * doing so ensures migrating from AMD to Intel won't explode.
		 */
		svm->tsc_aux = (u32)data;
		break;
Reiji Watanabe April 22, 2021, 10:39 p.m. UTC | #4
> All in all, I think we want this:
>
>         case MSR_TSC_AUX:
>                 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
>                         return 1;
>
>                 if (!msr_info->host_initiated &&
>                     !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
>                         return 1;
>
>                 /*
>                  * TSC_AUX is usually changed only during boot and never read
>                  * directly.  Intercept TSC_AUX instead of exposing it to the
>                  * guest via direct_access_msrs, and switch it via user return.
>                  */
>                 preempt_disable();
>                 r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
>                 preempt_enable();
>                 if (r)
>                         return 1;
>
>                 /*
>                  * Bits 63:32 are dropped by AMD CPUs, but are reserved on
>                  * Intel CPUs.  AMD's APM has incomplete and conflicting info
>                  * on the architectural behavior; emulate current hardware as
>                  * doing so ensures migrating from AMD to Intel won't explode.
>                  */
>                 svm->tsc_aux = (u32)data;
>                 break;


Thank you for the explanation.
I understand and the code above looks good to me.
(I would assume we want to check the msr_info->host_initiated and
guest_cpuid_has in svm_get_msr as well)

Thanks,
Reiji
Paolo Bonzini April 23, 2021, 6:13 a.m. UTC | #5
On 22/04/21 22:12, Sean Christopherson wrote:
> 	case MSR_TSC_AUX:
> 		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
> 			return 1;
> 
> 		if (!msr_info->host_initiated &&
> 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> 			return 1;
> 
> 		/*
> 		 * TSC_AUX is usually changed only during boot and never read
> 		 * directly.  Intercept TSC_AUX instead of exposing it to the
> 		 * guest via direct_access_msrs, and switch it via user return.
> 		 */
> 		preempt_disable();
> 		r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
> 		preempt_enable();
> 		if (r)
> 			return 1;
> 
> 		/*
> 		 * Bits 63:32 are dropped by AMD CPUs, but are reserved on
> 		 * Intel CPUs.  AMD's APM has incomplete and conflicting info
> 		 * on the architectural behavior; emulate current hardware as
> 		 * doing so ensures migrating from AMD to Intel won't explode.
> 		 */
> 		svm->tsc_aux = (u32)data;
> 		break;
> 

Ok, squashed in the following:

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 14ff7f0963e9..00e9680969a2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2875,16 +2875,28 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
  		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
  			return 1;
  
+		if (!msr_info->host_initiated &&
+		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+			return 1;
+
  		/*
  		 * TSC_AUX is usually changed only during boot and never read
  		 * directly.  Intercept TSC_AUX instead of exposing it to the
  		 * guest via direct_access_msrs, and switch it via user return.
  		 */
-		svm->tsc_aux = data;
-
  		preempt_disable();
-		kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
+		r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
  		preempt_enable();
+		if (r)
+			return 1;
+
+		/*
+		 * Bits 63:32 are dropped by AMD CPUs, but are reserved on
+		 * Intel CPUs.  AMD's APM has incomplete and conflicting info
+		 * on the architectural behavior; emulate current hardware as
+		 * doing so ensures migrating from AMD to Intel won't explode.
+		 */
+		svm->tsc_aux = (u32)data;
  		break;
  	case MSR_IA32_DEBUGCTLMSR:
  		if (!boot_cpu_has(X86_FEATURE_LBRV)) {

Paolo
Sean Christopherson April 23, 2021, 2:12 p.m. UTC | #6
On Fri, Apr 23, 2021, Paolo Bonzini wrote:
> On 22/04/21 22:12, Sean Christopherson wrote:
> > 	case MSR_TSC_AUX:
> > 		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
> > 			return 1;
> > 
> > 		if (!msr_info->host_initiated &&
> > 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> > 			return 1;
> > 
> > 		/*
> > 		 * TSC_AUX is usually changed only during boot and never read
> > 		 * directly.  Intercept TSC_AUX instead of exposing it to the
> > 		 * guest via direct_access_msrs, and switch it via user return.
> > 		 */
> > 		preempt_disable();
> > 		r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
> > 		preempt_enable();
> > 		if (r)
> > 			return 1;
> > 
> > 		/*
> > 		 * Bits 63:32 are dropped by AMD CPUs, but are reserved on
> > 		 * Intel CPUs.  AMD's APM has incomplete and conflicting info
> > 		 * on the architectural behavior; emulate current hardware as
> > 		 * doing so ensures migrating from AMD to Intel won't explode.
> > 		 */
> > 		svm->tsc_aux = (u32)data;
> > 		break;
> > 
> 
> Ok, squashed in the following:

Too fast!  The below won't compile (s/msr_info/msr and 'r' needs to be defined),
and the get_msr() path needs the guest_cpuid_has() check.  I'll spin a v3. 

> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 14ff7f0963e9..00e9680969a2 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -2875,16 +2875,28 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
>  		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
>  			return 1;
> +		if (!msr_info->host_initiated &&
> +		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> +			return 1;
> +
>  		/*
>  		 * TSC_AUX is usually changed only during boot and never read
>  		 * directly.  Intercept TSC_AUX instead of exposing it to the
>  		 * guest via direct_access_msrs, and switch it via user return.
>  		 */
> -		svm->tsc_aux = data;
> -
>  		preempt_disable();
> -		kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
> +		r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
>  		preempt_enable();
> +		if (r)
> +			return 1;
> +
> +		/*
> +		 * Bits 63:32 are dropped by AMD CPUs, but are reserved on
> +		 * Intel CPUs.  AMD's APM has incomplete and conflicting info
> +		 * on the architectural behavior; emulate current hardware as
> +		 * doing so ensures migrating from AMD to Intel won't explode.
> +		 */
> +		svm->tsc_aux = (u32)data;
>  		break;
>  	case MSR_IA32_DEBUGCTLMSR:
>  		if (!boot_cpu_has(X86_FEATURE_LBRV)) {
> 
> Paolo
>
Paolo Bonzini April 23, 2021, 2:18 p.m. UTC | #7
On 23/04/21 16:12, Sean Christopherson wrote:
> On Fri, Apr 23, 2021, Paolo Bonzini wrote:
>> On 22/04/21 22:12, Sean Christopherson wrote:
>>> 	case MSR_TSC_AUX:
>>> 		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
>>> 			return 1;
>>>
>>> 		if (!msr_info->host_initiated &&
>>> 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
>>> 			return 1;
>>>
>>> 		/*
>>> 		 * TSC_AUX is usually changed only during boot and never read
>>> 		 * directly.  Intercept TSC_AUX instead of exposing it to the
>>> 		 * guest via direct_access_msrs, and switch it via user return.
>>> 		 */
>>> 		preempt_disable();
>>> 		r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
>>> 		preempt_enable();
>>> 		if (r)
>>> 			return 1;
>>>
>>> 		/*
>>> 		 * Bits 63:32 are dropped by AMD CPUs, but are reserved on
>>> 		 * Intel CPUs.  AMD's APM has incomplete and conflicting info
>>> 		 * on the architectural behavior; emulate current hardware as
>>> 		 * doing so ensures migrating from AMD to Intel won't explode.
>>> 		 */
>>> 		svm->tsc_aux = (u32)data;
>>> 		break;
>>>
>>
>> Ok, squashed in the following:
> 
> Too fast!  The below won't compile (s/msr_info/msr and 'r' needs to be defined),
> and the get_msr() path needs the guest_cpuid_has() check.

Oops I missed the get_msr().  (I modify my local tree very aggressively, 
often without even compiling, so that I can use "git range-diff 
kvm/next..kvm/queue kvm/next.." as a reminder of things that are pending).

Paolo

   I'll spin a v3.
> 
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index 14ff7f0963e9..00e9680969a2 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -2875,16 +2875,28 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
>>   		if (!boot_cpu_has(X86_FEATURE_RDTSCP))
>>   			return 1;
>> +		if (!msr_info->host_initiated &&
>> +		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
>> +			return 1;
>> +
>>   		/*
>>   		 * TSC_AUX is usually changed only during boot and never read
>>   		 * directly.  Intercept TSC_AUX instead of exposing it to the
>>   		 * guest via direct_access_msrs, and switch it via user return.
>>   		 */
>> -		svm->tsc_aux = data;
>> -
>>   		preempt_disable();
>> -		kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
>> +		r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
>>   		preempt_enable();
>> +		if (r)
>> +			return 1;
>> +
>> +		/*
>> +		 * Bits 63:32 are dropped by AMD CPUs, but are reserved on
>> +		 * Intel CPUs.  AMD's APM has incomplete and conflicting info
>> +		 * on the architectural behavior; emulate current hardware as
>> +		 * doing so ensures migrating from AMD to Intel won't explode.
>> +		 */
>> +		svm->tsc_aux = (u32)data;
>>   		break;
>>   	case MSR_IA32_DEBUGCTLMSR:
>>   		if (!boot_cpu_has(X86_FEATURE_LBRV)) {
>>
>> Paolo
>>
>

Patch
diff mbox series

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index cd8c333ed2dc..596361449f25 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -213,6 +213,15 @@  struct kvm_ldttss_desc {
 
 DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
 
+/*
+ * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
+ * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
+ *
+ * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
+ * defer the restoration of TSC_AUX until the CPU returns to userspace.
+ */
+#define TSC_AUX_URET_SLOT	0
+
 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 
 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
@@ -958,6 +967,9 @@  static __init int svm_hardware_setup(void)
 		kvm_tsc_scaling_ratio_frac_bits = 32;
 	}
 
+	if (boot_cpu_has(X86_FEATURE_RDTSCP))
+		kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX);
+
 	/* Check for pause filtering support */
 	if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
 		pause_filter_count = 0;
@@ -1423,19 +1435,10 @@  static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
-	unsigned int i;
 
 	if (svm->guest_state_loaded)
 		return;
 
-	/*
-	 * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
-	 * area (non-sev-es). Save ones that aren't so we can restore them
-	 * individually later.
-	 */
-	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-		rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
 	/*
 	 * Save additional host state that will be restored on VMEXIT (sev-es)
 	 * or subsequent vmload of host save area.
@@ -1454,29 +1457,15 @@  static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	/* This assumes that the kernel never uses MSR_TSC_AUX */
 	if (static_cpu_has(X86_FEATURE_RDTSCP))
-		wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+		kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull);
 
 	svm->guest_state_loaded = true;
 }
 
 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_svm *svm = to_svm(vcpu);
-	unsigned int i;
-
-	if (!svm->guest_state_loaded)
-		return;
-
-	/*
-	 * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
-	 * area (non-sev-es). Restore the ones that weren't.
-	 */
-	for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
-		wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
-
-	svm->guest_state_loaded = false;
+	to_svm(vcpu)->guest_state_loaded = false;
 }
 
 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -2893,12 +2882,15 @@  static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 			return 1;
 
 		/*
-		 * This is rare, so we update the MSR here instead of using
-		 * direct_access_msrs.  Doing that would require a rdmsr in
-		 * svm_vcpu_put.
+		 * TSC_AUX is usually changed only during boot and never read
+		 * directly.  Intercept TSC_AUX instead of exposing it to the
+		 * guest via direct_acess_msrs, and switch it via user return.
 		 */
 		svm->tsc_aux = data;
-		wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+
+		preempt_disable();
+		kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull);
+		preempt_enable();
 		break;
 	case MSR_IA32_DEBUGCTLMSR:
 		if (!boot_cpu_has(X86_FEATURE_LBRV)) {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 454da1c1d9b7..9dce6f290041 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -23,11 +23,6 @@ 
 
 #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
 
-static const u32 host_save_user_msrs[] = {
-	MSR_TSC_AUX,
-};
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-
 #define	IOPM_SIZE PAGE_SIZE * 3
 #define	MSRPM_SIZE PAGE_SIZE * 2
 
@@ -128,8 +123,6 @@  struct vcpu_svm {
 
 	u64 next_rip;
 
-	u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-
 	u64 spec_ctrl;
 	/*
 	 * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be