Re: [PATCH 2/2 v2] KVM: nVMX: Rollback MSR-load if VM-entry fails due to VM-entry MSR-loading

From: Jim Mattson <jmattson@google.com>
To: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Cc: "kvm list" <kvm@vger.kernel.org>,
	"Paolo Bonzini" <pbonzini@redhat.com>,
	"Radim Krčmář" <rkrcmar@redhat.com>
Subject: Re: [PATCH 2/2 v2] KVM: nVMX: Rollback MSR-load if VM-entry fails due to VM-entry MSR-loading
Date: Tue, 15 Oct 2019 15:28:46 -0700	[thread overview]
Message-ID: <CALMp9eTZkY_E9AC_c-cdmdqksSZHiw4cMEtORvmxx4x=TNCBfw@mail.gmail.com> (raw)
In-Reply-To: <20191015000446.8099-3-krish.sadhukhan@oracle.com>

On Mon, Oct 14, 2019 at 5:40 PM Krish Sadhukhan
<krish.sadhukhan@oracle.com> wrote:
>
> If VM-entry fails due to VM-entry MSR-loading, the MSRs of the nested guests
> need to be rolled back to their previous state in order for the guest to not
> be in an inconsistent state.

This change seems overly simplistic, and it also breaks the existing ABI.

> Suggested-by: Jim Mattson <jmattson@google.com>
> Signed-off-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
> Reviewed-by: Karl Heubaum <karl.heubaum@oracle.com>
> ---
>  arch/x86/kvm/vmx/nested.c | 50 +++++++++++++++++++++++++++++++++++----
>  arch/x86/kvm/vmx/nested.h | 26 +++++++++++++++-----
>  arch/x86/kvm/vmx/vmx.h    |  8 +++++++
>  3 files changed, 74 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index cebdcb105ea8..bd8e7af5c1e5 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -191,7 +191,7 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
>         return kvm_skip_emulated_instruction(vcpu);
>  }
>
> -static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
> +void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
>  {
>         /* TODO: not to reset guest simply here. */
>         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> @@ -894,11 +894,13 @@ static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
>   * as possible, process all valid entries before failing rather than precheck
>   * for a capacity violation.
>   */
> -static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
> +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count,
> +                              bool save)
>  {
>         u32 i;
>         struct vmx_msr_entry e;
>         u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
>
>         for (i = 0; i < count; i++) {
>                 if (unlikely(i >= max_msr_list_size))
> @@ -917,6 +919,16 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
>                                 __func__, i, e.index, e.reserved);
>                         goto fail;
>                 }
> +               if (save) {
> +                       vmx->nested.vm_entry_msr_load_backup[i].index = e.index;
> +                       if (kvm_get_msr(vcpu, e.index,
> +                           &(vmx->nested.vm_entry_msr_load_backup[i].data))) {
> +                               pr_debug_ratelimited(
> +                                       "%s cannot read MSR (%u, 0x%x)\n",
> +                                       __func__, i, e.index);
> +                               goto fail;

This breaks the ABI, by requiring that all MSRs in the MSR-load list
have to be readable. Some, like IA32_PRED_CMD, are not.

> +                       }
> +               }
>                 if (kvm_set_msr(vcpu, e.index, e.value)) {
>                         pr_debug_ratelimited(
>                                 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
> @@ -926,6 +938,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
>         }
>         return 0;
>  fail:
> +       kfree(vmx->nested.vm_entry_msr_load_backup);
>         return i + 1;
>  }
>
> @@ -973,6 +986,26 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
>         return 0;
>  }
>
> +int nested_vmx_rollback_msr(struct kvm_vcpu *vcpu, u32 count)
> +{
> +       u32 i;
> +       struct msr_data msr;
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +       for (i = 0; i < count; i++) {

I wonder if this loop should go in the other direction, in case there
are dependencies among the MSR settings.

> +               msr.host_initiated = false;
> +               msr.index = vmx->nested.vm_entry_msr_load_backup[i].index;
> +               msr.data = vmx->nested.vm_entry_msr_load_backup[i].data;
> +               if (kvm_set_msr(vcpu, msr.index, msr.data)) {
> +                       pr_debug_ratelimited(
> +                                       "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
> +                                       __func__, i, msr.index, msr.data);
> +                       return -EINVAL;

This doesn't work with time-related MSRs, like
IA32_TIME_STAMP_COUNTER. Rather than "rolling back" to an earlier
value, you need to be able to negate the effect of the load that
should never have happened. Similarly, I don't think this works with
IA32_TSC_DEADLINE, if the original deadline has passed before you
saved it. I believe that writing a deadline in the past will result in
a spurious interrupt.

> +               }
> +       }
> +       return 0;
> +}
> +
>  static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
>  {
>         unsigned long invalid_mask;
> @@ -3102,9 +3135,18 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
>                 goto vmentry_fail_vmexit_guest_mode;
>
>         if (from_vmentry) {
> +               u32 count = vmcs12->vm_entry_msr_load_count;
> +
> +               /* Save guest MSRs before we load them */
> +               vmx->nested.vm_entry_msr_load_backup =
> +                   kcalloc(count, sizeof(struct msr_data), GFP_KERNEL_ACCOUNT);
> +               if (!vmx->nested.vm_entry_msr_load_backup)
> +                       goto vmentry_fail_vmexit_guest_mode;
> +

Should the backup memory be allocated in advance, so that we don't
have this unarchitected VM-entry failure? If not, should this be
deferred until after the attempted VM-entry to vmcs02, to avoid
introducing yet another priority inversion?

>                 exit_qual = nested_vmx_load_msr(vcpu,
>                                                 vmcs12->vm_entry_msr_load_addr,
> -                                               vmcs12->vm_entry_msr_load_count);
> +                                               vmcs12->vm_entry_msr_load_count,
> +                                               true);
>                 if (exit_qual) {
>                         /*
>                          * According to section “VM Entries” in Intel SDM
> @@ -3940,7 +3982,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
>                 vmx_update_msr_bitmap(vcpu);
>
>         if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
> -                               vmcs12->vm_exit_msr_load_count))
> +                               vmcs12->vm_exit_msr_load_count, false))
>                 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
>  }
>
> diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
> index bb51ec8cf7da..f951b2b338d2 100644
> --- a/arch/x86/kvm/vmx/nested.h
> +++ b/arch/x86/kvm/vmx/nested.h
> @@ -17,6 +17,8 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry);
>  bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
>  void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
>                        u32 exit_intr_info, unsigned long exit_qualification);
> +void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator);
> +int nested_vmx_rollback_msr(struct kvm_vcpu *vcpu, u32 count);
>  void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
>  int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
>  int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
> @@ -66,7 +68,6 @@ static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
>  {
>         u32 exit_qual;
>         u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> -       struct vmx_msr_entry *addr;
>
>         /*
>          * At this point, the exit interruption info in exit_intr_info
> @@ -85,11 +86,24 @@ static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
>
>         exit_qual = vmcs_readl(EXIT_QUALIFICATION);
>
> -       addr = __va(vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR));
> -       if (addr && addr->index == MSR_FS_BASE &&
> -           (exit_reason == (VMX_EXIT_REASONS_FAILED_VMENTRY |
> -                           EXIT_REASON_MSR_LOAD_FAIL))) {
> -               exit_qual = (to_vmx(vcpu))->nested.invalid_msr_load_exit_qual;
> +       if (exit_reason == (VMX_EXIT_REASONS_FAILED_VMENTRY |
> +                           EXIT_REASON_MSR_LOAD_FAIL)) {
> +
> +               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +               struct vmx_msr_entry *addr;
> +
> +               if (nested_vmx_rollback_msr(vcpu,
> +                                           vmcs12->vm_entry_msr_load_count)) {
> +                       nested_vmx_abort(vcpu,
> +                                       VMX_ABORT_SAVE_GUEST_MSR_FAIL);
> +
> +                       kfree(to_vmx(vcpu)->nested.vm_entry_msr_load_backup);
> +               }

Are we leaking the backup memory when the rollback succeeds?

> +
> +               addr = __va(vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR));
> +               if (addr && addr->index == MSR_FS_BASE)
> +                       exit_qual =
> +                           (to_vmx(vcpu))->nested.invalid_msr_load_exit_qual;
>         }
>
>         nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index ee7f40abd199..9a7c118036be 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -189,6 +189,14 @@ struct nested_vmx {
>          * due to invalid VM-entry MSR-load area in vmcs12.
>          */
>         u32 invalid_msr_load_exit_qual;
> +
> +       /*
> +        * This is used for backing up the MSRs of nested guests when
> +        * those MSRs are loaded from VM-entry MSR-load area on VM-entry.
> +        * If VM-entry fails due to VM-entry MSR-loading, we roll back
> +        * the MSRs to the values saved here.
> +        */
> +       struct msr_data *vm_entry_msr_load_backup;
>  };
>
>  struct vcpu_vmx {
> --
> 2.20.1
>