KVM Archive on lore.kernel.org
 help / color / Atom feed
From: Paolo Bonzini <pbonzini@redhat.com>
To: linux-kernel@vger.kernel.org, kvm@vger.kernel.org
Cc: Sean Christopherson <sean.j.christopherson@intel.com>,
	vkuznets@redhat.com
Subject: [PATCH 13/43] KVM: nVMX: Sync rarely accessed guest fields only when needed
Date: Thu, 13 Jun 2019 19:02:59 +0200
Message-ID: <1560445409-17363-14-git-send-email-pbonzini@redhat.com> (raw)
In-Reply-To: <1560445409-17363-1-git-send-email-pbonzini@redhat.com>

From: Sean Christopherson <sean.j.christopherson@intel.com>

Many guest fields are rarely read (or written) by VMMs, i.e. likely
aren't accessed between runs of a nested VMCS.  Delay pulling rarely
accessed guest fields from vmcs02 until they are VMREAD or until vmcs12
is dirtied.  The latter case is necessary because nested VM-Entry will
consume all manner of fields when vmcs12 is dirty, e.g. for consistency
checks.

Note, an alternative to synchronizing all guest fields on VMREAD would
be to read *only* the field being accessed, but switching VMCS pointers
is expensive and odds are good if one guest field is being accessed then
others will soon follow, or that vmcs12 will be dirtied due to a VMWRITE
(see above).  And the full synchronization results in slightly cleaner
code.

Note, although GUEST_PDPTRs are relevant only for a 32-bit PAE guest,
they are accessed quite frequently for said guests, and a separate patch
is in flight to optimize away GUEST_PDTPR synchronziation for non-PAE
guests.

Skipping rarely accessed guest fields reduces the latency of a nested
VM-Exit by ~200 cycles.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 140 +++++++++++++++++++++++++++++++++++++++-------
 arch/x86/kvm/vmx/vmx.h    |   7 +++
 2 files changed, 127 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index a6fe6cfe96f6..6f7b572417a4 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3376,20 +3376,57 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
 	return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
 }
 
-/*
- * Update the guest state fields of vmcs12 to reflect changes that
- * occurred while L2 was running. (The "IA-32e mode guest" bit of the
- * VM-entry controls is also updated, since this is really a guest
- * state bit.)
- */
-static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static bool is_vmcs12_ext_field(unsigned long field)
 {
-	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
-	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+	switch (field) {
+	case GUEST_ES_SELECTOR:
+	case GUEST_CS_SELECTOR:
+	case GUEST_SS_SELECTOR:
+	case GUEST_DS_SELECTOR:
+	case GUEST_FS_SELECTOR:
+	case GUEST_GS_SELECTOR:
+	case GUEST_LDTR_SELECTOR:
+	case GUEST_TR_SELECTOR:
+	case GUEST_ES_LIMIT:
+	case GUEST_CS_LIMIT:
+	case GUEST_SS_LIMIT:
+	case GUEST_DS_LIMIT:
+	case GUEST_FS_LIMIT:
+	case GUEST_GS_LIMIT:
+	case GUEST_LDTR_LIMIT:
+	case GUEST_TR_LIMIT:
+	case GUEST_GDTR_LIMIT:
+	case GUEST_IDTR_LIMIT:
+	case GUEST_ES_AR_BYTES:
+	case GUEST_DS_AR_BYTES:
+	case GUEST_FS_AR_BYTES:
+	case GUEST_GS_AR_BYTES:
+	case GUEST_LDTR_AR_BYTES:
+	case GUEST_TR_AR_BYTES:
+	case GUEST_ES_BASE:
+	case GUEST_CS_BASE:
+	case GUEST_SS_BASE:
+	case GUEST_DS_BASE:
+	case GUEST_FS_BASE:
+	case GUEST_GS_BASE:
+	case GUEST_LDTR_BASE:
+	case GUEST_TR_BASE:
+	case GUEST_GDTR_BASE:
+	case GUEST_IDTR_BASE:
+	case GUEST_PENDING_DBG_EXCEPTIONS:
+	case GUEST_BNDCFGS:
+		return true;
+	default:
+		break;
+	}
 
-	vmcs12->guest_rsp = kvm_rsp_read(vcpu);
-	vmcs12->guest_rip = kvm_rip_read(vcpu);
-	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	return false;
+}
+
+static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+				       struct vmcs12 *vmcs12)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
 	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
@@ -3410,8 +3447,6 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
 	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
 	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
-	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
-	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
 	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
 	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
 	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
@@ -3427,11 +3462,65 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
 	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
 	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	vmcs12->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	if (kvm_mpx_supported())
+		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+
+	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
+}
+
+static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
+				       struct vmcs12 *vmcs12)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int cpu;
+
+	if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
+		return;
+
+
+	WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
+
+	cpu = get_cpu();
+	vmx->loaded_vmcs = &vmx->nested.vmcs02;
+	vmx_vcpu_load(&vmx->vcpu, cpu);
+
+	sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+	vmx->loaded_vmcs = &vmx->vmcs01;
+	vmx_vcpu_load(&vmx->vcpu, cpu);
+	put_cpu();
+}
+
+/*
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
+ */
+static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vmx->nested.hv_evmcs)
+		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
+	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
+
+	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+	vmcs12->guest_rsp = kvm_rsp_read(vcpu);
+	vmcs12->guest_rip = kvm_rip_read(vcpu);
+	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
 
 	vmcs12->guest_interruptibility_info =
 		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-	vmcs12->guest_pending_dbg_exceptions =
-		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+
 	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
 		vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
 	else
@@ -3481,8 +3570,6 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
 	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
 	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-	if (kvm_mpx_supported())
-		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
 }
 
 /*
@@ -4280,6 +4367,8 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
 	if (vmx->nested.current_vmptr == -1ull)
 		return;
 
+	copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
+
 	if (enable_shadow_vmcs) {
 		/* copy to memory all shadowed fields in case
 		   they were modified */
@@ -4397,6 +4486,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 		return nested_vmx_failValid(vcpu,
 			VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 
+	if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+		copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+
 	/* Read the field, zero-extended to a u64 field_value */
 	field_value = vmcs12_read_any(vmcs12, field, offset);
 
@@ -4495,9 +4587,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 		return nested_vmx_failValid(vcpu,
 			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
 
-	if (!is_guest_mode(vcpu))
+	if (!is_guest_mode(vcpu)) {
 		vmcs12 = get_vmcs12(vcpu);
-	else {
+
+		/*
+		 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
+		 * vmcs12, else we may crush a field or consume a stale value.
+		 */
+		if (!is_shadow_field_rw(field))
+			copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+	} else {
 		/*
 		 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
 		 * to shadowed-field sets the ALU flags for VMfailInvalid.
@@ -5317,6 +5416,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 	 */
 	if (is_guest_mode(vcpu)) {
 		sync_vmcs02_to_vmcs12(vcpu, vmcs12);
+		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
 	} else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
 		if (vmx->nested.hv_evmcs)
 			copy_enlightened_to_vmcs12(vmx);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index f4448292df0f..ed65999b07a8 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -109,6 +109,7 @@ struct nested_vmx {
 	 * to guest memory during VM exit.
 	 */
 	struct vmcs12 *cached_shadow_vmcs12;
+
 	/*
 	 * Indicates if the shadow vmcs or enlightened vmcs must be updated
 	 * with the data held by struct vmcs12.
@@ -117,6 +118,12 @@ struct nested_vmx {
 	bool dirty_vmcs12;
 
 	/*
+	 * Indicates lazily loaded guest state has not yet been decached from
+	 * vmcs02.
+	 */
+	bool need_sync_vmcs02_to_vmcs12_rare;
+
+	/*
 	 * vmcs02 has been initialized, i.e. state that is constant for
 	 * vmcs02 has been written to the backing VMCS.  Initialization
 	 * is delayed until L1 actually attempts to run a nested VM.
-- 
1.8.3.1



  parent reply index

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-06-13 17:02 [PATCH 00/43] VMX optimizations Paolo Bonzini
2019-06-13 17:02 ` [PATCH 01/43] KVM: VMX: Fix handling of #MC that occurs during VM-Entry Paolo Bonzini
2019-06-13 17:24   ` Jim Mattson
2019-06-13 17:02 ` [PATCH 02/43] kvm: nVMX: small cleanup in handle_exception Paolo Bonzini
2019-06-13 17:02 ` [PATCH 03/43] KVM: VMX: Read cached VM-Exit reason to detect external interrupt Paolo Bonzini
2019-06-13 17:02 ` [PATCH 04/43] KVM: VMX: Store the host kernel's IDT base in a global variable Paolo Bonzini
2019-06-13 17:02 ` [PATCH 05/43] KVM: x86: Move kvm_{before,after}_interrupt() calls to vendor code Paolo Bonzini
2019-06-13 17:02 ` [PATCH 06/43] KVM: VMX: Handle NMIs, #MCs and async #PFs in common irqs-disabled fn Paolo Bonzini
2019-06-13 17:02 ` [PATCH 07/43] KVM: nVMX: Intercept VMWRITEs to read-only shadow VMCS fields Paolo Bonzini
2019-06-13 17:02 ` [PATCH 08/43] KVM: nVMX: Intercept VMWRITEs to GUEST_{CS,SS}_AR_BYTES Paolo Bonzini
2019-06-13 17:02 ` [PATCH 09/43] KVM: nVMX: Track vmcs12 offsets for shadowed VMCS fields Paolo Bonzini
2019-06-13 17:02 ` [PATCH 10/43] KVM: nVMX: Lift sync_vmcs12() out of prepare_vmcs12() Paolo Bonzini
2019-06-13 17:02 ` [PATCH 11/43] KVM: nVMX: Use descriptive names for VMCS sync functions and flags Paolo Bonzini
2019-06-13 17:02 ` [PATCH 12/43] KVM: nVMX: Add helpers to identify shadowed VMCS fields Paolo Bonzini
2019-06-14 16:10   ` Sean Christopherson
2019-06-13 17:02 ` Paolo Bonzini [this message]
2019-06-13 17:03 ` [PATCH 14/43] KVM: nVMX: Rename prepare_vmcs02_*_full to prepare_vmcs02_*_rare Paolo Bonzini
2019-06-13 17:03 ` [PATCH 15/43] KVM: VMX: Always signal #GP on WRMSR to MSR_IA32_CR_PAT with bad value Paolo Bonzini
2019-06-13 17:03 ` [PATCH 16/43] KVM: nVMX: Always sync GUEST_BNDCFGS when it comes from vmcs01 Paolo Bonzini
     [not found]   ` <20190615221602.93C5721851@mail.kernel.org>
2019-06-15 22:40     ` Liran Alon
2019-06-13 17:03 ` [PATCH 17/43] KVM: nVMX: Write ENCLS-exiting bitmap once per vmcs02 Paolo Bonzini
2019-06-13 17:03 ` [PATCH 18/43] KVM: nVMX: Don't rewrite GUEST_PML_INDEX during nested VM-Entry Paolo Bonzini
2019-06-13 17:03 ` [PATCH 19/43] KVM: VMX: simplify vmx_prepare_switch_to_{guest,host} Paolo Bonzini
2019-06-13 17:03 ` [PATCH 20/43] KVM: nVMX: Don't "put" vCPU or host state when switching VMCS Paolo Bonzini
2019-06-13 17:03 ` [PATCH 21/43] KVM: nVMX: Don't reread VMCS-agnostic " Paolo Bonzini
2019-06-14 16:25   ` Sean Christopherson
2019-06-13 17:03 ` [PATCH 22/43] KVM: nVMX: Don't dump VMCS if virtual APIC page can't be mapped Paolo Bonzini
2019-06-17 19:17   ` Radim Krčmář
2019-06-17 20:07     ` Sean Christopherson
2019-06-18  9:43       ` Paolo Bonzini
2019-06-13 17:03 ` [PATCH 23/43] KVM: nVMX: Don't speculatively write virtual-APIC page address Paolo Bonzini
2019-06-13 17:03 ` [PATCH 24/43] KVM: nVMX: Don't speculatively write APIC-access " Paolo Bonzini
2019-06-13 17:03 ` [PATCH 25/43] KVM: nVMX: Update vmcs12 for MSR_IA32_CR_PAT when it's written Paolo Bonzini
2019-06-13 17:03 ` [PATCH 26/43] KVM: nVMX: Update vmcs12 for SYSENTER MSRs when they're written Paolo Bonzini
2019-06-13 17:03 ` [PATCH 27/43] KVM: nVMX: Update vmcs12 for MSR_IA32_DEBUGCTLMSR when it's written Paolo Bonzini
2019-06-13 17:03 ` [PATCH 28/43] KVM: nVMX: Don't update GUEST_BNDCFGS if it's clean in HV eVMCS Paolo Bonzini
2019-06-13 17:03 ` [PATCH 29/43] KVM: x86: introduce is_pae_paging Paolo Bonzini
2019-06-13 17:03 ` [PATCH 30/43] KVM: nVMX: Copy PDPTRs to/from vmcs12 only when necessary Paolo Bonzini
2019-06-13 17:03 ` [PATCH 31/43] KVM: nVMX: Use adjusted pin controls for vmcs02 Paolo Bonzini
2019-06-13 17:03 ` [PATCH 32/43] KVM: VMX: Add builder macros for shadowing controls Paolo Bonzini
2019-06-13 17:03 ` [PATCH 33/43] KVM: VMX: Shadow VMCS pin controls Paolo Bonzini
2019-06-13 17:03 ` [PATCH 34/43] KVM: VMX: Shadow VMCS primary execution controls Paolo Bonzini
2019-06-13 17:03 ` [PATCH 35/43] KVM: VMX: Shadow VMCS secondary " Paolo Bonzini
2019-06-13 17:03 ` [PATCH 36/43] KVM: nVMX: Shadow VMCS controls on a per-VMCS basis Paolo Bonzini
2019-06-13 17:03 ` [PATCH 37/43] KVM: nVMX: Don't reset VMCS controls shadow on VMCS switch Paolo Bonzini
2019-06-13 17:03 ` [PATCH 38/43] KVM: VMX: Explicitly initialize controls shadow at VMCS allocation Paolo Bonzini
2019-06-13 17:03 ` [PATCH 39/43] KVM: nVMX: Preserve last USE_MSR_BITMAPS when preparing vmcs02 Paolo Bonzini
2019-06-13 17:03 ` [PATCH 40/43] KVM: nVMX: Preset *DT exiting in vmcs02 when emulating UMIP Paolo Bonzini
2019-06-13 17:03 ` [PATCH 41/43] KVM: VMX: Drop hv_timer_armed from 'struct loaded_vmcs' Paolo Bonzini
2019-06-13 17:03 ` [PATCH 42/43] KVM: VMX: Leave preemption timer running when it's disabled Paolo Bonzini
2019-06-14 16:34   ` Sean Christopherson
2019-06-13 17:03 ` [PATCH 43/43] KVM: nVMX: shadow pin based execution controls Paolo Bonzini
2019-06-14 16:34   ` Sean Christopherson

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1560445409-17363-14-git-send-email-pbonzini@redhat.com \
    --to=pbonzini@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=sean.j.christopherson@intel.com \
    --cc=vkuznets@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

KVM Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/kvm/0 kvm/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 kvm kvm/ https://lore.kernel.org/kvm \
		kvm@vger.kernel.org
	public-inbox-index kvm

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.kvm


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git