* [PATCH 0/2] Fix VMX preemption timer migration
@ 2020-05-18 20:15 Makarand Sonare
2020-05-18 20:15 ` [PATCH 1/2] KVM: nVMX: " Makarand Sonare
2020-05-18 20:16 ` [PATCH 2/2] KVM: selftests: VMX preemption timer migration test Makarand Sonare
0 siblings, 2 replies; 8+ messages in thread
From: Makarand Sonare @ 2020-05-18 20:15 UTC (permalink / raw)
To: kvm, pshier, jmattson; +Cc: Makarand Sonare
Fix VMX preemption timer migration. Add a selftest to ensure post migration
both L1 and L2 VM observe the VMX preemption timer exit close to the original
expiration deadline.
Makarand Sonare (1):
KVM: selftests: VMX preemption timer migration test
Peter Shier (1):
KVM: nVMX: Fix VMX preemption timer migration
Documentation/virt/kvm/api.rst | 4 +
arch/x86/include/uapi/asm/kvm.h | 2 +
arch/x86/kvm/vmx/nested.c | 61 ++++-
arch/x86/kvm/vmx/vmx.h | 1 +
arch/x86/kvm/x86.c | 3 +-
include/uapi/linux/kvm.h | 1 +
tools/arch/x86/include/uapi/asm/kvm.h | 3 +
tools/testing/selftests/kvm/.gitignore | 1 +
tools/testing/selftests/kvm/Makefile | 1 +
.../testing/selftests/kvm/include/kvm_util.h | 2 +
.../selftests/kvm/include/x86_64/processor.h | 11 +-
.../selftests/kvm/include/x86_64/vmx.h | 27 ++
.../kvm/x86_64/vmx_preemption_timer_test.c | 256 ++++++++++++++++++
13 files changed, 362 insertions(+), 11 deletions(-)
create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
--
2.26.2.761.g0e0b3e54be-goog
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 1/2] KVM: nVMX: Fix VMX preemption timer migration
2020-05-18 20:15 [PATCH 0/2] Fix VMX preemption timer migration Makarand Sonare
@ 2020-05-18 20:15 ` Makarand Sonare
2020-05-19 0:49 ` Vitaly Kuznetsov
2020-05-19 7:18 ` Krish Sadhukhan
2020-05-18 20:16 ` [PATCH 2/2] KVM: selftests: VMX preemption timer migration test Makarand Sonare
1 sibling, 2 replies; 8+ messages in thread
From: Makarand Sonare @ 2020-05-18 20:15 UTC (permalink / raw)
To: kvm, pshier, jmattson; +Cc: Makarand Sonare
From: Peter Shier <pshier@google.com>
Add new field to hold preemption timer expiration deadline
appended to struct kvm_vmx_nested_state_data. This is to prevent
the first VM-Enter after migration from incorrectly restarting the timer
with the full timer value instead of partially decayed timer value.
KVM_SET_NESTED_STATE restarts timer using migrated state regardless
of whether L1 sets VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
Fixes: cf8b84f48a593 ("kvm: nVMX: Prepare for checkpointing L2 state")
Signed-off-by: Peter Shier <pshier@google.com>
Signed-off-by: Makarand Sonare <makarandsonare@google.com>
Change-Id: I6446aba5a547afa667f0ef4620b1b76115bf3753
---
Documentation/virt/kvm/api.rst | 4 ++
arch/x86/include/uapi/asm/kvm.h | 2 +
arch/x86/kvm/vmx/nested.c | 61 ++++++++++++++++++++++++---
arch/x86/kvm/vmx/vmx.h | 1 +
arch/x86/kvm/x86.c | 3 +-
include/uapi/linux/kvm.h | 1 +
tools/arch/x86/include/uapi/asm/kvm.h | 2 +
7 files changed, 67 insertions(+), 7 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index d871dacb984e9..b410815772970 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4326,6 +4326,9 @@ Errors:
#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
#define KVM_STATE_NESTED_EVMCS 0x00000004
+ /* Available with KVM_CAP_NESTED_STATE_PREEMPTION_TIMER */
+ #define KVM_STATE_NESTED_PREEMPTION_TIMER 0x00000010
+
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
@@ -4346,6 +4349,7 @@ Errors:
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+ __u64 preemption_timer_deadline;
};
This ioctl copies the vcpu's nested virtualization state from the kernel to
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 3f3f780c8c650..20d5832bab215 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -391,6 +391,7 @@ struct kvm_sync_regs {
#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
#define KVM_STATE_NESTED_EVMCS 0x00000004
#define KVM_STATE_NESTED_MTF_PENDING 0x00000008
+#define KVM_STATE_NESTED_PREEMPTION_TIMER 0x00000010
#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
#define KVM_STATE_NESTED_SMM_VMXON 0x00000002
@@ -400,6 +401,7 @@ struct kvm_sync_regs {
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+ __u64 preemption_timer_deadline;
};
struct kvm_vmx_nested_state_hdr {
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 51ebb60e1533a..badb82a39ac04 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2092,9 +2092,9 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
+static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
+ u64 preemption_timeout)
{
- u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
@@ -3353,8 +3353,24 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
* the timer.
*/
vmx->nested.preemption_timer_expired = false;
- if (nested_cpu_has_preemption_timer(vmcs12))
- vmx_start_preemption_timer(vcpu);
+ if (nested_cpu_has_preemption_timer(vmcs12)) {
+ u64 timer_value;
+ u64 l1_tsc_value = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ if (from_vmentry) {
+ timer_value = vmcs12->vmx_preemption_timer_value;
+ vmx->nested.preemption_timer_deadline = timer_value +
+ (l1_tsc_value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE);
+ } else {
+ if ((l1_tsc_value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE) >
+ vmx->nested.preemption_timer_deadline)
+ timer_value = 0;
+ else
+ timer_value = vmx->nested.preemption_timer_deadline -
+ (l1_tsc_value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE);
+ }
+ vmx_start_preemption_timer(vcpu, timer_value);
+ }
/*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -3962,9 +3978,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
if (nested_cpu_has_preemption_timer(vmcs12) &&
- vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+ !vmx->nested.nested_run_pending) {
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
vmcs12->vmx_preemption_timer_value =
vmx_get_preemption_timer_value(vcpu);
+ }
/*
* In some cases (usually, nested EPT), L2 is allowed to change its
@@ -5939,6 +5957,13 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
if (vmx->nested.mtf_pending)
kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
+
+ if (nested_cpu_has_preemption_timer(vmcs12)) {
+ kvm_state.flags |=
+ KVM_STATE_NESTED_PREEMPTION_TIMER;
+ kvm_state.size = offsetof(struct kvm_nested_state, data.vmx) +
+ offsetofend(struct kvm_vmx_nested_state_data, preemption_timer_deadline);
+ }
}
}
@@ -5970,6 +5995,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
+ BUILD_BUG_ON(sizeof(user_vmx_nested_state->preemption_timer_deadline)
+ != sizeof(vmx->nested.preemption_timer_deadline));
+
/*
* Copy over the full allocated size of vmcs12 rather than just the size
@@ -5985,6 +6013,12 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
return -EFAULT;
}
+ if (kvm_state.flags & KVM_STATE_NESTED_PREEMPTION_TIMER) {
+ if (put_user(vmx->nested.preemption_timer_deadline,
+ &user_vmx_nested_state->preemption_timer_deadline))
+ return -EFAULT;
+ }
+
out:
return kvm_state.size;
}
@@ -6056,7 +6090,8 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
*/
if (is_smm(vcpu) ?
(kvm_state->flags &
- (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
+ (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING |
+ KVM_STATE_NESTED_PREEMPTION_TIMER))
: kvm_state->hdr.vmx.smm.flags)
return -EINVAL;
@@ -6146,6 +6181,20 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
goto error_guest_mode;
}
+ if (kvm_state->flags & KVM_STATE_NESTED_PREEMPTION_TIMER) {
+
+ if (kvm_state->size <
+ offsetof(struct kvm_nested_state, data.vmx) +
+ offsetofend(struct kvm_vmx_nested_state_data, preemption_timer_deadline))
+ goto error_guest_mode;
+
+ if (get_user(vmx->nested.preemption_timer_deadline,
+ &user_vmx_nested_state->preemption_timer_deadline)) {
+ ret = -EFAULT;
+ goto error_guest_mode;
+ }
+ }
+
if (nested_vmx_check_controls(vcpu, vmcs12) ||
nested_vmx_check_host_state(vcpu, vmcs12) ||
nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 298ddef79d009..db697400755fb 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -169,6 +169,7 @@ struct nested_vmx {
u16 posted_intr_nv;
struct hrtimer preemption_timer;
+ u64 preemption_timer_deadline;
bool preemption_timer_expired;
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 471fccf7f8501..ba9e62ffbb4cd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3418,6 +3418,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
case KVM_CAP_SET_GUEST_DEBUG:
+ case KVM_CAP_NESTED_STATE_PREEMPTION_TIMER:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
@@ -4626,7 +4627,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (kvm_state.flags &
~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
- | KVM_STATE_NESTED_EVMCS))
+ | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_PREEMPTION_TIMER))
break;
/* nested_run_pending implies guest_mode. */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ac9eba0289d1b..0868dce12a715 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1018,6 +1018,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_S390_PROTECTED 180
#define KVM_CAP_PPC_SECURE_GUEST 181
#define KVM_CAP_HALT_POLL 182
+#define KVM_CAP_NESTED_STATE_PREEMPTION_TIMER 183
#ifdef KVM_CAP_IRQ_ROUTING
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 3f3f780c8c650..60701178b9cc1 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -391,6 +391,8 @@ struct kvm_sync_regs {
#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
#define KVM_STATE_NESTED_EVMCS 0x00000004
#define KVM_STATE_NESTED_MTF_PENDING 0x00000008
+/* Available with KVM_CAP_NESTED_STATE_PREEMPTION_TIMER */
+#define KVM_STATE_NESTED_PREEMPTION_TIMER 0x00000010
#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
#define KVM_STATE_NESTED_SMM_VMXON 0x00000002
--
2.26.2.761.g0e0b3e54be-goog
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 2/2] KVM: selftests: VMX preemption timer migration test
2020-05-18 20:15 [PATCH 0/2] Fix VMX preemption timer migration Makarand Sonare
2020-05-18 20:15 ` [PATCH 1/2] KVM: nVMX: " Makarand Sonare
@ 2020-05-18 20:16 ` Makarand Sonare
2020-05-19 0:56 ` Vitaly Kuznetsov
2020-05-20 2:14 ` Krish Sadhukhan
1 sibling, 2 replies; 8+ messages in thread
From: Makarand Sonare @ 2020-05-18 20:16 UTC (permalink / raw)
To: kvm, pshier, jmattson; +Cc: Makarand Sonare
When a nested VM with a VMX-preemption timer is migrated, verify that the
nested VM and its parent VM observe the VMX-preemption timer exit close to
the original expiration deadline.
Signed-off-by: Makarand Sonare <makarandsonare@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Change-Id: Ia79337c682ee161399525edc34201fad473fc190
---
tools/arch/x86/include/uapi/asm/kvm.h | 1 +
tools/testing/selftests/kvm/.gitignore | 1 +
tools/testing/selftests/kvm/Makefile | 1 +
.../testing/selftests/kvm/include/kvm_util.h | 2 +
.../selftests/kvm/include/x86_64/processor.h | 11 +-
.../selftests/kvm/include/x86_64/vmx.h | 27 ++
.../kvm/x86_64/vmx_preemption_timer_test.c | 256 ++++++++++++++++++
7 files changed, 295 insertions(+), 4 deletions(-)
create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
index 60701178b9cc1..13dca545554dc 100644
--- a/tools/arch/x86/include/uapi/asm/kvm.h
+++ b/tools/arch/x86/include/uapi/asm/kvm.h
@@ -402,6 +402,7 @@ struct kvm_sync_regs {
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+ __u64 preemption_timer_deadline;
};
struct kvm_vmx_nested_state_hdr {
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 222e50104296a..f159718f90c0a 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -10,6 +10,7 @@
/x86_64/set_sregs_test
/x86_64/smm_test
/x86_64/state_test
+/x86_64/vmx_preemption_timer_test
/x86_64/svm_vmcall_test
/x86_64/sync_regs_test
/x86_64/vmx_close_while_nested_test
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index c66f4eec34111..780f0c189a7bc 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -20,6 +20,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
TEST_GEN_PROGS_x86_64 += x86_64/smm_test
TEST_GEN_PROGS_x86_64 += x86_64/state_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index e244c6ecfc1d5..919e161dd2893 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -314,6 +314,8 @@ void ucall_uninit(struct kvm_vm *vm);
void ucall(uint64_t cmd, int nargs, ...);
uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
+#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \
+ ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
#define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage)
#define GUEST_DONE() ucall(UCALL_DONE, 0)
#define __GUEST_ASSERT(_condition, _nargs, _args...) do { \
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index 7428513a4c687..7cb19eca6c72d 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -79,13 +79,16 @@ static inline uint64_t get_desc64_base(const struct desc64 *desc)
static inline uint64_t rdtsc(void)
{
uint32_t eax, edx;
-
+ uint32_t tsc_val;
/*
* The lfence is to wait (on Intel CPUs) until all previous
- * instructions have been executed.
+ * instructions have been executed. If software requires RDTSC to be
+ * executed prior to execution of any subsequent instruction, it can
+ * execute LFENCE immediately after RDTSC
*/
- __asm__ __volatile__("lfence; rdtsc" : "=a"(eax), "=d"(edx));
- return ((uint64_t)edx) << 32 | eax;
+ __asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx));
+ tsc_val = ((uint64_t)edx) << 32 | eax;
+ return tsc_val;
}
static inline uint64_t rdtscp(uint32_t *aux)
diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h
index 3d27069b9ed9c..ccff3e6e27048 100644
--- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
@@ -575,6 +575,33 @@ struct vmx_pages {
void *eptp;
};
+union vmx_basic {
+ u64 val;
+ struct {
+ u32 revision;
+ u32 size:13,
+ reserved1:3,
+ width:1,
+ dual:1,
+ type:4,
+ insouts:1,
+ ctrl:1,
+ vm_entry_exception_ctrl:1,
+ reserved2:7;
+ };
+};
+
+union vmx_ctrl_msr {
+ u64 val;
+ struct {
+ u32 set, clr;
+ };
+};
+
+union vmx_basic basic;
+union vmx_ctrl_msr ctrl_pin_rev;
+union vmx_ctrl_msr ctrl_exit_rev;
+
struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
bool prepare_for_vmx_operation(struct vmx_pages *vmx);
void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
new file mode 100644
index 0000000000000..10893b11511be
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VMX-preemption timer test
+ *
+ * Copyright (C) 2020, Google, LLC.
+ *
+ * Test to ensure the VM-Enter after migration doesn't
+ * incorrectly restarts the timer with the full timer
+ * value instead of partially decayed timer value
+ *
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define VCPU_ID 5
+#define PREEMPTION_TIMER_VALUE 100000000ull
+#define PREEMPTION_TIMER_VALUE_THRESHOLD1 80000000ull
+
+u32 vmx_pt_rate;
+bool l2_save_restore_done;
+static u64 l2_vmx_pt_start;
+volatile u64 l2_vmx_pt_finish;
+
+void l2_guest_code(void)
+{
+ u64 vmx_pt_delta;
+
+ vmcall();
+ l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
+
+ //
+ // Wait until the 1st threshold has passed
+ //
+ do {
+ l2_vmx_pt_finish = rdtsc();
+ vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >>
+ vmx_pt_rate;
+ } while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1);
+
+ //
+ // Force L2 through Save and Restore cycle
+ //
+ GUEST_SYNC(1);
+
+ l2_save_restore_done = 1;
+
+ //
+ // Now wait for the preemption timer to fire and
+ // exit to L1
+ //
+ while ((l2_vmx_pt_finish = rdtsc()))
+ ;
+}
+
+void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ u64 l1_vmx_pt_start;
+ u64 l1_vmx_pt_finish;
+ u64 l1_tsc_deadline, l2_tsc_deadline;
+
+ GUEST_ASSERT(vmx_pages->vmcs_gpa);
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+ GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+ prepare_vmcs(vmx_pages, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+ vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3);
+
+ //
+ // Check for Preemption timer support and turn on PIN control
+ //
+ basic.val = rdmsr(MSR_IA32_VMX_BASIC);
+ ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS
+ : MSR_IA32_VMX_PINBASED_CTLS);
+ ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS
+ : MSR_IA32_VMX_EXIT_CTLS);
+
+ if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) ||
+ !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+ return;
+
+ GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL,
+ vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
+ PIN_BASED_VMX_PREEMPTION_TIMER));
+
+ GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE,
+ PREEMPTION_TIMER_VALUE));
+
+ vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F;
+
+ l2_save_restore_done = 0;
+
+ l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
+
+ GUEST_ASSERT(!vmresume());
+
+ l1_vmx_pt_finish = rdtsc();
+
+ //
+ // Ensure exit from L2 happens after L2 goes through
+ // save and restore
+ GUEST_ASSERT(l2_save_restore_done);
+
+ //
+ // Ensure the exit from L2 is due to preemption timer expiry
+ //
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER);
+
+ l1_tsc_deadline = l1_vmx_pt_start +
+ (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
+
+ l2_tsc_deadline = l2_vmx_pt_start +
+ (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
+
+ //
+ // Sync with the host and pass the l1|l2 pt_expiry_finish times and
+ // tsc deadlines so that host can verify they are as expected
+ //
+ GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline,
+ l2_vmx_pt_finish, l2_tsc_deadline);
+}
+
+void guest_code(struct vmx_pages *vmx_pages)
+{
+ if (vmx_pages)
+ l1_guest_code(vmx_pages);
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t vmx_pages_gva = 0;
+
+ struct kvm_regs regs1, regs2;
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_x86_state *state;
+ struct ucall uc;
+ int stage;
+
+ /*
+ * AMD currently does not implement any VMX features, so for now we
+ * just early out.
+ */
+ nested_vmx_check_supported();
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ vcpu_regs_get(vm, VCPU_ID, ®s1);
+
+ if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
+ vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+ } else {
+ pr_info("will skip nested state checks\n");
+ goto done;
+ }
+
+ for (stage = 1;; stage++) {
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Stage %d: unexpected exit reason: %u (%s),\n",
+ stage, run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ /* UCALL_SYNC is handled here. */
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage, (ulong)uc.args[1]);
+ //
+ // If this stage 2 then we should verify the vmx pt expiry
+ // is as expected
+ //
+ if (stage == 2) {
+
+ pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n",
+ stage, uc.args[2], uc.args[3]);
+
+ pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n",
+ stage, uc.args[4], uc.args[5]);
+
+ //
+ // From L1's perspective verify Preemption timer hasn't
+ // expired too early
+ //
+
+ TEST_ASSERT(uc.args[2] >= uc.args[3],
+ "Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)",
+ stage, uc.args[2], uc.args[3]);
+
+ //
+ // From L2's perspective verify Preemption timer hasn't
+ // expired too late
+ //
+ TEST_ASSERT(uc.args[4] < uc.args[5],
+ "Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)",
+ stage, uc.args[4], uc.args[5]);
+ }
+
+ state = vcpu_save_state(vm, VCPU_ID);
+ memset(®s1, 0, sizeof(regs1));
+ vcpu_regs_get(vm, VCPU_ID, ®s1);
+
+ kvm_vm_release(vm);
+
+ /* Restore state in a new VM. */
+ kvm_vm_restart(vm, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vcpu_load_state(vm, VCPU_ID, state);
+ run = vcpu_state(vm, VCPU_ID);
+ free(state);
+
+ memset(®s2, 0, sizeof(regs2));
+ vcpu_regs_get(vm, VCPU_ID, ®s2);
+ TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)),
+ "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+ (ulong) regs2.rdi, (ulong) regs2.rsi);
+ }
+
+done:
+ kvm_vm_free(vm);
+}
--
2.26.2.761.g0e0b3e54be-goog
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 1/2] KVM: nVMX: Fix VMX preemption timer migration
2020-05-18 20:15 ` [PATCH 1/2] KVM: nVMX: " Makarand Sonare
@ 2020-05-19 0:49 ` Vitaly Kuznetsov
2020-05-19 7:18 ` Krish Sadhukhan
1 sibling, 0 replies; 8+ messages in thread
From: Vitaly Kuznetsov @ 2020-05-19 0:49 UTC (permalink / raw)
To: Makarand Sonare; +Cc: kvm, pshier, jmattson
Makarand Sonare <makarandsonare@google.com> writes:
> From: Peter Shier <pshier@google.com>
>
> Add new field to hold preemption timer expiration deadline
> appended to struct kvm_vmx_nested_state_data. This is to prevent
> the first VM-Enter after migration from incorrectly restarting the timer
> with the full timer value instead of partially decayed timer value.
> KVM_SET_NESTED_STATE restarts timer using migrated state regardless
> of whether L1 sets VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
>
> Fixes: cf8b84f48a593 ("kvm: nVMX: Prepare for checkpointing L2 state")
>
> Signed-off-by: Peter Shier <pshier@google.com>
> Signed-off-by: Makarand Sonare <makarandsonare@google.com>
> Change-Id: I6446aba5a547afa667f0ef4620b1b76115bf3753
> ---
> Documentation/virt/kvm/api.rst | 4 ++
> arch/x86/include/uapi/asm/kvm.h | 2 +
> arch/x86/kvm/vmx/nested.c | 61 ++++++++++++++++++++++++---
> arch/x86/kvm/vmx/vmx.h | 1 +
> arch/x86/kvm/x86.c | 3 +-
> include/uapi/linux/kvm.h | 1 +
> tools/arch/x86/include/uapi/asm/kvm.h | 2 +
> 7 files changed, 67 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index d871dacb984e9..b410815772970 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -4326,6 +4326,9 @@ Errors:
> #define KVM_STATE_NESTED_RUN_PENDING 0x00000002
> #define KVM_STATE_NESTED_EVMCS 0x00000004
>
Not your fault but KVM_STATE_NESTED_MTF_PENDING seems to be missing here
> + /* Available with KVM_CAP_NESTED_STATE_PREEMPTION_TIMER */
> + #define KVM_STATE_NESTED_PREEMPTION_TIMER 0x00000010
> +
> #define KVM_STATE_NESTED_FORMAT_VMX 0
> #define KVM_STATE_NESTED_FORMAT_SVM 1
>
> @@ -4346,6 +4349,7 @@ Errors:
> struct kvm_vmx_nested_state_data {
> __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> + __u64 preemption_timer_deadline;
> };
>
> This ioctl copies the vcpu's nested virtualization state from the kernel to
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 3f3f780c8c650..20d5832bab215 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -391,6 +391,7 @@ struct kvm_sync_regs {
> #define KVM_STATE_NESTED_RUN_PENDING 0x00000002
> #define KVM_STATE_NESTED_EVMCS 0x00000004
> #define KVM_STATE_NESTED_MTF_PENDING 0x00000008
> +#define KVM_STATE_NESTED_PREEMPTION_TIMER 0x00000010
and here you don't have the "/* Available with
KVM_CAP_NESTED_STATE_PREEMPTION_TIMER */" comment you put to api.rst. I
think it would be better to keep them in sync.
>
> #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
> #define KVM_STATE_NESTED_SMM_VMXON 0x00000002
> @@ -400,6 +401,7 @@ struct kvm_sync_regs {
> struct kvm_vmx_nested_state_data {
> __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> + __u64 preemption_timer_deadline;
> };
>
> struct kvm_vmx_nested_state_hdr {
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 51ebb60e1533a..badb82a39ac04 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -2092,9 +2092,9 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
> return HRTIMER_NORESTART;
> }
>
> -static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
> +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
> + u64 preemption_timeout)
> {
> - u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
> struct vcpu_vmx *vmx = to_vmx(vcpu);
>
> /*
> @@ -3353,8 +3353,24 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
> * the timer.
> */
> vmx->nested.preemption_timer_expired = false;
> - if (nested_cpu_has_preemption_timer(vmcs12))
> - vmx_start_preemption_timer(vcpu);
> + if (nested_cpu_has_preemption_timer(vmcs12)) {
> + u64 timer_value;
> + u64 l1_tsc_value = kvm_read_l1_tsc(vcpu, rdtsc());
> +
> + if (from_vmentry) {
> + timer_value = vmcs12->vmx_preemption_timer_value;
> + vmx->nested.preemption_timer_deadline = timer_value +
> + (l1_tsc_value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE);
> + } else {
> + if ((l1_tsc_value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE) >
> + vmx->nested.preemption_timer_deadline)
> + timer_value = 0;
> + else
> + timer_value = vmx->nested.preemption_timer_deadline -
> + (l1_tsc_value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE);
> + }
> + vmx_start_preemption_timer(vcpu, timer_value);
> + }
>
> /*
> * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
> @@ -3962,9 +3978,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
> vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
>
> if (nested_cpu_has_preemption_timer(vmcs12) &&
> - vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> + !vmx->nested.nested_run_pending) {
> + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> vmcs12->vmx_preemption_timer_value =
> vmx_get_preemption_timer_value(vcpu);
> + }
>
> /*
> * In some cases (usually, nested EPT), L2 is allowed to change its
> @@ -5939,6 +5957,13 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
>
> if (vmx->nested.mtf_pending)
> kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
> +
> + if (nested_cpu_has_preemption_timer(vmcs12)) {
> + kvm_state.flags |=
> + KVM_STATE_NESTED_PREEMPTION_TIMER;
> + kvm_state.size = offsetof(struct kvm_nested_state, data.vmx) +
> + offsetofend(struct kvm_vmx_nested_state_data, preemption_timer_deadline);
Hm, here we seem to drop all previous calculations for the
kvm_state.size (like if shadow vmcs was present or not). I think this
should just be
kvm_state.size += sizeof(user_vmx_nested_state.preemption_timer_deadline);
instead.
> + }
> }
> }
>
> @@ -5970,6 +5995,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
>
> BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
> BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
> + BUILD_BUG_ON(sizeof(user_vmx_nested_state->preemption_timer_deadline)
> + != sizeof(vmx->nested.preemption_timer_deadline));
> +
>
> /*
> * Copy over the full allocated size of vmcs12 rather than just the size
> @@ -5985,6 +6013,12 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
> return -EFAULT;
> }
>
> + if (kvm_state.flags & KVM_STATE_NESTED_PREEMPTION_TIMER) {
> + if (put_user(vmx->nested.preemption_timer_deadline,
> + &user_vmx_nested_state->preemption_timer_deadline))
> + return -EFAULT;
> + }
> +
> out:
> return kvm_state.size;
> }
> @@ -6056,7 +6090,8 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
> */
> if (is_smm(vcpu) ?
> (kvm_state->flags &
> - (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
> + (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING |
> + KVM_STATE_NESTED_PREEMPTION_TIMER))
> : kvm_state->hdr.vmx.smm.flags)
> return -EINVAL;
>
> @@ -6146,6 +6181,20 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
> goto error_guest_mode;
> }
>
> + if (kvm_state->flags & KVM_STATE_NESTED_PREEMPTION_TIMER) {
> +
> + if (kvm_state->size <
> + offsetof(struct kvm_nested_state, data.vmx) +
> + offsetofend(struct kvm_vmx_nested_state_data, preemption_timer_deadline))
This also doesn't seem to be correct. E.g. what if there is no shadow
vmcs data in the saved state?
> + goto error_guest_mode;
> +
> + if (get_user(vmx->nested.preemption_timer_deadline,
> + &user_vmx_nested_state->preemption_timer_deadline)) {
> + ret = -EFAULT;
> + goto error_guest_mode;
> + }
> + }
> +
> if (nested_vmx_check_controls(vcpu, vmcs12) ||
> nested_vmx_check_host_state(vcpu, vmcs12) ||
> nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 298ddef79d009..db697400755fb 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -169,6 +169,7 @@ struct nested_vmx {
> u16 posted_intr_nv;
>
> struct hrtimer preemption_timer;
> + u64 preemption_timer_deadline;
> bool preemption_timer_expired;
>
> /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 471fccf7f8501..ba9e62ffbb4cd 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3418,6 +3418,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> case KVM_CAP_MSR_PLATFORM_INFO:
> case KVM_CAP_EXCEPTION_PAYLOAD:
> case KVM_CAP_SET_GUEST_DEBUG:
> + case KVM_CAP_NESTED_STATE_PREEMPTION_TIMER:
> r = 1;
> break;
> case KVM_CAP_SYNC_REGS:
> @@ -4626,7 +4627,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>
> if (kvm_state.flags &
> ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
> - | KVM_STATE_NESTED_EVMCS))
> + | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_PREEMPTION_TIMER))
> break;
>
> /* nested_run_pending implies guest_mode. */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index ac9eba0289d1b..0868dce12a715 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1018,6 +1018,7 @@ struct kvm_ppc_resize_hpt {
> #define KVM_CAP_S390_PROTECTED 180
> #define KVM_CAP_PPC_SECURE_GUEST 181
> #define KVM_CAP_HALT_POLL 182
> +#define KVM_CAP_NESTED_STATE_PREEMPTION_TIMER 183
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
> index 3f3f780c8c650..60701178b9cc1 100644
> --- a/tools/arch/x86/include/uapi/asm/kvm.h
> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
> @@ -391,6 +391,8 @@ struct kvm_sync_regs {
> #define KVM_STATE_NESTED_RUN_PENDING 0x00000002
> #define KVM_STATE_NESTED_EVMCS 0x00000004
> #define KVM_STATE_NESTED_MTF_PENDING 0x00000008
> +/* Available with KVM_CAP_NESTED_STATE_PREEMPTION_TIMER */
> +#define KVM_STATE_NESTED_PREEMPTION_TIMER 0x00000010
>
> #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
> #define KVM_STATE_NESTED_SMM_VMXON 0x00000002
It seems tools/arch/x86/include/uapi/asm/kvm.h is usually updated
separately (not sure it's the right thing to do though..)
--
Vitaly
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] KVM: selftests: VMX preemption timer migration test
2020-05-18 20:16 ` [PATCH 2/2] KVM: selftests: VMX preemption timer migration test Makarand Sonare
@ 2020-05-19 0:56 ` Vitaly Kuznetsov
2020-05-20 2:14 ` Krish Sadhukhan
1 sibling, 0 replies; 8+ messages in thread
From: Vitaly Kuznetsov @ 2020-05-19 0:56 UTC (permalink / raw)
To: Makarand Sonare; +Cc: Makarand Sonare, kvm, pshier, jmattson
Makarand Sonare <makarandsonare@google.com> writes:
> When a nested VM with a VMX-preemption timer is migrated, verify that the
> nested VM and its parent VM observe the VMX-preemption timer exit close to
> the original expiration deadline.
>
> Signed-off-by: Makarand Sonare <makarandsonare@google.com>
> Reviewed-by: Jim Mattson <jmattson@google.com>
> Change-Id: Ia79337c682ee161399525edc34201fad473fc190
> ---
> tools/arch/x86/include/uapi/asm/kvm.h | 1 +
> tools/testing/selftests/kvm/.gitignore | 1 +
> tools/testing/selftests/kvm/Makefile | 1 +
> .../testing/selftests/kvm/include/kvm_util.h | 2 +
> .../selftests/kvm/include/x86_64/processor.h | 11 +-
> .../selftests/kvm/include/x86_64/vmx.h | 27 ++
> .../kvm/x86_64/vmx_preemption_timer_test.c | 256 ++++++++++++++++++
> 7 files changed, 295 insertions(+), 4 deletions(-)
> create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
>
> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
> index 60701178b9cc1..13dca545554dc 100644
> --- a/tools/arch/x86/include/uapi/asm/kvm.h
> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
> @@ -402,6 +402,7 @@ struct kvm_sync_regs {
> struct kvm_vmx_nested_state_data {
> __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> + __u64 preemption_timer_deadline;
> };
>
> struct kvm_vmx_nested_state_hdr {
> diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
> index 222e50104296a..f159718f90c0a 100644
> --- a/tools/testing/selftests/kvm/.gitignore
> +++ b/tools/testing/selftests/kvm/.gitignore
> @@ -10,6 +10,7 @@
> /x86_64/set_sregs_test
> /x86_64/smm_test
> /x86_64/state_test
> +/x86_64/vmx_preemption_timer_test
> /x86_64/svm_vmcall_test
> /x86_64/sync_regs_test
> /x86_64/vmx_close_while_nested_test
> diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> index c66f4eec34111..780f0c189a7bc 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -20,6 +20,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
> TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
> TEST_GEN_PROGS_x86_64 += x86_64/smm_test
> TEST_GEN_PROGS_x86_64 += x86_64/state_test
> +TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
> TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
> TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
> TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
> index e244c6ecfc1d5..919e161dd2893 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
> @@ -314,6 +314,8 @@ void ucall_uninit(struct kvm_vm *vm);
> void ucall(uint64_t cmd, int nargs, ...);
> uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
>
> +#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \
> + ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
> #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage)
> #define GUEST_DONE() ucall(UCALL_DONE, 0)
> #define __GUEST_ASSERT(_condition, _nargs, _args...) do { \
> diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
> index 7428513a4c687..7cb19eca6c72d 100644
> --- a/tools/testing/selftests/kvm/include/x86_64/processor.h
> +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
> @@ -79,13 +79,16 @@ static inline uint64_t get_desc64_base(const struct desc64 *desc)
> static inline uint64_t rdtsc(void)
> {
> uint32_t eax, edx;
> -
> + uint32_t tsc_val;
> /*
> * The lfence is to wait (on Intel CPUs) until all previous
> - * instructions have been executed.
> + * instructions have been executed. If software requires RDTSC to be
> + * executed prior to execution of any subsequent instruction, it can
> + * execute LFENCE immediately after RDTSC
> */
> - __asm__ __volatile__("lfence; rdtsc" : "=a"(eax), "=d"(edx));
> - return ((uint64_t)edx) << 32 | eax;
> + __asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx));
> + tsc_val = ((uint64_t)edx) << 32 | eax;
> + return tsc_val;
> }
>
> static inline uint64_t rdtscp(uint32_t *aux)
> diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h
> index 3d27069b9ed9c..ccff3e6e27048 100644
> --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
> +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
> @@ -575,6 +575,33 @@ struct vmx_pages {
> void *eptp;
> };
>
> +union vmx_basic {
> + u64 val;
> + struct {
> + u32 revision;
> + u32 size:13,
> + reserved1:3,
> + width:1,
> + dual:1,
> + type:4,
> + insouts:1,
> + ctrl:1,
> + vm_entry_exception_ctrl:1,
> + reserved2:7;
> + };
> +};
> +
> +union vmx_ctrl_msr {
> + u64 val;
> + struct {
> + u32 set, clr;
> + };
> +};
> +
> +union vmx_basic basic;
> +union vmx_ctrl_msr ctrl_pin_rev;
> +union vmx_ctrl_msr ctrl_exit_rev;
> +
> struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
> bool prepare_for_vmx_operation(struct vmx_pages *vmx);
> void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
> diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
> new file mode 100644
> index 0000000000000..10893b11511be
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
> @@ -0,0 +1,256 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * VMX-preemption timer test
> + *
> + * Copyright (C) 2020, Google, LLC.
> + *
> + * Test to ensure the VM-Enter after migration doesn't
> + * incorrectly restarts the timer with the full timer
> + * value instead of partially decayed timer value
> + *
> + */
> +#define _GNU_SOURCE /* for program_invocation_short_name */
> +#include <fcntl.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +
> +#include "test_util.h"
> +
> +#include "kvm_util.h"
> +#include "processor.h"
> +#include "vmx.h"
> +
> +#define VCPU_ID 5
> +#define PREEMPTION_TIMER_VALUE 100000000ull
> +#define PREEMPTION_TIMER_VALUE_THRESHOLD1 80000000ull
> +
> +u32 vmx_pt_rate;
> +bool l2_save_restore_done;
> +static u64 l2_vmx_pt_start;
> +volatile u64 l2_vmx_pt_finish;
> +
> +void l2_guest_code(void)
> +{
> + u64 vmx_pt_delta;
> +
> + vmcall();
> + l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
> +
> + //
> + // Wait until the 1st threshold has passed
> + //
> + do {
> + l2_vmx_pt_finish = rdtsc();
> + vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >>
> + vmx_pt_rate;
> + } while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1);
> +
> + //
> + // Force L2 through Save and Restore cycle
> + //
> + GUEST_SYNC(1);
> +
> + l2_save_restore_done = 1;
> +
> + //
> + // Now wait for the preemption timer to fire and
> + // exit to L1
> + //
> + while ((l2_vmx_pt_finish = rdtsc()))
> + ;
> +}
> +
> +void l1_guest_code(struct vmx_pages *vmx_pages)
> +{
> +#define L2_GUEST_STACK_SIZE 64
> + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
> + u64 l1_vmx_pt_start;
> + u64 l1_vmx_pt_finish;
> + u64 l1_tsc_deadline, l2_tsc_deadline;
> +
> + GUEST_ASSERT(vmx_pages->vmcs_gpa);
> + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
> + GUEST_ASSERT(load_vmcs(vmx_pages));
> + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
> +
> + prepare_vmcs(vmx_pages, l2_guest_code,
> + &l2_guest_stack[L2_GUEST_STACK_SIZE]);
> +
> + GUEST_ASSERT(!vmlaunch());
> + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
> + vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3);
> +
> + //
> + // Check for Preemption timer support and turn on PIN control
> + //
> + basic.val = rdmsr(MSR_IA32_VMX_BASIC);
> + ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS
> + : MSR_IA32_VMX_PINBASED_CTLS);
> + ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS
> + : MSR_IA32_VMX_EXIT_CTLS);
> +
> + if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) ||
> + !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
> + return;
> +
> + GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL,
> + vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
> + PIN_BASED_VMX_PREEMPTION_TIMER));
> +
> + GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE,
> + PREEMPTION_TIMER_VALUE));
> +
> + vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F;
> +
> + l2_save_restore_done = 0;
> +
> + l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
> +
> + GUEST_ASSERT(!vmresume());
> +
> + l1_vmx_pt_finish = rdtsc();
> +
> + //
> + // Ensure exit from L2 happens after L2 goes through
> + // save and restore
> + GUEST_ASSERT(l2_save_restore_done);
> +
> + //
> + // Ensure the exit from L2 is due to preemption timer expiry
> + //
> + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER);
> +
> + l1_tsc_deadline = l1_vmx_pt_start +
> + (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
> +
> + l2_tsc_deadline = l2_vmx_pt_start +
> + (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
> +
> + //
> + // Sync with the host and pass the l1|l2 pt_expiry_finish times and
> + // tsc deadlines so that host can verify they are as expected
> + //
> + GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline,
> + l2_vmx_pt_finish, l2_tsc_deadline);
> +}
> +
> +void guest_code(struct vmx_pages *vmx_pages)
> +{
> + if (vmx_pages)
> + l1_guest_code(vmx_pages);
> +
> + GUEST_DONE();
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + vm_vaddr_t vmx_pages_gva = 0;
> +
> + struct kvm_regs regs1, regs2;
> + struct kvm_vm *vm;
> + struct kvm_run *run;
> + struct kvm_x86_state *state;
> + struct ucall uc;
> + int stage;
> +
> + /*
> + * AMD currently does not implement any VMX features, so for now we
> + * just early out.
> + */
> + nested_vmx_check_supported();
> +
> + /* Create VM */
> + vm = vm_create_default(VCPU_ID, 0, guest_code);
> + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
> + run = vcpu_state(vm, VCPU_ID);
> +
> + vcpu_regs_get(vm, VCPU_ID, ®s1);
> +
> + if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
You need to check the newly added KVM_CAP_NESTED_STATE_PREEMPTION_TIMER
instead or skip the whole test.
> + vcpu_alloc_vmx(vm, &vmx_pages_gva);
> + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
> + } else {
> + pr_info("will skip nested state checks\n");
Copy/paste from state_test.c detected :-)
> + goto done;
> + }
> +
> + for (stage = 1;; stage++) {
> + _vcpu_run(vm, VCPU_ID);
> + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
> + "Stage %d: unexpected exit reason: %u (%s),\n",
> + stage, run->exit_reason,
> + exit_reason_str(run->exit_reason));
> +
> + switch (get_ucall(vm, VCPU_ID, &uc)) {
> + case UCALL_ABORT:
> + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
> + __FILE__, uc.args[1]);
> + /* NOT REACHED */
> + case UCALL_SYNC:
> + break;
> + case UCALL_DONE:
> + goto done;
> + default:
> + TEST_FAIL("Unknown ucall %lu", uc.cmd);
> + }
> +
> + /* UCALL_SYNC is handled here. */
> + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
> + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
> + stage, (ulong)uc.args[1]);
> + //
> + // If this stage 2 then we should verify the vmx pt expiry
> + // is as expected
> + //
Nitpick: coding style requirements for selftests is definitely lower
than for KVM itself but could you please be consistent with comments and
use kernel style
/*
* This is a comment.
*/
comments exclusively (you seem to have a mix)? Thanks!
> + if (stage == 2) {
> +
> + pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n",
> + stage, uc.args[2], uc.args[3]);
> +
> + pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n",
> + stage, uc.args[4], uc.args[5]);
> +
> + //
> + // From L1's perspective verify Preemption timer hasn't
> + // expired too early
> + //
> +
> + TEST_ASSERT(uc.args[2] >= uc.args[3],
> + "Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)",
> + stage, uc.args[2], uc.args[3]);
> +
> + //
> + // From L2's perspective verify Preemption timer hasn't
> + // expired too late
> + //
> + TEST_ASSERT(uc.args[4] < uc.args[5],
> + "Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)",
> + stage, uc.args[4], uc.args[5]);
> + }
> +
> + state = vcpu_save_state(vm, VCPU_ID);
> + memset(®s1, 0, sizeof(regs1));
> + vcpu_regs_get(vm, VCPU_ID, ®s1);
> +
> + kvm_vm_release(vm);
> +
> + /* Restore state in a new VM. */
> + kvm_vm_restart(vm, O_RDWR);
> + vm_vcpu_add(vm, VCPU_ID);
> + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
> + vcpu_load_state(vm, VCPU_ID, state);
> + run = vcpu_state(vm, VCPU_ID);
> + free(state);
> +
> + memset(®s2, 0, sizeof(regs2));
> + vcpu_regs_get(vm, VCPU_ID, ®s2);
> + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)),
> + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
> + (ulong) regs2.rdi, (ulong) regs2.rsi);
> + }
> +
> +done:
> + kvm_vm_free(vm);
> +}
--
Vitaly
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 1/2] KVM: nVMX: Fix VMX preemption timer migration
2020-05-18 20:15 ` [PATCH 1/2] KVM: nVMX: " Makarand Sonare
2020-05-19 0:49 ` Vitaly Kuznetsov
@ 2020-05-19 7:18 ` Krish Sadhukhan
1 sibling, 0 replies; 8+ messages in thread
From: Krish Sadhukhan @ 2020-05-19 7:18 UTC (permalink / raw)
To: Makarand Sonare, kvm, pshier, jmattson
On 5/18/20 1:15 PM, Makarand Sonare wrote:
> From: Peter Shier <pshier@google.com>
>
> Add new field to hold preemption timer expiration deadline
> appended to struct kvm_vmx_nested_state_data. This is to prevent
> the first VM-Enter after migration from incorrectly restarting the timer
> with the full timer value instead of partially decayed timer value.
> KVM_SET_NESTED_STATE restarts timer using migrated state regardless
> of whether L1 sets VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
>
> Fixes: cf8b84f48a593 ("kvm: nVMX: Prepare for checkpointing L2 state")
>
> Signed-off-by: Peter Shier <pshier@google.com>
> Signed-off-by: Makarand Sonare <makarandsonare@google.com>
> Change-Id: I6446aba5a547afa667f0ef4620b1b76115bf3753
> ---
> Documentation/virt/kvm/api.rst | 4 ++
> arch/x86/include/uapi/asm/kvm.h | 2 +
> arch/x86/kvm/vmx/nested.c | 61 ++++++++++++++++++++++++---
> arch/x86/kvm/vmx/vmx.h | 1 +
> arch/x86/kvm/x86.c | 3 +-
> include/uapi/linux/kvm.h | 1 +
> tools/arch/x86/include/uapi/asm/kvm.h | 2 +
> 7 files changed, 67 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index d871dacb984e9..b410815772970 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -4326,6 +4326,9 @@ Errors:
> #define KVM_STATE_NESTED_RUN_PENDING 0x00000002
> #define KVM_STATE_NESTED_EVMCS 0x00000004
>
> + /* Available with KVM_CAP_NESTED_STATE_PREEMPTION_TIMER */
> + #define KVM_STATE_NESTED_PREEMPTION_TIMER 0x00000010
> +
> #define KVM_STATE_NESTED_FORMAT_VMX 0
> #define KVM_STATE_NESTED_FORMAT_SVM 1
>
> @@ -4346,6 +4349,7 @@ Errors:
> struct kvm_vmx_nested_state_data {
> __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> + __u64 preemption_timer_deadline;
> };
>
> This ioctl copies the vcpu's nested virtualization state from the kernel to
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 3f3f780c8c650..20d5832bab215 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -391,6 +391,7 @@ struct kvm_sync_regs {
> #define KVM_STATE_NESTED_RUN_PENDING 0x00000002
> #define KVM_STATE_NESTED_EVMCS 0x00000004
> #define KVM_STATE_NESTED_MTF_PENDING 0x00000008
> +#define KVM_STATE_NESTED_PREEMPTION_TIMER 0x00000010
>
> #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
> #define KVM_STATE_NESTED_SMM_VMXON 0x00000002
> @@ -400,6 +401,7 @@ struct kvm_sync_regs {
> struct kvm_vmx_nested_state_data {
> __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> + __u64 preemption_timer_deadline;
> };
>
> struct kvm_vmx_nested_state_hdr {
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 51ebb60e1533a..badb82a39ac04 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -2092,9 +2092,9 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
> return HRTIMER_NORESTART;
> }
>
> -static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
> +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
> + u64 preemption_timeout)
> {
> - u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
> struct vcpu_vmx *vmx = to_vmx(vcpu);
>
> /*
> @@ -3353,8 +3353,24 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
> * the timer.
> */
> vmx->nested.preemption_timer_expired = false;
> - if (nested_cpu_has_preemption_timer(vmcs12))
> - vmx_start_preemption_timer(vcpu);
> + if (nested_cpu_has_preemption_timer(vmcs12)) {
> + u64 timer_value;
> + u64 l1_tsc_value = kvm_read_l1_tsc(vcpu, rdtsc());
> +
> + if (from_vmentry) {
> + timer_value = vmcs12->vmx_preemption_timer_value;
> + vmx->nested.preemption_timer_deadline = timer_value +
> + (l1_tsc_value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE);
> + } else {
> + if ((l1_tsc_value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE) >
> + vmx->nested.preemption_timer_deadline)
> + timer_value = 0;
> + else
> + timer_value = vmx->nested.preemption_timer_deadline -
> + (l1_tsc_value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE);
A couple of code optimizations can be done here:
1. The inside if-else can be simplified by initializing 'timer_value' at
the place where you have defined it:
} else {
+ if (!((l1_tsc_value >>
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE) >
+ vmx->nested.preemption_timer_deadline))
+ timer_value = vmx->nested.preemption_timer_deadline -
+ (l1_tsc_value >>
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE);
+ }
2. You can apply the right-shift operation on 'l1_tsc_value' right where
you have defined it:
u64 l1_tsc_value = kvm_read_l1_tsc(vcpu, rdtsc()) >>
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE
> + }
> + vmx_start_preemption_timer(vcpu, timer_value);
> + }
>
> /*
> * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
> @@ -3962,9 +3978,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
> vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
>
> if (nested_cpu_has_preemption_timer(vmcs12) &&
> - vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> + !vmx->nested.nested_run_pending) {
> + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> vmcs12->vmx_preemption_timer_value =
> vmx_get_preemption_timer_value(vcpu);
> + }
>
> /*
> * In some cases (usually, nested EPT), L2 is allowed to change its
> @@ -5939,6 +5957,13 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
>
> if (vmx->nested.mtf_pending)
> kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
> +
> + if (nested_cpu_has_preemption_timer(vmcs12)) {
> + kvm_state.flags |=
> + KVM_STATE_NESTED_PREEMPTION_TIMER;
> + kvm_state.size = offsetof(struct kvm_nested_state, data.vmx) +
'size' is reset here instead of accumulating all previous sizes. Is that
what is intended ?
> + offsetofend(struct kvm_vmx_nested_state_data, preemption_timer_deadline);
> + }
> }
> }
>
> @@ -5970,6 +5995,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
>
> BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
> BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
> + BUILD_BUG_ON(sizeof(user_vmx_nested_state->preemption_timer_deadline)
> + != sizeof(vmx->nested.preemption_timer_deadline));
> +
>
> /*
> * Copy over the full allocated size of vmcs12 rather than just the size
> @@ -5985,6 +6013,12 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
> return -EFAULT;
> }
>
> + if (kvm_state.flags & KVM_STATE_NESTED_PREEMPTION_TIMER) {
> + if (put_user(vmx->nested.preemption_timer_deadline,
> + &user_vmx_nested_state->preemption_timer_deadline))
> + return -EFAULT;
> + }
> +
> out:
> return kvm_state.size;
> }
> @@ -6056,7 +6090,8 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
> */
> if (is_smm(vcpu) ?
> (kvm_state->flags &
> - (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
> + (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING |
> + KVM_STATE_NESTED_PREEMPTION_TIMER))
> : kvm_state->hdr.vmx.smm.flags)
> return -EINVAL;
>
> @@ -6146,6 +6181,20 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
> goto error_guest_mode;
> }
>
> + if (kvm_state->flags & KVM_STATE_NESTED_PREEMPTION_TIMER) {
> +
> + if (kvm_state->size <
> + offsetof(struct kvm_nested_state, data.vmx) +
> + offsetofend(struct kvm_vmx_nested_state_data, preemption_timer_deadline))
> + goto error_guest_mode;
> +
> + if (get_user(vmx->nested.preemption_timer_deadline,
> + &user_vmx_nested_state->preemption_timer_deadline)) {
> + ret = -EFAULT;
> + goto error_guest_mode;
> + }
> + }
> +
> if (nested_vmx_check_controls(vcpu, vmcs12) ||
> nested_vmx_check_host_state(vcpu, vmcs12) ||
> nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 298ddef79d009..db697400755fb 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -169,6 +169,7 @@ struct nested_vmx {
> u16 posted_intr_nv;
>
> struct hrtimer preemption_timer;
> + u64 preemption_timer_deadline;
> bool preemption_timer_expired;
>
> /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 471fccf7f8501..ba9e62ffbb4cd 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3418,6 +3418,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
> case KVM_CAP_MSR_PLATFORM_INFO:
> case KVM_CAP_EXCEPTION_PAYLOAD:
> case KVM_CAP_SET_GUEST_DEBUG:
> + case KVM_CAP_NESTED_STATE_PREEMPTION_TIMER:
> r = 1;
> break;
> case KVM_CAP_SYNC_REGS:
> @@ -4626,7 +4627,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>
> if (kvm_state.flags &
> ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
> - | KVM_STATE_NESTED_EVMCS))
> + | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_PREEMPTION_TIMER))
> break;
>
> /* nested_run_pending implies guest_mode. */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index ac9eba0289d1b..0868dce12a715 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1018,6 +1018,7 @@ struct kvm_ppc_resize_hpt {
> #define KVM_CAP_S390_PROTECTED 180
> #define KVM_CAP_PPC_SECURE_GUEST 181
> #define KVM_CAP_HALT_POLL 182
> +#define KVM_CAP_NESTED_STATE_PREEMPTION_TIMER 183
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
> index 3f3f780c8c650..60701178b9cc1 100644
> --- a/tools/arch/x86/include/uapi/asm/kvm.h
> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
> @@ -391,6 +391,8 @@ struct kvm_sync_regs {
> #define KVM_STATE_NESTED_RUN_PENDING 0x00000002
> #define KVM_STATE_NESTED_EVMCS 0x00000004
> #define KVM_STATE_NESTED_MTF_PENDING 0x00000008
> +/* Available with KVM_CAP_NESTED_STATE_PREEMPTION_TIMER */
> +#define KVM_STATE_NESTED_PREEMPTION_TIMER 0x00000010
>
> #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
> #define KVM_STATE_NESTED_SMM_VMXON 0x00000002
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] KVM: selftests: VMX preemption timer migration test
2020-05-18 20:16 ` [PATCH 2/2] KVM: selftests: VMX preemption timer migration test Makarand Sonare
2020-05-19 0:56 ` Vitaly Kuznetsov
@ 2020-05-20 2:14 ` Krish Sadhukhan
2020-05-20 23:18 ` Makarand Sonare
1 sibling, 1 reply; 8+ messages in thread
From: Krish Sadhukhan @ 2020-05-20 2:14 UTC (permalink / raw)
To: Makarand Sonare, kvm, pshier, jmattson
On 5/18/20 1:16 PM, Makarand Sonare wrote:
> When a nested VM with a VMX-preemption timer is migrated, verify that the
> nested VM and its parent VM observe the VMX-preemption timer exit close to
> the original expiration deadline.
>
> Signed-off-by: Makarand Sonare <makarandsonare@google.com>
> Reviewed-by: Jim Mattson <jmattson@google.com>
> Change-Id: Ia79337c682ee161399525edc34201fad473fc190
> ---
> tools/arch/x86/include/uapi/asm/kvm.h | 1 +
> tools/testing/selftests/kvm/.gitignore | 1 +
> tools/testing/selftests/kvm/Makefile | 1 +
> .../testing/selftests/kvm/include/kvm_util.h | 2 +
> .../selftests/kvm/include/x86_64/processor.h | 11 +-
> .../selftests/kvm/include/x86_64/vmx.h | 27 ++
> .../kvm/x86_64/vmx_preemption_timer_test.c | 256 ++++++++++++++++++
> 7 files changed, 295 insertions(+), 4 deletions(-)
> create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
>
> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
> index 60701178b9cc1..13dca545554dc 100644
> --- a/tools/arch/x86/include/uapi/asm/kvm.h
> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
> @@ -402,6 +402,7 @@ struct kvm_sync_regs {
> struct kvm_vmx_nested_state_data {
> __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> + __u64 preemption_timer_deadline;
> };
>
> struct kvm_vmx_nested_state_hdr {
> diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
> index 222e50104296a..f159718f90c0a 100644
> --- a/tools/testing/selftests/kvm/.gitignore
> +++ b/tools/testing/selftests/kvm/.gitignore
> @@ -10,6 +10,7 @@
> /x86_64/set_sregs_test
> /x86_64/smm_test
> /x86_64/state_test
> +/x86_64/vmx_preemption_timer_test
> /x86_64/svm_vmcall_test
> /x86_64/sync_regs_test
> /x86_64/vmx_close_while_nested_test
> diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> index c66f4eec34111..780f0c189a7bc 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -20,6 +20,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
> TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
> TEST_GEN_PROGS_x86_64 += x86_64/smm_test
> TEST_GEN_PROGS_x86_64 += x86_64/state_test
> +TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
> TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
> TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
> TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
> index e244c6ecfc1d5..919e161dd2893 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
> @@ -314,6 +314,8 @@ void ucall_uninit(struct kvm_vm *vm);
> void ucall(uint64_t cmd, int nargs, ...);
> uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
>
> +#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \
> + ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
> #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage)
> #define GUEST_DONE() ucall(UCALL_DONE, 0)
> #define __GUEST_ASSERT(_condition, _nargs, _args...) do { \
> diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
> index 7428513a4c687..7cb19eca6c72d 100644
> --- a/tools/testing/selftests/kvm/include/x86_64/processor.h
> +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
> @@ -79,13 +79,16 @@ static inline uint64_t get_desc64_base(const struct desc64 *desc)
> static inline uint64_t rdtsc(void)
> {
> uint32_t eax, edx;
> -
> + uint32_t tsc_val;
> /*
> * The lfence is to wait (on Intel CPUs) until all previous
> - * instructions have been executed.
> + * instructions have been executed. If software requires RDTSC to be
> + * executed prior to execution of any subsequent instruction, it can
> + * execute LFENCE immediately after RDTSC
> */
> - __asm__ __volatile__("lfence; rdtsc" : "=a"(eax), "=d"(edx));
> - return ((uint64_t)edx) << 32 | eax;
> + __asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx));
> + tsc_val = ((uint64_t)edx) << 32 | eax;
> + return tsc_val;
> }
>
> static inline uint64_t rdtscp(uint32_t *aux)
> diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h
> index 3d27069b9ed9c..ccff3e6e27048 100644
> --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
> +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
> @@ -575,6 +575,33 @@ struct vmx_pages {
> void *eptp;
> };
>
> +union vmx_basic {
> + u64 val;
> + struct {
> + u32 revision;
> + u32 size:13,
> + reserved1:3,
> + width:1,
> + dual:1,
> + type:4,
> + insouts:1,
> + ctrl:1,
> + vm_entry_exception_ctrl:1,
> + reserved2:7;
> + };
> +};
> +
> +union vmx_ctrl_msr {
> + u64 val;
> + struct {
> + u32 set, clr;
> + };
> +};
> +
> +union vmx_basic basic;
> +union vmx_ctrl_msr ctrl_pin_rev;
> +union vmx_ctrl_msr ctrl_exit_rev;
> +
> struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
> bool prepare_for_vmx_operation(struct vmx_pages *vmx);
> void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
> diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
> new file mode 100644
> index 0000000000000..10893b11511be
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
> @@ -0,0 +1,256 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * VMX-preemption timer test
> + *
> + * Copyright (C) 2020, Google, LLC.
> + *
> + * Test to ensure the VM-Enter after migration doesn't
> + * incorrectly restarts the timer with the full timer
> + * value instead of partially decayed timer value
> + *
> + */
> +#define _GNU_SOURCE /* for program_invocation_short_name */
> +#include <fcntl.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +
> +#include "test_util.h"
> +
> +#include "kvm_util.h"
> +#include "processor.h"
> +#include "vmx.h"
> +
> +#define VCPU_ID 5
> +#define PREEMPTION_TIMER_VALUE 100000000ull
> +#define PREEMPTION_TIMER_VALUE_THRESHOLD1 80000000ull
> +
> +u32 vmx_pt_rate;
> +bool l2_save_restore_done;
> +static u64 l2_vmx_pt_start;
> +volatile u64 l2_vmx_pt_finish;
> +
> +void l2_guest_code(void)
> +{
> + u64 vmx_pt_delta;
> +
> + vmcall();
> + l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
> +
> + //
> + // Wait until the 1st threshold has passed
> + //
> + do {
> + l2_vmx_pt_finish = rdtsc();
> + vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >>
> + vmx_pt_rate;
> + } while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1);
> +
> + //
> + // Force L2 through Save and Restore cycle
> + //
> + GUEST_SYNC(1);
> +
> + l2_save_restore_done = 1;
> +
> + //
> + // Now wait for the preemption timer to fire and
> + // exit to L1
> + //
> + while ((l2_vmx_pt_finish = rdtsc()))
> + ;
> +}
> +
> +void l1_guest_code(struct vmx_pages *vmx_pages)
> +{
> +#define L2_GUEST_STACK_SIZE 64
> + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
> + u64 l1_vmx_pt_start;
> + u64 l1_vmx_pt_finish;
> + u64 l1_tsc_deadline, l2_tsc_deadline;
> +
> + GUEST_ASSERT(vmx_pages->vmcs_gpa);
> + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
> + GUEST_ASSERT(load_vmcs(vmx_pages));
> + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
> +
> + prepare_vmcs(vmx_pages, l2_guest_code,
> + &l2_guest_stack[L2_GUEST_STACK_SIZE]);
> +
> + GUEST_ASSERT(!vmlaunch());
> + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
> + vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3);
Please use vmcs_read(VM_EXIT_INSTRUCTION_LEN) instead of 3.
> +
> + //
> + // Check for Preemption timer support and turn on PIN control
> + //
> + basic.val = rdmsr(MSR_IA32_VMX_BASIC);
> + ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS
> + : MSR_IA32_VMX_PINBASED_CTLS);
> + ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS
> + : MSR_IA32_VMX_EXIT_CTLS);
> +
> + if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) ||
> + !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
> + return;
> +
Why not do this check in the beginning of L1, before VMLAUNCH of L2 ? If
these two controls are not supported, the test is just void.
> + GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL,
> + vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
> + PIN_BASED_VMX_PREEMPTION_TIMER));
> +
> + GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE,
> + PREEMPTION_TIMER_VALUE));
> +
> + vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F;
> +
> + l2_save_restore_done = 0;
> +
> + l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
> +
> + GUEST_ASSERT(!vmresume());
> +
> + l1_vmx_pt_finish = rdtsc();
> +
> + //
> + // Ensure exit from L2 happens after L2 goes through
> + // save and restore
> + GUEST_ASSERT(l2_save_restore_done);
> +
> + //
> + // Ensure the exit from L2 is due to preemption timer expiry
> + //
> + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER);
I am wondering if checking just the EXIT reason is enough to determine
that L2 has gone through a successful migration.
> +
> + l1_tsc_deadline = l1_vmx_pt_start +
> + (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
> +
> + l2_tsc_deadline = l2_vmx_pt_start +
> + (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
> +
> + //
> + // Sync with the host and pass the l1|l2 pt_expiry_finish times and
> + // tsc deadlines so that host can verify they are as expected
> + //
> + GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline,
> + l2_vmx_pt_finish, l2_tsc_deadline);
> +}
> +
> +void guest_code(struct vmx_pages *vmx_pages)
> +{
> + if (vmx_pages)
> + l1_guest_code(vmx_pages);
> +
> + GUEST_DONE();
> +}
> +
> +int main(int argc, char *argv[])
> +{
> + vm_vaddr_t vmx_pages_gva = 0;
> +
> + struct kvm_regs regs1, regs2;
> + struct kvm_vm *vm;
> + struct kvm_run *run;
> + struct kvm_x86_state *state;
> + struct ucall uc;
> + int stage;
> +
> + /*
> + * AMD currently does not implement any VMX features, so for now we
> + * just early out.
> + */
> + nested_vmx_check_supported();
> +
> + /* Create VM */
> + vm = vm_create_default(VCPU_ID, 0, guest_code);
> + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
> + run = vcpu_state(vm, VCPU_ID);
> +
> + vcpu_regs_get(vm, VCPU_ID, ®s1);
> +
> + if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
> + vcpu_alloc_vmx(vm, &vmx_pages_gva);
> + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
> + } else {
> + pr_info("will skip nested state checks\n");
This error message looks odd. Shouldn't it say something like,
"nested state capability not available, skipping test"
> + goto done;
> + }
> +
> + for (stage = 1;; stage++) {
> + _vcpu_run(vm, VCPU_ID);
> + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
> + "Stage %d: unexpected exit reason: %u (%s),\n",
> + stage, run->exit_reason,
> + exit_reason_str(run->exit_reason));
> +
> + switch (get_ucall(vm, VCPU_ID, &uc)) {
> + case UCALL_ABORT:
> + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
> + __FILE__, uc.args[1]);
> + /* NOT REACHED */
> + case UCALL_SYNC:
> + break;
> + case UCALL_DONE:
> + goto done;
> + default:
> + TEST_FAIL("Unknown ucall %lu", uc.cmd);
> + }
> +
> + /* UCALL_SYNC is handled here. */
> + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
> + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
> + stage, (ulong)uc.args[1]);
> + //
> + // If this stage 2 then we should verify the vmx pt expiry
> + // is as expected
> + //
> + if (stage == 2) {
> +
> + pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n",
> + stage, uc.args[2], uc.args[3]);
> +
> + pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n",
> + stage, uc.args[4], uc.args[5]);
> +
> + //
> + // From L1's perspective verify Preemption timer hasn't
> + // expired too early
> + //
> +
> + TEST_ASSERT(uc.args[2] >= uc.args[3],
> + "Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)",
> + stage, uc.args[2], uc.args[3]);
> +
> + //
> + // From L2's perspective verify Preemption timer hasn't
> + // expired too late
> + //
> + TEST_ASSERT(uc.args[4] < uc.args[5],
> + "Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)",
> + stage, uc.args[4], uc.args[5]);
> + }
> +
For readability, it's better to put a block comment here for the entire
save/restore operation as a whole.
Also, this save/restore block should be placed before the above stage2
block.
> + state = vcpu_save_state(vm, VCPU_ID);
> + memset(®s1, 0, sizeof(regs1));
> + vcpu_regs_get(vm, VCPU_ID, ®s1);
> +
> + kvm_vm_release(vm);
> +
> + /* Restore state in a new VM. */
> + kvm_vm_restart(vm, O_RDWR);
> + vm_vcpu_add(vm, VCPU_ID);
> + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
> + vcpu_load_state(vm, VCPU_ID, state);
> + run = vcpu_state(vm, VCPU_ID);
> + free(state);
> +
> + memset(®s2, 0, sizeof(regs2));
> + vcpu_regs_get(vm, VCPU_ID, ®s2);
> + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)),
> + "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
> + (ulong) regs2.rdi, (ulong) regs2.rsi);
> + }
> +
> +done:
> + kvm_vm_free(vm);
> +}
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] KVM: selftests: VMX preemption timer migration test
2020-05-20 2:14 ` Krish Sadhukhan
@ 2020-05-20 23:18 ` Makarand Sonare
0 siblings, 0 replies; 8+ messages in thread
From: Makarand Sonare @ 2020-05-20 23:18 UTC (permalink / raw)
To: Krish Sadhukhan; +Cc: kvm, pshier, jmattson
> On 5/18/20 1:16 PM, Makarand Sonare wrote:
>> When a nested VM with a VMX-preemption timer is migrated, verify that the
>> nested VM and its parent VM observe the VMX-preemption timer exit close
>> to
>> the original expiration deadline.
>>
>> Signed-off-by: Makarand Sonare <makarandsonare@google.com>
>> Reviewed-by: Jim Mattson <jmattson@google.com>
>> Change-Id: Ia79337c682ee161399525edc34201fad473fc190
>> ---
>> tools/arch/x86/include/uapi/asm/kvm.h | 1 +
>> tools/testing/selftests/kvm/.gitignore | 1 +
>> tools/testing/selftests/kvm/Makefile | 1 +
>> .../testing/selftests/kvm/include/kvm_util.h | 2 +
>> .../selftests/kvm/include/x86_64/processor.h | 11 +-
>> .../selftests/kvm/include/x86_64/vmx.h | 27 ++
>> .../kvm/x86_64/vmx_preemption_timer_test.c | 256 ++++++++++++++++++
>> 7 files changed, 295 insertions(+), 4 deletions(-)
>> create mode 100644
>> tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
>>
>> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h
>> b/tools/arch/x86/include/uapi/asm/kvm.h
>> index 60701178b9cc1..13dca545554dc 100644
>> --- a/tools/arch/x86/include/uapi/asm/kvm.h
>> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
>> @@ -402,6 +402,7 @@ struct kvm_sync_regs {
>> struct kvm_vmx_nested_state_data {
>> __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
>> __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
>> + __u64 preemption_timer_deadline;
>> };
>>
>> struct kvm_vmx_nested_state_hdr {
>> diff --git a/tools/testing/selftests/kvm/.gitignore
>> b/tools/testing/selftests/kvm/.gitignore
>> index 222e50104296a..f159718f90c0a 100644
>> --- a/tools/testing/selftests/kvm/.gitignore
>> +++ b/tools/testing/selftests/kvm/.gitignore
>> @@ -10,6 +10,7 @@
>> /x86_64/set_sregs_test
>> /x86_64/smm_test
>> /x86_64/state_test
>> +/x86_64/vmx_preemption_timer_test
>> /x86_64/svm_vmcall_test
>> /x86_64/sync_regs_test
>> /x86_64/vmx_close_while_nested_test
>> diff --git a/tools/testing/selftests/kvm/Makefile
>> b/tools/testing/selftests/kvm/Makefile
>> index c66f4eec34111..780f0c189a7bc 100644
>> --- a/tools/testing/selftests/kvm/Makefile
>> +++ b/tools/testing/selftests/kvm/Makefile
>> @@ -20,6 +20,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
>> TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
>> TEST_GEN_PROGS_x86_64 += x86_64/smm_test
>> TEST_GEN_PROGS_x86_64 += x86_64/state_test
>> +TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
>> TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
>> TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
>> TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
>> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h
>> b/tools/testing/selftests/kvm/include/kvm_util.h
>> index e244c6ecfc1d5..919e161dd2893 100644
>> --- a/tools/testing/selftests/kvm/include/kvm_util.h
>> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
>> @@ -314,6 +314,8 @@ void ucall_uninit(struct kvm_vm *vm);
>> void ucall(uint64_t cmd, int nargs, ...);
>> uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall
>> *uc);
>>
>> +#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \
>> + ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
>> #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage)
>> #define GUEST_DONE() ucall(UCALL_DONE, 0)
>> #define __GUEST_ASSERT(_condition, _nargs, _args...) do { \
>> diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h
>> b/tools/testing/selftests/kvm/include/x86_64/processor.h
>> index 7428513a4c687..7cb19eca6c72d 100644
>> --- a/tools/testing/selftests/kvm/include/x86_64/processor.h
>> +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
>> @@ -79,13 +79,16 @@ static inline uint64_t get_desc64_base(const struct
>> desc64 *desc)
>> static inline uint64_t rdtsc(void)
>> {
>> uint32_t eax, edx;
>> -
>> + uint32_t tsc_val;
>> /*
>> * The lfence is to wait (on Intel CPUs) until all previous
>> - * instructions have been executed.
>> + * instructions have been executed. If software requires RDTSC to be
>> + * executed prior to execution of any subsequent instruction, it can
>> + * execute LFENCE immediately after RDTSC
>> */
>> - __asm__ __volatile__("lfence; rdtsc" : "=a"(eax), "=d"(edx));
>> - return ((uint64_t)edx) << 32 | eax;
>> + __asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx));
>> + tsc_val = ((uint64_t)edx) << 32 | eax;
>> + return tsc_val;
>> }
>>
>> static inline uint64_t rdtscp(uint32_t *aux)
>> diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h
>> b/tools/testing/selftests/kvm/include/x86_64/vmx.h
>> index 3d27069b9ed9c..ccff3e6e27048 100644
>> --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
>> +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
>> @@ -575,6 +575,33 @@ struct vmx_pages {
>> void *eptp;
>> };
>>
>> +union vmx_basic {
>> + u64 val;
>> + struct {
>> + u32 revision;
>> + u32 size:13,
>> + reserved1:3,
>> + width:1,
>> + dual:1,
>> + type:4,
>> + insouts:1,
>> + ctrl:1,
>> + vm_entry_exception_ctrl:1,
>> + reserved2:7;
>> + };
>> +};
>> +
>> +union vmx_ctrl_msr {
>> + u64 val;
>> + struct {
>> + u32 set, clr;
>> + };
>> +};
>> +
>> +union vmx_basic basic;
>> +union vmx_ctrl_msr ctrl_pin_rev;
>> +union vmx_ctrl_msr ctrl_exit_rev;
>> +
>> struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t
>> *p_vmx_gva);
>> bool prepare_for_vmx_operation(struct vmx_pages *vmx);
>> void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void
>> *guest_rsp);
>> diff --git
>> a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
>> b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
>> new file mode 100644
>> index 0000000000000..10893b11511be
>> --- /dev/null
>> +++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
>> @@ -0,0 +1,256 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * VMX-preemption timer test
>> + *
>> + * Copyright (C) 2020, Google, LLC.
>> + *
>> + * Test to ensure the VM-Enter after migration doesn't
>> + * incorrectly restarts the timer with the full timer
>> + * value instead of partially decayed timer value
>> + *
>> + */
>> +#define _GNU_SOURCE /* for program_invocation_short_name */
>> +#include <fcntl.h>
>> +#include <stdio.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <sys/ioctl.h>
>> +
>> +#include "test_util.h"
>> +
>> +#include "kvm_util.h"
>> +#include "processor.h"
>> +#include "vmx.h"
>> +
>> +#define VCPU_ID 5
>> +#define PREEMPTION_TIMER_VALUE 100000000ull
>> +#define PREEMPTION_TIMER_VALUE_THRESHOLD1 80000000ull
>> +
>> +u32 vmx_pt_rate;
>> +bool l2_save_restore_done;
>> +static u64 l2_vmx_pt_start;
>> +volatile u64 l2_vmx_pt_finish;
>> +
>> +void l2_guest_code(void)
>> +{
>> + u64 vmx_pt_delta;
>> +
>> + vmcall();
>> + l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
>> +
>> + //
>> + // Wait until the 1st threshold has passed
>> + //
>> + do {
>> + l2_vmx_pt_finish = rdtsc();
>> + vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >>
>> + vmx_pt_rate;
>> + } while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1);
>> +
>> + //
>> + // Force L2 through Save and Restore cycle
>> + //
>> + GUEST_SYNC(1);
>> +
>> + l2_save_restore_done = 1;
>> +
>> + //
>> + // Now wait for the preemption timer to fire and
>> + // exit to L1
>> + //
>> + while ((l2_vmx_pt_finish = rdtsc()))
>> + ;
>> +}
>> +
>> +void l1_guest_code(struct vmx_pages *vmx_pages)
>> +{
>> +#define L2_GUEST_STACK_SIZE 64
>> + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
>> + u64 l1_vmx_pt_start;
>> + u64 l1_vmx_pt_finish;
>> + u64 l1_tsc_deadline, l2_tsc_deadline;
>> +
>> + GUEST_ASSERT(vmx_pages->vmcs_gpa);
>> + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
>> + GUEST_ASSERT(load_vmcs(vmx_pages));
>> + GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
>> +
>> + prepare_vmcs(vmx_pages, l2_guest_code,
>> + &l2_guest_stack[L2_GUEST_STACK_SIZE]);
>> +
>> + GUEST_ASSERT(!vmlaunch());
>> + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
>> + vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3);
>
>
> Please use vmcs_read(VM_EXIT_INSTRUCTION_LEN) instead of 3.
>
>> +
>> + //
>> + // Check for Preemption timer support and turn on PIN control
>> + //
>> + basic.val = rdmsr(MSR_IA32_VMX_BASIC);
>> + ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS
>> + : MSR_IA32_VMX_PINBASED_CTLS);
>> + ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS
>> + : MSR_IA32_VMX_EXIT_CTLS);
>> +
>> + if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) ||
>> + !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
>> + return;
>> +
>
>
> Why not do this check in the beginning of L1, before VMLAUNCH of L2 ? If
> these two controls are not supported, the test is just void.
>
>> + GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL,
>> + vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
>> + PIN_BASED_VMX_PREEMPTION_TIMER));
>> +
>> + GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE,
>> + PREEMPTION_TIMER_VALUE));
>> +
>> + vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F;
>> +
>> + l2_save_restore_done = 0;
>> +
>> + l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
>> +
>> + GUEST_ASSERT(!vmresume());
>> +
>> + l1_vmx_pt_finish = rdtsc();
>> +
>> + //
>> + // Ensure exit from L2 happens after L2 goes through
>> + // save and restore
>> + GUEST_ASSERT(l2_save_restore_done);
>> +
>> + //
>> + // Ensure the exit from L2 is due to preemption timer expiry
>> + //
>> + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER);
>
>
> I am wondering if checking just the EXIT reason is enough to determine
> that L2 has gone through a successful migration.
>
Yes. But I found it hugely helpful to pass the date to L0 for logging
in case of failures.
>> +
>> + l1_tsc_deadline = l1_vmx_pt_start +
>> + (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
>> +
>> + l2_tsc_deadline = l2_vmx_pt_start +
>> + (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
>> +
>> + //
>> + // Sync with the host and pass the l1|l2 pt_expiry_finish times and
>> + // tsc deadlines so that host can verify they are as expected
>> + //
>> + GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline,
>> + l2_vmx_pt_finish, l2_tsc_deadline);
>> +}
>> +
>> +void guest_code(struct vmx_pages *vmx_pages)
>> +{
>> + if (vmx_pages)
>> + l1_guest_code(vmx_pages);
>> +
>> + GUEST_DONE();
>> +}
>> +
>> +int main(int argc, char *argv[])
>> +{
>> + vm_vaddr_t vmx_pages_gva = 0;
>> +
>> + struct kvm_regs regs1, regs2;
>> + struct kvm_vm *vm;
>> + struct kvm_run *run;
>> + struct kvm_x86_state *state;
>> + struct ucall uc;
>> + int stage;
>> +
>> + /*
>> + * AMD currently does not implement any VMX features, so for now we
>> + * just early out.
>> + */
>> + nested_vmx_check_supported();
>> +
>> + /* Create VM */
>> + vm = vm_create_default(VCPU_ID, 0, guest_code);
>> + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
>> + run = vcpu_state(vm, VCPU_ID);
>> +
>> + vcpu_regs_get(vm, VCPU_ID, ®s1);
>> +
>> + if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
>> + vcpu_alloc_vmx(vm, &vmx_pages_gva);
>> + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
>> + } else {
>> + pr_info("will skip nested state checks\n");
>
>
> This error message looks odd. Shouldn't it say something like,
>
> "nested state capability not available, skipping test"
>
>> + goto done;
>> + }
>> +
>> + for (stage = 1;; stage++) {
>> + _vcpu_run(vm, VCPU_ID);
>> + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
>> + "Stage %d: unexpected exit reason: %u (%s),\n",
>> + stage, run->exit_reason,
>> + exit_reason_str(run->exit_reason));
>> +
>> + switch (get_ucall(vm, VCPU_ID, &uc)) {
>> + case UCALL_ABORT:
>> + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
>> + __FILE__, uc.args[1]);
>> + /* NOT REACHED */
>> + case UCALL_SYNC:
>> + break;
>> + case UCALL_DONE:
>> + goto done;
>> + default:
>> + TEST_FAIL("Unknown ucall %lu", uc.cmd);
>> + }
>> +
>> + /* UCALL_SYNC is handled here. */
>> + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
>> + uc.args[1] == stage, "Stage %d: Unexpected register values vmexit,
>> got %lx",
>> + stage, (ulong)uc.args[1]);
>> + //
>> + // If this stage 2 then we should verify the vmx pt expiry
>> + // is as expected
>> + //
>> + if (stage == 2) {
>> +
>> + pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n",
>> + stage, uc.args[2], uc.args[3]);
>> +
>> + pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n",
>> + stage, uc.args[4], uc.args[5]);
>> +
>> + //
>> + // From L1's perspective verify Preemption timer hasn't
>> + // expired too early
>> + //
>> +
>> + TEST_ASSERT(uc.args[2] >= uc.args[3],
>> + "Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)",
>> + stage, uc.args[2], uc.args[3]);
>> +
>> + //
>> + // From L2's perspective verify Preemption timer hasn't
>> + // expired too late
>> + //
>> + TEST_ASSERT(uc.args[4] < uc.args[5],
>> + "Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)",
>> + stage, uc.args[4], uc.args[5]);
>> + }
>> +
>
>
> For readability, it's better to put a block comment here for the entire
> save/restore operation as a whole.
>
> Also, this save/restore block should be placed before the above stage2
> block.
>
>> + state = vcpu_save_state(vm, VCPU_ID);
>> + memset(®s1, 0, sizeof(regs1));
>> + vcpu_regs_get(vm, VCPU_ID, ®s1);
>> +
>> + kvm_vm_release(vm);
>> +
>> + /* Restore state in a new VM. */
>> + kvm_vm_restart(vm, O_RDWR);
>> + vm_vcpu_add(vm, VCPU_ID);
>> + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
>> + vcpu_load_state(vm, VCPU_ID, state);
>> + run = vcpu_state(vm, VCPU_ID);
>> + free(state);
>> +
>> + memset(®s2, 0, sizeof(regs2));
>> + vcpu_regs_get(vm, VCPU_ID, ®s2);
>> + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)),
>> + "Unexpected register values after vcpu_load_state; rdi: %lx rsi:
>> %lx",
>> + (ulong) regs2.rdi, (ulong) regs2.rsi);
>> + }
>> +
>> +done:
>> + kvm_vm_free(vm);
>> +}
>
--
Thanks,
Makarand Sonare
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2020-05-20 23:18 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-18 20:15 [PATCH 0/2] Fix VMX preemption timer migration Makarand Sonare
2020-05-18 20:15 ` [PATCH 1/2] KVM: nVMX: " Makarand Sonare
2020-05-19 0:49 ` Vitaly Kuznetsov
2020-05-19 7:18 ` Krish Sadhukhan
2020-05-18 20:16 ` [PATCH 2/2] KVM: selftests: VMX preemption timer migration test Makarand Sonare
2020-05-19 0:56 ` Vitaly Kuznetsov
2020-05-20 2:14 ` Krish Sadhukhan
2020-05-20 23:18 ` Makarand Sonare
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.