All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] KVM: Reduce TSC scaling race condition mess
@ 2022-02-25 14:53 David Woodhouse
  2022-02-25 14:53 ` [PATCH 1/3] KVM: x86: Accept KVM_[GS]ET_TSC_KHZ as a VM ioctl David Woodhouse
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: David Woodhouse @ 2022-02-25 14:53 UTC (permalink / raw)
  To: kvm, Paolo Bonzini, Sean Christopherson
  Cc: Vitaly Kuznetsov, Wanpeng Li, Jim Mattson, Joerg Roedel,
	linux-kernel, Suleiman Souhlal, Anton Romanov

As Sean says, we punt on a proper fix... but at least now I can create a
bunch of vCPUs with a scaled TSC and their TSC synchronization isn't 
*utterly* hosed, so the test case I posted last night is actually passing.

It would still be nice for the original test case to work, where each
vCPU thread creates its own vCPU, then sets the scale and the current
TSC value. But this resolves most sane use cases.

David Woodhouse (2):
      KVM: x86: Accept KVM_[GS]ET_TSC_KHZ as a VM ioctl.
      KVM: x86: Test case for TSC scaling and offset sync

Sean Christopherson (1):
      KVM: x86: Don't snapshot "max" TSC if host TSC is constant

 Documentation/virt/kvm/api.rst                        |  11 +++--
 arch/x86/include/asm/kvm_host.h                       |   2 +
 arch/x86/kvm/x86.c                                    |  54 ++++++++++++++++------
 include/uapi/linux/kvm.h                              |   4 +-
 tools/testing/selftests/kvm/Makefile                  |   1 +
 tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 171 insertions(+), 20 deletions(-)



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/3] KVM: x86: Accept KVM_[GS]ET_TSC_KHZ as a VM ioctl.
  2022-02-25 14:53 [PATCH 0/3] KVM: Reduce TSC scaling race condition mess David Woodhouse
@ 2022-02-25 14:53 ` David Woodhouse
  2022-02-25 14:53 ` [PATCH 2/3] KVM: x86: Don't snapshot "max" TSC if host TSC is constant David Woodhouse
  2022-02-25 14:53 ` [PATCH 3/3] KVM: x86: Test case for TSC scaling and offset sync David Woodhouse
  2 siblings, 0 replies; 4+ messages in thread
From: David Woodhouse @ 2022-02-25 14:53 UTC (permalink / raw)
  To: kvm, Paolo Bonzini, Sean Christopherson
  Cc: Vitaly Kuznetsov, Wanpeng Li, Jim Mattson, Joerg Roedel,
	linux-kernel, Suleiman Souhlal, Anton Romanov

From: David Woodhouse <dwmw@amazon.co.uk>

This sets the default TSC frequency for subsequently created vCPUs.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 Documentation/virt/kvm/api.rst  | 11 +++++++----
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              | 26 +++++++++++++++++++++++++-
 include/uapi/linux/kvm.h        |  4 +++-
 4 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 046b386f6ce3..982fcdf8cfa8 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1903,22 +1903,25 @@ the future.
 4.55 KVM_SET_TSC_KHZ
 --------------------
 
-:Capability: KVM_CAP_TSC_CONTROL
+:Capability: KVM_CAP_TSC_CONTROL / KVM_CAP_VM_TSC_CONTROL
 :Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
 :Parameters: virtual tsc_khz
 :Returns: 0 on success, -1 on error
 
 Specifies the tsc frequency for the virtual machine. The unit of the
 frequency is KHz.
 
+If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also
+be used as a vm ioctl to set the initial tsc frequency of subsequently
+created vCPUs.
 
 4.56 KVM_GET_TSC_KHZ
 --------------------
 
-:Capability: KVM_CAP_GET_TSC_KHZ
+:Capability: KVM_CAP_GET_TSC_KHZ / KVM_CAP_VM_TSC_CONTROL
 :Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
 :Parameters: none
 :Returns: virtual tsc-khz on success, negative value on error
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a3385db39d3e..e4696a578f41 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1119,6 +1119,8 @@ struct kvm_arch {
 	u64 cur_tsc_generation;
 	int nr_vcpus_matched_tsc;
 
+	u32 default_tsc_khz;
+
 	seqcount_raw_spinlock_t pvclock_sc;
 	bool use_master_clock;
 	u64 master_kernel_ns;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 83accd3e7502..bb3e9916229a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4308,6 +4308,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = boot_cpu_has(X86_FEATURE_XSAVE);
 		break;
 	case KVM_CAP_TSC_CONTROL:
+	case KVM_CAP_VM_TSC_CONTROL:
 		r = kvm_has_tsc_control;
 		break;
 	case KVM_CAP_X2APIC_API:
@@ -6483,6 +6484,28 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_GET_CLOCK:
 		r = kvm_vm_ioctl_get_clock(kvm, argp);
 		break;
+	case KVM_SET_TSC_KHZ: {
+		u32 user_tsc_khz;
+
+		r = -EINVAL;
+		user_tsc_khz = (u32)arg;
+
+		if (kvm_has_tsc_control &&
+		    user_tsc_khz >= kvm_max_guest_tsc_khz)
+			goto out;
+
+		if (user_tsc_khz == 0)
+			user_tsc_khz = tsc_khz;
+
+		WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+		r = 0;
+
+		goto out;
+	}
+	case KVM_GET_TSC_KHZ: {
+		r = READ_ONCE(kvm->arch.default_tsc_khz);
+		goto out;
+	}
 	case KVM_MEMORY_ENCRYPT_OP: {
 		r = -ENOTTY;
 		if (!kvm_x86_ops.mem_enc_ioctl)
@@ -11165,7 +11188,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	kvm_xen_init_vcpu(vcpu);
 	kvm_vcpu_mtrr_init(vcpu);
 	vcpu_load(vcpu);
-	kvm_set_tsc_khz(vcpu, max_tsc_khz);
+	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
 	kvm_vcpu_reset(vcpu, false);
 	kvm_init_mmu(vcpu);
 	vcpu_put(vcpu);
@@ -11614,6 +11637,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	pvclock_update_vm_gtod_copy(kvm);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
+	kvm->arch.default_tsc_khz = max_tsc_khz;
 	kvm->arch.guest_can_read_msr_platform_info = true;
 
 #if IS_ENABLED(CONFIG_HYPERV)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 22a1aa98fa9e..01ae8b0e90f8 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1134,6 +1134,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_VM_GPA_BITS 207
 #define KVM_CAP_XSAVE2 208
 #define KVM_CAP_SYS_ATTRIBUTES 209
+#define KVM_CAP_VM_TSC_CONTROL 210
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1461,7 +1462,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
 /* Available with KVM_CAP_PPC_GET_PVINFO */
 #define KVM_PPC_GET_PVINFO	  _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
-/* Available with KVM_CAP_TSC_CONTROL */
+/* Available with KVM_CAP_TSC_CONTROL for a vCPU, or with
+*  KVM_CAP_VM_TSC_CONTROL to set defaults for a VM */
 #define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
 #define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
 /* Available with KVM_CAP_PCI_2_3 */
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/3] KVM: x86: Don't snapshot "max" TSC if host TSC is constant
  2022-02-25 14:53 [PATCH 0/3] KVM: Reduce TSC scaling race condition mess David Woodhouse
  2022-02-25 14:53 ` [PATCH 1/3] KVM: x86: Accept KVM_[GS]ET_TSC_KHZ as a VM ioctl David Woodhouse
@ 2022-02-25 14:53 ` David Woodhouse
  2022-02-25 14:53 ` [PATCH 3/3] KVM: x86: Test case for TSC scaling and offset sync David Woodhouse
  2 siblings, 0 replies; 4+ messages in thread
From: David Woodhouse @ 2022-02-25 14:53 UTC (permalink / raw)
  To: kvm, Paolo Bonzini, Sean Christopherson
  Cc: Vitaly Kuznetsov, Wanpeng Li, Jim Mattson, Joerg Roedel,
	linux-kernel, Suleiman Souhlal, Anton Romanov

From: Sean Christopherson <seanjc@google.com>

Don't snapshot tsc_khz into max_tsc_khz during KVM initialization if the
host TSC is constant, in which case the actual TSC frequency will never
change and thus capturing the "max" TSC during initialization is
unnecessary, KVM can simply use tsc_khz during VM creation.

On CPUs with constant TSC, but not a hardware-specified TSC frequency,
snapshotting max_tsc_khz and using that to set a VM's default TSC
frequency can lead to KVM thinking it needs to manually scale the guest's
TSC if refining the TSC completes after KVM snapshots tsc_khz.  The
actual frequency never changes, only the kernel's calculation of what
that frequency is changes.  On systems without hardware TSC scaling, this
either puts KVM into "always catchup" mode (extremely inefficient), or
prevents creating VMs altogether.

Ideally, KVM would not be able to race with TSC refinement, or would have
a hook into tsc_refine_calibration_work() to get an alert when refinement
is complete.  Avoiding the race altogether isn't practical as refinement
takes a relative eternity; it's deliberately put on a work queue outside
of the normal boot sequence to avoid unnecessarily delaying boot.

Adding a hook is doable, but somewhat gross due to KVM's ability to be
built as a module.  And if the TSC is constant, which is likely the case
for every VMX/SVM-capable CPU produced in the last decade, the race can
be hit if and only if userspace is able to create a VM before TSC
refinement completes; refinement is slow, but not that slow.

For now, punt on a proper fix, as not taking a snapshot can help some
uses cases and not taking a snapshot is arguably correct irrespective of
the race with refinement.

[ dwmw2: Rebase on top of KVM-wide default_tsc_khz to ensure that all
         vCPUs get the same frequency even if we hit the race. ]

Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Anton Romanov <romanton@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/kvm/x86.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bb3e9916229a..04f86cb94069 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8751,22 +8751,22 @@ static int kvmclock_cpu_online(unsigned int cpu)
 
 static void kvm_timer_init(void)
 {
-	max_tsc_khz = tsc_khz;
-
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-#ifdef CONFIG_CPU_FREQ
-		struct cpufreq_policy *policy;
-		int cpu;
-
-		cpu = get_cpu();
-		policy = cpufreq_cpu_get(cpu);
-		if (policy) {
-			if (policy->cpuinfo.max_freq)
-				max_tsc_khz = policy->cpuinfo.max_freq;
-			cpufreq_cpu_put(policy);
+		max_tsc_khz = tsc_khz;
+
+		if (IS_ENABLED(CONFIG_CPU_FREQ)) {
+			struct cpufreq_policy *policy;
+			int cpu;
+
+			cpu = get_cpu();
+			policy = cpufreq_cpu_get(cpu);
+			if (policy) {
+				if (policy->cpuinfo.max_freq)
+					max_tsc_khz = policy->cpuinfo.max_freq;
+				cpufreq_cpu_put(policy);
+			}
+			put_cpu();
 		}
-		put_cpu();
-#endif
 		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
 					  CPUFREQ_TRANSITION_NOTIFIER);
 	}
@@ -11637,7 +11637,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	pvclock_update_vm_gtod_copy(kvm);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
-	kvm->arch.default_tsc_khz = max_tsc_khz;
+	kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
 	kvm->arch.guest_can_read_msr_platform_info = true;
 
 #if IS_ENABLED(CONFIG_HYPERV)
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/3] KVM: x86: Test case for TSC scaling and offset sync
  2022-02-25 14:53 [PATCH 0/3] KVM: Reduce TSC scaling race condition mess David Woodhouse
  2022-02-25 14:53 ` [PATCH 1/3] KVM: x86: Accept KVM_[GS]ET_TSC_KHZ as a VM ioctl David Woodhouse
  2022-02-25 14:53 ` [PATCH 2/3] KVM: x86: Don't snapshot "max" TSC if host TSC is constant David Woodhouse
@ 2022-02-25 14:53 ` David Woodhouse
  2 siblings, 0 replies; 4+ messages in thread
From: David Woodhouse @ 2022-02-25 14:53 UTC (permalink / raw)
  To: kvm, Paolo Bonzini, Sean Christopherson
  Cc: Vitaly Kuznetsov, Wanpeng Li, Jim Mattson, Joerg Roedel,
	linux-kernel, Suleiman Souhlal, Anton Romanov

From: David Woodhouse <dwmw@amazon.co.uk>

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 tools/testing/selftests/kvm/Makefile          |   1 +
 .../selftests/kvm/x86_64/tsc_scaling_sync.c   | 119 ++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 06c3a4602bcc..33b57f8c6251 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -65,6 +65,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
+TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync
 TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
 TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
 TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
new file mode 100644
index 000000000000..f0083d8cfe98
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_vmcall_test
+ *
+ * Copyright © 2021 Amazon.com, Inc. or its affiliates.
+ *
+ * Xen shared_info / pvclock testing
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <signal.h>
+#include <pthread.h>
+
+#define NR_TEST_VCPUS 20
+
+static struct kvm_vm *vm;
+pthread_spinlock_t create_lock;
+
+#define TEST_TSC_KHZ    2345678UL
+#define TEST_TSC_OFFSET 200000000
+
+uint64_t tsc_sync;
+static void guest_code(void)
+{
+	uint64_t start_tsc, local_tsc, tmp;
+
+	start_tsc = rdtsc();
+	do {
+		tmp = READ_ONCE(tsc_sync);
+		local_tsc = rdtsc();
+		WRITE_ONCE(tsc_sync, local_tsc);
+		if (unlikely(local_tsc < tmp))
+			GUEST_SYNC_ARGS(0, local_tsc, tmp, 0, 0);
+
+	} while (local_tsc - start_tsc < 5000 * TEST_TSC_KHZ);
+
+	GUEST_DONE();
+}
+
+
+static void *run_vcpu(void *_cpu_nr)
+{
+	unsigned long cpu = (unsigned long)_cpu_nr;
+	unsigned long failures = 0;
+	static bool first_cpu_done;
+
+	/* The kernel is fine, but vm_vcpu_add_default() needs locking */
+	pthread_spin_lock(&create_lock);
+
+	vm_vcpu_add_default(vm, cpu, guest_code);
+
+	if (!first_cpu_done) {
+		first_cpu_done = true;
+		vcpu_set_msr(vm, cpu, MSR_IA32_TSC, TEST_TSC_OFFSET);
+	}
+
+	pthread_spin_unlock(&create_lock);
+
+	for (;;) {
+		volatile struct kvm_run *run = vcpu_state(vm, cpu);
+                struct ucall uc;
+
+                vcpu_run(vm, cpu);
+                TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+                            "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+                            run->exit_reason,
+                            exit_reason_str(run->exit_reason));
+
+                switch (get_ucall(vm, cpu, &uc)) {
+                case UCALL_DONE:
+			goto out;
+
+                case UCALL_SYNC:
+			printf("Guest %ld sync %lx %lx %ld\n", cpu, uc.args[2], uc.args[3], uc.args[2] - uc.args[3]);
+			failures++;
+			break;
+
+                default:
+                        TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+ out:
+	return (void *)failures;
+}
+
+int main(int argc, char *argv[])
+{
+        if (!kvm_check_cap(KVM_CAP_VM_TSC_CONTROL)) {
+		print_skip("KVM_CAP_VM_TSC_CONTROL not available");
+		exit(KSFT_SKIP);
+	}
+
+	vm = vm_create_default_with_vcpus(0, DEFAULT_STACK_PGS * NR_TEST_VCPUS, 0, guest_code, NULL);
+	vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ);
+
+	pthread_spin_init(&create_lock, PTHREAD_PROCESS_PRIVATE);
+	pthread_t cpu_threads[NR_TEST_VCPUS];
+	unsigned long cpu;
+	for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++)
+		pthread_create(&cpu_threads[cpu], NULL, run_vcpu, (void *)cpu);
+
+	unsigned long failures = 0;
+	for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++) {
+		void *this_cpu_failures;
+		pthread_join(cpu_threads[cpu], &this_cpu_failures);
+		failures += (unsigned long)this_cpu_failures;
+	}
+
+	TEST_ASSERT(!failures, "TSC sync failed");
+	pthread_spin_destroy(&create_lock);
+	kvm_vm_free(vm);
+	return 0;
+}
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-02-25 14:53 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-25 14:53 [PATCH 0/3] KVM: Reduce TSC scaling race condition mess David Woodhouse
2022-02-25 14:53 ` [PATCH 1/3] KVM: x86: Accept KVM_[GS]ET_TSC_KHZ as a VM ioctl David Woodhouse
2022-02-25 14:53 ` [PATCH 2/3] KVM: x86: Don't snapshot "max" TSC if host TSC is constant David Woodhouse
2022-02-25 14:53 ` [PATCH 3/3] KVM: x86: Test case for TSC scaling and offset sync David Woodhouse

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.