From: Hikaru Nishida <hikalium@chromium.org>
To: linux-kernel@vger.kernel.org, dme@dme.org, tglx@linutronix.de,
mlevitsk@redhat.com, linux@roeck-us.net, pbonzini@redhat.com,
vkuznets@redhat.com, maz@kernel.org, will@kernel.org
Cc: suleiman@google.com, senozhatsky@google.com,
kvmarm@lists.cs.columbia.edu,
linux-arm-kernel@lists.infradead.org,
Hikaru Nishida <hikalium@chromium.org>,
Andy Lutomirski <luto@kernel.org>, Arnd Bergmann <arnd@arndb.de>,
Borislav Petkov <bp@alien8.de>,
Dave Hansen <dave.hansen@linux.intel.com>,
Geert Uytterhoeven <geert@linux-m68k.org>,
"H. Peter Anvin" <hpa@zytor.com>, Ingo Molnar <mingo@redhat.com>,
Jim Mattson <jmattson@google.com>, Joerg Roedel <joro@8bytes.org>,
John Stultz <john.stultz@linaro.org>,
Juergen Gross <jgross@suse.com>,
Kees Cook <keescook@chromium.org>,
Lai Jiangshan <laijs@linux.alibaba.com>,
Linus Walleij <linus.walleij@linaro.org>,
Peter Zijlstra <peterz@infradead.org>,
Sean Christopherson <seanjc@google.com>,
Stephen Boyd <sboyd@kernel.org>,
Wanpeng Li <wanpengli@tencent.com>,
kvm@vger.kernel.org, x86@kernel.org
Subject: [RFC PATCH v3 5/5] kvm/x86: virtual suspend time injection: Implement guest side
Date: Wed, 20 Oct 2021 21:04:30 +0900 [thread overview]
Message-ID: <20211020210348.RFC.v3.5.I99f7da15fd68fc098709ea4bcf74525e0883ea92@changeid> (raw)
In-Reply-To: <20211020120431.776494-1-hikalium@chromium.org>
Add guest side implementation of KVM virtual suspend time injection.
How it works from guest's view:
- Guest will be paused without going through suspend/resume path in the
guest kernel
- Before resuming the execution of the guest's vcpus, host will adjust
the hardware clock (and kvm_clock) to the time before the suspend.
- By this action, guest's CLOCK_MONOTONIC behaves as expected (stops
during the host's suspension.)
- the guest will receive an IRQ from the guest that notifies about the
suspend which was invisible to the guest. In the handler, the guest
can adjust their CLOCK_BOOTTIME to reflect the suspension.
- Now, CLOCK_BOOTTIME includes the time passed during the host's
suspension.
Signed-off-by: Hikaru Nishida <hikalium@chromium.org>
---
Changes in v3:
- Reused HYPERVISOR_CALLBACK_VECTOR IRQ instead of adding a new one.
- Extracted arch-independent parts.
arch/x86/Kconfig | 13 ++++++++
arch/x86/include/asm/idtentry.h | 2 +-
arch/x86/include/asm/kvmclock.h | 9 ++++++
arch/x86/kernel/kvm.c | 14 ++++++---
arch/x86/kernel/kvmclock.c | 26 ++++++++++++++++
arch/x86/mm/fault.c | 2 +-
include/linux/timekeeper_internal.h | 5 ++++
include/linux/timekeeping.h | 4 +++
kernel/time/timekeeping.c | 46 +++++++++++++++++++++++++++++
9 files changed, 115 insertions(+), 6 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d9830e7e1060..1d4a529d1577 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -824,6 +824,19 @@ config KVM_GUEST
underlying device model, the host provides the guest with
timing infrastructure such as time of day, and system time
+config KVM_VIRT_SUSPEND_TIMING_GUEST
+ bool "Guest support for virtual suspend time injection"
+ depends on KVM_GUEST
+ default n
+ help
+ This option makes the host's suspension reflected on the guest's clocks.
+ In other words, guest's CLOCK_MONOTONIC will stop and
+ CLOCK_BOOTTIME keeps running during the host's suspension.
+ This feature will only be effective when both guest and host support
+ this feature. For the host side, see KVM_VIRT_SUSPEND_TIMING.
+
+ If unsure, say N.
+
config ARCH_CPUIDLE_HALTPOLL
def_bool n
prompt "Disable host haltpoll when loading haltpoll driver"
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 1345088e9902..5e30f84ea07e 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -686,7 +686,7 @@ DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback);
#endif
#ifdef CONFIG_KVM_GUEST
-DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
+DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_hv_callback);
#endif
#undef X86_TRAP_OTHER
diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h
index 9add14edc24d..2bf1a5c92319 100644
--- a/arch/x86/include/asm/kvmclock.h
+++ b/arch/x86/include/asm/kvmclock.h
@@ -20,4 +20,13 @@ static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
return this_cpu_read(hv_clock_per_cpu);
}
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+u64 kvm_get_suspend_time(void);
+#else
+static inline u64 kvm_get_suspend_time(void)
+{
+ return 0;
+}
+#endif
+
#endif /* _ASM_X86_KVM_CLOCK_H */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b656456c3a94..3d84ef6d9df2 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -40,6 +40,7 @@
#include <asm/ptrace.h>
#include <asm/reboot.h>
#include <asm/svm.h>
+#include <asm/kvmclock.h>
DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@@ -270,7 +271,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
return true;
}
-DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
+DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_hv_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
u32 token;
@@ -286,6 +287,8 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
}
+ timekeeping_inject_virtual_suspend_time(kvm_get_suspend_time());
+
set_irq_regs(old_regs);
}
@@ -710,10 +713,13 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
- if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf)
static_branch_enable(&kvm_async_pf_enabled);
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt);
- }
+
+ if ((kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) ||
+ kvm_para_has_feature(KVM_FEATURE_HOST_SUSPEND_TIME))
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
+ asm_sysvec_kvm_hv_callback);
#ifdef CONFIG_SMP
if (pv_tlb_flush_supported()) {
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 73c74b961d0f..3e16d0ab79f3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -16,11 +16,15 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/set_memory.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <asm/hypervisor.h>
#include <asm/mem_encrypt.h>
#include <asm/x86_init.h>
#include <asm/kvmclock.h>
+#include <asm/desc.h>
+#include <asm/idtentry.h>
static int kvmclock __initdata = 1;
static int kvmclock_vsyscall __initdata = 1;
@@ -48,6 +52,9 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
static struct pvclock_vsyscall_time_info
hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+static struct kvm_suspend_time suspend_time __bss_decrypted;
+#endif
static struct pvclock_wall_clock wall_clock __bss_decrypted;
static struct pvclock_vsyscall_time_info *hvclock_mem;
DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
@@ -281,6 +288,17 @@ static int kvmclock_setup_percpu(unsigned int cpu)
return p ? 0 : -ENOMEM;
}
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+/**
+ * kvm_get_suspend_time - duration of host suspend.
+ * Return: Cumulative duration of host suspend in nanoseconds.
+ */
+u64 kvm_get_suspend_time(void)
+{
+ return suspend_time.suspend_time_ns;
+}
+#endif
+
void __init kvmclock_init(void)
{
u8 flags;
@@ -295,6 +313,14 @@ void __init kvmclock_init(void)
return;
}
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+ if (kvm_para_has_feature(KVM_FEATURE_HOST_SUSPEND_TIME)) {
+ /* Register the suspend time structure */
+ wrmsrl(MSR_KVM_HOST_SUSPEND_TIME,
+ slow_virt_to_phys(&suspend_time) | KVM_MSR_ENABLED);
+ }
+#endif
+
if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
kvmclock_setup_percpu, NULL) < 0) {
return;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 84a2c8c4af73..f36f49585d5d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1509,7 +1509,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
* memory is swapped out). Note, the corresponding "page ready" event
* which is injected when the memory becomes available, is delivered via
* an interrupt mechanism and not a #PF exception
- * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
+ * (see arch/x86/kernel/kvm.c: sysvec_kvm_hv_callback()).
*
* We are relying on the interrupted context being sane (valid RSP,
* relevant locks not held, etc.), which is fine as long as the
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 84ff2844df2a..0d5b29122d40 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -68,6 +68,8 @@ struct tk_read_base {
* shifted nano seconds.
* @ntp_error_shift: Shift conversion between clock shifted nano seconds and
* ntp shifted nano seconds.
+ * @kvm_suspend_time: The cumulative duration of suspend injected through KVM
+ * in nano seconds.
* @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING)
* @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING)
* @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING)
@@ -124,6 +126,9 @@ struct timekeeper {
u32 ntp_err_mult;
/* Flag used to avoid updating NTP twice with same second */
u32 skip_second_overflow;
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+ u64 kvm_suspend_time;
+#endif
#ifdef CONFIG_DEBUG_TIMEKEEPING
long last_warning;
/*
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index f7be69c81dab..a2228300c3f9 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -310,4 +310,8 @@ void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
extern int update_persistent_clock64(struct timespec64 now);
#endif
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+void timekeeping_inject_virtual_suspend_time(u64 total_duration_ns);
+#endif
+
#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e77580d9f8c1..5f474cde0bae 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2133,6 +2133,52 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
return offset;
}
+#ifdef CONFIG_KVM_VIRT_SUSPEND_TIMING_GUEST
+/**
+ * timekeeping_inject_virtual_suspend_time - Inject virtual suspend time
+ * when requested by the kvm host.
+ * @total_duration_ns: Total suspend time to be injected in nanoseconds.
+ */
+void timekeeping_inject_virtual_suspend_time(u64 total_duration_ns)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ if (total_duration_ns > tk->kvm_suspend_time) {
+ /*
+ * Do injection only if the time is not injected yet.
+ * total_duration_ns and tk->kvm_suspend_time values are
+ * cumulative, so the delta between them will be an amount
+ * of adjustments. For example, if the host suspends 2 times
+ * during the guest is running and each suspend is 5 seconds,
+ * total_duration_ns will be 5 seconds at the first injection
+ * and tk->kvm_suspend_time was initialized to zero so the
+ * adjustment injected here will be 5 - 0 = 5 seconds and
+ * tk->kvm_suspend_time will be updated to 5 seconds.
+ * On the second injection after the second resume,
+ * total_duration_ns will be 10 seconds and
+ * tk->kvm_suspend_time will be 5 seconds so 10 - 5 = 5 seconds
+ * of the suspend time will be injected again.
+ */
+ struct timespec64 delta =
+ ns_to_timespec64(total_duration_ns -
+ tk->kvm_suspend_time);
+ tk->kvm_suspend_time = total_duration_ns;
+
+ write_seqcount_begin(&tk_core.seq);
+ timekeeping_forward_now(tk);
+ __timekeeping_inject_sleeptime(tk, &delta);
+ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
+ write_seqcount_end(&tk_core.seq);
+
+ /* signal hrtimers about time change */
+ clock_was_set_delayed();
+ }
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
+#endif
+
/*
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
--
2.33.0.1079.g6e70778dc9-goog
next prev parent reply other threads:[~2021-10-20 12:05 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-10-20 12:04 [RFC PATCH v3 0/5] x86/kvm: Virtual suspend time injection support Hikaru Nishida
2021-10-20 12:04 ` [RFC PATCH v3 1/5] timekeeping: Expose tk->offs_boot via ktime_get_offs_boot_ns Hikaru Nishida
2021-10-20 12:04 ` [RFC PATCH v3 2/5] kvm/x86: Include asm/pvclock.h in asm/kvmclock.h Hikaru Nishida
2021-10-20 12:04 ` [RFC PATCH v3 3/5] kvm/x86: virtual suspend time injection: Add common definitions Hikaru Nishida
2021-10-20 12:04 ` [RFC PATCH v3 4/5] kvm/x86: virtual suspend time injection: Implement host side Hikaru Nishida
2021-10-20 12:04 ` Hikaru Nishida [this message]
2021-10-20 13:52 ` [RFC PATCH v3 0/5] x86/kvm: Virtual suspend time injection support Marc Zyngier
2021-11-04 9:10 ` Hikaru Nishida
2021-12-04 17:30 ` Marc Zyngier
2022-04-06 3:48 ` Hikaru Nishida
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20211020210348.RFC.v3.5.I99f7da15fd68fc098709ea4bcf74525e0883ea92@changeid \
--to=hikalium@chromium.org \
--cc=arnd@arndb.de \
--cc=bp@alien8.de \
--cc=dave.hansen@linux.intel.com \
--cc=dme@dme.org \
--cc=geert@linux-m68k.org \
--cc=hpa@zytor.com \
--cc=jgross@suse.com \
--cc=jmattson@google.com \
--cc=john.stultz@linaro.org \
--cc=joro@8bytes.org \
--cc=keescook@chromium.org \
--cc=kvm@vger.kernel.org \
--cc=kvmarm@lists.cs.columbia.edu \
--cc=laijs@linux.alibaba.com \
--cc=linus.walleij@linaro.org \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux@roeck-us.net \
--cc=luto@kernel.org \
--cc=maz@kernel.org \
--cc=mingo@redhat.com \
--cc=mlevitsk@redhat.com \
--cc=pbonzini@redhat.com \
--cc=peterz@infradead.org \
--cc=sboyd@kernel.org \
--cc=seanjc@google.com \
--cc=senozhatsky@google.com \
--cc=suleiman@google.com \
--cc=tglx@linutronix.de \
--cc=vkuznets@redhat.com \
--cc=wanpengli@tencent.com \
--cc=will@kernel.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).