* [PATCH] x86/kvm: Handle async page faults directly through do_page_fault()
@ 2020-02-28 18:42 Andy Lutomirski
2020-02-28 19:00 ` Paolo Bonzini
0 siblings, 1 reply; 4+ messages in thread
From: Andy Lutomirski @ 2020-02-28 18:42 UTC (permalink / raw)
To: LKML, x86, kvm list, Paolo Bonzini, Radim Krcmar; +Cc: Andy Lutomirski
KVM overloads #PF to indicate two types of not-actually-page-fault
events. Right now, the KVM guest code intercepts them by modifying
the IDT and hooking the #PF vector. This makes the already fragile
fault code even harder to understand, and it also pollutes call
traces with async_page_fault and do_async_page_fault for normal page
faults.
Clean it up by moving the logic into do_page_fault() using a static
branch. This gets rid of the platform trap_init override mechanism
completely.
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
arch/x86/entry/entry_64.S | 4 ----
arch/x86/include/asm/kvm_para.h | 17 +++++++++++++++-
arch/x86/include/asm/x86_init.h | 2 --
arch/x86/kernel/kvm.c | 36 +++++++++++++++++++--------------
arch/x86/kernel/traps.c | 2 --
arch/x86/kernel/x86_init.c | 1 -
arch/x86/mm/fault.c | 20 ++++++++++++++++++
7 files changed, 57 insertions(+), 25 deletions(-)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f2bb91e87877..be1fc6f2fe85 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1202,10 +1202,6 @@ idtentry xendebug do_debug has_error_code=0
idtentry general_protection do_general_protection has_error_code=1
idtentry page_fault do_page_fault has_error_code=1 read_cr2=1
-#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1
-#endif
-
#ifdef CONFIG_X86_MCE
idtentry machine_check do_mce has_error_code=0 paranoid=1
#endif
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 9b4df6eaa11a..4d72f5488584 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -92,7 +92,17 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_pf_reason(void);
extern void kvm_disable_steal_time(void);
-void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
+extern bool do_kvm_handle_async_pf(struct pt_regs *regs, unsigned long error_code, unsigned long address);
+
+DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+
+static inline bool kvm_handle_async_pf(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+ if (static_branch_unlikely(&kvm_async_pf_enabled))
+ return do_kvm_handle_async_pf(regs, error_code, address);
+ else
+ return false;
+}
#ifdef CONFIG_PARAVIRT_SPINLOCKS
void __init kvm_spinlock_init(void);
@@ -130,6 +140,11 @@ static inline void kvm_disable_steal_time(void)
{
return;
}
+
+static inline bool kvm_handle_async_pf(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+ return false;
+}
#endif
#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 96d9cd208610..6807153c0410 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -50,14 +50,12 @@ struct x86_init_resources {
* @pre_vector_init: init code to run before interrupt vectors
* are set up.
* @intr_init: interrupt init code
- * @trap_init: platform specific trap setup
* @intr_mode_select: interrupt delivery mode selection
* @intr_mode_init: interrupt delivery mode setup
*/
struct x86_init_irqs {
void (*pre_vector_init)(void);
void (*intr_init)(void);
- void (*trap_init)(void);
void (*intr_mode_select)(void);
void (*intr_mode_init)(void);
};
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d817f255aed8..93ab0cbd304e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -35,6 +35,8 @@
#include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h>
+DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+
static int kvmapf = 1;
static int __init parse_no_kvmapf(char *arg)
@@ -242,25 +244,29 @@ u32 kvm_read_and_reset_pf_reason(void)
EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
-dotraplinkage void
-do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+bool
+do_kvm_handle_async_pf(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
+ /*
+ * If we get a page fault right here, the pf_reason seems likely
+ * to be clobbered. Bummer.
+ */
+
switch (kvm_read_and_reset_pf_reason()) {
default:
- do_page_fault(regs, error_code, address);
- break;
+ return false;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
kvm_async_pf_task_wait((u32)address, !user_mode(regs));
- break;
+ return true;
case KVM_PV_REASON_PAGE_READY:
rcu_irq_enter();
kvm_async_pf_task_wake((u32)address);
rcu_irq_exit();
- break;
+ return true;
}
}
-NOKPROBE_SYMBOL(do_async_page_fault);
+NOKPROBE_SYMBOL(do_kvm_handle_async_pf);
static void __init paravirt_ops_setup(void)
{
@@ -306,7 +312,11 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
static void kvm_guest_cpu_init(void)
{
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
- u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
+ u64 pa;
+
+ WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
+
+ pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
#ifdef CONFIG_PREEMPTION
pa |= KVM_ASYNC_PF_SEND_ALWAYS;
@@ -570,11 +580,6 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
}
#endif
-static void __init kvm_apf_trap_init(void)
-{
- update_intr_gate(X86_TRAP_PF, async_page_fault);
-}
-
static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
static void kvm_flush_tlb_others(const struct cpumask *cpumask,
@@ -611,8 +616,6 @@ static void __init kvm_guest_init(void)
register_reboot_notifier(&kvm_pv_reboot_nb);
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
raw_spin_lock_init(&async_pf_sleepers[i].lock);
- if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
- x86_init.irqs.trap_init = kvm_apf_trap_init;
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
@@ -652,6 +655,9 @@ static void __init kvm_guest_init(void)
* overcommitted.
*/
hardlockup_detector_disable();
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf)
+ static_branch_enable(&kvm_async_pf_enabled);
}
static noinline uint32_t __kvm_cpuid_base(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 6ef00eb6fbb9..bcb514c801f2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -936,7 +936,5 @@ void __init trap_init(void)
idt_setup_ist_traps();
- x86_init.irqs.trap_init();
-
idt_setup_debugidt_traps();
}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 85f1a90c55cd..123f1c1f1788 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -79,7 +79,6 @@ struct x86_init_ops x86_init __initdata = {
.irqs = {
.pre_vector_init = init_ISA_irqs,
.intr_init = native_init_IRQ,
- .trap_init = x86_init_noop,
.intr_mode_select = apic_intr_mode_select,
.intr_mode_init = apic_intr_mode_init
},
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fa4ea09593ab..ba04be080142 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -30,6 +30,7 @@
#include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
+#include <asm/kvm_para.h> /* kvm_handle_async_pf */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -1505,6 +1506,25 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
prefetchw(¤t->mm->mmap_sem);
+ /*
+ * KVM has two types of events that are, logically, interrupts, but
+ * are unfortunately delivered using the #PF vector. These events are
+ * "you just accessed valid memory, but the host doesn't have it right
+ * not, so I'll put you to sleep if you continue" and "that memory
+ * you tried to access earlier is available now."
+ *
+ * We are relying on the interrupted context being sane (valid
+ * RSP, relevant locks not held, etc.), which is fine as long as
+ * the the interrupted context had IF=1. We are also relying on
+ * the KVM async pf type field and CR2 being read consistently
+ * instead of getting values from real and async page faults
+ * mixed up.
+ *
+ * Fingers crossed.
+ */
+ if (kvm_handle_async_pf(regs, hw_error_code, address))
+ return;
+
trace_page_fault_entries(regs, hw_error_code, address);
if (unlikely(kmmio_fault(regs, address)))
--
2.24.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] x86/kvm: Handle async page faults directly through do_page_fault()
2020-02-28 18:42 [PATCH] x86/kvm: Handle async page faults directly through do_page_fault() Andy Lutomirski
@ 2020-02-28 19:00 ` Paolo Bonzini
2020-02-28 19:04 ` Andy Lutomirski
0 siblings, 1 reply; 4+ messages in thread
From: Paolo Bonzini @ 2020-02-28 19:00 UTC (permalink / raw)
To: Andy Lutomirski, LKML, x86, kvm list, Radim Krcmar
On 28/02/20 19:42, Andy Lutomirski wrote:
> KVM overloads #PF to indicate two types of not-actually-page-fault
> events. Right now, the KVM guest code intercepts them by modifying
> the IDT and hooking the #PF vector. This makes the already fragile
> fault code even harder to understand, and it also pollutes call
> traces with async_page_fault and do_async_page_fault for normal page
> faults.
>
> Clean it up by moving the logic into do_page_fault() using a static
> branch. This gets rid of the platform trap_init override mechanism
> completely.
>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Just one thing:
> @@ -1505,6 +1506,25 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
> unsigned long address)
> {
> prefetchw(¤t->mm->mmap_sem);
> + /*
> + * KVM has two types of events that are, logically, interrupts, but
> + * are unfortunately delivered using the #PF vector.
At least the not-present case isn't entirely an interrupt because it
must be delivered precisely. Regarding the page-ready case you're
right, it could be an interrupt. However, generally speaking this is not
a problem. Using something in memory rather than overloading the error
code was the mistake.
> + * These events are
> + * "you just accessed valid memory, but the host doesn't have it right
> + * not, so I'll put you to sleep if you continue" and "that memory
> + * you tried to access earlier is available now."
> + *
> + * We are relying on the interrupted context being sane (valid
> + * RSP, relevant locks not held, etc.), which is fine as long as
> + * the the interrupted context had IF=1.
This is not about IF=0/IF=1; the KVM code is careful about taking
spinlocks only with IRQs disabled, and async PF is not delivered if the
interrupted context had IF=0. The problem is that the memory location
is not reentrant if an NMI is delivered in the wrong window, as you hint
below.
Paolo
> We are also relying on
> + * the KVM async pf type field and CR2 being read consistently
> + * instead of getting values from real and async page faults
> + * mixed up.
> + *
> + * Fingers crossed.
> + */
> + if (kvm_handle_async_pf(regs, hw_error_code, address))
> + return;
> +
> trace_page_fault_entries(regs, hw_error_code, address);
>
> if (unlikely(kmmio_fault(regs, address)))
>
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] x86/kvm: Handle async page faults directly through do_page_fault()
2020-02-28 19:00 ` Paolo Bonzini
@ 2020-02-28 19:04 ` Andy Lutomirski
2020-02-28 19:36 ` Paolo Bonzini
0 siblings, 1 reply; 4+ messages in thread
From: Andy Lutomirski @ 2020-02-28 19:04 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: Andy Lutomirski, LKML, X86 ML, kvm list, Radim Krcmar
On Fri, Feb 28, 2020 at 11:01 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 28/02/20 19:42, Andy Lutomirski wrote:
> > KVM overloads #PF to indicate two types of not-actually-page-fault
> > events. Right now, the KVM guest code intercepts them by modifying
> > the IDT and hooking the #PF vector. This makes the already fragile
> > fault code even harder to understand, and it also pollutes call
> > traces with async_page_fault and do_async_page_fault for normal page
> > faults.
> >
> > Clean it up by moving the logic into do_page_fault() using a static
> > branch. This gets rid of the platform trap_init override mechanism
> > completely.
> >
> > Signed-off-by: Andy Lutomirski <luto@kernel.org>
>
> Acked-by: Paolo Bonzini <pbonzini@redhat.com>
>
> Just one thing:
>
> > @@ -1505,6 +1506,25 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
> > unsigned long address)
> > {
> > prefetchw(¤t->mm->mmap_sem);
> > + /*
> > + * KVM has two types of events that are, logically, interrupts, but
> > + * are unfortunately delivered using the #PF vector.
>
> At least the not-present case isn't entirely an interrupt because it
> must be delivered precisely. Regarding the page-ready case you're
> right, it could be an interrupt. However, generally speaking this is not
> a problem. Using something in memory rather than overloading the error
> code was the mistake.
>
> > + * These events are
> > + * "you just accessed valid memory, but the host doesn't have it right
> > + * not, so I'll put you to sleep if you continue" and "that memory
> > + * you tried to access earlier is available now."
> > + *
> > + * We are relying on the interrupted context being sane (valid
> > + * RSP, relevant locks not held, etc.), which is fine as long as
> > + * the the interrupted context had IF=1.
>
> This is not about IF=0/IF=1; the KVM code is careful about taking
> spinlocks only with IRQs disabled, and async PF is not delivered if the
> interrupted context had IF=0. The problem is that the memory location
> is not reentrant if an NMI is delivered in the wrong window, as you hint
> below.
If an async PF is delivered with IF=0, then, unless something else
clever happens to make it safe, we are toast. The x86 entry code
cannot handle #PF (or most other entries) at arbitrary places. I'll
improve the comment in v2.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] x86/kvm: Handle async page faults directly through do_page_fault()
2020-02-28 19:04 ` Andy Lutomirski
@ 2020-02-28 19:36 ` Paolo Bonzini
0 siblings, 0 replies; 4+ messages in thread
From: Paolo Bonzini @ 2020-02-28 19:36 UTC (permalink / raw)
To: Andy Lutomirski; +Cc: LKML, X86 ML, kvm list, Radim Krcmar
On 28/02/20 20:04, Andy Lutomirski wrote:
>>> + * We are relying on the interrupted context being sane (valid
>>> + * RSP, relevant locks not held, etc.), which is fine as long as
>>> + * the the interrupted context had IF=1.
>> This is not about IF=0/IF=1; the KVM code is careful about taking
>> spinlocks only with IRQs disabled, and async PF is not delivered if the
>> interrupted context had IF=0. The problem is that the memory location
>> is not reentrant if an NMI is delivered in the wrong window, as you hint
>> below.
>
> If an async PF is delivered with IF=0, then, unless something else
> clever happens to make it safe, we are toast.
Right, it just cannot happen. kvm_can_do_async_pf is where KVM decides
whether a page fault must be handled synchronously, and it does this:
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
{
...
/*
* If interrupts are off we cannot even use an artificial
* halt state.
*/
return kvm_x86_ops->interrupt_allowed(vcpu);
}
The same function is called by kvm_arch_can_inject_async_page_present.
Paolo
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2020-02-28 19:36 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-28 18:42 [PATCH] x86/kvm: Handle async page faults directly through do_page_fault() Andy Lutomirski
2020-02-28 19:00 ` Paolo Bonzini
2020-02-28 19:04 ` Andy Lutomirski
2020-02-28 19:36 ` Paolo Bonzini
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).