From mboxrd@z Thu Jan  1 00:00:00 1970
From: David Matlack <dmatlack@google.com>
Subject: Re: [RFC PATCH V2 4/4] Utilize the vmx preemption timer for tsc
 deadline timer
Date: Tue, 24 May 2016 16:11:29 -0700
Message-ID: <CALzav=eruuXMMn_CfXUxNz+YtTWHvxTghB5-RDYTVtHwCfu1Gw@mail.gmail.com>
References: <1464128852-14138-1-git-send-email-yunhong.jiang@linux.intel.com> <1464128852-14138-5-git-send-email-yunhong.jiang@linux.intel.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Cc: kvm list <kvm@vger.kernel.org>,
	Marcelo Tosatti <mtosatti@redhat.com>,
	=?UTF-8?B?UmFkaW0gS3LEjW3DocWZ?= <rkrcmar@redhat.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Wanpeng Li <kernellwp@gmail.com>
To: Yunhong Jiang <yunhong.jiang@linux.intel.com>
Return-path: <kvm-owner@vger.kernel.org>
Received: from mail-oi0-f51.google.com ([209.85.218.51]:34414 "EHLO
	mail-oi0-f51.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1754931AbcEXXLu (ORCPT <rfc822;kvm@vger.kernel.org>);
	Tue, 24 May 2016 19:11:50 -0400
Received: by mail-oi0-f51.google.com with SMTP id b65so51846631oia.1
        for <kvm@vger.kernel.org>; Tue, 24 May 2016 16:11:49 -0700 (PDT)
In-Reply-To: <1464128852-14138-5-git-send-email-yunhong.jiang@linux.intel.com>
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>

On Tue, May 24, 2016 at 3:27 PM, Yunhong Jiang
<yunhong.jiang@linux.intel.com> wrote:
> From: Yunhong Jiang <yunhong.jiang@gmail.com>
>
> Utilizing the VMX preemption timer for tsc deadline timer
> virtualization. The VMX preemption timer is armed when the vCPU is
> running, and a VMExit will happen if the virtual TSC deadline timer
> expires.
>
> When the vCPU thread is scheduled out, the tsc deadline timer
> virtualization will be switched to use the current solution, i.e. use
> the timer for it. It's switched back to VMX preemption timer when the
> vCPU thread is scheduled int.
>
> This solution avoids the complex OS's hrtimer system, and also the host
> timer interrupt handling cost, with a preemption_timer VMexit. It fits
> well for some NFV usage scenario, when the vCPU is bound to a pCPU and
> the pCPU is isolated, or some similar scenario.
>
> However, it possibly has impact if the vCPU thread is scheduled in/out
> very frequently, because it switches from/to the hrtimer emulation a lot.

What is the cost of the extra sched-in/out hooks?

>
> Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
> ---
>  arch/x86/kvm/lapic.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/lapic.h | 11 ++++++
>  arch/x86/kvm/trace.h | 22 ++++++++++++
>  arch/x86/kvm/vmx.c   |  8 +++++
>  arch/x86/kvm/x86.c   |  4 +++
>  5 files changed, 139 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index f1cf8a5ede11..93679db7ce0f 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1313,7 +1313,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
>                 __delay(tsc_deadline - guest_tsc);
>  }
>
> -static void start_sw_tscdeadline(struct kvm_lapic *apic)
> +static void start_sw_tscdeadline(struct kvm_lapic *apic, int no_expire)
>  {
>         u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
>         u64 ns = 0;
> @@ -1330,7 +1330,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
>
>         now = apic->lapic_timer.timer.base->get_time();
>         guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> -       if (likely(tscdeadline > guest_tsc)) {
> +       if (no_expire || likely(tscdeadline > guest_tsc)) {
>                 ns = (tscdeadline - guest_tsc) * 1000000ULL;
>                 do_div(ns, this_tsc_khz);
>                 expire = ktime_add_ns(now, ns);
> @@ -1343,6 +1343,85 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
>         local_irq_restore(flags);
>  }
>
> +void kvm_lapic_arm_hv_timer(struct kvm_vcpu *vcpu)
> +{
> +       struct kvm_lapic *apic = vcpu->arch.apic;
> +       u64 tscdeadline, guest_tsc;
> +
> +       if (apic->lapic_timer.hv_timer_state == HV_TIMER_NOT_USED)
> +               return;
> +
> +       tscdeadline = apic->lapic_timer.tscdeadline;
> +       guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
> +
> +       if (tscdeadline >= guest_tsc)
> +               kvm_x86_ops->set_hv_timer(vcpu, tscdeadline - guest_tsc);

Does this interact correctly with TSC scaling? IIUC this programs the
VMX preemption timer with a delay in guest cycles, rather than host
cycles.

> +       else
> +               kvm_x86_ops->set_hv_timer(vcpu, 0);
> +
> +       apic->lapic_timer.hv_timer_state = HV_TIMER_ARMED;
> +       trace_kvm_hv_timer_state(vcpu->vcpu_id,
> +                       apic->lapic_timer.hv_timer_state);
> +}
> +EXPORT_SYMBOL_GPL(kvm_lapic_arm_hv_timer);
> +
> +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
> +{
> +       struct kvm_lapic *apic = vcpu->arch.apic;
> +
> +       WARN_ON(apic->lapic_timer.hv_timer_state != HV_TIMER_ARMED);
> +       WARN_ON(swait_active(&vcpu->wq));
> +       kvm_x86_ops->cancel_hv_timer(vcpu);
> +       apic->lapic_timer.hv_timer_state = HV_TIMER_NOT_USED;
> +       apic_timer_expired(apic);
> +}
> +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
> +
> +void switch_to_hv_lapic_timer(struct kvm_vcpu *vcpu)
> +{
> +       struct kvm_lapic *apic = vcpu->arch.apic;
> +
> +       WARN_ON(apic->lapic_timer.hv_timer_state != HV_TIMER_NOT_USED);
> +
> +       if (apic_lvtt_tscdeadline(apic) &&
> +           !atomic_read(&apic->lapic_timer.pending)) {
> +               hrtimer_cancel(&apic->lapic_timer.timer);
> +               /* In case the timer triggered in above small window */
> +               if (!atomic_read(&apic->lapic_timer.pending)) {
> +                       apic->lapic_timer.hv_timer_state =
> +                               HV_TIMER_NEEDS_ARMING;
> +                       trace_kvm_hv_timer_state(vcpu->vcpu_id,
> +                                       apic->lapic_timer.hv_timer_state);
> +               }
> +       }
> +}
> +EXPORT_SYMBOL_GPL(switch_to_hv_lapic_timer);
> +
> +void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu)
> +{
> +       struct kvm_lapic *apic = vcpu->arch.apic;
> +
> +       /* Possibly the TSC deadline timer is not enabled yet */
> +       if (apic->lapic_timer.hv_timer_state == HV_TIMER_NOT_USED)
> +               return;
> +
> +       if (apic->lapic_timer.hv_timer_state == HV_TIMER_ARMED)
> +               kvm_x86_ops->cancel_hv_timer(vcpu);
> +       apic->lapic_timer.hv_timer_state = HV_TIMER_NOT_USED;
> +
> +       if (atomic_read(&apic->lapic_timer.pending))
> +               return;
> +
> +       /*
> +        * Don't trigger the apic_timer_expired() for deadlock,
> +        * because the swake_up() from apic_timer_expired() will
> +        * try to get the run queue lock, which has been held here
> +        * since we are in context switch procedure already.
> +        */
> +       start_sw_tscdeadline(apic, 1);
> +}
> +EXPORT_SYMBOL_GPL(switch_to_sw_lapic_timer);
> +
>  static void start_apic_timer(struct kvm_lapic *apic)
>  {
>         ktime_t now;
> @@ -1389,7 +1468,19 @@ static void start_apic_timer(struct kvm_lapic *apic)
>                            ktime_to_ns(ktime_add_ns(now,
>                                         apic->lapic_timer.period)));
>         } else if (apic_lvtt_tscdeadline(apic)) {
> -               start_sw_tscdeadline(apic);
> +               /* lapic timer in tsc deadline mode */
> +               if (kvm_x86_ops->set_hv_timer) {
> +                       if (unlikely(!apic->lapic_timer.tscdeadline ||
> +                                       !apic->vcpu->arch.virtual_tsc_khz))
> +                               return;
> +
> +                       /* Expired timer will be checked on vcpu_run() */
> +                       apic->lapic_timer.hv_timer_state =
> +                               HV_TIMER_NEEDS_ARMING;
> +                       trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
> +                                       apic->lapic_timer.hv_timer_state);
> +               } else
> +                       start_sw_tscdeadline(apic, 0);
>         }
>  }
>
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 891c6da7d4aa..dc4fd8eea04d 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -12,6 +12,12 @@
>  #define KVM_APIC_SHORT_MASK    0xc0000
>  #define KVM_APIC_DEST_MASK     0x800
>
> +enum {
> +       HV_TIMER_NOT_USED,
> +       HV_TIMER_NEEDS_ARMING,
> +       HV_TIMER_ARMED,
> +};
> +
>  struct kvm_timer {
>         struct hrtimer timer;
>         s64 period;                             /* unit: ns */
> @@ -20,6 +26,7 @@ struct kvm_timer {
>         u64 tscdeadline;
>         u64 expired_tscdeadline;
>         atomic_t pending;                       /* accumulated triggered timers */
> +       int hv_timer_state;
>  };
>
>  struct kvm_lapic {
> @@ -212,4 +219,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
>                         struct kvm_vcpu **dest_vcpu);
>  int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
>                         const unsigned long *bitmap, u32 bitmap_size);
> +void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu);
> +void switch_to_hv_lapic_timer(struct kvm_vcpu *vcpu);
> +void kvm_lapic_arm_hv_timer(struct kvm_vcpu *vcpu);
> +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
>  #endif
> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> index 8de925031b5c..8e1b5e9c2e78 100644
> --- a/arch/x86/kvm/trace.h
> +++ b/arch/x86/kvm/trace.h
> @@ -6,6 +6,7 @@
>  #include <asm/svm.h>
>  #include <asm/clocksource.h>
>  #include <asm/pvclock-abi.h>
> +#include <lapic.h>
>
>  #undef TRACE_SYSTEM
>  #define TRACE_SYSTEM kvm
> @@ -1348,6 +1349,27 @@ TRACE_EVENT(kvm_avic_unaccelerated_access,
>                   __entry->vec)
>  );
>
> +#define kvm_trace_symbol_hv_timer              \
> +       {HV_TIMER_NOT_USED, "no"},              \
> +       {HV_TIMER_NEEDS_ARMING, "need_arming"}, \
> +       {HV_TIMER_ARMED, "armed"}
> +
> +TRACE_EVENT(kvm_hv_timer_state,
> +               TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_state),
> +               TP_ARGS(vcpu_id, hv_timer_state),
> +               TP_STRUCT__entry(
> +                       __field(unsigned int, vcpu_id)
> +                       __field(unsigned int, hv_timer_state)
> +                       ),
> +               TP_fast_assign(
> +                       __entry->vcpu_id = vcpu_id;
> +                       __entry->hv_timer_state = hv_timer_state;
> +                       ),
> +               TP_printk("vcpu_id %x hv_timer %s\n",
> +                       __entry->vcpu_id,
> +                       __print_symbolic(__entry->hv_timer_state,
> +                               kvm_trace_symbol_hv_timer))
> +          );
>  #endif /* _TRACE_KVM_H */
>
>  #undef TRACE_INCLUDE_PATH
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 2b29afa61715..dc0b8cae02b8 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -7576,6 +7576,11 @@ static int handle_pcommit(struct kvm_vcpu *vcpu)
>         return 1;
>  }
>
> +static int handle_preemption_timer(struct kvm_vcpu *vcpu)
> +{
> +       kvm_lapic_expired_hv_timer(vcpu);
> +       return 1;
> +}
>  /*
>   * The exit handlers return 1 if the exit was handled fully and guest execution
>   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
> @@ -7627,6 +7632,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>         [EXIT_REASON_XRSTORS]                 = handle_xrstors,
>         [EXIT_REASON_PML_FULL]                = handle_pml_full,
>         [EXIT_REASON_PCOMMIT]                 = handle_pcommit,
> +       [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
>  };
>
>  static const int kvm_vmx_max_exit_handlers =
> @@ -8678,6 +8684,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
>                 vmx_set_interrupt_shadow(vcpu, 0);
>
> +       kvm_lapic_arm_hv_timer(vcpu);
> +
>         if (vmx->guest_pkru_valid)
>                 __write_pkru(vmx->guest_pkru);
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 269d576520ba..f4b50608568a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7724,11 +7724,15 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>
>  void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
>  {
> +       if (kvm_x86_ops->set_hv_timer)
> +               switch_to_hv_lapic_timer(vcpu);
>         kvm_x86_ops->sched_in(vcpu, cpu);
>  }
>
>  void kvm_arch_sched_out(struct kvm_vcpu *vcpu)
>  {
> +       if (kvm_x86_ops->set_hv_timer)
> +               switch_to_sw_lapic_timer(vcpu);
>  }
>
>  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html