From: Wanpeng Li <wanpengli@tencent.com> Using a moving average based on per-vCPU lapic_timer_advance_ns to tune smoothly, filter out drastic fluctuation which prevents this before, let's assume it is 10000 cycles. Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Radim Krčmář <rkrcmar@redhat.com> Signed-off-by: Wanpeng Li <wanpengli@tencent.com> --- arch/x86/kvm/lapic.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e904ff0..181537a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -69,6 +69,7 @@ #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 +#define LAPIC_TIMER_ADVANCE_FILTER 10000 static inline int apic_test_vector(int vec, void *bitmap) { @@ -1484,23 +1485,28 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, s64 advance_expire_delta) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; - u64 ns; + u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns, ns; + + if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_FILTER) + /* filter out drastic fluctuations */ + return; /* too early */ if (advance_expire_delta < 0) { ns = -advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns -= min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns -= ns; } else { /* too late */ ns = advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns += min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns += ns; } + timer_advance_ns = (apic->lapic_timer.timer_advance_ns * + (LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) + advance_expire_delta) / + LAPIC_TIMER_ADVANCE_ADJUST_STEP; + if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) apic->lapic_timer.timer_advance_adjust_done = true; if (unlikely(timer_advance_ns > 5000)) { -- 2.7.4
From: Wanpeng Li <wanpengli@tencent.com> Even if for realtime CPUs, cache line bounces, frequency scaling, presence of higher-priority RT tasks, etc can still cause different response. These interferences should be considered and periodically revaluate whether or not the lapic_timer_advance_ns value is the best, do nothing if it is, otherwise recaluate again. Set lapic_timer_advance_ns to the minimal conservative value from all the estimated values. Testing on Skylake server, cat vcpu*/lapic_timer_advance_ns, before patch: 1628 4161 4321 3236 ... Testing on Skylake server, cat vcpu*/lapic_timer_advance_ns, after patch: 1553 1499 1509 1489 ... Testing on Haswell desktop, cat vcpu*/lapic_timer_advance_ns, before patch: 4617 3641 4102 4577 ... Testing on Haswell desktop, cat vcpu*/lapic_timer_advance_ns, after patch: 2775 2892 2764 2775 ... Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Radim Krčmář <rkrcmar@redhat.com> Signed-off-by: Wanpeng Li <wanpengli@tencent.com> --- arch/x86/kvm/lapic.c | 37 ++++++++++++++++++++++++++++++------- arch/x86/kvm/lapic.h | 2 ++ 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 181537a..4421583 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -70,6 +70,7 @@ /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 #define LAPIC_TIMER_ADVANCE_FILTER 10000 +#define LAPIC_TIMER_ADVANCE_RECALC_PERIOD (600 * HZ) static inline int apic_test_vector(int vec, void *bitmap) { @@ -1484,13 +1485,24 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, s64 advance_expire_delta) { - struct kvm_lapic *apic = vcpu->arch.apic; - u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns, ns; + struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; + u32 timer_advance_ns = ktimer->timer_advance_ns, ns; if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_FILTER) /* filter out drastic flunctuations */ return; + /* periodic revaluate */ + if (unlikely(ktimer->timer_advance_adjust_done)) { + ktimer->recalc_timer_advance_ns = jiffies + + LAPIC_TIMER_ADVANCE_RECALC_PERIOD; + if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_DONE) { + timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + ktimer->timer_advance_adjust_done = false; + } else + return; + } + /* too early */ if (advance_expire_delta < 0) { ns = -advance_expire_delta * 1000000ULL; @@ -1503,17 +1515,24 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, timer_advance_ns += ns; } - timer_advance_ns = (apic->lapic_timer.timer_advance_ns * + timer_advance_ns = (ktimer->timer_advance_ns * (LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) + advance_expire_delta) / LAPIC_TIMER_ADVANCE_ADJUST_STEP; if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - apic->lapic_timer.timer_advance_adjust_done = true; + ktimer->timer_advance_adjust_done = true; if (unlikely(timer_advance_ns > 5000)) { timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; - apic->lapic_timer.timer_advance_adjust_done = false; + ktimer->timer_advance_adjust_done = false; + } + + ktimer->timer_advance_ns = timer_advance_ns; + + if (ktimer->timer_advance_adjust_done) { + if (ktimer->min_timer_advance_ns > timer_advance_ns) + ktimer->min_timer_advance_ns = timer_advance_ns; + ktimer->timer_advance_ns = ktimer->min_timer_advance_ns; } - apic->lapic_timer.timer_advance_ns = timer_advance_ns; } static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) @@ -1532,7 +1551,8 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) + if (unlikely(!apic->lapic_timer.timer_advance_adjust_done) || + time_before(apic->lapic_timer.recalc_timer_advance_ns, jiffies)) adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } @@ -2310,9 +2330,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) if (timer_advance_ns == -1) { apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; apic->lapic_timer.timer_advance_adjust_done = false; + apic->lapic_timer.recalc_timer_advance_ns = jiffies; + apic->lapic_timer.min_timer_advance_ns = UINT_MAX; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; apic->lapic_timer.timer_advance_adjust_done = true; + apic->lapic_timer.recalc_timer_advance_ns = MAX_JIFFY_OFFSET; } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 50053d2..56a05eb 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -31,6 +31,8 @@ struct kvm_timer { u32 timer_mode_mask; u64 tscdeadline; u64 expired_tscdeadline; + unsigned long recalc_timer_advance_ns; + u32 min_timer_advance_ns; u32 timer_advance_ns; s64 advance_expire_delta; atomic_t pending; /* accumulated triggered timers */ -- 2.7.4
On 28/08/19 10:19, Wanpeng Li wrote:
> From: Wanpeng Li <wanpengli@tencent.com>
>
> Using a moving average based on per-vCPU lapic_timer_advance_ns to tune
> smoothly, filter out drastic fluctuation which prevents this before,
> let's assume it is 10000 cycles.
>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: Radim Krčmář <rkrcmar@redhat.com>
> Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
> ---
> arch/x86/kvm/lapic.c | 18 ++++++++++++------
> 1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index e904ff0..181537a 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -69,6 +69,7 @@
> #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
> /* step-by-step approximation to mitigate fluctuation */
> #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
> +#define LAPIC_TIMER_ADVANCE_FILTER 10000
>
> static inline int apic_test_vector(int vec, void *bitmap)
> {
> @@ -1484,23 +1485,28 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
> s64 advance_expire_delta)
> {
> struct kvm_lapic *apic = vcpu->arch.apic;
> - u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
> - u64 ns;
> + u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns, ns;
> +
> + if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_FILTER)
> + /* filter out drastic fluctuations */
> + return;
>
> /* too early */
> if (advance_expire_delta < 0) {
> ns = -advance_expire_delta * 1000000ULL;
> do_div(ns, vcpu->arch.virtual_tsc_khz);
> - timer_advance_ns -= min((u32)ns,
> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> + timer_advance_ns -= ns;
> } else {
> /* too late */
> ns = advance_expire_delta * 1000000ULL;
> do_div(ns, vcpu->arch.virtual_tsc_khz);
> - timer_advance_ns += min((u32)ns,
> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> + timer_advance_ns += ns;
> }
>
> + timer_advance_ns = (apic->lapic_timer.timer_advance_ns *
> + (LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) + advance_expire_delta) /
> + LAPIC_TIMER_ADVANCE_ADJUST_STEP;
> +
> if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
> apic->lapic_timer.timer_advance_adjust_done = true;
> if (unlikely(timer_advance_ns > 5000)) {
>
This looks great. But instead of patch 2, why not remove
timer_advance_adjust_done altogether?
Paolo
On 12/09/19 02:34, Wanpeng Li wrote: >>> - timer_advance_ns -= min((u32)ns, >>> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); >>> + timer_advance_ns -= ns; Looking more closely, this assignment... >>> } else { >>> /* too late */ >>> ns = advance_expire_delta * 1000000ULL; >>> do_div(ns, vcpu->arch.virtual_tsc_khz); >>> - timer_advance_ns += min((u32)ns, >>> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); >>> + timer_advance_ns += ns; ... and this one are dead code now. However... >>> } >>> >>> + timer_advance_ns = (apic->lapic_timer.timer_advance_ns * >>> + (LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) + advance_expire_delta) / >>> + LAPIC_TIMER_ADVANCE_ADJUST_STEP; ... you should instead remove this new assignment and just make the assignments above just timer_advance -= ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP; and timer_advance -= ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP; In fact this whole last assignment is buggy, since advance_expire_delta is in TSC units rather than nanoseconds. >>> if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) >>> apic->lapic_timer.timer_advance_adjust_done = true; >>> if (unlikely(timer_advance_ns > 5000)) { >> This looks great. But instead of patch 2, why not remove >> timer_advance_adjust_done altogether? > It can fluctuate w/o stop. Possibly because of the wrong calculation of timer_advance_ns? Paolo
On Thu, 12 Sep 2019 at 20:37, Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 12/09/19 02:34, Wanpeng Li wrote: > >>> - timer_advance_ns -= min((u32)ns, > >>> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); > >>> + timer_advance_ns -= ns; > > Looking more closely, this assignment... > > >>> } else { > >>> /* too late */ > >>> ns = advance_expire_delta * 1000000ULL; > >>> do_div(ns, vcpu->arch.virtual_tsc_khz); > >>> - timer_advance_ns += min((u32)ns, > >>> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); > >>> + timer_advance_ns += ns; > > ... and this one are dead code now. However... > > >>> } > >>> > >>> + timer_advance_ns = (apic->lapic_timer.timer_advance_ns * > >>> + (LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) + advance_expire_delta) / > >>> + LAPIC_TIMER_ADVANCE_ADJUST_STEP; > > ... you should instead remove this new assignment and just make the > assignments above just > > timer_advance -= ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP; > > and > > timer_advance -= ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP; > > In fact this whole last assignment is buggy, since advance_expire_delta > is in TSC units rather than nanoseconds. > > >>> if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) > >>> apic->lapic_timer.timer_advance_adjust_done = true; > >>> if (unlikely(timer_advance_ns > 5000)) { > >> This looks great. But instead of patch 2, why not remove > >> timer_advance_adjust_done altogether? > > It can fluctuate w/o stop. > > Possibly because of the wrong calculation of timer_advance_ns? Hi Paolo, Something like below? It still fluctuate when running complex guest os like linux. Removing timer_advance_adjust_done will hinder introduce patch v3 2/2 since there is no adjust done flag in each round evaluation. I have two questions here, best-effort tune always as below or periodically revaluate to get conservative value and get best-effort value in each round evaluation as patch v3 2/2, which one do you prefer? The former one can wast time to wait sometimes and the later one can not get the best latency. In addition, can the adaptive tune algorithm be using in all the scenarios contain RT/over-subscribe? ---8<--- diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 685d17c..895735b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -69,6 +69,7 @@ #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 +#define LAPIC_TIMER_ADVANCE_FILTER 5000 static inline int apic_test_vector(int vec, void *bitmap) { @@ -1479,29 +1480,28 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, s64 advance_expire_delta) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; - u64 ns; + u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns, ns; + + if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_FILTER || + abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) { + /* filter out random fluctuations */ + return; + } /* too early */ if (advance_expire_delta < 0) { ns = -advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns -= min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } else { /* too late */ ns = advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns += min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } - if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - apic->lapic_timer.timer_advance_adjust_done = true; - if (unlikely(timer_advance_ns > 5000)) { + if (unlikely(timer_advance_ns > 5000)) timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; - apic->lapic_timer.timer_advance_adjust_done = false; - } apic->lapic_timer.timer_advance_ns = timer_advance_ns; } @@ -1521,7 +1521,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) + if (lapic_timer_advance_ns == -1) adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } @@ -2298,10 +2298,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; - apic->lapic_timer.timer_advance_adjust_done = false; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; - apic->lapic_timer.timer_advance_adjust_done = true; } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 50053d2..2aad7e2 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -35,7 +35,6 @@ struct kvm_timer { s64 advance_expire_delta; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; - bool timer_advance_adjust_done; }; struct kvm_lapic { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 93b0bd4..4f65ef1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -141,7 +141,7 @@ * advancement entirely. Any other value is used as-is and disables adaptive * tuning, i.e. allows priveleged userspace to set an exact advancement time. */ -static int __read_mostly lapic_timer_advance_ns = -1; +int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6594020..2c6ba86 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -301,6 +301,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, extern bool enable_vmware_backdoor; +extern int lapic_timer_advance_ns; extern int pi_inject_timer; extern struct static_key kvm_no_apic_vcpu;
On 16/09/19 06:02, Wanpeng Li wrote: > Hi Paolo, > > Something like below? It still fluctuate when running complex guest os > like linux. Removing timer_advance_adjust_done will hinder introduce > patch v3 2/2 since there is no adjust done flag in each round > evaluation. That's not important, since the adjustment would be continuous. How much fluctuation can you see? > I have two questions here, best-effort tune always as > below or periodically revaluate to get conservative value and get > best-effort value in each round evaluation as patch v3 2/2, which one > do you prefer? The former one can wast time to wait sometimes and the > later one can not get the best latency. In addition, can the adaptive > tune algorithm be using in all the scenarios contain > RT/over-subscribe? I prefer the former, like the patch below, mostly because of the extra complexity of the periodic reevaluation. Paolo > > ---8<--- > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 685d17c..895735b 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -69,6 +69,7 @@ > #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 > /* step-by-step approximation to mitigate fluctuation */ > #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 > +#define LAPIC_TIMER_ADVANCE_FILTER 5000 > > static inline int apic_test_vector(int vec, void *bitmap) > { > @@ -1479,29 +1480,28 @@ static inline void > adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, > s64 advance_expire_delta) > { > struct kvm_lapic *apic = vcpu->arch.apic; > - u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; > - u64 ns; > + u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns, ns; > + > + if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_FILTER || > + abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) { > + /* filter out random fluctuations */ > + return; > + } > > /* too early */ > if (advance_expire_delta < 0) { > ns = -advance_expire_delta * 1000000ULL; > do_div(ns, vcpu->arch.virtual_tsc_khz); > - timer_advance_ns -= min((u32)ns, > - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); > + timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; > } else { > /* too late */ > ns = advance_expire_delta * 1000000ULL; > do_div(ns, vcpu->arch.virtual_tsc_khz); > - timer_advance_ns += min((u32)ns, > - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); > + timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; > } > > - if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) > - apic->lapic_timer.timer_advance_adjust_done = true; > - if (unlikely(timer_advance_ns > 5000)) { > + if (unlikely(timer_advance_ns > 5000)) > timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; > - apic->lapic_timer.timer_advance_adjust_done = false; > - } > apic->lapic_timer.timer_advance_ns = timer_advance_ns; > } > > @@ -1521,7 +1521,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) > if (guest_tsc < tsc_deadline) > __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); > > - if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) > + if (lapic_timer_advance_ns == -1) > adjust_lapic_timer_advance(vcpu, > apic->lapic_timer.advance_expire_delta); > } > > @@ -2298,10 +2298,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int > timer_advance_ns) > apic->lapic_timer.timer.function = apic_timer_fn; > if (timer_advance_ns == -1) { > apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; > - apic->lapic_timer.timer_advance_adjust_done = false; > } else { > apic->lapic_timer.timer_advance_ns = timer_advance_ns; > - apic->lapic_timer.timer_advance_adjust_done = true; > } > > > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > index 50053d2..2aad7e2 100644 > --- a/arch/x86/kvm/lapic.h > +++ b/arch/x86/kvm/lapic.h > @@ -35,7 +35,6 @@ struct kvm_timer { > s64 advance_expire_delta; > atomic_t pending; /* accumulated triggered timers */ > bool hv_timer_in_use; > - bool timer_advance_adjust_done; > }; > > struct kvm_lapic { > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 93b0bd4..4f65ef1 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -141,7 +141,7 @@ > * advancement entirely. Any other value is used as-is and disables adaptive > * tuning, i.e. allows priveleged userspace to set an exact advancement time. > */ > -static int __read_mostly lapic_timer_advance_ns = -1; > +int __read_mostly lapic_timer_advance_ns = -1; > module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); > > static bool __read_mostly vector_hashing = true; > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index 6594020..2c6ba86 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -301,6 +301,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, > unsigned long cr2, > > extern bool enable_vmware_backdoor; > > +extern int lapic_timer_advance_ns; > extern int pi_inject_timer; > > extern struct static_key kvm_no_apic_vcpu; >
On Mon, 16 Sep 2019 at 15:49, Paolo Bonzini <pbonzini@redhat.com> wrote: > > On 16/09/19 06:02, Wanpeng Li wrote: > > Hi Paolo, > > > > Something like below? It still fluctuate when running complex guest os > > like linux. Removing timer_advance_adjust_done will hinder introduce > > patch v3 2/2 since there is no adjust done flag in each round > > evaluation. > > That's not important, since the adjustment would be continuous. > > How much fluctuation can you see? After I add a trace_printk to observe more closely, the adjustment is continuous as expected. > > > I have two questions here, best-effort tune always as > > below or periodically revaluate to get conservative value and get > > best-effort value in each round evaluation as patch v3 2/2, which one > > do you prefer? The former one can wast time to wait sometimes and the > > later one can not get the best latency. In addition, can the adaptive > > tune algorithm be using in all the scenarios contain > > RT/over-subscribe? > > I prefer the former, like the patch below, mostly because of the extra > complexity of the periodic reevaluation. How about question two? Wanpeng