Re: [PATCH v11 3/4] softmmu/dirtylimit: implement virtual CPU throttle

From: Hyman Huang <huangy81@chinatelecom.cn>
To: Peter Xu <peterx@redhat.com>
Cc: "Eduardo Habkost" <eduardo@habkost.net>,
	"David Hildenbrand" <david@redhat.com>,
	"Juan Quintela" <quintela@redhat.com>,
	"Richard Henderson" <richard.henderson@linaro.org>,
	qemu-devel <qemu-devel@nongnu.org>,
	"Markus ArmBruster" <armbru@redhat.com>,
	"Paolo Bonzini" <pbonzini@redhat.com>,
	"Philippe Mathieu-Daudé" <philmd@redhat.com>,
	"Dr. David Alan Gilbert" <dgilbert@redhat.com>
Subject: Re: [PATCH v11 3/4] softmmu/dirtylimit: implement virtual CPU throttle
Date: Mon, 24 Jan 2022 12:20:55 +0800	[thread overview]
Message-ID: <7f0570c7-2fc3-fd76-d34e-4da43cd331ae@chinatelecom.cn> (raw)
In-Reply-To: <YeUbhC7MG32K9pxu@xz-m1.local>


在 2022/1/17 15:32, Peter Xu 写道:
> On Wed, Jan 05, 2022 at 01:14:08AM +0800, huangy81@chinatelecom.cn wrote:
>>   ##
>> +# @DirtyLimitInfo:
>> +#
>> +# Dirty page rate limit information of virtual CPU.
>> +#
>> +# @cpu-index: index of virtual CPU.
>> +#
>> +# @limit-rate: upper limit of dirty page rate for virtual CPU.
>> +#
>> +# @current-rate: current dirty page rate for virtual CPU.
> 
> Please consider spell out the unit too for all these dirty rate fields (MB/s).
> 
>> +#
>> +# Since: 7.0
>> +#
>> +##
>> +{ 'struct': 'DirtyLimitInfo',
>> +  'data': { 'cpu-index': 'int',
>> +            'limit-rate': 'int64',
>> +            'current-rate': 'int64' } }
>> +
>> +##
>>   # @snapshot-save:
>>   #
>>   # Save a VM snapshot
>> diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
>> index a10ac6f..c9f5745 100644
>> --- a/softmmu/dirtylimit.c
>> +++ b/softmmu/dirtylimit.c
>> @@ -18,6 +18,26 @@
>>   #include "sysemu/dirtylimit.h"
>>   #include "exec/memory.h"
>>   #include "hw/boards.h"
>> +#include "sysemu/kvm.h"
>> +#include "trace.h"
>> +
>> +/*
>> + * Dirtylimit stop working if dirty page rate error
>> + * value less than DIRTYLIMIT_TOLERANCE_RANGE
>> + */
>> +#define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
>> +/*
>> + * Plus or minus vcpu sleep time linearly if dirty
>> + * page rate error value percentage over
>> + * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
>> + * Otherwise, plus or minus a fixed vcpu sleep time.
>> + */
>> +#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT     50
>> +/*
>> + * Max vcpu sleep time percentage during a cycle
>> + * composed of dirty ring full and sleep time.
>> + */
>> +#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
> 
> (Thanks for the enriched comments)
> 
>> +static inline void dirtylimit_vcpu_set_quota(int cpu_index,
>> +                                             uint64_t quota,
>> +                                             bool on)
>> +{
>> +    dirtylimit_state->states[cpu_index].quota = quota;
> 
> To be clear, we could move this line into the "(on)" if condition, then in !on
> case we reset it.
> 
>> +    if (on) {
>> +        if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
>> +            dirtylimit_state->limited_nvcpu++;
>> +        }
>> +    } else {
>> +        if (dirtylimit_state->states[cpu_index].enabled) {
>> +            dirtylimit_state->limited_nvcpu--;
>> +        }
>> +    }
>> +
>> +    dirtylimit_state->states[cpu_index].enabled = on;
>> +}
>> +
>> +static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
>> +{
>> +    static uint64_t max_dirtyrate;
>> +    uint32_t dirty_ring_size = kvm_dirty_ring_size();
>> +    uint64_t dirty_ring_size_meory_MB =
>> +        dirty_ring_size * TARGET_PAGE_SIZE >> 20;
>> +
>> +    if (max_dirtyrate < dirtyrate) {
>> +        max_dirtyrate = dirtyrate;
>> +    }
>> +
>> +    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
>> +}
>> +
>> +static inline bool dirtylimit_done(uint64_t quota,
>> +                                   uint64_t current)
>> +{
>> +    uint64_t min, max;
>> +
>> +    min = MIN(quota, current);
>> +    max = MAX(quota, current);
>> +
>> +    return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
>> +}
>> +
>> +static inline bool
>> +dirtylimit_need_linear_adjustment(uint64_t quota,
>> +                                  uint64_t current)
>> +{
>> +    uint64_t min, max, pct;
>> +
>> +    min = MIN(quota, current);
>> +    max = MAX(quota, current);
>> +
>> +    pct = (max - min) * 100 / max;
>> +
>> +    return pct > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
>> +}
>> +
>> +static void dirtylimit_set_throttle(CPUState *cpu,
>> +                                    uint64_t quota,
>> +                                    uint64_t current)
>> +{
>> +    int64_t ring_full_time_us = 0;
>> +    uint64_t sleep_pct = 0;
>> +    uint64_t throttle_us = 0;
>> +
>> +    ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
>> +
>> +    if (dirtylimit_need_linear_adjustment(quota, current)) {
>> +        if (quota < current) {
>> +            sleep_pct = (current - quota) * 100 / current;
>> +            throttle_us =
>> +                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
>> +            cpu->throttle_us_per_full += throttle_us;
>> +        } else {
>> +            sleep_pct = (quota - current) * 100 / quota;
>> +            throttle_us =
>> +                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
>> +            cpu->throttle_us_per_full -= throttle_us;
>> +        }
>> +
>> +        trace_dirtylimit_throttle_pct(cpu->cpu_index,
>> +                                      sleep_pct,
>> +                                      throttle_us);
>> +    } else {
>> +        if (quota < current) {
>> +            cpu->throttle_us_per_full += ring_full_time_us / 10;
>> +        } else {
>> +            cpu->throttle_us_per_full -= ring_full_time_us / 10;
>> +        }
>> +    }
>> +
>> +    cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
>> +        ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
>> +
>> +    cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
>> +}
> 
> This algorithm seems works even worse than the previous version, could you have
> a look on what's wrong?
> 
> See how it fluctuates when I set a throttle of 300MB/s:
> 
> (QMP) set-vcpu-dirty-limit dirty-rate=300
> 
> Dirty rate: 17622 (MB/s), duration: 1000 (ms), load: 100.00%
> Dirty rate: 17617 (MB/s), duration: 1000 (ms), load: 100.00%
> Dirty rate: 17611 (MB/s), duration: 1000 (ms), load: 100.00%
> Dirty rate: 13023 (MB/s), duration: 1153 (ms), load: 100.00%
> Dirty rate: 923 (MB/s), duration: 1000 (ms), load: 100.00%
> Dirty rate: 2853 (MB/s), duration: 1000 (ms), load: 100.00%
> Dirty rate: 1963 (MB/s), duration: 1040 (ms), load: 100.00%
> Dirty rate: 180 (MB/s), duration: 1006 (ms), load: 100.00%
> Dirty rate: 182 (MB/s), duration: 1007 (ms), load: 100.00%
> Dirty rate: 177 (MB/s), duration: 1005 (ms), load: 100.00%
> Dirty rate: 181 (MB/s), duration: 1007 (ms), load: 100.00%
> Dirty rate: 179 (MB/s), duration: 1006 (ms), load: 100.00%
> Dirty rate: 168 (MB/s), duration: 1005 (ms), load: 100.00%
> Dirty rate: 169 (MB/s), duration: 1006 (ms), load: 100.00%
> Dirty rate: 2717 (MB/s), duration: 1000 (ms), load: 100.00%
> Dirty rate: 2851 (MB/s), duration: 1000 (ms), load: 100.00%
> Dirty rate: 1773 (MB/s), duration: 1021 (ms), load: 100.00%
> Dirty rate: 177 (MB/s), duration: 1006 (ms), load: 100.00%
> Dirty rate: 179 (MB/s), duration: 1006 (ms), load: 100.00%
> Dirty rate: 175 (MB/s), duration: 1005 (ms), load: 100.00%
> Dirty rate: 1973 (MB/s), duration: 1000 (ms), load: 100.00%
> Dirty rate: 2878 (MB/s), duration: 1000 (ms), load: 100.00%
> Dirty rate: 1690 (MB/s), duration: 1022 (ms), load: 100.00%
> Dirty rate: 174 (MB/s), duration: 1005 (ms), load: 100.00%
> Dirty rate: 184 (MB/s), duration: 1006 (ms), load: 100.00%
> 
> This is the tool I'm using:
> 
> https://github.com/xzpeter/mig_mon#memory-dirty
> 
> Again, I won't ask for a good algorithm as the 1st version, but then I think
> it's nicer we have the simplest algorithm merged first, which should be very
> easy to verify.
> 
>> +
>> +static void dirtylimit_adjust_throttle(CPUState *cpu)
>> +{
>> +    uint64_t quota = 0;
>> +    uint64_t current = 0;
>> +    int cpu_index = cpu->cpu_index;
>> +
>> +    quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
>> +    current = vcpu_dirty_rate_get(cpu_index);
>> +
>> +    if (current == 0 &&
>> +        dirtylimit_vcpu_get_state(cpu_index)->unmatched_cnt == 0) {
>> +        cpu->throttle_us_per_full = 0;
>> +        goto end;
>> +    } else if (++dirtylimit_vcpu_get_state(cpu_index)->unmatched_cnt
>> +               < 2) {
>> +        goto end;
>> +    } else if (dirtylimit_done(quota, current)) {
>> +        goto end;
>> +    } else {
>> +        dirtylimit_vcpu_get_state(cpu_index)->unmatched_cnt = 0;
>> +        dirtylimit_set_throttle(cpu, quota, current);
>> +    }
>> +end:
>> +    trace_dirtylimit_adjust_throttle(cpu_index,
>> +                                     quota, current,
>> +                                     cpu->throttle_us_per_full);
>> +    return;
>> +}
>> +
>> +static void *dirtylimit_thread(void *opaque)
>> +{
>> +    CPUState *cpu;
>> +
>> +    rcu_register_thread();
>> +
>> +    while (!qatomic_read(&dirtylimit_quit)) {
>> +        sleep(DIRTYLIMIT_CALC_TIME_MS / 1000);
> 
> Sorry to have not mentioned this: I think we probably don't even need this
> dirtylimit thread.
> 
> It'll be hard to make the "sleep" right here.. you could read two identical
> values from the dirty calc thread because the 1sec sleep is not accurate, so
> even after this sleep() the calc thread may not have provided the latest number
> yet.
> 
> It'll be much cleaner (and most importantly, accurate..) to me if we could make
> this a hook function being passed over to the vcpu_dirty_rate_stat_thread()
> thread, then after each vcpu_dirty_rate_stat_collect() we call the hook.
Ok, i remove the dirtylimit_thread and implemtment throttle in bottom
half instead, indeed, it become more accurate. What do you think of it?
> 
>> +
>> +        dirtylimit_state_lock();
>> +
>> +        if (!dirtylimit_in_service()) {
>> +            dirtylimit_state_unlock();
>> +            break;
>> +        }
>> +
>> +        CPU_FOREACH(cpu) {
>> +            if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
>> +                continue;
>> +            }
>> +            dirtylimit_adjust_throttle(cpu);
>> +        }
>> +        dirtylimit_state_unlock();
>> +    }
>> +
>> +    rcu_unregister_thread();
>> +
>> +    return NULL;
>> +}
>> +
>> +static void dirtylimit_thread_start(void)
>> +{
>> +    qatomic_set(&dirtylimit_quit, 0);
>> +    qemu_thread_create(&dirtylimit_thr,
>> +                       "dirtylimit",
>> +                       dirtylimit_thread,
>> +                       NULL,
>> +                       QEMU_THREAD_JOINABLE);
>> +}
>> +
>> +static void dirtylimit_thread_stop(void)
>> +{
>> +    qatomic_set(&dirtylimit_quit, 1);
>> +    qemu_mutex_unlock_iothread();
>> +    qemu_thread_join(&dirtylimit_thr);
>> +    qemu_mutex_lock_iothread();
>> +}
>> +
>> +void dirtylimit_set_vcpu(int cpu_index,
>> +                         uint64_t quota,
>> +                         bool enable)
>> +{
>> +    trace_dirtylimit_set_vcpu(cpu_index, quota);
>> +
>> +    if (enable) {
>> +        if (dirtylimit_in_service()) {
>> +            /* only set the vcpu dirty page rate limit */
>> +            dirtylimit_vcpu_set_quota(cpu_index, quota, true);
>> +            return;
>> +        }
>> +
>> +        /* initialize state when set dirtylimit first time */
>> +        dirtylimit_state_lock();
>> +        dirtylimit_state_initialize();
>> +        dirtylimit_vcpu_set_quota(cpu_index, quota, true);
>> +        dirtylimit_state_unlock();
>> +
>> +        dirtylimit_thread_start();
> 
> Can we move dirtylimit global initializations out of dirtylimit_set_vcpu() too?
> We should always keep init/cleanup of dirty_rate_calc and dirtylimit together,
> because they should, imho.  We never enable one of them.
> 
> I commented similarly in previous version on this.
> 
>> +    } else {
>> +        if (!dirtylimit_in_service()) {
>> +            return;
>> +        }
>> +
>> +        dirtylimit_state_lock();
>> +        /* dirty page rate limit is not enabled */
>> +        if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
>> +            dirtylimit_state_unlock();
>> +            return;
>> +        }
>> +
>> +        /* switch off vcpu dirty page rate limit */
>> +        dirtylimit_vcpu_set_quota(cpu_index, 0, false);
>> +        dirtylimit_state_unlock();
>> +
>> +        if (!dirtylimit_state->limited_nvcpu) {
>> +            dirtylimit_thread_stop();
>> +
>> +            dirtylimit_state_lock();
>> +            dirtylimit_state_finalize();
>> +            dirtylimit_state_unlock();
> 
> We don't need such a fine control of locking, IMHO.. it can be a very big lock
> just to serialize things..
> 
> IMHO it could be as simple as:
> 
> void dirtylimit_set_vcpu(int cpu_index,
>                           uint64_t quota,
>                           bool enable)
> {
>      dirtylimit_vcpu_set_quota(cpu_index, quota, enable);
>      trace_dirtylimit_set_vcpu(cpu_index, quota);
> }
> 
> void qmp_set_vcpu_dirty_limit(bool has_cpu_index,
>                                uint64_t cpu_index,
>                                uint64_t dirty_rate,
>                                Error **errp)
> {
>      if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
>          error_setg(errp, "dirty page limit feature requires KVM with"
>                     " accelerator property 'dirty-ring-size' set'");
>          return;
>      }
> 
>      if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
>          error_setg(errp, "incorrect cpu index specified");
>          return;
>      }
> 
>      dirtylimit_state_lock();
> 
>      if (!dirtylimit_in_service()) {
>          /* TODO: we could have one helper to initialize all of them */
>          vcpu_dirty_rate_stat_initialize();
>          vcpu_dirty_rate_stat_start();
>          dirtylimit_state_initialize();
>          dirtylimit_vcpu_set_quota(cpu_index, quota, true);
>      }
> 
>      if (has_cpu_index) {
>          dirtylimit_set_vcpu(cpu_index, dirty_rate, true);
>      } else {
>          dirtylimit_set_all(dirty_rate, true);
>      }
> 
>      dirtylimit_state_unlock();
> }
> 
> I didn't write the cleanup path, but it's the same: we should only cleanup all
> the global structs in cancel-vcpu-dirty-limit when we found there's zero vcpus
> in track, and it should be done once there.
> 
> Thanks,
> 

-- 
Best regard

Hyman Huang(黄勇)