Re: [PATCH v6 1/5] KVM: Implement dirty quota-based throttling of vcpus

From: Shivam Kumar <shivam.kumar1@nutanix.com>
To: pbonzini@redhat.com, seanjc@google.com, maz@kernel.org,
	james.morse@arm.com, borntraeger@linux.ibm.com, david@redhat.com
Cc: kvm@vger.kernel.org, Shaju Abraham <shaju.abraham@nutanix.com>,
	Manish Mishra <manish.mishra@nutanix.com>,
	Anurag Madnawat <anurag.madnawat@nutanix.com>
Subject: Re: [PATCH v6 1/5] KVM: Implement dirty quota-based throttling of vcpus
Date: Mon, 10 Oct 2022 11:11:03 +0530	[thread overview]
Message-ID: <21fce8d9-489f-0d7e-b1a6-5598f92453fe@nutanix.com> (raw)
In-Reply-To: <20220915101049.187325-2-shivam.kumar1@nutanix.com>


On 15/09/22 3:40 pm, Shivam Kumar wrote:
> Define variables to track and throttle memory dirtying for every vcpu.
> 
> dirty_count:    Number of pages the vcpu has dirtied since its creation,
>                  while dirty logging is enabled.
> dirty_quota:    Number of pages the vcpu is allowed to dirty. To dirty
>                  more, it needs to request more quota by exiting to
>                  userspace.
> 
> Implement the flow for throttling based on dirty quota.
> 
> i) Increment dirty_count for the vcpu whenever it dirties a page.
> ii) Exit to userspace whenever the dirty quota is exhausted (i.e. dirty
> count equals/exceeds dirty quota) to request more dirty quota.
> 
> Suggested-by: Shaju Abraham <shaju.abraham@nutanix.com>
> Suggested-by: Manish Mishra <manish.mishra@nutanix.com>
> Co-developed-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
> Signed-off-by: Anurag Madnawat <anurag.madnawat@nutanix.com>
> Signed-off-by: Shivam Kumar <shivam.kumar1@nutanix.com>
> ---
>   Documentation/virt/kvm/api.rst | 35 ++++++++++++++++++++++++++++++++++
>   include/linux/kvm_host.h       | 20 ++++++++++++++++++-
>   include/linux/kvm_types.h      |  1 +
>   include/uapi/linux/kvm.h       | 12 ++++++++++++
>   virt/kvm/kvm_main.c            | 26 ++++++++++++++++++++++---
>   5 files changed, 90 insertions(+), 4 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index abd7c32126ce..97030a6a35b4 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6614,6 +6614,26 @@ array field represents return values. The userspace should update the return
>   values of SBI call before resuming the VCPU. For more details on RISC-V SBI
>   spec refer, https://github.com/riscv/riscv-sbi-doc.
>   
> +::
> +
> +		/* KVM_EXIT_DIRTY_QUOTA_EXHAUSTED */
> +		struct {
> +			__u64 count;
> +			__u64 quota;
> +		} dirty_quota_exit;
> +
> +If exit reason is KVM_EXIT_DIRTY_QUOTA_EXHAUSTED, it indicates that the VCPU has
> +exhausted its dirty quota. The 'dirty_quota_exit' member of kvm_run structure
> +makes the following information available to the userspace:
> +    count: the current count of pages dirtied by the VCPU, can be
> +    skewed based on the size of the pages accessed by each vCPU.
> +    quota: the observed dirty quota just before the exit to userspace.
> +
> +The userspace can design a strategy to allocate the overall scope of dirtying
> +for the VM among the vcpus. Based on the strategy and the current state of dirty
> +quota throttling, the userspace can make a decision to either update (increase)
> +the quota or to put the VCPU to sleep for some time.
> +
>   ::
>   
>       /* KVM_EXIT_NOTIFY */
> @@ -6668,6 +6688,21 @@ values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set.
>   
>   ::
>   
> +	/*
> +	 * Number of pages the vCPU is allowed to have dirtied over its entire
> +	 * lifetime.  KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if the quota
> +	 * is reached/exceeded.
> +	 */
> +	__u64 dirty_quota;
> +
> +Please note that enforcing the quota is best effort, as the guest may dirty
> +multiple pages before KVM can recheck the quota.  However, unless KVM is using
> +a hardware-based dirty ring buffer, e.g. Intel's Page Modification Logging,
> +KVM will detect quota exhaustion within a handful of dirtied pages.  If a
> +hardware ring buffer is used, the overrun is bounded by the size of the buffer
> +(512 entries for PML).
> +
> +::
>     };
>   
>   
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f4519d3689e1..9acb28635d94 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -151,12 +151,13 @@ static inline bool is_error_page(struct page *page)
>   #define KVM_REQUEST_NO_ACTION      BIT(10)
>   /*
>    * Architecture-independent vcpu->requests bit members
> - * Bits 4-7 are reserved for more arch-independent bits.
> + * Bits 5-7 are reserved for more arch-independent bits.
>    */
>   #define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>   #define KVM_REQ_VM_DEAD           (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>   #define KVM_REQ_UNBLOCK           2
>   #define KVM_REQ_UNHALT            3
> +#define KVM_REQ_DIRTY_QUOTA_EXIT  4
>   #define KVM_REQUEST_ARCH_BASE     8
>   
>   /*
> @@ -380,6 +381,8 @@ struct kvm_vcpu {
>   	 */
>   	struct kvm_memory_slot *last_used_slot;
>   	u64 last_used_slot_gen;
> +
> +	u64 dirty_quota;
>   };
>   
>   /*
> @@ -542,6 +545,21 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
>   	return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE);
>   }
>   
> +static inline int kvm_vcpu_check_dirty_quota(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_run *run = vcpu->run;
> +	u64 dirty_quota = READ_ONCE(run->dirty_quota);
> +	u64 pages_dirtied = vcpu->stat.generic.pages_dirtied;
> +
> +	if (!dirty_quota || (pages_dirtied < dirty_quota))
> +		return 1;
> +
> +	run->exit_reason = KVM_EXIT_DIRTY_QUOTA_EXHAUSTED;
> +	run->dirty_quota_exit.count = pages_dirtied;
> +	run->dirty_quota_exit.quota = dirty_quota;
> +	return 0;
> +}
> +
>   /*
>    * Some of the bitops functions do not support too long bitmaps.
>    * This number must be determined not to exceed such limits.
> diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
> index 3ca3db020e0e..263a588f3cd3 100644
> --- a/include/linux/kvm_types.h
> +++ b/include/linux/kvm_types.h
> @@ -118,6 +118,7 @@ struct kvm_vcpu_stat_generic {
>   	u64 halt_poll_fail_hist[HALT_POLL_HIST_COUNT];
>   	u64 halt_wait_hist[HALT_POLL_HIST_COUNT];
>   	u64 blocking;
> +	u64 pages_dirtied;
I am reworking the QEMU patches and I am not sure how I can access the
pages_dirtied info from the userspace side when the migration starts, i.e.
without a dirty quota exit.

I need this info to initialise the dirty quota. This is what I am looking
to do on the userspace side at the start of dirty quota migration:
	dirty_quota = pages_dirtied + some initial quota

Hoping if you could help, Sean. Thanks in advance.
>   #define KVM_STATS_NAME_SIZE	48
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index eed0315a77a6..4c4a65b0f0a5 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -272,6 +272,7 @@ struct kvm_xen_exit {
>   #define KVM_EXIT_RISCV_SBI        35
>   #define KVM_EXIT_RISCV_CSR        36
>   #define KVM_EXIT_NOTIFY           37
> +#define KVM_EXIT_DIRTY_QUOTA_EXHAUSTED 38
>   
>   /* For KVM_EXIT_INTERNAL_ERROR */
>   /* Emulate instruction failed. */
> @@ -510,6 +511,11 @@ struct kvm_run {
>   #define KVM_NOTIFY_CONTEXT_INVALID	(1 << 0)
>   			__u32 flags;
>   		} notify;
> +		/* KVM_EXIT_DIRTY_QUOTA_EXHAUSTED */
> +		struct {
> +			__u64 count;
> +			__u64 quota;
> +		} dirty_quota_exit;
>   		/* Fix the size of the union. */
>   		char padding[256];
>   	};
> @@ -531,6 +537,12 @@ struct kvm_run {
>   		struct kvm_sync_regs regs;
>   		char padding[SYNC_REGS_SIZE_BYTES];
>   	} s;
> +	/*
> +	 * Number of pages the vCPU is allowed to have dirtied over its entire
> +	 * lifetime.  KVM_RUN exits with KVM_EXIT_DIRTY_QUOTA_EXHAUSTED if the
> +	 * quota is reached/exceeded.
> +	 */
> +	__u64 dirty_quota;
>   };
>   
>   /* for KVM_REGISTER_COALESCED_MMIO / KVM_UNREGISTER_COALESCED_MMIO */
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 584a5bab3af3..f315af50037d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3298,18 +3298,36 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
>   }
>   EXPORT_SYMBOL_GPL(kvm_clear_guest);
>   
> +static void kvm_vcpu_is_dirty_quota_exhausted(struct kvm_vcpu *vcpu)
> +{
> +	u64 dirty_quota = READ_ONCE(vcpu->run->dirty_quota);
> +
> +	if (!dirty_quota || (vcpu->stat.generic.pages_dirtied < dirty_quota))
> +		return;
> +
> +	/*
> +	 * Snapshot the quota to report it to userspace.  The dirty count will be
> +	 * captured when the request is processed.
> +	 */
> +	vcpu->dirty_quota = dirty_quota;
> +	kvm_make_request(KVM_REQ_DIRTY_QUOTA_EXIT, vcpu);
> +}
> +
>   void mark_page_dirty_in_slot(struct kvm *kvm,
>   			     const struct kvm_memory_slot *memslot,
>   		 	     gfn_t gfn)
>   {
>   	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
>   
> -#ifdef CONFIG_HAVE_KVM_DIRTY_RING
>   	if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
>   		return;
> -#endif
>   
> -	if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
> +	if (!memslot)
> +		return;
> +
> +	WARN_ON_ONCE(!vcpu->stat.generic.pages_dirtied++);
> +
> +	if (kvm_slot_dirty_track_enabled(memslot)) {
>   		unsigned long rel_gfn = gfn - memslot->base_gfn;
>   		u32 slot = (memslot->as_id << 16) | memslot->id;
>   
> @@ -3318,6 +3336,8 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
>   					    slot, rel_gfn);
>   		else
>   			set_bit_le(rel_gfn, memslot->dirty_bitmap);
> +
> +		kvm_vcpu_is_dirty_quota_exhausted(vcpu);
>   	}
>   }
>   EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);