All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] KVM: Make mmu_shrink() scan nr_to_scan shadow pages
@ 2011-12-11 22:22 Takuya Yoshikawa
  2011-12-11 22:24 ` [PATCH 1/4] KVM: Rename vm_list to kvm_list to avoid confusion Takuya Yoshikawa
                   ` (3 more replies)
  0 siblings, 4 replies; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-11 22:22 UTC (permalink / raw)
  To: avi, mtosatti; +Cc: kvm, yoshikawa.takuya

This patch set fixes mmu_shrink() as I said last week.

Though I did not change tuning parameters, we can do that in the future
on top of this: I think the batch size, 128, may be too large.

	Takuya

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/4] KVM: Rename vm_list to kvm_list to avoid confusion
  2011-12-11 22:22 [PATCH 0/4] KVM: Make mmu_shrink() scan nr_to_scan shadow pages Takuya Yoshikawa
@ 2011-12-11 22:24 ` Takuya Yoshikawa
  2011-12-12  3:16   ` Xiao Guangrong
  2011-12-11 22:24 ` [PATCH 2/4] KVM: MMU: Make common preparation code for zapping sp into a function Takuya Yoshikawa
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-11 22:24 UTC (permalink / raw)
  To: avi, mtosatti; +Cc: kvm, yoshikawa.takuya

From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>

Make it clear that this is not related to virtual memory.

Remove vm_ prefix from the corresponding member of the struct kvm to
avoid kvm->vm_ redundancy alongside.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
---
 Documentation/virtual/kvm/locking.txt |    2 +-
 arch/x86/include/asm/kvm_host.h       |    2 +-
 arch/x86/kvm/mmu.c                    |    4 ++--
 arch/x86/kvm/x86.c                    |    4 ++--
 include/linux/kvm_host.h              |    2 +-
 virt/kvm/kvm_main.c                   |   12 ++++++------
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 3b4cd3b..1a851be 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -12,7 +12,7 @@ KVM Lock Overview
 Name:		kvm_lock
 Type:		raw_spinlock
 Arch:		any
-Protects:	- vm_list
+Protects:	- kvm_list
 		- hardware virtualization enable/disable
 Comment:	'raw' because hardware enabling/disabling must be atomic /wrt
 		migration.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 020413a..186b2b0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -105,7 +105,7 @@
 #define ASYNC_PF_PER_VCPU 64
 
 extern raw_spinlock_t kvm_lock;
-extern struct list_head vm_list;
+extern struct list_head kvm_list;
 
 struct kvm_vcpu;
 struct kvm;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a2a9b4..590f76b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3911,7 +3911,7 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 
 	raw_spin_lock(&kvm_lock);
 
-	list_for_each_entry(kvm, &vm_list, vm_list) {
+	list_for_each_entry(kvm, &kvm_list, list) {
 		int idx;
 		LIST_HEAD(invalid_list);
 
@@ -3930,7 +3930,7 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 		srcu_read_unlock(&kvm->srcu, idx);
 	}
 	if (kvm_freed)
-		list_move_tail(&kvm_freed->vm_list, &vm_list);
+		list_move_tail(&kvm_freed->list, &kvm_list);
 
 	raw_spin_unlock(&kvm_lock);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eeeaf2e..96f118b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4566,7 +4566,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
 
 	raw_spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list) {
+	list_for_each_entry(kvm, &kvm_list, list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
 			if (vcpu->cpu != freq->cpu)
 				continue;
@@ -5857,7 +5857,7 @@ int kvm_arch_hardware_enable(void *garbage)
 	int i;
 
 	kvm_shared_msr_cpu_online();
-	list_for_each_entry(kvm, &vm_list, vm_list)
+	list_for_each_entry(kvm, &kvm_list, list)
 		kvm_for_each_vcpu(i, vcpu, kvm)
 			if (vcpu->cpu == smp_processor_id())
 				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8c5c303..054b52e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -256,7 +256,7 @@ struct kvm {
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	atomic_t online_vcpus;
 	int last_boosted_vcpu;
-	struct list_head vm_list;
+	struct list_head list; /* the list of kvm instances */
 	struct mutex lock;
 	struct kvm_io_bus *buses[KVM_NR_BUSES];
 #ifdef CONFIG_HAVE_KVM_EVENTFD
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d8bac07..03ae960 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -70,8 +70,8 @@ MODULE_LICENSE("GPL");
  * 		kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  */
 
-DEFINE_RAW_SPINLOCK(kvm_lock);
-LIST_HEAD(vm_list);
+DEFINE_RAW_SPINLOCK(kvm_lock);	/* protect kvm_list */
+LIST_HEAD(kvm_list);		/* the list of kvm instances */
 
 static cpumask_var_t cpus_hardware_enabled;
 static int kvm_usage_count = 0;
@@ -498,7 +498,7 @@ static struct kvm *kvm_create_vm(void)
 		goto out_err;
 
 	raw_spin_lock(&kvm_lock);
-	list_add(&kvm->vm_list, &vm_list);
+	list_add(&kvm->list, &kvm_list);
 	raw_spin_unlock(&kvm_lock);
 
 	return kvm;
@@ -573,7 +573,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 
 	kvm_arch_sync_events(kvm);
 	raw_spin_lock(&kvm_lock);
-	list_del(&kvm->vm_list);
+	list_del(&kvm->list);
 	raw_spin_unlock(&kvm_lock);
 	kvm_free_irq_routing(kvm);
 	for (i = 0; i < KVM_NR_BUSES; i++)
@@ -2626,7 +2626,7 @@ static int vm_stat_get(void *_offset, u64 *val)
 
 	*val = 0;
 	raw_spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list)
+	list_for_each_entry(kvm, &kvm_list, list)
 		*val += *(u32 *)((void *)kvm + offset);
 	raw_spin_unlock(&kvm_lock);
 	return 0;
@@ -2643,7 +2643,7 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 
 	*val = 0;
 	raw_spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list)
+	list_for_each_entry(kvm, &kvm_list, list)
 		kvm_for_each_vcpu(i, vcpu, kvm)
 			*val += *(u32 *)((void *)vcpu + offset);
 
-- 
1.7.5.4


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/4] KVM: MMU: Make common preparation code for zapping sp into a function
  2011-12-11 22:22 [PATCH 0/4] KVM: Make mmu_shrink() scan nr_to_scan shadow pages Takuya Yoshikawa
  2011-12-11 22:24 ` [PATCH 1/4] KVM: Rename vm_list to kvm_list to avoid confusion Takuya Yoshikawa
@ 2011-12-11 22:24 ` Takuya Yoshikawa
  2011-12-11 22:25 ` [PATCH 3/4] KVM: MMU: Make preparation for zapping some sp into a separate function Takuya Yoshikawa
  2011-12-11 22:26 ` [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages Takuya Yoshikawa
  3 siblings, 0 replies; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-11 22:24 UTC (permalink / raw)
  To: avi, mtosatti; +Cc: kvm, yoshikawa.takuya

From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>

Use list_entry() instead of container_of() for taking a shadow page from
the active_mmu_pages list.

Note: the return value of pre_zap_one_sp() will be used later.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
---
 arch/x86/kvm/mmu.c |   45 +++++++++++++++++++++++----------------------
 1 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 590f76b..b1e8270 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1930,6 +1930,26 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	return ret;
 }
 
+/**
+ * pre_zap_one_sp - make one shadow page ready for being freed
+ * @kvm: the kvm instance
+ * @invalid_list: the list to which we add shadow pages ready for being freed
+ *
+ * Take one shadow page from the tail of the active_mmu_pages list and make it
+ * ready for being freed, then put it into the @invalid_list.  Other pages,
+ * unsync children, may also be put into the @invalid_list.
+ *
+ * Return the number of shadow pages added to the @invalid_list this way.
+ */
+static int pre_zap_one_sp(struct kvm *kvm, struct list_head *invalid_list)
+{
+	struct kvm_mmu_page *sp;
+
+	sp = list_entry(kvm->arch.active_mmu_pages.prev,
+			struct kvm_mmu_page, link);
+	return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+}
+
 static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
 {
 	struct kvm_mmu_page *sp;
@@ -1999,11 +2019,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
 		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
 			!list_empty(&kvm->arch.active_mmu_pages)) {
-			struct kvm_mmu_page *page;
-
-			page = container_of(kvm->arch.active_mmu_pages.prev,
-					    struct kvm_mmu_page, link);
-			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
+			pre_zap_one_sp(kvm, &invalid_list);
 		}
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
@@ -3719,11 +3735,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 
 	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
 	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
-		struct kvm_mmu_page *sp;
-
-		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
-				  struct kvm_mmu_page, link);
-		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+		pre_zap_one_sp(vcpu->kvm, &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -3890,16 +3902,6 @@ restart:
 	spin_unlock(&kvm->mmu_lock);
 }
 
-static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
-						struct list_head *invalid_list)
-{
-	struct kvm_mmu_page *page;
-
-	page = container_of(kvm->arch.active_mmu_pages.prev,
-			    struct kvm_mmu_page, link);
-	kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
-}
-
 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct kvm *kvm;
@@ -3919,8 +3921,7 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 		spin_lock(&kvm->mmu_lock);
 		if (!kvm_freed && nr_to_scan > 0 &&
 		    kvm->arch.n_used_mmu_pages > 0) {
-			kvm_mmu_remove_some_alloc_mmu_pages(kvm,
-							    &invalid_list);
+			pre_zap_one_sp(kvm, &invalid_list);
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
-- 
1.7.5.4


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 3/4] KVM: MMU: Make preparation for zapping some sp into a separate function
  2011-12-11 22:22 [PATCH 0/4] KVM: Make mmu_shrink() scan nr_to_scan shadow pages Takuya Yoshikawa
  2011-12-11 22:24 ` [PATCH 1/4] KVM: Rename vm_list to kvm_list to avoid confusion Takuya Yoshikawa
  2011-12-11 22:24 ` [PATCH 2/4] KVM: MMU: Make common preparation code for zapping sp into a function Takuya Yoshikawa
@ 2011-12-11 22:25 ` Takuya Yoshikawa
  2011-12-12  1:19   ` Takuya Yoshikawa
  2011-12-11 22:26 ` [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages Takuya Yoshikawa
  3 siblings, 1 reply; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-11 22:25 UTC (permalink / raw)
  To: avi, mtosatti; +Cc: kvm, yoshikawa.takuya

From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>

This will be used for mmu_shrink() in the following patch.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
---
 arch/x86/kvm/mmu.c |   36 ++++++++++++++++++++++++++----------
 1 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1e8270..fcd0dd1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2003,6 +2003,28 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 }
 
+/**
+ * pre_zap_some_sp - make some shadow pages ready for being freed
+ * @kvm: the kvm instance
+ * @invalid_list: the list to which we add shadow pages ready for being freed
+ * @nr_to_zap: how many shadow pages we want to zap
+ *
+ * Try to make @nr_to_zap shadow pages ready for being freed, then put them
+ * into the @invalid_list.
+ *
+ * Return the number of shadow pages actually added to the @invalid_list.
+ */
+static int pre_zap_some_sp(struct kvm *kvm, struct list_head *invalid_list,
+			   int nr_to_zap)
+{
+	int nr_before = kvm->arch.n_used_mmu_pages;
+
+	while (nr_to_zap > 0 && !list_empty(&kvm->arch.active_mmu_pages))
+		nr_to_zap -= pre_zap_one_sp(kvm, invalid_list);
+
+	return nr_before - kvm->arch.n_used_mmu_pages;
+}
+
 /*
  * Changing the number of mmu pages allocated to the vm
  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
@@ -2010,17 +2032,11 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 {
 	LIST_HEAD(invalid_list);
-	/*
-	 * If we set the number of mmu pages to be smaller be than the
-	 * number of actived pages , we must to free some mmu pages before we
-	 * change the value
-	 */
+	int nr_to_zap = kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages;
 
-	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
-		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
-			!list_empty(&kvm->arch.active_mmu_pages)) {
-			pre_zap_one_sp(kvm, &invalid_list);
-		}
+	if (nr_to_zap > 0) {
+		/* free some shadow pages to make the number fit the goal */
+		pre_zap_some_sp(kvm, &invalid_list, nr_to_zap);
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
 	}
-- 
1.7.5.4


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages
  2011-12-11 22:22 [PATCH 0/4] KVM: Make mmu_shrink() scan nr_to_scan shadow pages Takuya Yoshikawa
                   ` (2 preceding siblings ...)
  2011-12-11 22:25 ` [PATCH 3/4] KVM: MMU: Make preparation for zapping some sp into a separate function Takuya Yoshikawa
@ 2011-12-11 22:26 ` Takuya Yoshikawa
  2011-12-16 11:06   ` Marcelo Tosatti
  3 siblings, 1 reply; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-11 22:26 UTC (permalink / raw)
  To: avi, mtosatti; +Cc: kvm, yoshikawa.takuya

From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>

Currently, mmu_shrink() tries to free a shadow page from one kvm and
does not use nr_to_scan correctly.

This patch fixes this by making it try to free some shadow pages from
each kvm.  The number of shadow pages each kvm frees becomes
proportional to the number of shadow pages it is using.

Note: an easy way to see how this code works is to do
  echo 3 > /proc/sys/vm/drop_caches
during some virtual machines are running.  Shadow pages will be zapped
as expected by this.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
---
 arch/x86/kvm/mmu.c |   23 ++++++++++++++---------
 1 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fcd0dd1..c6c61dd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3921,7 +3921,7 @@ restart:
 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct kvm *kvm;
-	struct kvm *kvm_freed = NULL;
+	int nr_to_zap, nr_total;
 	int nr_to_scan = sc->nr_to_scan;
 
 	if (nr_to_scan == 0)
@@ -3929,25 +3929,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 
 	raw_spin_lock(&kvm_lock);
 
+	nr_total = percpu_counter_read_positive(&kvm_total_used_mmu_pages);
+
 	list_for_each_entry(kvm, &kvm_list, list) {
 		int idx;
 		LIST_HEAD(invalid_list);
 
+		if (nr_to_scan <= 0) {
+			/* next time from this kvm */
+			list_move_tail(&kvm_list, &kvm->list);
+			break;
+		}
+
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
-		if (!kvm_freed && nr_to_scan > 0 &&
-		    kvm->arch.n_used_mmu_pages > 0) {
-			pre_zap_one_sp(kvm, &invalid_list);
-			kvm_freed = kvm;
-		}
-		nr_to_scan--;
 
+		/* proportional to how many shadow pages this kvm is using */
+		nr_to_zap = sc->nr_to_scan * kvm->arch.n_used_mmu_pages;
+		nr_to_zap /= nr_total;
+		nr_to_scan -= pre_zap_some_sp(kvm, &invalid_list, nr_to_zap);
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
 		spin_unlock(&kvm->mmu_lock);
 		srcu_read_unlock(&kvm->srcu, idx);
 	}
-	if (kvm_freed)
-		list_move_tail(&kvm_freed->list, &kvm_list);
 
 	raw_spin_unlock(&kvm_lock);
 
-- 
1.7.5.4


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] KVM: MMU: Make preparation for zapping some sp into a separate function
  2011-12-11 22:25 ` [PATCH 3/4] KVM: MMU: Make preparation for zapping some sp into a separate function Takuya Yoshikawa
@ 2011-12-12  1:19   ` Takuya Yoshikawa
  0 siblings, 0 replies; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-12  1:19 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: avi, mtosatti, kvm

Takuya Yoshikawa <takuya.yoshikawa@gmail.com> wrote:

> @@ -2010,17 +2032,11 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
>  {
>  	LIST_HEAD(invalid_list);
> -	/*
> -	 * If we set the number of mmu pages to be smaller be than the
> -	 * number of actived pages , we must to free some mmu pages before we
> -	 * change the value
> -	 */
> +	int nr_to_zap = kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages;

Sorry, should have been:
	int nr_to_zap = kvm->arch.n_used_mmu_pages - goal_nr_mmu_pages;

I will fix this after getting some comments.

	Takuya

>  
> -	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
> -		while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
> -			!list_empty(&kvm->arch.active_mmu_pages)) {
> -			pre_zap_one_sp(kvm, &invalid_list);
> -		}
> +	if (nr_to_zap > 0) {
> +		/* free some shadow pages to make the number fit the goal */
> +		pre_zap_some_sp(kvm, &invalid_list, nr_to_zap);
>  		kvm_mmu_commit_zap_page(kvm, &invalid_list);
>  		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
>  	}
> -- 
> 1.7.5.4
> 


-- 
Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] KVM: Rename vm_list to kvm_list to avoid confusion
  2011-12-11 22:24 ` [PATCH 1/4] KVM: Rename vm_list to kvm_list to avoid confusion Takuya Yoshikawa
@ 2011-12-12  3:16   ` Xiao Guangrong
  2011-12-12  4:04     ` Takuya Yoshikawa
  0 siblings, 1 reply; 19+ messages in thread
From: Xiao Guangrong @ 2011-12-12  3:16 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: avi, mtosatti, kvm, yoshikawa.takuya

On 12/12/2011 06:24 AM, Takuya Yoshikawa wrote:

> From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
> 
> Make it clear that this is not related to virtual memory.
> 


'vm' means 'virtual machine'...

> Remove vm_ prefix from the corresponding member of the struct kvm to
> avoid kvm->vm_ redundancy alongside.
> 
> Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] KVM: Rename vm_list to kvm_list to avoid confusion
  2011-12-12  3:16   ` Xiao Guangrong
@ 2011-12-12  4:04     ` Takuya Yoshikawa
  2011-12-12  4:51       ` Xiao Guangrong
  0 siblings, 1 reply; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-12  4:04 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Takuya Yoshikawa, avi, mtosatti, kvm

(2011/12/12 12:16), Xiao Guangrong wrote:
> On 12/12/2011 06:24 AM, Takuya Yoshikawa wrote:
>
>> From: Takuya Yoshikawa<yoshikawa.takuya@oss.ntt.co.jp>
>>
>> Make it clear that this is not related to virtual memory.
>>
>
>
> 'vm' means 'virtual machine'...

Of course I know.  So I wrote "not related" to virtual memory.

What's your point?

	Takuya

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] KVM: Rename vm_list to kvm_list to avoid confusion
  2011-12-12  4:04     ` Takuya Yoshikawa
@ 2011-12-12  4:51       ` Xiao Guangrong
  2011-12-12  7:10         ` Takuya Yoshikawa
  0 siblings, 1 reply; 19+ messages in thread
From: Xiao Guangrong @ 2011-12-12  4:51 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: Takuya Yoshikawa, avi, mtosatti, kvm

On 12/12/2011 12:04 PM, Takuya Yoshikawa wrote:

> (2011/12/12 12:16), Xiao Guangrong wrote:
>> On 12/12/2011 06:24 AM, Takuya Yoshikawa wrote:
>>
>>> From: Takuya Yoshikawa<yoshikawa.takuya@oss.ntt.co.jp>
>>>
>>> Make it clear that this is not related to virtual memory.
>>>
>>
>>
>> 'vm' means 'virtual machine'...
> 
> Of course I know.  So I wrote "not related" to virtual memory.
> 
> What's your point?
> 


In the code, we have kvm_create_vm()/kvm_destroy_vm(), then
add/delete the 'vm" to/from the "vm_list", it is really clear,
so i think this name is OK. :)


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/4] KVM: Rename vm_list to kvm_list to avoid confusion
  2011-12-12  4:51       ` Xiao Guangrong
@ 2011-12-12  7:10         ` Takuya Yoshikawa
  0 siblings, 0 replies; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-12  7:10 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Takuya Yoshikawa, avi, mtosatti, kvm

(2011/12/12 13:51), Xiao Guangrong wrote:
> On 12/12/2011 12:04 PM, Takuya Yoshikawa wrote:
>
>> (2011/12/12 12:16), Xiao Guangrong wrote:
>>> On 12/12/2011 06:24 AM, Takuya Yoshikawa wrote:
>>>
>>>> From: Takuya Yoshikawa<yoshikawa.takuya@oss.ntt.co.jp>
>>>>
>>>> Make it clear that this is not related to virtual memory.
>>>>
>>>
>>>
>>> 'vm' means 'virtual machine'...
>>
>> Of course I know.  So I wrote "not related" to virtual memory.
>>
>> What's your point?
>>
>
>
> In the code, we have kvm_create_vm()/kvm_destroy_vm(), then
> add/delete the 'vm" to/from the "vm_list", it is really clear,
> so i think this name is OK. :)
>

Some reasons I wanted to change this:

	- The lock which protects this list is called kvm_lock, not vm_lock
	- Some architectures are using vm_list for vm region member
	- The list connects kvm instances (struct kvm) and we are doing
	  list_for_each_entry(kvm, &vm_list, vm_list), not
	  list_for_each_entry(vm, &vm_list, vm_list)

In the case of kvm_create_vm(), it creates not only a kvm instance but also
does more virtual machine initialization generally.  So _vm is reasonable.
(I do not mind if it is static in kvm_main.c but it is more widely used.)


But I do not mind to drop this patch if other people also want to keep the
name.  So I will wait some more comments.


Thanks,
	Takuya

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages
  2011-12-11 22:26 ` [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages Takuya Yoshikawa
@ 2011-12-16 11:06   ` Marcelo Tosatti
  2011-12-16 14:58     ` Takuya Yoshikawa
  0 siblings, 1 reply; 19+ messages in thread
From: Marcelo Tosatti @ 2011-12-16 11:06 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: avi, kvm, yoshikawa.takuya

On Mon, Dec 12, 2011 at 07:26:47AM +0900, Takuya Yoshikawa wrote:
> From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
> 
> Currently, mmu_shrink() tries to free a shadow page from one kvm and
> does not use nr_to_scan correctly.
> 
> This patch fixes this by making it try to free some shadow pages from
> each kvm.  The number of shadow pages each kvm frees becomes
> proportional to the number of shadow pages it is using.
> 
> Note: an easy way to see how this code works is to do
>   echo 3 > /proc/sys/vm/drop_caches
> during some virtual machines are running.  Shadow pages will be zapped
> as expected by this.

I'm not sure this is a meaningful test to verify this change is
worthwhile, because while the shrinker tries to free a shadow page from
one vm, the vm's position in the kvm list is changed, so the over time
the shrinker will cycle over all VMs.

Can you measure whether there is a significant difference in a synthetic
workload, and what that change is? Perhaps apply {moderate, high} memory
pressure load with {2, 4, 8, 16} VMs or something like that.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages
  2011-12-16 11:06   ` Marcelo Tosatti
@ 2011-12-16 14:58     ` Takuya Yoshikawa
  2011-12-19  8:43       ` Avi Kivity
  0 siblings, 1 reply; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-16 14:58 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: avi, kvm, yoshikawa.takuya

On Fri, 16 Dec 2011 09:06:11 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> On Mon, Dec 12, 2011 at 07:26:47AM +0900, Takuya Yoshikawa wrote:
> > From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
> > 
> > Currently, mmu_shrink() tries to free a shadow page from one kvm and
> > does not use nr_to_scan correctly.
> > 
> > This patch fixes this by making it try to free some shadow pages from
> > each kvm.  The number of shadow pages each kvm frees becomes
> > proportional to the number of shadow pages it is using.
> > 
> > Note: an easy way to see how this code works is to do
> >   echo 3 > /proc/sys/vm/drop_caches
> > during some virtual machines are running.  Shadow pages will be zapped
> > as expected by this.
> 
> I'm not sure this is a meaningful test to verify this change is
> worthwhile, because while the shrinker tries to free a shadow page from
> one vm, the vm's position in the kvm list is changed, so the over time
> the shrinker will cycle over all VMs.

The test was for checking if mmu_shrink() would work as intended.  Maybe
not good as a changelog, sorry.


I admit that I could not find any strong reason except for protecting the
host from intentionally induced shadowing.

But for that, don't you think that freeing the same amount of shadow pages
from good and bad VMs eaqually is bad thing?

My method will try to free many shadow pages from VMs with many shadow
pages;  e.g. if there is a pathological increase in shadow pages for one
VM, that one will be intensively treated.

If you can agree on this reasoning, I will update the description and resend.

> 
> Can you measure whether there is a significant difference in a synthetic
> workload, and what that change is? Perhaps apply {moderate, high} memory
> pressure load with {2, 4, 8, 16} VMs or something like that.
> 

I was running 4 VMs on my machine with enough high memory pressure.  The problem
was that mmu_shrink() was not tuned to be called in usual memory pressures:  what
I did was changing the seeks and batch parameters and making ept=0.

At least, I have checked that if I make one VM do meaningless many copies, letting
others keep silent, the shrinker frees shadow pages intensively from that one.


Anyway, I don't think making the shrinker call mmu_shrink() with the default batch
size, nr_to_scan=128, and just trying to free one shadow page is a good behaviour.


	Takuya

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages
  2011-12-16 14:58     ` Takuya Yoshikawa
@ 2011-12-19  8:43       ` Avi Kivity
  2011-12-19  9:22         ` Takuya Yoshikawa
  0 siblings, 1 reply; 19+ messages in thread
From: Avi Kivity @ 2011-12-19  8:43 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: Marcelo Tosatti, kvm, yoshikawa.takuya

On 12/16/2011 04:58 PM, Takuya Yoshikawa wrote:
> > 
> > I'm not sure this is a meaningful test to verify this change is
> > worthwhile, because while the shrinker tries to free a shadow page from
> > one vm, the vm's position in the kvm list is changed, so the over time
> > the shrinker will cycle over all VMs.
>
> The test was for checking if mmu_shrink() would work as intended.  Maybe
> not good as a changelog, sorry.
>
>
> I admit that I could not find any strong reason except for protecting the
> host from intentionally induced shadowing.
>
> But for that, don't you think that freeing the same amount of shadow pages
> from good and bad VMs eaqually is bad thing?
>
> My method will try to free many shadow pages from VMs with many shadow
> pages;  e.g. if there is a pathological increase in shadow pages for one
> VM, that one will be intensively treated.
>
> If you can agree on this reasoning, I will update the description and resend.

Well, if one guest is twice as large as other guests, then it will want
twice as many shadow pages.  So our goal should be to zap pages from the
guest with the highest (shadow pages / memory) ratio.

> > 
> > Can you measure whether there is a significant difference in a synthetic
> > workload, and what that change is? Perhaps apply {moderate, high} memory
> > pressure load with {2, 4, 8, 16} VMs or something like that.
> > 
>
> I was running 4 VMs on my machine with enough high memory pressure.  The problem
> was that mmu_shrink() was not tuned to be called in usual memory pressures:  what
> I did was changing the seeks and batch parameters and making ept=0.
>
> At least, I have checked that if I make one VM do meaningless many copies, letting
> others keep silent, the shrinker frees shadow pages intensively from that one.
>
>
> Anyway, I don't think making the shrinker call mmu_shrink() with the default batch
> size, nr_to_scan=128, and just trying to free one shadow page is a good behaviour.

Yes, it's very conservative.  But on the other hand the shrinker is
tuned for dcache and icache, where there are usually tons of useless
objects.  If we have to free something, I'd rather free them instead of
mmu pages which tend to get recreated soon.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages
  2011-12-19  8:43       ` Avi Kivity
@ 2011-12-19  9:22         ` Takuya Yoshikawa
  2011-12-19  9:26           ` Avi Kivity
  0 siblings, 1 reply; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-19  9:22 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Takuya Yoshikawa, Marcelo Tosatti, kvm

(2011/12/19 17:43), Avi Kivity wrote:
> Well, if one guest is twice as large as other guests, then it will want
> twice as many shadow pages.  So our goal should be to zap pages from the
> guest with the highest (shadow pages / memory) ratio.
>
>>>
>>> Can you measure whether there is a significant difference in a synthetic
>>> workload, and what that change is? Perhaps apply {moderate, high} memory
>>> pressure load with {2, 4, 8, 16} VMs or something like that.
>>>
>>
>> I was running 4 VMs on my machine with enough high memory pressure.  The problem
>> was that mmu_shrink() was not tuned to be called in usual memory pressures:  what
>> I did was changing the seeks and batch parameters and making ept=0.
>>
>> At least, I have checked that if I make one VM do meaningless many copies, letting
>> others keep silent, the shrinker frees shadow pages intensively from that one.
>>
>>
>> Anyway, I don't think making the shrinker call mmu_shrink() with the default batch
>> size, nr_to_scan=128, and just trying to free one shadow page is a good behaviour.
>
> Yes, it's very conservative.  But on the other hand the shrinker is
> tuned for dcache and icache, where there are usually tons of useless
> objects.  If we have to free something, I'd rather free them instead of
> mmu pages which tend to get recreated soon.
>

OK, to satisfy the requirements, I will do:

	1. find the guest with the highest (shadow pages / memory) ratio
	2. just zap one page from that guest, keeping the current conservative rate

I will update the patch.

	Takuya

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages
  2011-12-19  9:22         ` Takuya Yoshikawa
@ 2011-12-19  9:26           ` Avi Kivity
  2011-12-19  9:56             ` Takuya Yoshikawa
  0 siblings, 1 reply; 19+ messages in thread
From: Avi Kivity @ 2011-12-19  9:26 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: Takuya Yoshikawa, Marcelo Tosatti, kvm

On 12/19/2011 11:22 AM, Takuya Yoshikawa wrote:
>> Yes, it's very conservative.  But on the other hand the shrinker is
>> tuned for dcache and icache, where there are usually tons of useless
>> objects.  If we have to free something, I'd rather free them instead of
>> mmu pages which tend to get recreated soon.
>>
>
>
> OK, to satisfy the requirements, I will do:
>
>     1. find the guest with the highest (shadow pages / memory) ratio

How do you propose to do that efficiently?  We may have hundreds of
guests, or even more, on one host.  Each guest access will involve
bouncing a few cache lines.

>     2. just zap one page from that guest, keeping the current
> conservative rate
>
> I will update the patch.

I think the current rate is too conservative.  No idea what a good one
is, I don't have a feeling as to the relation between shrinker callbacks
and memory pressure.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages
  2011-12-19  9:26           ` Avi Kivity
@ 2011-12-19  9:56             ` Takuya Yoshikawa
  2011-12-19 10:03               ` Avi Kivity
  0 siblings, 1 reply; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-19  9:56 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Takuya Yoshikawa, Marcelo Tosatti, kvm

(2011/12/19 18:26), Avi Kivity wrote:
> On 12/19/2011 11:22 AM, Takuya Yoshikawa wrote:
>>> Yes, it's very conservative.  But on the other hand the shrinker is
>>> tuned for dcache and icache, where there are usually tons of useless
>>> objects.  If we have to free something, I'd rather free them instead of
>>> mmu pages which tend to get recreated soon.
>>>
>>
>>
>> OK, to satisfy the requirements, I will do:
>>
>>      1. find the guest with the highest (shadow pages / memory) ratio
>
> How do you propose to do that efficiently?  We may have hundreds of
> guests, or even more, on one host.  Each guest access will involve
> bouncing a few cache lines.

IMO, The goal should be restricted to emergencies.

So possible solution may be:
	- we set the tuning parameters as conservative as possible
	- pick up a guest with relatively high ratio
	  (I have to think more how to achieve this)
	- move the vm_list head for fairness

In an emergency, we should not mind performance penalty so much.

>
>>      2. just zap one page from that guest, keeping the current
>> conservative rate
>>
>> I will update the patch.
>
> I think the current rate is too conservative.  No idea what a good one
> is, I don't have a feeling as to the relation between shrinker callbacks
> and memory pressure.
>

When I tried to see what the current code is doing, frankly speaking,
I thought mmu_shrink() was not tested enough from the beginning.

I read the shrinker code as far as possible and realized the combination of
(seeks=10*default, batch=128) is not reasonable; the high seeks means the
shrinker rarely calculate higher value than 128, and mmu_shrink() cannot
be called in normal life.

How about setting the batch a bit lower, keeping seeks as is?

But there is not a perfect value because how often mmu_shrink() can be called
will change if the admin change the sysctl_vfs_cache_pressure tuning parameter
for dcache and icache, IIUC.

And tdp and shadow paging differ much.

	Takuya

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages
  2011-12-19  9:56             ` Takuya Yoshikawa
@ 2011-12-19 10:03               ` Avi Kivity
  2011-12-19 10:21                 ` Takuya Yoshikawa
  0 siblings, 1 reply; 19+ messages in thread
From: Avi Kivity @ 2011-12-19 10:03 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: Takuya Yoshikawa, Marcelo Tosatti, kvm

On 12/19/2011 11:56 AM, Takuya Yoshikawa wrote:
> (2011/12/19 18:26), Avi Kivity wrote:
>> On 12/19/2011 11:22 AM, Takuya Yoshikawa wrote:
>>>> Yes, it's very conservative.  But on the other hand the shrinker is
>>>> tuned for dcache and icache, where there are usually tons of useless
>>>> objects.  If we have to free something, I'd rather free them
>>>> instead of
>>>> mmu pages which tend to get recreated soon.
>>>>
>>>
>>>
>>> OK, to satisfy the requirements, I will do:
>>>
>>>      1. find the guest with the highest (shadow pages / memory) ratio
>>
>> How do you propose to do that efficiently?  We may have hundreds of
>> guests, or even more, on one host.  Each guest access will involve
>> bouncing a few cache lines.
>
> IMO, The goal should be restricted to emergencies.
>
> So possible solution may be:
>     - we set the tuning parameters as conservative as possible
>     - pick up a guest with relatively high ratio
>       (I have to think more how to achieve this)
>     - move the vm_list head for fairness
>
> In an emergency, we should not mind performance penalty so much.

But is the shrinker really only called in emergencies?

Also, with things like cgroups, we may have an emergency in one
container, but not in others - if the shrinker is not cgroup aware, it
soon will be.

>
>>
>>>      2. just zap one page from that guest, keeping the current
>>> conservative rate
>>>
>>> I will update the patch.
>>
>> I think the current rate is too conservative.  No idea what a good one
>> is, I don't have a feeling as to the relation between shrinker callbacks
>> and memory pressure.
>>
>
> When I tried to see what the current code is doing, frankly speaking,
> I thought mmu_shrink() was not tested enough from the beginning.

It wasn't, and in fact the original code was even worse, the code we
have now is after some fixes.

>
> I read the shrinker code as far as possible and realized the
> combination of
> (seeks=10*default, batch=128) is not reasonable; the high seeks means the
> shrinker rarely calculate higher value than 128, and mmu_shrink() cannot
> be called in normal life.
>
> How about setting the batch a bit lower, keeping seeks as is?

Ok.

>
> But there is not a perfect value because how often mmu_shrink() can be
> called
> will change if the admin change the sysctl_vfs_cache_pressure tuning
> parameter
> for dcache and icache, IIUC.
>
> And tdp and shadow paging differ much.

We should aim for the following:
- normal operation causes very little shrinks (some are okay)
- high pressure mostly due to kvm results in kvm being shrunk (this is a
pathological case caused by a starting a guest with a huge amount of
memory, and mapping it all to /dev/zero (or ksm), and getting the guest
the create shadow mappings for all of it)
- general high pressure is shared among other caches like dcache and icache

The cost of reestablishing an mmu page can be as high as half a
millisecond of cpu time, which is the reason I want to be conservative.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages
  2011-12-19 10:03               ` Avi Kivity
@ 2011-12-19 10:21                 ` Takuya Yoshikawa
  2011-12-19 10:24                   ` Avi Kivity
  0 siblings, 1 reply; 19+ messages in thread
From: Takuya Yoshikawa @ 2011-12-19 10:21 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Takuya Yoshikawa, Marcelo Tosatti, kvm

(2011/12/19 19:03), Avi Kivity wrote:
>> IMO, The goal should be restricted to emergencies.
>>
>> So possible solution may be:
>>      - we set the tuning parameters as conservative as possible
>>      - pick up a guest with relatively high ratio
>>        (I have to think more how to achieve this)
>>      - move the vm_list head for fairness
>>
>> In an emergency, we should not mind performance penalty so much.
>
> But is the shrinker really only called in emergencies?

No, sadly.

That is the problem.

>
> Also, with things like cgroups, we may have an emergency in one
> container, but not in others - if the shrinker is not cgroup aware, it
> soon will be.

That seems to be a common problem for everyone, not KVM only.

>> But there is not a perfect value because how often mmu_shrink() can be
>> called
>> will change if the admin change the sysctl_vfs_cache_pressure tuning
>> parameter
>> for dcache and icache, IIUC.
>>
>> And tdp and shadow paging differ much.
>
> We should aim for the following:
> - normal operation causes very little shrinks (some are okay)
> - high pressure mostly due to kvm results in kvm being shrunk (this is a
> pathological case caused by a starting a guest with a huge amount of
> memory, and mapping it all to /dev/zero (or ksm), and getting the guest
> the create shadow mappings for all of it)
> - general high pressure is shared among other caches like dcache and icache
>
> The cost of reestablishing an mmu page can be as high as half a
> millisecond of cpu time, which is the reason I want to be conservative.
>

I agree with you.

I feel that I should add lkml in CC next time to hear from mm specialist.
Shrinker has many heuristics added from a lot of experience; my lack of
such experience means I need help.

	Takuya

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages
  2011-12-19 10:21                 ` Takuya Yoshikawa
@ 2011-12-19 10:24                   ` Avi Kivity
  0 siblings, 0 replies; 19+ messages in thread
From: Avi Kivity @ 2011-12-19 10:24 UTC (permalink / raw)
  To: Takuya Yoshikawa; +Cc: Takuya Yoshikawa, Marcelo Tosatti, kvm, Andrea Arcangeli

On 12/19/2011 12:21 PM, Takuya Yoshikawa wrote:
> (2011/12/19 19:03), Avi Kivity wrote:
>>> IMO, The goal should be restricted to emergencies.
>>>
>>> So possible solution may be:
>>>      - we set the tuning parameters as conservative as possible
>>>      - pick up a guest with relatively high ratio
>>>        (I have to think more how to achieve this)
>>>      - move the vm_list head for fairness
>>>
>>> In an emergency, we should not mind performance penalty so much.
>>
>> But is the shrinker really only called in emergencies?
>
> No, sadly.
>
> That is the problem.

It's not a problem - otherwise the icache and dcache would grow
indefinitely.

kvm's caches have another limit - perhaps we should remove it and let
the shrinker manage the caches itself (but that requires a better
selection algorithm).

>
>>
>> Also, with things like cgroups, we may have an emergency in one
>> container, but not in others - if the shrinker is not cgroup aware, it
>> soon will be.
>
> That seems to be a common problem for everyone, not KVM only.

It's really a requirement for dcache/icache, otherwise one container
could force out icache/dcache for another.  It doesn't concern us
directly but we have to ensure that one guest cannot force out another's
mmu pages.

>
>>> But there is not a perfect value because how often mmu_shrink() can be
>>> called
>>> will change if the admin change the sysctl_vfs_cache_pressure tuning
>>> parameter
>>> for dcache and icache, IIUC.
>>>
>>> And tdp and shadow paging differ much.
>>
>> We should aim for the following:
>> - normal operation causes very little shrinks (some are okay)
>> - high pressure mostly due to kvm results in kvm being shrunk (this is a
>> pathological case caused by a starting a guest with a huge amount of
>> memory, and mapping it all to /dev/zero (or ksm), and getting the guest
>> the create shadow mappings for all of it)
>> - general high pressure is shared among other caches like dcache and
>> icache
>>
>> The cost of reestablishing an mmu page can be as high as half a
>> millisecond of cpu time, which is the reason I want to be conservative.
>>
>
> I agree with you.
>
> I feel that I should add lkml in CC next time to hear from mm specialist.
> Shrinker has many heuristics added from a lot of experience; my lack of
> such experience means I need help.

Copying one expert.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2011-12-19 10:24 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-12-11 22:22 [PATCH 0/4] KVM: Make mmu_shrink() scan nr_to_scan shadow pages Takuya Yoshikawa
2011-12-11 22:24 ` [PATCH 1/4] KVM: Rename vm_list to kvm_list to avoid confusion Takuya Yoshikawa
2011-12-12  3:16   ` Xiao Guangrong
2011-12-12  4:04     ` Takuya Yoshikawa
2011-12-12  4:51       ` Xiao Guangrong
2011-12-12  7:10         ` Takuya Yoshikawa
2011-12-11 22:24 ` [PATCH 2/4] KVM: MMU: Make common preparation code for zapping sp into a function Takuya Yoshikawa
2011-12-11 22:25 ` [PATCH 3/4] KVM: MMU: Make preparation for zapping some sp into a separate function Takuya Yoshikawa
2011-12-12  1:19   ` Takuya Yoshikawa
2011-12-11 22:26 ` [PATCH 4/4] KVM: MMU: Make mmu_shrink() scan nr_to_scan shadow pages Takuya Yoshikawa
2011-12-16 11:06   ` Marcelo Tosatti
2011-12-16 14:58     ` Takuya Yoshikawa
2011-12-19  8:43       ` Avi Kivity
2011-12-19  9:22         ` Takuya Yoshikawa
2011-12-19  9:26           ` Avi Kivity
2011-12-19  9:56             ` Takuya Yoshikawa
2011-12-19 10:03               ` Avi Kivity
2011-12-19 10:21                 ` Takuya Yoshikawa
2011-12-19 10:24                   ` Avi Kivity

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.