All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking
@ 2012-04-23 16:16 Avi Kivity
  2012-04-24  1:17 ` Marcelo Tosatti
  2012-04-24  6:37 ` Xiao Guangrong
  0 siblings, 2 replies; 10+ messages in thread
From: Avi Kivity @ 2012-04-23 16:16 UTC (permalink / raw)
  To: Marcelo Tosatti, Xiao Guangrong, kvm

Using RCU for lockless shadow walking can increase the amount of memory
in use by the system, since RCU grace periods are unpredictable.  We also
have an unconditional write to a shared variable (reader_counter), which
isn't good for scaling.

Replace that with a scheme similar to x86's get_user_pages_fast(): disable
interrupts during lockless shadow walk to force the freer
(kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
processor with interrupts enabled.

We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
kvm_flush_remote_tlbs() from avoiding the IPI.

Signed-off-by: Avi Kivity <avi@redhat.com>
---

Turned out to be simpler than expected.  However, I think there's a problem
with make_all_cpus_request() possible reading an incorrect vcpu->cpu.

 arch/x86/include/asm/kvm_host.h |    4 ---
 arch/x86/kvm/mmu.c              |   61 +++++++++++----------------------------
 include/linux/kvm_host.h        |    3 +-
 3 files changed, 19 insertions(+), 49 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f624ca7..67e66e6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -237,8 +237,6 @@ struct kvm_mmu_page {
 #endif
 
 	int write_flooding_count;
-
-	struct rcu_head rcu;
 };
 
 struct kvm_pio_request {
@@ -536,8 +534,6 @@ struct kvm_arch {
 	u64 hv_guest_os_id;
 	u64 hv_hypercall;
 
-	atomic_t reader_counter;
-
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 07424cf..903af5e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -551,19 +551,23 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 
 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 {
-	rcu_read_lock();
-	atomic_inc(&vcpu->kvm->arch.reader_counter);
-
-	/* Increase the counter before walking shadow page table */
-	smp_mb__after_atomic_inc();
+	/*
+	 * Prevent page table teardown by making any free-er wait during
+	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
+	 */
+	local_irq_disable();
+	vcpu->mode = READING_SHADOW_PAGE_TABLES;
+	/*
+	 * wmb: advertise vcpu->mode change
+	 * rmb: make sure we see updated sptes
+	 */
+	smp_mb();
 }
 
 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 {
-	/* Decrease the counter after walking shadow page table finished */
-	smp_mb__before_atomic_dec();
-	atomic_dec(&vcpu->kvm->arch.reader_counter);
-	rcu_read_unlock();
+	vcpu->mode = OUTSIDE_GUEST_MODE;
+	local_irq_enable();
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -1989,30 +1993,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	return ret;
 }
 
-static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
-{
-	struct kvm_mmu_page *sp;
-
-	list_for_each_entry(sp, invalid_list, link)
-		kvm_mmu_isolate_page(sp);
-}
-
-static void free_pages_rcu(struct rcu_head *head)
-{
-	struct kvm_mmu_page *next, *sp;
-
-	sp = container_of(head, struct kvm_mmu_page, rcu);
-	while (sp) {
-		if (!list_empty(&sp->link))
-			next = list_first_entry(&sp->link,
-				      struct kvm_mmu_page, link);
-		else
-			next = NULL;
-		kvm_mmu_free_page(sp);
-		sp = next;
-	}
-}
-
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list)
 {
@@ -2021,25 +2001,18 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 	if (list_empty(invalid_list))
 		return;
 
+	/*
+	 * Wait for all vcpus to exit guest mode and/or lockless shadow
+	 * page table walks.
+	 */
 	kvm_flush_remote_tlbs(kvm);
 
-	if (atomic_read(&kvm->arch.reader_counter)) {
-		kvm_mmu_isolate_pages(invalid_list);
-		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
-		list_del_init(invalid_list);
-
-		trace_kvm_mmu_delay_free_pages(sp);
-		call_rcu(&sp->rcu, free_pages_rcu);
-		return;
-	}
-
 	do {
 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
 		WARN_ON(!sp->role.invalid || sp->root_count);
 		kvm_mmu_isolate_page(sp);
 		kvm_mmu_free_page(sp);
 	} while (!list_empty(invalid_list));
-
 }
 
 /*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 186ffab..d1f1adf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -128,7 +128,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
-	EXITING_GUEST_MODE
+	EXITING_GUEST_MODE,
+	READING_SHADOW_PAGE_TABLES,
 };
 
 /*
-- 
1.7.10


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-23 16:16 [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking Avi Kivity
@ 2012-04-24  1:17 ` Marcelo Tosatti
  2012-04-24  9:24   ` Avi Kivity
  2012-04-24  6:37 ` Xiao Guangrong
  1 sibling, 1 reply; 10+ messages in thread
From: Marcelo Tosatti @ 2012-04-24  1:17 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Xiao Guangrong, kvm

On Mon, Apr 23, 2012 at 07:16:52PM +0300, Avi Kivity wrote:
> Using RCU for lockless shadow walking can increase the amount of memory
> in use by the system, since RCU grace periods are unpredictable.  We also
> have an unconditional write to a shared variable (reader_counter), which
> isn't good for scaling.
> 
> Replace that with a scheme similar to x86's get_user_pages_fast(): disable
> interrupts during lockless shadow walk to force the freer
> (kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
> processor with interrupts enabled.
> 
> We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
> kvm_flush_remote_tlbs() from avoiding the IPI.
> 
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
> 
> Turned out to be simpler than expected.  However, I think there's a problem
> with make_all_cpus_request() possible reading an incorrect vcpu->cpu.
> 
>  arch/x86/include/asm/kvm_host.h |    4 ---
>  arch/x86/kvm/mmu.c              |   61 +++++++++++----------------------------
>  include/linux/kvm_host.h        |    3 +-
>  3 files changed, 19 insertions(+), 49 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f624ca7..67e66e6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -237,8 +237,6 @@ struct kvm_mmu_page {
>  #endif
>  
>  	int write_flooding_count;
> -
> -	struct rcu_head rcu;
>  };
>  
>  struct kvm_pio_request {
> @@ -536,8 +534,6 @@ struct kvm_arch {
>  	u64 hv_guest_os_id;
>  	u64 hv_hypercall;
>  
> -	atomic_t reader_counter;
> -
>  	#ifdef CONFIG_KVM_MMU_AUDIT
>  	int audit_point;
>  	#endif
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 07424cf..903af5e 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -551,19 +551,23 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
>  
>  static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
>  {
> -	rcu_read_lock();
> -	atomic_inc(&vcpu->kvm->arch.reader_counter);
> -
> -	/* Increase the counter before walking shadow page table */
> -	smp_mb__after_atomic_inc();
> +	/*
> +	 * Prevent page table teardown by making any free-er wait during
> +	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
> +	 */
> +	local_irq_disable();
> +	vcpu->mode = READING_SHADOW_PAGE_TABLES;
> +	/*
> +	 * wmb: advertise vcpu->mode change
> +	 * rmb: make sure we see updated sptes
> +	 */
> +	smp_mb();
>  }
>  
>  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>  {
> -	/* Decrease the counter after walking shadow page table finished */
> -	smp_mb__before_atomic_dec();
> -	atomic_dec(&vcpu->kvm->arch.reader_counter);
> -	rcu_read_unlock();
> +	vcpu->mode = OUTSIDE_GUEST_MODE;
> +	local_irq_enable();
>  }
>  
>  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
> @@ -1989,30 +1993,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
>  	return ret;
>  }
>  
> -static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
> -{
> -	struct kvm_mmu_page *sp;
> -
> -	list_for_each_entry(sp, invalid_list, link)
> -		kvm_mmu_isolate_page(sp);
> -}
> -
> -static void free_pages_rcu(struct rcu_head *head)
> -{
> -	struct kvm_mmu_page *next, *sp;
> -
> -	sp = container_of(head, struct kvm_mmu_page, rcu);
> -	while (sp) {
> -		if (!list_empty(&sp->link))
> -			next = list_first_entry(&sp->link,
> -				      struct kvm_mmu_page, link);
> -		else
> -			next = NULL;
> -		kvm_mmu_free_page(sp);
> -		sp = next;
> -	}
> -}
> -
>  static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>  				    struct list_head *invalid_list)
>  {
> @@ -2021,25 +2001,18 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>  	if (list_empty(invalid_list))
>  		return;
>  
> +	/*
> +	 * Wait for all vcpus to exit guest mode and/or lockless shadow
> +	 * page table walks.
> +	 */
>  	kvm_flush_remote_tlbs(kvm);
>  
> -	if (atomic_read(&kvm->arch.reader_counter)) {
> -		kvm_mmu_isolate_pages(invalid_list);
> -		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
> -		list_del_init(invalid_list);
> -
> -		trace_kvm_mmu_delay_free_pages(sp);
> -		call_rcu(&sp->rcu, free_pages_rcu);
> -		return;
> -	}
> -
>  	do {
>  		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
>  		WARN_ON(!sp->role.invalid || sp->root_count);
>  		kvm_mmu_isolate_page(sp);
>  		kvm_mmu_free_page(sp);
>  	} while (!list_empty(invalid_list));
> -
>  }
>  
>  /*
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 186ffab..d1f1adf 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -128,7 +128,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
>  enum {
>  	OUTSIDE_GUEST_MODE,
>  	IN_GUEST_MODE,
> -	EXITING_GUEST_MODE
> +	EXITING_GUEST_MODE,
> +	READING_SHADOW_PAGE_TABLES,
>  };

Should add an explicit mb after prepare_zap_page? (currently rely on
unrelated ones internal to flush_remote_tlbs).


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-23 16:16 [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking Avi Kivity
  2012-04-24  1:17 ` Marcelo Tosatti
@ 2012-04-24  6:37 ` Xiao Guangrong
  2012-04-24  9:19   ` Avi Kivity
  1 sibling, 1 reply; 10+ messages in thread
From: Xiao Guangrong @ 2012-04-24  6:37 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

On 04/24/2012 12:16 AM, Avi Kivity wrote:

> Using RCU for lockless shadow walking can increase the amount of memory
> in use by the system, since RCU grace periods are unpredictable.  We also
> have an unconditional write to a shared variable (reader_counter), which
> isn't good for scaling.
> 
> Replace that with a scheme similar to x86's get_user_pages_fast(): disable
> interrupts during lockless shadow walk to force the freer
> (kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
> processor with interrupts enabled.
> 
> We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
> kvm_flush_remote_tlbs() from avoiding the IPI.
> 
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
> 
> Turned out to be simpler than expected.  However, I think there's a problem
> with make_all_cpus_request() possible reading an incorrect vcpu->cpu.


It seems possible.

Can we fix it by reading vcpu->cpu when the vcpu is in GUEST_MODE or
EXITING_GUEST_MODE (IIRC, in these modes, interrupt is disabled)?

Like:

if (kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
                      cpumask_set_cpu(vcpu->cpu, cpus);

> 
>  arch/x86/include/asm/kvm_host.h |    4 ---
>  arch/x86/kvm/mmu.c              |   61 +++++++++++----------------------------
>  include/linux/kvm_host.h        |    3 +-
>  3 files changed, 19 insertions(+), 49 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f624ca7..67e66e6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -237,8 +237,6 @@ struct kvm_mmu_page {
>  #endif
> 
>  	int write_flooding_count;
> -
> -	struct rcu_head rcu;
>  };
> 
>  struct kvm_pio_request {
> @@ -536,8 +534,6 @@ struct kvm_arch {
>  	u64 hv_guest_os_id;
>  	u64 hv_hypercall;
> 
> -	atomic_t reader_counter;
> -
>  	#ifdef CONFIG_KVM_MMU_AUDIT
>  	int audit_point;
>  	#endif
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 07424cf..903af5e 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -551,19 +551,23 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
> 
>  static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
>  {
> -	rcu_read_lock();
> -	atomic_inc(&vcpu->kvm->arch.reader_counter);
> -
> -	/* Increase the counter before walking shadow page table */
> -	smp_mb__after_atomic_inc();
> +	/*
> +	 * Prevent page table teardown by making any free-er wait during
> +	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
> +	 */
> +	local_irq_disable();
> +	vcpu->mode = READING_SHADOW_PAGE_TABLES;
> +	/*
> +	 * wmb: advertise vcpu->mode change
> +	 * rmb: make sure we see updated sptes
> +	 */
> +	smp_mb();
>  }
> 
>  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>  {
> -	/* Decrease the counter after walking shadow page table finished */
> -	smp_mb__before_atomic_dec();
> -	atomic_dec(&vcpu->kvm->arch.reader_counter);
> -	rcu_read_unlock();


We need a mb here to avoid that setting vcpu->mode is reordered to the head
of reading/writing spte? (it is safe on x86, but we need a comment at least?)

Otherwise it looks good to me, i will measure it later.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-24  6:37 ` Xiao Guangrong
@ 2012-04-24  9:19   ` Avi Kivity
  2012-04-24  9:23     ` Avi Kivity
  2012-04-24  9:54     ` Xiao Guangrong
  0 siblings, 2 replies; 10+ messages in thread
From: Avi Kivity @ 2012-04-24  9:19 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, kvm

On 04/24/2012 09:37 AM, Xiao Guangrong wrote:
> On 04/24/2012 12:16 AM, Avi Kivity wrote:
>
> > Using RCU for lockless shadow walking can increase the amount of memory
> > in use by the system, since RCU grace periods are unpredictable.  We also
> > have an unconditional write to a shared variable (reader_counter), which
> > isn't good for scaling.
> > 
> > Replace that with a scheme similar to x86's get_user_pages_fast(): disable
> > interrupts during lockless shadow walk to force the freer
> > (kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
> > processor with interrupts enabled.
> > 
> > We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
> > kvm_flush_remote_tlbs() from avoiding the IPI.
> > 
> > Signed-off-by: Avi Kivity <avi@redhat.com>
> > ---
> > 
> > Turned out to be simpler than expected.  However, I think there's a problem
> > with make_all_cpus_request() possible reading an incorrect vcpu->cpu.
>
>
> It seems possible.
>
> Can we fix it by reading vcpu->cpu when the vcpu is in GUEST_MODE or
> EXITING_GUEST_MODE (IIRC, in these modes, interrupt is disabled)?
>
> Like:
>
> if (kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
>                       cpumask_set_cpu(vcpu->cpu, cpus);

I think it is actually okay.  We are only vulnerable if lockless shadow
walk started during prepare_zap_page(), and extends past
kvm_flush_remote_tlbs(), yes?  But in that case, vcpu->cpu is stable
since local_irq_disable() kills preemption.

> > 
> >  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
> >  {
> > -	/* Decrease the counter after walking shadow page table finished */
> > -	smp_mb__before_atomic_dec();
> > -	atomic_dec(&vcpu->kvm->arch.reader_counter);
> > -	rcu_read_unlock();
>
>
> We need a mb here to avoid that setting vcpu->mode is reordered to the head
> of reading/writing spte? (it is safe on x86, but we need a comment at least?)

I don't think so.  Documentation/memory-barriers says:

Any atomic operation that modifies some state in memory and returns
information
about the state (old or new) implies an SMP-conditional general memory
barrier
(smp_mb()) on each side of the actual operation (with the exception of
explicit lock operations, described later).  These include:

        xchg();
        cmpxchg();
        atomic_cmpxchg();
        atomic_inc_return();
        atomic_dec_return();
        atomic_add_return();
        atomic_sub_return();
        atomic_inc_and_test();
        atomic_dec_and_test();
        atomic_sub_and_test();
        atomic_add_negative();
        atomic_add_unless();    /* when succeeds (returns 1) */
        test_and_set_bit();
        test_and_clear_bit();
        test_and_change_bit();




-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-24  9:19   ` Avi Kivity
@ 2012-04-24  9:23     ` Avi Kivity
  2012-04-24  9:54     ` Xiao Guangrong
  1 sibling, 0 replies; 10+ messages in thread
From: Avi Kivity @ 2012-04-24  9:23 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, kvm

On 04/24/2012 12:19 PM, Avi Kivity wrote:
> > We need a mb here to avoid that setting vcpu->mode is reordered to the head
> > of reading/writing spte? (it is safe on x86, but we need a comment at least?)
>
> I don't think so.  Documentation/memory-barriers says:
>
> Any atomic operation that modifies some state in memory and returns
> information
>

But we have some non-atomic writes to sptes.  Will fix.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-24  1:17 ` Marcelo Tosatti
@ 2012-04-24  9:24   ` Avi Kivity
  2012-05-14 12:41     ` Avi Kivity
  0 siblings, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2012-04-24  9:24 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, kvm

On 04/24/2012 04:17 AM, Marcelo Tosatti wrote:
> On Mon, Apr 23, 2012 at 07:16:52PM +0300, Avi Kivity wrote:
> > Using RCU for lockless shadow walking can increase the amount of memory
> > in use by the system, since RCU grace periods are unpredictable.  We also
> > have an unconditional write to a shared variable (reader_counter), which
> > isn't good for scaling.
> > 
> > Replace that with a scheme similar to x86's get_user_pages_fast(): disable
> > interrupts during lockless shadow walk to force the freer
> > (kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
> > processor with interrupts enabled.
> > 
> > We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
> > kvm_flush_remote_tlbs() from avoiding the IPI.
> > 
>
> Should add an explicit mb after prepare_zap_page? (currently rely on
> unrelated ones internal to flush_remote_tlbs).

Yes.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-24  9:19   ` Avi Kivity
  2012-04-24  9:23     ` Avi Kivity
@ 2012-04-24  9:54     ` Xiao Guangrong
  2012-04-24 10:02       ` Avi Kivity
  1 sibling, 1 reply; 10+ messages in thread
From: Xiao Guangrong @ 2012-04-24  9:54 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

On 04/24/2012 05:19 PM, Avi Kivity wrote:


>>> Turned out to be simpler than expected.  However, I think there's a problem
>>> with make_all_cpus_request() possible reading an incorrect vcpu->cpu.
>>
>>
>> It seems possible.
>>
>> Can we fix it by reading vcpu->cpu when the vcpu is in GUEST_MODE or
>> EXITING_GUEST_MODE (IIRC, in these modes, interrupt is disabled)?
>>
>> Like:
>>
>> if (kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
>>                       cpumask_set_cpu(vcpu->cpu, cpus);
> 
> I think it is actually okay.  We are only vulnerable if lockless shadow
> walk started during prepare_zap_page(), and extends past
> kvm_flush_remote_tlbs(), yes?  But in that case, vcpu->cpu is stable
> since local_irq_disable() kills preemption.
> 


This case can happen?

   VCPU 0                                           VCPU 1

kvm_for_each_vcpu(i, vcpu, kvm) {
	kvm_make_request(req, vcpu);

                                                  VCPU1 is running on CPU 1 out of guest mode

	cpu = vcpu->cpu;

	/* Set ->requests bit before we read ->mode */
	smp_mb();

	if (cpus != NULL && cpu != -1 && cpu != me &&

						 VCPU1 is scheduled to CPU 2, and running in
                                                 guest mode

	      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
		cpumask_set_cpu(cpu, cpus);
}

       VCPU 0 send IPI to CPU1, but actually, VCPU1 is running on CPU 2.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-24  9:54     ` Xiao Guangrong
@ 2012-04-24 10:02       ` Avi Kivity
  2012-04-24 10:05         ` Xiao Guangrong
  0 siblings, 1 reply; 10+ messages in thread
From: Avi Kivity @ 2012-04-24 10:02 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, kvm

On 04/24/2012 12:54 PM, Xiao Guangrong wrote:
> On 04/24/2012 05:19 PM, Avi Kivity wrote:
>
>
> >>> Turned out to be simpler than expected.  However, I think there's a problem
> >>> with make_all_cpus_request() possible reading an incorrect vcpu->cpu.
> >>
> >>
> >> It seems possible.
> >>
> >> Can we fix it by reading vcpu->cpu when the vcpu is in GUEST_MODE or
> >> EXITING_GUEST_MODE (IIRC, in these modes, interrupt is disabled)?
> >>
> >> Like:
> >>
> >> if (kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
> >>                       cpumask_set_cpu(vcpu->cpu, cpus);
> > 
> > I think it is actually okay.  We are only vulnerable if lockless shadow
> > walk started during prepare_zap_page(), and extends past
> > kvm_flush_remote_tlbs(), yes?  But in that case, vcpu->cpu is stable
> > since local_irq_disable() kills preemption.
> > 
>
>
> This case can happen?
>
>    VCPU 0                                           VCPU 1
>
> kvm_for_each_vcpu(i, vcpu, kvm) {
> 	kvm_make_request(req, vcpu);
>
>                                                   VCPU1 is running on CPU 1 out of guest mode
>
> 	cpu = vcpu->cpu;
>
> 	/* Set ->requests bit before we read ->mode */
> 	smp_mb();
>
> 	if (cpus != NULL && cpu != -1 && cpu != me &&
>
> 						 VCPU1 is scheduled to CPU 2, and running in
>                                                  guest mode
>
> 	      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
> 		cpumask_set_cpu(cpu, cpus);
> }
>
>        VCPU 0 send IPI to CPU1, but actually, VCPU1 is running on CPU 2.
>

It can happen, but it's benign.  After migration, vcpu1 will examine
vcpu->requests and flush the TLB.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-24 10:02       ` Avi Kivity
@ 2012-04-24 10:05         ` Xiao Guangrong
  0 siblings, 0 replies; 10+ messages in thread
From: Xiao Guangrong @ 2012-04-24 10:05 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

On 04/24/2012 06:02 PM, Avi Kivity wrote:


>>
>> 	      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
>> 		cpumask_set_cpu(cpu, cpus);
>> }
>>
>>        VCPU 0 send IPI to CPU1, but actually, VCPU1 is running on CPU 2.
>>
> 
> It can happen, but it's benign.  After migration, vcpu1 will examine
> vcpu->requests and flush the TLB.
> 


Yes, you are right, i forgot it.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-24  9:24   ` Avi Kivity
@ 2012-05-14 12:41     ` Avi Kivity
  0 siblings, 0 replies; 10+ messages in thread
From: Avi Kivity @ 2012-05-14 12:41 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, kvm

On 04/24/2012 12:24 PM, Avi Kivity wrote:
> On 04/24/2012 04:17 AM, Marcelo Tosatti wrote:
> > On Mon, Apr 23, 2012 at 07:16:52PM +0300, Avi Kivity wrote:
> > > Using RCU for lockless shadow walking can increase the amount of memory
> > > in use by the system, since RCU grace periods are unpredictable.  We also
> > > have an unconditional write to a shared variable (reader_counter), which
> > > isn't good for scaling.
> > > 
> > > Replace that with a scheme similar to x86's get_user_pages_fast(): disable
> > > interrupts during lockless shadow walk to force the freer
> > > (kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
> > > processor with interrupts enabled.
> > > 
> > > We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
> > > kvm_flush_remote_tlbs() from avoiding the IPI.
> > > 
> >
> > Should add an explicit mb after prepare_zap_page? (currently rely on
> > unrelated ones internal to flush_remote_tlbs).
>
> Yes.

Is that really true?  If unlocked shadow walk doesn't see the changes by
prepare_zap_page(), it will still see valid sptes that will not be
reused (blocked by kvm_flush_remote_tlbs()).  If it does see the
changes, it won't access those sptes at all. Either way we're fine.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2012-05-14 12:41 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-23 16:16 [PATCH RFC] KVM: MMU: Don't use RCU for lockless shadow walking Avi Kivity
2012-04-24  1:17 ` Marcelo Tosatti
2012-04-24  9:24   ` Avi Kivity
2012-05-14 12:41     ` Avi Kivity
2012-04-24  6:37 ` Xiao Guangrong
2012-04-24  9:19   ` Avi Kivity
2012-04-24  9:23     ` Avi Kivity
2012-04-24  9:54     ` Xiao Guangrong
2012-04-24 10:02       ` Avi Kivity
2012-04-24 10:05         ` Xiao Guangrong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.