All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] KVM: MMU: Don't use RCU for lockless shadow walking
@ 2012-04-24  9:47 Avi Kivity
  2012-04-24 10:13 ` Xiao Guangrong
  2012-04-26 22:00 ` Marcelo Tosatti
  0 siblings, 2 replies; 8+ messages in thread
From: Avi Kivity @ 2012-04-24  9:47 UTC (permalink / raw)
  To: Marcelo Tosatti, Xiao Guangrong, kvm

Using RCU for lockless shadow walking can increase the amount of memory
in use by the system, since RCU grace periods are unpredictable.  We also
have an unconditional write to a shared variable (reader_counter), which
isn't good for scaling.

Replace that with a scheme similar to x86's get_user_pages_fast(): disable
interrupts during lockless shadow walk to force the freer
(kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
processor with interrupts enabled.

We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
kvm_flush_remote_tlbs() from avoiding the IPI.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |    4 ---
 arch/x86/kvm/mmu.c              |   72 +++++++++++++++------------------------
 include/linux/kvm_host.h        |    3 +-
 3 files changed, 30 insertions(+), 49 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f624ca7..67e66e6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -237,8 +237,6 @@ struct kvm_mmu_page {
 #endif
 
 	int write_flooding_count;
-
-	struct rcu_head rcu;
 };
 
 struct kvm_pio_request {
@@ -536,8 +534,6 @@ struct kvm_arch {
 	u64 hv_guest_os_id;
 	u64 hv_hypercall;
 
-	atomic_t reader_counter;
-
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 07424cf..ef88034 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -551,19 +551,28 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
 
 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 {
-	rcu_read_lock();
-	atomic_inc(&vcpu->kvm->arch.reader_counter);
-
-	/* Increase the counter before walking shadow page table */
-	smp_mb__after_atomic_inc();
+	/*
+	 * Prevent page table teardown by making any free-er wait during
+	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
+	 */
+	local_irq_disable();
+	vcpu->mode = READING_SHADOW_PAGE_TABLES;
+	/*
+	 * wmb: advertise vcpu->mode change
+	 * rmb: make sure we see updated sptes
+	 */
+	smp_mb();
 }
 
 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 {
-	/* Decrease the counter after walking shadow page table finished */
-	smp_mb__before_atomic_dec();
-	atomic_dec(&vcpu->kvm->arch.reader_counter);
-	rcu_read_unlock();
+	/*
+	 * Make our reads and writes to shadow page tables globally visible
+	 * before leaving READING_SHADOW_PAGE_TABLES mode.
+	 */
+	smp_mb();
+	vcpu->mode = OUTSIDE_GUEST_MODE;
+	local_irq_enable();
 }
 
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -1989,30 +1998,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	return ret;
 }
 
-static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
-{
-	struct kvm_mmu_page *sp;
-
-	list_for_each_entry(sp, invalid_list, link)
-		kvm_mmu_isolate_page(sp);
-}
-
-static void free_pages_rcu(struct rcu_head *head)
-{
-	struct kvm_mmu_page *next, *sp;
-
-	sp = container_of(head, struct kvm_mmu_page, rcu);
-	while (sp) {
-		if (!list_empty(&sp->link))
-			next = list_first_entry(&sp->link,
-				      struct kvm_mmu_page, link);
-		else
-			next = NULL;
-		kvm_mmu_free_page(sp);
-		sp = next;
-	}
-}
-
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list)
 {
@@ -2021,17 +2006,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 	if (list_empty(invalid_list))
 		return;
 
-	kvm_flush_remote_tlbs(kvm);
-
-	if (atomic_read(&kvm->arch.reader_counter)) {
-		kvm_mmu_isolate_pages(invalid_list);
-		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
-		list_del_init(invalid_list);
+	/*
+	 * wmb: make sure everyone sees our modifications to the page tables
+	 * rmb: make sure we see changes to vcpu->mode
+	 */
+	smp_mb();
 
-		trace_kvm_mmu_delay_free_pages(sp);
-		call_rcu(&sp->rcu, free_pages_rcu);
-		return;
-	}
+	/*
+	 * Wait for all vcpus to exit guest mode and/or lockless shadow
+	 * page table walks.
+	 */
+	kvm_flush_remote_tlbs(kvm);
 
 	do {
 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
@@ -2039,7 +2024,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 		kvm_mmu_isolate_page(sp);
 		kvm_mmu_free_page(sp);
 	} while (!list_empty(invalid_list));
-
 }
 
 /*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 186ffab..d1f1adf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -128,7 +128,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
-	EXITING_GUEST_MODE
+	EXITING_GUEST_MODE,
+	READING_SHADOW_PAGE_TABLES,
 };
 
 /*
-- 
1.7.10


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-24  9:47 [PATCH v2] KVM: MMU: Don't use RCU for lockless shadow walking Avi Kivity
@ 2012-04-24 10:13 ` Xiao Guangrong
  2012-04-24 10:42   ` Avi Kivity
  2012-04-26 22:00 ` Marcelo Tosatti
  1 sibling, 1 reply; 8+ messages in thread
From: Xiao Guangrong @ 2012-04-24 10:13 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, kvm

On 04/24/2012 05:47 PM, Avi Kivity wrote:


>  static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>  				    struct list_head *invalid_list)
>  {
> @@ -2021,17 +2006,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>  	if (list_empty(invalid_list))
>  		return;
> 
> -	kvm_flush_remote_tlbs(kvm);
> -
> -	if (atomic_read(&kvm->arch.reader_counter)) {
> -		kvm_mmu_isolate_pages(invalid_list);
> -		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
> -		list_del_init(invalid_list);
> +	/*
> +	 * wmb: make sure everyone sees our modifications to the page tables


Other lockless vcpu can see a old-copy of a spte, but it is ok since all page
tables is valid to be used. (kvm_flush_remote_tlbs can stop page table to be
freed)

> +	 * rmb: make sure we see changes to vcpu->mode


It is the responsibility of kvm_flush_remote_tlbs to see changes to vcpu->mode.
No?


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-24 10:13 ` Xiao Guangrong
@ 2012-04-24 10:42   ` Avi Kivity
  0 siblings, 0 replies; 8+ messages in thread
From: Avi Kivity @ 2012-04-24 10:42 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Marcelo Tosatti, kvm

On 04/24/2012 01:13 PM, Xiao Guangrong wrote:
> On 04/24/2012 05:47 PM, Avi Kivity wrote:
>
>
> >  static void kvm_mmu_commit_zap_page(struct kvm *kvm,
> >  				    struct list_head *invalid_list)
> >  {
> > @@ -2021,17 +2006,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
> >  	if (list_empty(invalid_list))
> >  		return;
> > 
> > -	kvm_flush_remote_tlbs(kvm);
> > -
> > -	if (atomic_read(&kvm->arch.reader_counter)) {
> > -		kvm_mmu_isolate_pages(invalid_list);
> > -		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
> > -		list_del_init(invalid_list);
> > +	/*
> > +	 * wmb: make sure everyone sees our modifications to the page tables
>
>
> Other lockless vcpu can see a old-copy of a spte, but it is ok since all page
> tables is valid to be used. (kvm_flush_remote_tlbs can stop page table to be
> freed)

We want kvm_commit_zap_page() to see the changes before freeing memory. 
But maybe it's the responsibility of kmem_cache_free()/kmem_cache_alloc().

> > +	 * rmb: make sure we see changes to vcpu->mode
>
>
> It is the responsibility of kvm_flush_remote_tlbs to see changes to vcpu->mode.
> No?

Yes.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-24  9:47 [PATCH v2] KVM: MMU: Don't use RCU for lockless shadow walking Avi Kivity
  2012-04-24 10:13 ` Xiao Guangrong
@ 2012-04-26 22:00 ` Marcelo Tosatti
  2012-04-27  6:07   ` Xiao Guangrong
  2012-04-29  9:35   ` Avi Kivity
  1 sibling, 2 replies; 8+ messages in thread
From: Marcelo Tosatti @ 2012-04-26 22:00 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Xiao Guangrong, kvm

On Tue, Apr 24, 2012 at 12:47:25PM +0300, Avi Kivity wrote:
> Using RCU for lockless shadow walking can increase the amount of memory
> in use by the system, since RCU grace periods are unpredictable.  We also
> have an unconditional write to a shared variable (reader_counter), which
> isn't good for scaling.
> 
> Replace that with a scheme similar to x86's get_user_pages_fast(): disable
> interrupts during lockless shadow walk to force the freer
> (kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the
> processor with interrupts enabled.
> 
> We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent
> kvm_flush_remote_tlbs() from avoiding the IPI.
> 
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
>  arch/x86/include/asm/kvm_host.h |    4 ---
>  arch/x86/kvm/mmu.c              |   72 +++++++++++++++------------------------
>  include/linux/kvm_host.h        |    3 +-
>  3 files changed, 30 insertions(+), 49 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index f624ca7..67e66e6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -237,8 +237,6 @@ struct kvm_mmu_page {
>  #endif
>  
>  	int write_flooding_count;
> -
> -	struct rcu_head rcu;
>  };
>  
>  struct kvm_pio_request {
> @@ -536,8 +534,6 @@ struct kvm_arch {
>  	u64 hv_guest_os_id;
>  	u64 hv_hypercall;
>  
> -	atomic_t reader_counter;
> -
>  	#ifdef CONFIG_KVM_MMU_AUDIT
>  	int audit_point;
>  	#endif
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 07424cf..ef88034 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -551,19 +551,28 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
>  
>  static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
>  {
> -	rcu_read_lock();
> -	atomic_inc(&vcpu->kvm->arch.reader_counter);
> -
> -	/* Increase the counter before walking shadow page table */
> -	smp_mb__after_atomic_inc();
> +	/*
> +	 * Prevent page table teardown by making any free-er wait during
> +	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
> +	 */
> +	local_irq_disable();
> +	vcpu->mode = READING_SHADOW_PAGE_TABLES;
> +	/*
> +	 * wmb: advertise vcpu->mode change
> +	 * rmb: make sure we see updated sptes
> +	 */
> +	smp_mb();
>  }
>  
>  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>  {
> -	/* Decrease the counter after walking shadow page table finished */
> -	smp_mb__before_atomic_dec();
> -	atomic_dec(&vcpu->kvm->arch.reader_counter);
> -	rcu_read_unlock();
> +	/*
> +	 * Make our reads and writes to shadow page tables globally visible
> +	 * before leaving READING_SHADOW_PAGE_TABLES mode.
> +	 */

This comment is misleading. Writes to shadow page tables must be
performed with locked instructions outside the mmu_lock.

> +	smp_mb();
> +	vcpu->mode = OUTSIDE_GUEST_MODE;

Don't you want 

vcpu->mode = OUTSIDE_GUEST_MODE;
smp_mb();

So that vcpu->mode update is globally visible before subsequent loads
execute?


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-26 22:00 ` Marcelo Tosatti
@ 2012-04-27  6:07   ` Xiao Guangrong
  2012-04-27 21:49     ` Marcelo Tosatti
  2012-04-29  9:35   ` Avi Kivity
  1 sibling, 1 reply; 8+ messages in thread
From: Xiao Guangrong @ 2012-04-27  6:07 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, kvm

On 04/27/2012 06:00 AM, Marcelo Tosatti wrote:


>>  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>>  {
>> -	/* Decrease the counter after walking shadow page table finished */
>> -	smp_mb__before_atomic_dec();
>> -	atomic_dec(&vcpu->kvm->arch.reader_counter);
>> -	rcu_read_unlock();
>> +	/*
>> +	 * Make our reads and writes to shadow page tables globally visible
>> +	 * before leaving READING_SHADOW_PAGE_TABLES mode.
>> +	 */
> 
> This comment is misleading. Writes to shadow page tables must be
> performed with locked instructions outside the mmu_lock.
> 


You mean that the write should guarantee a correct memory order by itself?

>> +	smp_mb();
>> +	vcpu->mode = OUTSIDE_GUEST_MODE;
> 
> Don't you want 
> 
> vcpu->mode = OUTSIDE_GUEST_MODE;
> smp_mb();
> 


It is unsafe i think, it is a problem if spte read / spte update is ordered
to the behind of vcpu->mode = OUTSIDE_GUEST_MODE, like below:

VCPU 0                                  VCPU 1
                                   commit_zapped_page:
/*
 * setting vcpu->mode is reordered
 * to the head of read spte.
 */
vcpu->mode = OUTSIDE_GUEST_MODE;

                                   see VCPU 0 is out-of-guest-mode, IPI is
                                   not sent, and the sp is free immediately.

read spte;
 OOPS!!!

(It is invalid since spte is freed.)

smp_mb



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-27  6:07   ` Xiao Guangrong
@ 2012-04-27 21:49     ` Marcelo Tosatti
  2012-04-29  9:38       ` Avi Kivity
  0 siblings, 1 reply; 8+ messages in thread
From: Marcelo Tosatti @ 2012-04-27 21:49 UTC (permalink / raw)
  To: Xiao Guangrong; +Cc: Avi Kivity, kvm

On Fri, Apr 27, 2012 at 02:07:57PM +0800, Xiao Guangrong wrote:
> On 04/27/2012 06:00 AM, Marcelo Tosatti wrote:
> 
> 
> >>  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
> >>  {
> >> -	/* Decrease the counter after walking shadow page table finished */
> >> -	smp_mb__before_atomic_dec();
> >> -	atomic_dec(&vcpu->kvm->arch.reader_counter);
> >> -	rcu_read_unlock();
> >> +	/*
> >> +	 * Make our reads and writes to shadow page tables globally visible
> >> +	 * before leaving READING_SHADOW_PAGE_TABLES mode.
> >> +	 */
> > 
> > This comment is misleading. Writes to shadow page tables must be
> > performed with locked instructions outside the mmu_lock.
> > 
> 
> 
> You mean that the write should guarantee a correct memory order by itself?

Yes.

> >> +	smp_mb();
> >> +	vcpu->mode = OUTSIDE_GUEST_MODE;
> > 
> > Don't you want 
> > 
> > vcpu->mode = OUTSIDE_GUEST_MODE;
> > smp_mb();
> > 
> 
> 
> It is unsafe i think, it is a problem if spte read / spte update is ordered
> to the behind of vcpu->mode = OUTSIDE_GUEST_MODE, like below:
> 
> VCPU 0                                  VCPU 1
>                                    commit_zapped_page:
> /*
>  * setting vcpu->mode is reordered
>  * to the head of read spte.
>  */
> vcpu->mode = OUTSIDE_GUEST_MODE;
> 
>                                    see VCPU 0 is out-of-guest-mode, IPI is
>                                    not sent, and the sp is free immediately.
> 
> read spte;
>  OOPS!!!
> 
> (It is invalid since spte is freed.)
> 
> smp_mb

Right. In that case a compiler barrier is sufficient (stores are not 
reordered with earlier loads on x86).


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-26 22:00 ` Marcelo Tosatti
  2012-04-27  6:07   ` Xiao Guangrong
@ 2012-04-29  9:35   ` Avi Kivity
  1 sibling, 0 replies; 8+ messages in thread
From: Avi Kivity @ 2012-04-29  9:35 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, kvm

On 04/27/2012 01:00 AM, Marcelo Tosatti wrote:
> >  
> >  static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
> >  {
> > -	/* Decrease the counter after walking shadow page table finished */
> > -	smp_mb__before_atomic_dec();
> > -	atomic_dec(&vcpu->kvm->arch.reader_counter);
> > -	rcu_read_unlock();
> > +	/*
> > +	 * Make our reads and writes to shadow page tables globally visible
> > +	 * before leaving READING_SHADOW_PAGE_TABLES mode.
> > +	 */
>
> This comment is misleading. Writes to shadow page tables must be
> performed with locked instructions outside the mmu_lock.

You are correct with respect to writes.  I guess reads are immaterial
because they will be either protected by a write, or discarded.


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] KVM: MMU: Don't use RCU for lockless shadow walking
  2012-04-27 21:49     ` Marcelo Tosatti
@ 2012-04-29  9:38       ` Avi Kivity
  0 siblings, 0 replies; 8+ messages in thread
From: Avi Kivity @ 2012-04-29  9:38 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Xiao Guangrong, kvm

On 04/28/2012 12:49 AM, Marcelo Tosatti wrote:
> > >> +	smp_mb();
> > >> +	vcpu->mode = OUTSIDE_GUEST_MODE;
> > > 
> > > Don't you want 
> > > 
> > > vcpu->mode = OUTSIDE_GUEST_MODE;
> > > smp_mb();
> > > 
> > 
> > 
> > It is unsafe i think, it is a problem if spte read / spte update is ordered
> > to the behind of vcpu->mode = OUTSIDE_GUEST_MODE, like below:
> > 
> > VCPU 0                                  VCPU 1
> >                                    commit_zapped_page:
> > /*
> >  * setting vcpu->mode is reordered
> >  * to the head of read spte.
> >  */
> > vcpu->mode = OUTSIDE_GUEST_MODE;
> > 
> >                                    see VCPU 0 is out-of-guest-mode, IPI is
> >                                    not sent, and the sp is free immediately.
> > 
> > read spte;
> >  OOPS!!!

Ouch!

> > 
> > (It is invalid since spte is freed.)
> > 
> > smp_mb
>
> Right. In that case a compiler barrier is sufficient (stores are not 
> reordered with earlier loads on x86).

I think it's customary not to depend on arch specific memory barrier
behaviour (but I don't have anything to base this on, just a feeling).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2012-04-29  9:39 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-24  9:47 [PATCH v2] KVM: MMU: Don't use RCU for lockless shadow walking Avi Kivity
2012-04-24 10:13 ` Xiao Guangrong
2012-04-24 10:42   ` Avi Kivity
2012-04-26 22:00 ` Marcelo Tosatti
2012-04-27  6:07   ` Xiao Guangrong
2012-04-27 21:49     ` Marcelo Tosatti
2012-04-29  9:38       ` Avi Kivity
2012-04-29  9:35   ` Avi Kivity

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.