[12/24] kvm: x86/kvm: RCU dereference tdp mmu page table links
diff mbox series

Message ID 20210112181041.356734-13-bgardon@google.com
State New, archived
Headers show
Series
  • Allow parallel page faults with TDP MMU
Related show

Commit Message

Ben Gardon Jan. 12, 2021, 6:10 p.m. UTC
In order to protect TDP MMU PT memory with RCU, ensure that page table
links are properly rcu_derefenced.

Reviewed-by: Peter Feiner <pfeiner@google.com>

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/mmu/tdp_iter.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

Comments

Sean Christopherson Jan. 22, 2021, 6:32 p.m. UTC | #1
On Tue, Jan 12, 2021, Ben Gardon wrote:
> In order to protect TDP MMU PT memory with RCU, ensure that page table
> links are properly rcu_derefenced.
> 
> Reviewed-by: Peter Feiner <pfeiner@google.com>
> 
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  arch/x86/kvm/mmu/tdp_iter.c | 6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> index 87b7e16911db..82855613ffa0 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.c
> +++ b/arch/x86/kvm/mmu/tdp_iter.c
> @@ -49,6 +49,8 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
>   */
>  u64 *spte_to_child_pt(u64 spte, int level)
>  {
> +	u64 *child_pt;
> +
>  	/*
>  	 * There's no child entry if this entry isn't present or is a
>  	 * last-level entry.
> @@ -56,7 +58,9 @@ u64 *spte_to_child_pt(u64 spte, int level)
>  	if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
>  		return NULL;
>  
> -	return __va(spte_to_pfn(spte) << PAGE_SHIFT);
> +	child_pt = __va(spte_to_pfn(spte) << PAGE_SHIFT);
> +
> +	return rcu_dereference(child_pt);

This is what bugs me the most about the RCU usage.  We're reaping the functional
benefits of RCU without doing the grunt work to truly RCU-ify the TDP MMU.  The
above rcu_dereference() barely scratches the surface of what's being protected
by RCU.  There are already multiple mechanisms that protect the page tables,
throwing RCU into the mix without fully integrating RCU makes for simple code
and avoids reinventing the wheel (big thumbs up), but ends up adding complexity
to an already complex system.  E.g. the lockless walks in the old MMU are
complex on the surface, but I find them easier to think through because they
explicitly rely on the same mechanism (remote TLB flush) that is used to protect
guest usage of the page tables.

Ideally, I think struct kvm_mmu_page's 'u64 *spt' would be annotated with __rcu,
as that would provide a high level of enforcement and would also highlight where
we're using other mechanisms to ensure correctness.  E.g. dereferencing root->spt
in kvm_tdp_mmu_get_vcpu_root_hpa() relies on the root being pinned by
get_tdp_mmu_vcpu_root(), and _that_ in turn relies on hold rwlock for write.
Unfortunately since kvm_mmu_page is shared with the old mmu, annotating ->spt
that doesn't work well.  We could employ a union to make it work, but that'd
probably do more harm than good.

The middle ground would be to annotate pt_path and sptep in struct tdp_iter.
That gets a decent chunk of the enforcement and also helps highlight what's
being protected with RCU.  Assuming we end up going with RCU, I think this
single rcu_dereference should be replace with something like the below patch.

diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 82855613ffa0..e000642d938d 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
 {
        iter->sptep = iter->pt_path[iter->level - 1] +
                SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
-       iter->old_spte = READ_ONCE(*iter->sptep);
+       iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
 }

 static gfn_t round_gfn_for_level(gfn_t gfn, int level)
@@ -34,7 +34,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
        iter->root_level = root_level;
        iter->min_level = min_level;
        iter->level = root_level;
-       iter->pt_path[iter->level - 1] = root_pt;
+       iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;

        iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
        tdp_iter_refresh_sptep(iter);
@@ -47,9 +47,9 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
  * address of the child page table referenced by the SPTE. Returns null if
  * there is no such entry.
  */
-u64 *spte_to_child_pt(u64 spte, int level)
+tdp_ptep_t spte_to_child_pt(u64 spte, int level)
 {
-       u64 *child_pt;
+       tdp_ptep_t child_pt;

        /*
         * There's no child entry if this entry isn't present or is a
@@ -58,9 +58,9 @@ u64 *spte_to_child_pt(u64 spte, int level)
        if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
                return NULL;

-       child_pt = __va(spte_to_pfn(spte) << PAGE_SHIFT);
+       child_pt = (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);

-       return rcu_dereference(child_pt);
+       return child_pt;
 }

 /*
@@ -69,7 +69,7 @@ u64 *spte_to_child_pt(u64 spte, int level)
  */
 static bool try_step_down(struct tdp_iter *iter)
 {
-       u64 *child_pt;
+       tdp_ptep_t child_pt;

        if (iter->level == iter->min_level)
                return false;
@@ -78,7 +78,7 @@ static bool try_step_down(struct tdp_iter *iter)
         * Reread the SPTE before stepping down to avoid traversing into page
         * tables that are no longer linked from this entry.
         */
-       iter->old_spte = READ_ONCE(*iter->sptep);
+       iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));

        child_pt = spte_to_child_pt(iter->old_spte, iter->level);
        if (!child_pt)
@@ -112,7 +112,7 @@ static bool try_step_side(struct tdp_iter *iter)
        iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
        iter->goal_gfn = iter->gfn;
        iter->sptep++;
-       iter->old_spte = READ_ONCE(*iter->sptep);
+       iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));

        return true;
 }
@@ -175,11 +175,11 @@ void tdp_iter_refresh_walk(struct tdp_iter *iter)
        if (iter->gfn > goal_gfn)
                goal_gfn = iter->gfn;

-       tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
+       tdp_iter_start(iter, rcu_dereference(iter->pt_path[iter->root_level - 1]),
                       iter->root_level, iter->min_level, goal_gfn);
 }

-u64 *tdp_iter_root_pt(struct tdp_iter *iter)
+tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
 {
        return iter->pt_path[iter->root_level - 1];
 }
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 47170d0dc98e..bf882dab8ec5 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -7,6 +7,8 @@

 #include "mmu.h"

+typedef u64 __rcu *tdp_ptep_t;
+
 /*
  * A TDP iterator performs a pre-order walk over a TDP paging structure.
  */
@@ -17,9 +19,9 @@ struct tdp_iter {
         */
        gfn_t goal_gfn;
        /* Pointers to the page tables traversed to reach the current SPTE */
-       u64 *pt_path[PT64_ROOT_MAX_LEVEL];
+       tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
        /* A pointer to the current SPTE */
-       u64 *sptep;
+       tdp_ptep_t sptep;
        /* The lowest GFN mapped by the current SPTE */
        gfn_t gfn;
        /* The level of the root page given to the iterator */
@@ -49,12 +51,12 @@ struct tdp_iter {
 #define for_each_tdp_pte(iter, root, root_level, start, end) \
        for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end)

-u64 *spte_to_child_pt(u64 pte, int level);
+tdp_ptep_t spte_to_child_pt(u64 pte, int level);

 void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
                    int min_level, gfn_t goal_gfn);
 void tdp_iter_next(struct tdp_iter *iter);
 void tdp_iter_refresh_walk(struct tdp_iter *iter);
-u64 *tdp_iter_root_pt(struct tdp_iter *iter);
+tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);

 #endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 45160ff84e91..27b850904230 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -509,7 +509,7 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
                                           struct tdp_iter *iter,
                                           u64 new_spte)
 {
-       u64 *root_pt = tdp_iter_root_pt(iter);
+       tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
        struct kvm_mmu_page *root = sptep_to_sp(root_pt);
        int as_id = kvm_mmu_page_as_id(root);
Ben Gardon Jan. 26, 2021, 6:17 p.m. UTC | #2
On Fri, Jan 22, 2021 at 10:32 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Jan 12, 2021, Ben Gardon wrote:
> > In order to protect TDP MMU PT memory with RCU, ensure that page table
> > links are properly rcu_derefenced.
> >
> > Reviewed-by: Peter Feiner <pfeiner@google.com>
> >
> > Signed-off-by: Ben Gardon <bgardon@google.com>
> > ---
> >  arch/x86/kvm/mmu/tdp_iter.c | 6 +++++-
> >  1 file changed, 5 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> > index 87b7e16911db..82855613ffa0 100644
> > --- a/arch/x86/kvm/mmu/tdp_iter.c
> > +++ b/arch/x86/kvm/mmu/tdp_iter.c
> > @@ -49,6 +49,8 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
> >   */
> >  u64 *spte_to_child_pt(u64 spte, int level)
> >  {
> > +     u64 *child_pt;
> > +
> >       /*
> >        * There's no child entry if this entry isn't present or is a
> >        * last-level entry.
> > @@ -56,7 +58,9 @@ u64 *spte_to_child_pt(u64 spte, int level)
> >       if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
> >               return NULL;
> >
> > -     return __va(spte_to_pfn(spte) << PAGE_SHIFT);
> > +     child_pt = __va(spte_to_pfn(spte) << PAGE_SHIFT);
> > +
> > +     return rcu_dereference(child_pt);
>
> This is what bugs me the most about the RCU usage.  We're reaping the functional
> benefits of RCU without doing the grunt work to truly RCU-ify the TDP MMU.  The
> above rcu_dereference() barely scratches the surface of what's being protected
> by RCU.  There are already multiple mechanisms that protect the page tables,
> throwing RCU into the mix without fully integrating RCU makes for simple code
> and avoids reinventing the wheel (big thumbs up), but ends up adding complexity
> to an already complex system.  E.g. the lockless walks in the old MMU are
> complex on the surface, but I find them easier to think through because they
> explicitly rely on the same mechanism (remote TLB flush) that is used to protect
> guest usage of the page tables.
>
> Ideally, I think struct kvm_mmu_page's 'u64 *spt' would be annotated with __rcu,
> as that would provide a high level of enforcement and would also highlight where
> we're using other mechanisms to ensure correctness.  E.g. dereferencing root->spt
> in kvm_tdp_mmu_get_vcpu_root_hpa() relies on the root being pinned by
> get_tdp_mmu_vcpu_root(), and _that_ in turn relies on hold rwlock for write.
> Unfortunately since kvm_mmu_page is shared with the old mmu, annotating ->spt
> that doesn't work well.  We could employ a union to make it work, but that'd
> probably do more harm than good.
>
> The middle ground would be to annotate pt_path and sptep in struct tdp_iter.
> That gets a decent chunk of the enforcement and also helps highlight what's
> being protected with RCU.  Assuming we end up going with RCU, I think this
> single rcu_dereference should be replace with something like the below patch.

Thank you for explaining your thought process here. You make an
excellent point that this results in code that is substantially less
self-documenting than it could be. It seems like your patch below will
substantially improve the automated checker's ability to validate the
RCU usage as well. I'll happily include it in the next version of this
series. I appreciate the way that the patch below makes all references
to the entries of the page table RCU dereferences. Not doing those
dereferences was certainly an error in the original patch.

>
> diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
> index 82855613ffa0..e000642d938d 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.c
> +++ b/arch/x86/kvm/mmu/tdp_iter.c
> @@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
>  {
>         iter->sptep = iter->pt_path[iter->level - 1] +
>                 SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
> -       iter->old_spte = READ_ONCE(*iter->sptep);
> +       iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
>  }
>
>  static gfn_t round_gfn_for_level(gfn_t gfn, int level)
> @@ -34,7 +34,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
>         iter->root_level = root_level;
>         iter->min_level = min_level;
>         iter->level = root_level;
> -       iter->pt_path[iter->level - 1] = root_pt;
> +       iter->pt_path[iter->level - 1] = (tdp_ptep_t)root_pt;
>
>         iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
>         tdp_iter_refresh_sptep(iter);
> @@ -47,9 +47,9 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
>   * address of the child page table referenced by the SPTE. Returns null if
>   * there is no such entry.
>   */
> -u64 *spte_to_child_pt(u64 spte, int level)
> +tdp_ptep_t spte_to_child_pt(u64 spte, int level)
>  {
> -       u64 *child_pt;
> +       tdp_ptep_t child_pt;
>
>         /*
>          * There's no child entry if this entry isn't present or is a
> @@ -58,9 +58,9 @@ u64 *spte_to_child_pt(u64 spte, int level)
>         if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
>                 return NULL;
>
> -       child_pt = __va(spte_to_pfn(spte) << PAGE_SHIFT);
> +       child_pt = (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
>
> -       return rcu_dereference(child_pt);
> +       return child_pt;
>  }
>
>  /*
> @@ -69,7 +69,7 @@ u64 *spte_to_child_pt(u64 spte, int level)
>   */
>  static bool try_step_down(struct tdp_iter *iter)
>  {
> -       u64 *child_pt;
> +       tdp_ptep_t child_pt;
>
>         if (iter->level == iter->min_level)
>                 return false;
> @@ -78,7 +78,7 @@ static bool try_step_down(struct tdp_iter *iter)
>          * Reread the SPTE before stepping down to avoid traversing into page
>          * tables that are no longer linked from this entry.
>          */
> -       iter->old_spte = READ_ONCE(*iter->sptep);
> +       iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
>
>         child_pt = spte_to_child_pt(iter->old_spte, iter->level);
>         if (!child_pt)
> @@ -112,7 +112,7 @@ static bool try_step_side(struct tdp_iter *iter)
>         iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
>         iter->goal_gfn = iter->gfn;
>         iter->sptep++;
> -       iter->old_spte = READ_ONCE(*iter->sptep);
> +       iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
>
>         return true;
>  }
> @@ -175,11 +175,11 @@ void tdp_iter_refresh_walk(struct tdp_iter *iter)
>         if (iter->gfn > goal_gfn)
>                 goal_gfn = iter->gfn;
>
> -       tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
> +       tdp_iter_start(iter, rcu_dereference(iter->pt_path[iter->root_level - 1]),
>                        iter->root_level, iter->min_level, goal_gfn);
>  }
>
> -u64 *tdp_iter_root_pt(struct tdp_iter *iter)
> +tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter)
>  {
>         return iter->pt_path[iter->root_level - 1];
>  }
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> index 47170d0dc98e..bf882dab8ec5 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.h
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -7,6 +7,8 @@
>
>  #include "mmu.h"
>
> +typedef u64 __rcu *tdp_ptep_t;
> +
>  /*
>   * A TDP iterator performs a pre-order walk over a TDP paging structure.
>   */
> @@ -17,9 +19,9 @@ struct tdp_iter {
>          */
>         gfn_t goal_gfn;
>         /* Pointers to the page tables traversed to reach the current SPTE */
> -       u64 *pt_path[PT64_ROOT_MAX_LEVEL];
> +       tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
>         /* A pointer to the current SPTE */
> -       u64 *sptep;
> +       tdp_ptep_t sptep;
>         /* The lowest GFN mapped by the current SPTE */
>         gfn_t gfn;
>         /* The level of the root page given to the iterator */
> @@ -49,12 +51,12 @@ struct tdp_iter {
>  #define for_each_tdp_pte(iter, root, root_level, start, end) \
>         for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end)
>
> -u64 *spte_to_child_pt(u64 pte, int level);
> +tdp_ptep_t spte_to_child_pt(u64 pte, int level);
>
>  void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
>                     int min_level, gfn_t goal_gfn);
>  void tdp_iter_next(struct tdp_iter *iter);
>  void tdp_iter_refresh_walk(struct tdp_iter *iter);
> -u64 *tdp_iter_root_pt(struct tdp_iter *iter);
> +tdp_ptep_t tdp_iter_root_pt(struct tdp_iter *iter);
>
>  #endif /* __KVM_X86_MMU_TDP_ITER_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 45160ff84e91..27b850904230 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -509,7 +509,7 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
>                                            struct tdp_iter *iter,
>                                            u64 new_spte)
>  {
> -       u64 *root_pt = tdp_iter_root_pt(iter);
> +       tdp_ptep_t root_pt = tdp_iter_root_pt(iter);
>         struct kvm_mmu_page *root = sptep_to_sp(root_pt);
>         int as_id = kvm_mmu_page_as_id(root);
>
>

Patch
diff mbox series

diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 87b7e16911db..82855613ffa0 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -49,6 +49,8 @@  void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
  */
 u64 *spte_to_child_pt(u64 spte, int level)
 {
+	u64 *child_pt;
+
 	/*
 	 * There's no child entry if this entry isn't present or is a
 	 * last-level entry.
@@ -56,7 +58,9 @@  u64 *spte_to_child_pt(u64 spte, int level)
 	if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
 		return NULL;
 
-	return __va(spte_to_pfn(spte) << PAGE_SHIFT);
+	child_pt = __va(spte_to_pfn(spte) << PAGE_SHIFT);
+
+	return rcu_dereference(child_pt);
 }
 
 /*