All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases
@ 2021-03-27  1:51 Boris Ostrovsky
  2021-03-29  9:56 ` Jan Beulich
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Boris Ostrovsky @ 2021-03-27  1:51 UTC (permalink / raw)
  To: xen-devel
  Cc: jbeulich, andrew.cooper3, roger.pau, wl, boris.ostrovsky,
	stephen.s.brennan, iwj

Commit 8e76aef72820 ("x86/vpt: fix race when migrating timers between
vCPUs") addressed XSA-336 by introducing a per-domain rwlock that was
intended to protect periodic timer during VCPU migration. Since such
migration is an infrequent event no performance impact was expected.

Unfortunately this turned out not to be the case: on a fairly large
guest (92 VCPUs) we've observed as much as 40% TPCC performance
regression with some guest kernels. Further investigation pointed to
pt_migrate read lock taken in pt_update_irq() as the largest contributor
to this regression. With large number of VCPUs and large number of VMEXITs
(from where pt_update_irq() is always called) the update of an atomic in
read_lock() is thought to be the main cause.

Stephen Brennan analyzed locking pattern and classified lock users as
follows:

1. Functions which read (maybe write) all periodic_time instances
attached to a particular vCPU. These are functions which use pt_vcpu_lock()
after the commit, such as pt_restore_timer(), pt_save_timer(), etc.
2. Functions which want to modify a particular periodic_time object.
These guys lock whichever vCPU the periodic_time is attached to, but
since the vCPU could be modified without holding any lock, they are
vulnerable to the bug. Functions in this group use pt_lock(), such as
pt_timer_fn() or destroy_periodic_time().
3. Functions which not only want to modify the periodic_time, but also
would like to modify the =vcpu= fields. These are create_periodic_time()
or pt_adjust_vcpu(). They create the locking imbalance bug for group 2,
but we can't simply hold 2 vcpu locks due to the deadlock risk.

Roger Monné then pointed out that group 1 functions don't really need
to hold the pt_migrate rwlock and that group 3 should be hardened by
holding appropriate vcpu's tm_lock when adding or deleting a timer
from its list.

Suggested-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Suggested-by: Roger Pau Monne <roger.pau@citrix.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
v2: Drop per-periodic_time spinlock and keep pt_migrate rwlock (and thus
    change patch subject)

 xen/arch/x86/hvm/vpt.c        | 40 +++++++++++++++++++++++++++++++---------
 xen/include/asm-x86/hvm/vpt.h |  8 ++++----
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c
index 4c2afe2e9154..893641f20e1c 100644
--- a/xen/arch/x86/hvm/vpt.c
+++ b/xen/arch/x86/hvm/vpt.c
@@ -153,32 +153,43 @@ static int pt_irq_masked(struct periodic_time *pt)
     return 1;
 }
 
+/*
+ * Functions which read (maybe write) all periodic_time instances
+ * attached to a particular vCPU use these locking helpers.
+ *
+ * Such users are explicitly forbidden from changing the value of the
+ * pt->vcpu field, because another thread holding the pt_migrate lock
+ * may already be spinning waiting for your vcpu lock.
+ */
 static void pt_vcpu_lock(struct vcpu *v)
 {
-    read_lock(&v->domain->arch.hvm.pl_time->pt_migrate);
     spin_lock(&v->arch.hvm.tm_lock);
 }
 
 static void pt_vcpu_unlock(struct vcpu *v)
 {
     spin_unlock(&v->arch.hvm.tm_lock);
-    read_unlock(&v->domain->arch.hvm.pl_time->pt_migrate);
 }
 
+/*
+ * Functions which want to modify a particular periodic_time object
+ * use these locking helpers.
+ *
+ * These users lock whichever vCPU the periodic_time is attached to,
+ * but since the vCPU could be modified without holding any lock, they
+ * need to take an additional lock that protects against pt->vcpu
+ * changing.
+ */
 static void pt_lock(struct periodic_time *pt)
 {
-    /*
-     * We cannot use pt_vcpu_lock here, because we need to acquire the
-     * per-domain lock first and then (re-)fetch the value of pt->vcpu, or
-     * else we might be using a stale value of pt->vcpu.
-     */
     read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
     spin_lock(&pt->vcpu->arch.hvm.tm_lock);
 }
 
 static void pt_unlock(struct periodic_time *pt)
 {
-    pt_vcpu_unlock(pt->vcpu);
+    spin_unlock(&pt->vcpu->arch.hvm.tm_lock);
+    read_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
 }
 
 static void pt_process_missed_ticks(struct periodic_time *pt)
@@ -543,8 +554,10 @@ void create_periodic_time(
     pt->cb = cb;
     pt->priv = data;
 
+    pt_vcpu_lock(pt->vcpu);
     pt->on_list = 1;
     list_add(&pt->list, &v->arch.hvm.tm_list);
+    pt_vcpu_unlock(pt->vcpu);
 
     init_timer(&pt->timer, pt_timer_fn, pt, v->processor);
     set_timer(&pt->timer, pt->scheduled);
@@ -580,13 +593,22 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
         return;
 
     write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
+
+    pt_vcpu_lock(pt->vcpu);
+    if ( pt->on_list )
+        list_del(&pt->list);
+    pt_vcpu_unlock(pt->vcpu);
+
     pt->vcpu = v;
+
+    pt_vcpu_lock(pt->vcpu);
     if ( pt->on_list )
     {
-        list_del(&pt->list);
         list_add(&pt->list, &v->arch.hvm.tm_list);
         migrate_timer(&pt->timer, v->processor);
     }
+    pt_vcpu_unlock(pt->vcpu);
+
     write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
 }
 
diff --git a/xen/include/asm-x86/hvm/vpt.h b/xen/include/asm-x86/hvm/vpt.h
index 39d26cbda496..f3c2a439630a 100644
--- a/xen/include/asm-x86/hvm/vpt.h
+++ b/xen/include/asm-x86/hvm/vpt.h
@@ -129,10 +129,10 @@ struct pl_time {    /* platform time */
     struct HPETState vhpet;
     struct PMTState  vpmt;
     /*
-     * rwlock to prevent periodic_time vCPU migration. Take the lock in read
-     * mode in order to prevent the vcpu field of periodic_time from changing.
-     * Lock must be taken in write mode when changes to the vcpu field are
-     * performed, as it allows exclusive access to all the timers of a domain.
+     * Functions which want to modify the vcpu field of the vpt need to
+     * hold the global lock (pt_migrate) in write mode together with the
+     * per-vcpu locks of the lists being modified. Note that two vcpu
+     * locks cannot be held at the same time to avoid a deadlock.
      */
     rwlock_t pt_migrate;
     /* guest_time = Xen sys time + stime_offset */
-- 
1.8.3.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases
  2021-03-27  1:51 [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases Boris Ostrovsky
@ 2021-03-29  9:56 ` Jan Beulich
  2021-03-29 10:40   ` Roger Pau Monné
  2021-03-29 15:04   ` Boris Ostrovsky
  2021-03-29 10:31 ` Roger Pau Monné
  2021-03-29 17:44 ` Stephen Brennan
  2 siblings, 2 replies; 10+ messages in thread
From: Jan Beulich @ 2021-03-29  9:56 UTC (permalink / raw)
  To: Boris Ostrovsky, roger.pau
  Cc: andrew.cooper3, wl, stephen.s.brennan, iwj, xen-devel

On 27.03.2021 02:51, Boris Ostrovsky wrote:
> Commit 8e76aef72820 ("x86/vpt: fix race when migrating timers between
> vCPUs") addressed XSA-336 by introducing a per-domain rwlock that was
> intended to protect periodic timer during VCPU migration. Since such
> migration is an infrequent event no performance impact was expected.
> 
> Unfortunately this turned out not to be the case: on a fairly large
> guest (92 VCPUs) we've observed as much as 40% TPCC performance
> regression with some guest kernels. Further investigation pointed to
> pt_migrate read lock taken in pt_update_irq() as the largest contributor
> to this regression. With large number of VCPUs and large number of VMEXITs
> (from where pt_update_irq() is always called) the update of an atomic in
> read_lock() is thought to be the main cause.
> 
> Stephen Brennan analyzed locking pattern and classified lock users as
> follows:
> 
> 1. Functions which read (maybe write) all periodic_time instances
> attached to a particular vCPU. These are functions which use pt_vcpu_lock()
> after the commit, such as pt_restore_timer(), pt_save_timer(), etc.
> 2. Functions which want to modify a particular periodic_time object.
> These guys lock whichever vCPU the periodic_time is attached to, but
> since the vCPU could be modified without holding any lock, they are
> vulnerable to the bug. Functions in this group use pt_lock(), such as
> pt_timer_fn() or destroy_periodic_time().
> 3. Functions which not only want to modify the periodic_time, but also
> would like to modify the =vcpu= fields. These are create_periodic_time()
> or pt_adjust_vcpu(). They create the locking imbalance bug for group 2,
> but we can't simply hold 2 vcpu locks due to the deadlock risk.
> 
> Roger Monné then pointed out that group 1 functions don't really need
> to hold the pt_migrate rwlock and that group 3 should be hardened by
> holding appropriate vcpu's tm_lock when adding or deleting a timer
> from its list.

I'm struggling some with the latter aspect: Is this to mean there is
something wrong right now? Or does "harden" really mean "just to be
on the safe side" here? In the latter case I'm not really certain we
should add such extra locking, in particular if any of this might be
on a frequently executed code path. Leaving respective comments there
instead might be an option.

> @@ -543,8 +554,10 @@ void create_periodic_time(
>      pt->cb = cb;
>      pt->priv = data;
>  
> +    pt_vcpu_lock(pt->vcpu);
>      pt->on_list = 1;
>      list_add(&pt->list, &v->arch.hvm.tm_list);
> +    pt_vcpu_unlock(pt->vcpu);

I think it would be better (not just code generation wise) to use v
here.

> @@ -580,13 +593,22 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
>          return;
>  
>      write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
> +
> +    pt_vcpu_lock(pt->vcpu);
> +    if ( pt->on_list )
> +        list_del(&pt->list);
> +    pt_vcpu_unlock(pt->vcpu);

While these two obviously can't use v, ...

>      pt->vcpu = v;
> +
> +    pt_vcpu_lock(pt->vcpu);
>      if ( pt->on_list )
>      {
> -        list_del(&pt->list);
>          list_add(&pt->list, &v->arch.hvm.tm_list);
>          migrate_timer(&pt->timer, v->processor);
>      }
> +    pt_vcpu_unlock(pt->vcpu);

... these two again could (and imo should), and ...

>      write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);

... really this and its counterpart better would do so, too (albeit
perhaps in a separate patch).

While I see that pt_adjust_global_vcpu_target() (the only caller of
pt_adjust_vcpu()) already avoids calling here when the vcpu there
doesn't really change, I also think that with all this extra locking
the function now would better short-circuit the case of
pt->vcpu == v upon entry (or more precisely once the write lock was
acquired).

> --- a/xen/include/asm-x86/hvm/vpt.h
> +++ b/xen/include/asm-x86/hvm/vpt.h
> @@ -129,10 +129,10 @@ struct pl_time {    /* platform time */
>      struct HPETState vhpet;
>      struct PMTState  vpmt;
>      /*
> -     * rwlock to prevent periodic_time vCPU migration. Take the lock in read
> -     * mode in order to prevent the vcpu field of periodic_time from changing.
> -     * Lock must be taken in write mode when changes to the vcpu field are
> -     * performed, as it allows exclusive access to all the timers of a domain.
> +     * Functions which want to modify the vcpu field of the vpt need to
> +     * hold the global lock (pt_migrate) in write mode together with the
> +     * per-vcpu locks of the lists being modified. Note that two vcpu
> +     * locks cannot be held at the same time to avoid a deadlock.
>       */
>      rwlock_t pt_migrate;

It looks to me as if some information got lost here, most notably the
scope of the write lock and (somewhat related) what holding the lock
in read mode protects against.

Jan


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases
  2021-03-27  1:51 [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases Boris Ostrovsky
  2021-03-29  9:56 ` Jan Beulich
@ 2021-03-29 10:31 ` Roger Pau Monné
  2021-03-29 17:44 ` Stephen Brennan
  2 siblings, 0 replies; 10+ messages in thread
From: Roger Pau Monné @ 2021-03-29 10:31 UTC (permalink / raw)
  To: Boris Ostrovsky
  Cc: xen-devel, jbeulich, andrew.cooper3, wl, stephen.s.brennan, iwj

On Fri, Mar 26, 2021 at 09:51:06PM -0400, Boris Ostrovsky wrote:
> Commit 8e76aef72820 ("x86/vpt: fix race when migrating timers between
> vCPUs") addressed XSA-336 by introducing a per-domain rwlock that was
> intended to protect periodic timer during VCPU migration. Since such
> migration is an infrequent event no performance impact was expected.
> 
> Unfortunately this turned out not to be the case: on a fairly large
> guest (92 VCPUs) we've observed as much as 40% TPCC performance
> regression with some guest kernels. Further investigation pointed to
> pt_migrate read lock taken in pt_update_irq() as the largest contributor
> to this regression. With large number of VCPUs and large number of VMEXITs
> (from where pt_update_irq() is always called) the update of an atomic in
> read_lock() is thought to be the main cause.
> 
> Stephen Brennan analyzed locking pattern and classified lock users as
> follows:
> 
> 1. Functions which read (maybe write) all periodic_time instances
> attached to a particular vCPU. These are functions which use pt_vcpu_lock()
> after the commit, such as pt_restore_timer(), pt_save_timer(), etc.
> 2. Functions which want to modify a particular periodic_time object.
> These guys lock whichever vCPU the periodic_time is attached to, but
> since the vCPU could be modified without holding any lock, they are
> vulnerable to the bug. Functions in this group use pt_lock(), such as
> pt_timer_fn() or destroy_periodic_time().
> 3. Functions which not only want to modify the periodic_time, but also
> would like to modify the =vcpu= fields. These are create_periodic_time()
> or pt_adjust_vcpu(). They create the locking imbalance bug for group 2,
> but we can't simply hold 2 vcpu locks due to the deadlock risk.
> 
> Roger Monné then pointed out that group 1 functions don't really need

Roger alone is fine, or else it would have to be Roger Pau (Monné is
my second surname).

> to hold the pt_migrate rwlock and that group 3 should be hardened by
> holding appropriate vcpu's tm_lock when adding or deleting a timer
> from its list.
> 
> Suggested-by: Stephen Brennan <stephen.s.brennan@oracle.com>
> Suggested-by: Roger Pau Monne <roger.pau@citrix.com>
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Some nits below.

> ---
> v2: Drop per-periodic_time spinlock and keep pt_migrate rwlock (and thus
>     change patch subject)
> 
>  xen/arch/x86/hvm/vpt.c        | 40 +++++++++++++++++++++++++++++++---------
>  xen/include/asm-x86/hvm/vpt.h |  8 ++++----
>  2 files changed, 35 insertions(+), 13 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c
> index 4c2afe2e9154..893641f20e1c 100644
> --- a/xen/arch/x86/hvm/vpt.c
> +++ b/xen/arch/x86/hvm/vpt.c
> @@ -153,32 +153,43 @@ static int pt_irq_masked(struct periodic_time *pt)
>      return 1;
>  }
>  
> +/*
> + * Functions which read (maybe write) all periodic_time instances
> + * attached to a particular vCPU use these locking helpers.

I would replace 'these' with pt_vcpu_{un}lock, to make it clearer.

> + *
> + * Such users are explicitly forbidden from changing the value of the
> + * pt->vcpu field, because another thread holding the pt_migrate lock
> + * may already be spinning waiting for your vcpu lock.
> + */
>  static void pt_vcpu_lock(struct vcpu *v)
>  {
> -    read_lock(&v->domain->arch.hvm.pl_time->pt_migrate);
>      spin_lock(&v->arch.hvm.tm_lock);
>  }
>  
>  static void pt_vcpu_unlock(struct vcpu *v)
>  {
>      spin_unlock(&v->arch.hvm.tm_lock);
> -    read_unlock(&v->domain->arch.hvm.pl_time->pt_migrate);
>  }
>  
> +/*
> + * Functions which want to modify a particular periodic_time object
> + * use these locking helpers.

Same here, I would use pt_{un}lock instead of 'these' to make it
clearer.

> + *
> + * These users lock whichever vCPU the periodic_time is attached to,
> + * but since the vCPU could be modified without holding any lock, they
> + * need to take an additional lock that protects against pt->vcpu
> + * changing.
> + */
>  static void pt_lock(struct periodic_time *pt)
>  {
> -    /*
> -     * We cannot use pt_vcpu_lock here, because we need to acquire the
> -     * per-domain lock first and then (re-)fetch the value of pt->vcpu, or
> -     * else we might be using a stale value of pt->vcpu.
> -     */
>      read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>      spin_lock(&pt->vcpu->arch.hvm.tm_lock);
>  }
>  
>  static void pt_unlock(struct periodic_time *pt)
>  {
> -    pt_vcpu_unlock(pt->vcpu);
> +    spin_unlock(&pt->vcpu->arch.hvm.tm_lock);
> +    read_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>  }
>  
>  static void pt_process_missed_ticks(struct periodic_time *pt)
> @@ -543,8 +554,10 @@ void create_periodic_time(
>      pt->cb = cb;
>      pt->priv = data;
>  
> +    pt_vcpu_lock(pt->vcpu);
>      pt->on_list = 1;
>      list_add(&pt->list, &v->arch.hvm.tm_list);
> +    pt_vcpu_unlock(pt->vcpu);
>  
>      init_timer(&pt->timer, pt_timer_fn, pt, v->processor);
>      set_timer(&pt->timer, pt->scheduled);
> @@ -580,13 +593,22 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
>          return;
>  
>      write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
> +
> +    pt_vcpu_lock(pt->vcpu);
> +    if ( pt->on_list )
> +        list_del(&pt->list);
> +    pt_vcpu_unlock(pt->vcpu);
> +
>      pt->vcpu = v;
> +
> +    pt_vcpu_lock(pt->vcpu);
>      if ( pt->on_list )
>      {
> -        list_del(&pt->list);
>          list_add(&pt->list, &v->arch.hvm.tm_list);
>          migrate_timer(&pt->timer, v->processor);
>      }
> +    pt_vcpu_unlock(pt->vcpu);
> +
>      write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>  }
>  
> diff --git a/xen/include/asm-x86/hvm/vpt.h b/xen/include/asm-x86/hvm/vpt.h
> index 39d26cbda496..f3c2a439630a 100644
> --- a/xen/include/asm-x86/hvm/vpt.h
> +++ b/xen/include/asm-x86/hvm/vpt.h
> @@ -129,10 +129,10 @@ struct pl_time {    /* platform time */
>      struct HPETState vhpet;
>      struct PMTState  vpmt;
>      /*
> -     * rwlock to prevent periodic_time vCPU migration. Take the lock in read
> -     * mode in order to prevent the vcpu field of periodic_time from changing.
> -     * Lock must be taken in write mode when changes to the vcpu field are
> -     * performed, as it allows exclusive access to all the timers of a domain.
> +     * Functions which want to modify the vcpu field of the vpt need to
> +     * hold the global lock (pt_migrate) in write mode together with the
> +     * per-vcpu locks of the lists being modified. Note that two vcpu
> +     * locks cannot be held at the same time to avoid a deadlock.

I would maybe word this as:

    /*
     * Functions which want to modify the vcpu field of the vpt need
     * to hold the global lock (pt_migrate) in write mode together
     * with the per-vcpu locks of the lists being modified. Functions
     * that want to lock a periodic_timer that's possibly on a
     * different vCPU list need to take the lock in read mode first in
     * order to prevent the vcpu filed of periodic_timer from
     * changing.
     *
     * Note that two vcpu locks cannot be held at the same time to
     * avoid a deadlock.
     */

Thanks, Roger.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases
  2021-03-29  9:56 ` Jan Beulich
@ 2021-03-29 10:40   ` Roger Pau Monné
  2021-03-29 11:01     ` Jan Beulich
  2021-03-29 15:04   ` Boris Ostrovsky
  1 sibling, 1 reply; 10+ messages in thread
From: Roger Pau Monné @ 2021-03-29 10:40 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Boris Ostrovsky, andrew.cooper3, wl, stephen.s.brennan, iwj, xen-devel

On Mon, Mar 29, 2021 at 11:56:56AM +0200, Jan Beulich wrote:
> On 27.03.2021 02:51, Boris Ostrovsky wrote:
> > Commit 8e76aef72820 ("x86/vpt: fix race when migrating timers between
> > vCPUs") addressed XSA-336 by introducing a per-domain rwlock that was
> > intended to protect periodic timer during VCPU migration. Since such
> > migration is an infrequent event no performance impact was expected.
> > 
> > Unfortunately this turned out not to be the case: on a fairly large
> > guest (92 VCPUs) we've observed as much as 40% TPCC performance
> > regression with some guest kernels. Further investigation pointed to
> > pt_migrate read lock taken in pt_update_irq() as the largest contributor
> > to this regression. With large number of VCPUs and large number of VMEXITs
> > (from where pt_update_irq() is always called) the update of an atomic in
> > read_lock() is thought to be the main cause.
> > 
> > Stephen Brennan analyzed locking pattern and classified lock users as
> > follows:
> > 
> > 1. Functions which read (maybe write) all periodic_time instances
> > attached to a particular vCPU. These are functions which use pt_vcpu_lock()
> > after the commit, such as pt_restore_timer(), pt_save_timer(), etc.
> > 2. Functions which want to modify a particular periodic_time object.
> > These guys lock whichever vCPU the periodic_time is attached to, but
> > since the vCPU could be modified without holding any lock, they are
> > vulnerable to the bug. Functions in this group use pt_lock(), such as
> > pt_timer_fn() or destroy_periodic_time().
> > 3. Functions which not only want to modify the periodic_time, but also
> > would like to modify the =vcpu= fields. These are create_periodic_time()
> > or pt_adjust_vcpu(). They create the locking imbalance bug for group 2,
> > but we can't simply hold 2 vcpu locks due to the deadlock risk.
> > 
> > Roger Monné then pointed out that group 1 functions don't really need
> > to hold the pt_migrate rwlock and that group 3 should be hardened by
> > holding appropriate vcpu's tm_lock when adding or deleting a timer
> > from its list.
> 
> I'm struggling some with the latter aspect: Is this to mean there is
> something wrong right now?

There's nothing wrong right now AFAICT , because per-vcpu users (ie:
type 1) also hold the rw lock in read mode when iterating over the
per-vcpu list.

> Or does "harden" really mean "just to be
> on the safe side" here?

If the per-domain rw lock is no longer read-locked by type 1 users
then type 2 and type 3 users need to make sure the per-vcpu lock is
taken before adding or removing items from the per-vcpu lists, or else
a type 1 user could see list corruption as a result of modifications
done by type 2 or 3 without holding the per-vcpu lock.

This again makes the locking logic more complicated than it was, so I
will try to get back with my vpt improvements so we can get rid of all
this at least on unstable. Hopefully the comments are clear enough to
follow the logic.

Roger.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases
  2021-03-29 10:40   ` Roger Pau Monné
@ 2021-03-29 11:01     ` Jan Beulich
  0 siblings, 0 replies; 10+ messages in thread
From: Jan Beulich @ 2021-03-29 11:01 UTC (permalink / raw)
  To: Roger Pau Monné, Boris Ostrovsky
  Cc: andrew.cooper3, wl, stephen.s.brennan, iwj, xen-devel

On 29.03.2021 12:40, Roger Pau Monné wrote:
> On Mon, Mar 29, 2021 at 11:56:56AM +0200, Jan Beulich wrote:
>> On 27.03.2021 02:51, Boris Ostrovsky wrote:
>>> Commit 8e76aef72820 ("x86/vpt: fix race when migrating timers between
>>> vCPUs") addressed XSA-336 by introducing a per-domain rwlock that was
>>> intended to protect periodic timer during VCPU migration. Since such
>>> migration is an infrequent event no performance impact was expected.
>>>
>>> Unfortunately this turned out not to be the case: on a fairly large
>>> guest (92 VCPUs) we've observed as much as 40% TPCC performance
>>> regression with some guest kernels. Further investigation pointed to
>>> pt_migrate read lock taken in pt_update_irq() as the largest contributor
>>> to this regression. With large number of VCPUs and large number of VMEXITs
>>> (from where pt_update_irq() is always called) the update of an atomic in
>>> read_lock() is thought to be the main cause.
>>>
>>> Stephen Brennan analyzed locking pattern and classified lock users as
>>> follows:
>>>
>>> 1. Functions which read (maybe write) all periodic_time instances
>>> attached to a particular vCPU. These are functions which use pt_vcpu_lock()
>>> after the commit, such as pt_restore_timer(), pt_save_timer(), etc.
>>> 2. Functions which want to modify a particular periodic_time object.
>>> These guys lock whichever vCPU the periodic_time is attached to, but
>>> since the vCPU could be modified without holding any lock, they are
>>> vulnerable to the bug. Functions in this group use pt_lock(), such as
>>> pt_timer_fn() or destroy_periodic_time().
>>> 3. Functions which not only want to modify the periodic_time, but also
>>> would like to modify the =vcpu= fields. These are create_periodic_time()
>>> or pt_adjust_vcpu(). They create the locking imbalance bug for group 2,
>>> but we can't simply hold 2 vcpu locks due to the deadlock risk.
>>>
>>> Roger Monné then pointed out that group 1 functions don't really need
>>> to hold the pt_migrate rwlock and that group 3 should be hardened by
>>> holding appropriate vcpu's tm_lock when adding or deleting a timer
>>> from its list.
>>
>> I'm struggling some with the latter aspect: Is this to mean there is
>> something wrong right now?
> 
> There's nothing wrong right now AFAICT , because per-vcpu users (ie:
> type 1) also hold the rw lock in read mode when iterating over the
> per-vcpu list.
> 
>> Or does "harden" really mean "just to be
>> on the safe side" here?
> 
> If the per-domain rw lock is no longer read-locked by type 1 users
> then type 2 and type 3 users need to make sure the per-vcpu lock is
> taken before adding or removing items from the per-vcpu lists, or else
> a type 1 user could see list corruption as a result of modifications
> done by type 2 or 3 without holding the per-vcpu lock.

Ah, right. Boris, may I then ask to avoid the somewhat ambiguous word
"harden" here then, and instead make clear that the new locking is in
fact "balancing" removal of locks elsewhere?

Jan


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases
  2021-03-29  9:56 ` Jan Beulich
  2021-03-29 10:40   ` Roger Pau Monné
@ 2021-03-29 15:04   ` Boris Ostrovsky
  2021-03-29 15:21     ` Jan Beulich
  1 sibling, 1 reply; 10+ messages in thread
From: Boris Ostrovsky @ 2021-03-29 15:04 UTC (permalink / raw)
  To: Jan Beulich, roger.pau
  Cc: andrew.cooper3, wl, stephen.s.brennan, iwj, xen-devel


On 3/29/21 5:56 AM, Jan Beulich wrote:
> On 27.03.2021 02:51, Boris Ostrovsky wrote:
>
>> @@ -543,8 +554,10 @@ void create_periodic_time(
>>      pt->cb = cb;
>>      pt->priv = data;
>>  
>> +    pt_vcpu_lock(pt->vcpu);
>>      pt->on_list = 1;
>>      list_add(&pt->list, &v->arch.hvm.tm_list);
>> +    pt_vcpu_unlock(pt->vcpu);
> I think it would be better (not just code generation wise) to use v
> here.
>
>> @@ -580,13 +593,22 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
>>          return;
>>  
>>      write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>> +
>> +    pt_vcpu_lock(pt->vcpu);
>> +    if ( pt->on_list )
>> +        list_del(&pt->list);
>> +    pt_vcpu_unlock(pt->vcpu);
> While these two obviously can't use v, ...
>
>>      pt->vcpu = v;
>> +
>> +    pt_vcpu_lock(pt->vcpu);
>>      if ( pt->on_list )
>>      {
>> -        list_del(&pt->list);
>>          list_add(&pt->list, &v->arch.hvm.tm_list);
>>          migrate_timer(&pt->timer, v->processor);
>>      }
>> +    pt_vcpu_unlock(pt->vcpu);
> ... these two again could (and imo should), and ...
>
>>      write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
> ... really this and its counterpart better would do so, too (albeit
> perhaps in a separate patch).


Are you suggesting to replace pt->vcpu with v here? They are different at lock and unlock points (although they obviously point to the same domain).


>
> While I see that pt_adjust_global_vcpu_target() (the only caller of
> pt_adjust_vcpu()) already avoids calling here when the vcpu there
> doesn't really change, I also think that with all this extra locking
> the function now would better short-circuit the case of
> pt->vcpu == v upon entry (or more precisely once the write lock was
> acquired).


Sure. I'll add this (and address comment changes from you and Roger).


-boris





^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases
  2021-03-29 15:04   ` Boris Ostrovsky
@ 2021-03-29 15:21     ` Jan Beulich
  2021-03-29 15:31       ` Boris Ostrovsky
  0 siblings, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2021-03-29 15:21 UTC (permalink / raw)
  To: Boris Ostrovsky
  Cc: andrew.cooper3, wl, stephen.s.brennan, iwj, xen-devel, roger.pau

On 29.03.2021 17:04, Boris Ostrovsky wrote:
> On 3/29/21 5:56 AM, Jan Beulich wrote:
>> On 27.03.2021 02:51, Boris Ostrovsky wrote:
>>> @@ -580,13 +593,22 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
>>>          return;
>>>  
>>>      write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>>> +
>>> +    pt_vcpu_lock(pt->vcpu);
>>> +    if ( pt->on_list )
>>> +        list_del(&pt->list);
>>> +    pt_vcpu_unlock(pt->vcpu);
>> While these two obviously can't use v, ...
>>
>>>      pt->vcpu = v;
>>> +
>>> +    pt_vcpu_lock(pt->vcpu);
>>>      if ( pt->on_list )
>>>      {
>>> -        list_del(&pt->list);
>>>          list_add(&pt->list, &v->arch.hvm.tm_list);
>>>          migrate_timer(&pt->timer, v->processor);
>>>      }
>>> +    pt_vcpu_unlock(pt->vcpu);
>> ... these two again could (and imo should), and ...
>>
>>>      write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>> ... really this and its counterpart better would do so, too (albeit
>> perhaps in a separate patch).
> 
> 
> Are you suggesting to replace pt->vcpu with v here?

Yes.

> They are different at lock and unlock points (although they obviously point to the same domain).

Indeed, but all we care about is - as you say - the domain.

Jan


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases
  2021-03-29 15:21     ` Jan Beulich
@ 2021-03-29 15:31       ` Boris Ostrovsky
  2021-03-29 16:22         ` Jan Beulich
  0 siblings, 1 reply; 10+ messages in thread
From: Boris Ostrovsky @ 2021-03-29 15:31 UTC (permalink / raw)
  To: Jan Beulich
  Cc: andrew.cooper3, wl, stephen.s.brennan, iwj, xen-devel, roger.pau


On 3/29/21 11:21 AM, Jan Beulich wrote:
> On 29.03.2021 17:04, Boris Ostrovsky wrote:
>> On 3/29/21 5:56 AM, Jan Beulich wrote:
>>> On 27.03.2021 02:51, Boris Ostrovsky wrote:
>>>> @@ -580,13 +593,22 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
>>>>          return;
>>>>  
>>>>      write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>>>> +
>>>> +    pt_vcpu_lock(pt->vcpu);
>>>> +    if ( pt->on_list )
>>>> +        list_del(&pt->list);
>>>> +    pt_vcpu_unlock(pt->vcpu);
>>> While these two obviously can't use v, ...
>>>
>>>>      pt->vcpu = v;
>>>> +
>>>> +    pt_vcpu_lock(pt->vcpu);
>>>>      if ( pt->on_list )
>>>>      {
>>>> -        list_del(&pt->list);
>>>>          list_add(&pt->list, &v->arch.hvm.tm_list);
>>>>          migrate_timer(&pt->timer, v->processor);
>>>>      }
>>>> +    pt_vcpu_unlock(pt->vcpu);
>>> ... these two again could (and imo should), and ...
>>>
>>>>      write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>>> ... really this and its counterpart better would do so, too (albeit
>>> perhaps in a separate patch).
>>
>> Are you suggesting to replace pt->vcpu with v here?
> Yes.
>
>> They are different at lock and unlock points (although they obviously point to the same domain).
> Indeed, but all we care about is - as you say - the domain.


Hmm.. I think then it's better to stash domain (or, better, pl_time) into a local variable. Otherwise starting with different pointers in lock and unlock paths (even though they eventually lead to the same lock) makes me a bit uncomfortable.


Secondly, do you want this and the check for pt->vcpu == v in pt_adjust_vcpu() be done in two separate patches or can they go into a single one?


-boris



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases
  2021-03-29 15:31       ` Boris Ostrovsky
@ 2021-03-29 16:22         ` Jan Beulich
  0 siblings, 0 replies; 10+ messages in thread
From: Jan Beulich @ 2021-03-29 16:22 UTC (permalink / raw)
  To: Boris Ostrovsky
  Cc: andrew.cooper3, wl, stephen.s.brennan, iwj, xen-devel, roger.pau

On 29.03.2021 17:31, Boris Ostrovsky wrote:
> 
> On 3/29/21 11:21 AM, Jan Beulich wrote:
>> On 29.03.2021 17:04, Boris Ostrovsky wrote:
>>> On 3/29/21 5:56 AM, Jan Beulich wrote:
>>>> On 27.03.2021 02:51, Boris Ostrovsky wrote:
>>>>> @@ -580,13 +593,22 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
>>>>>          return;
>>>>>  
>>>>>      write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>>>>> +
>>>>> +    pt_vcpu_lock(pt->vcpu);
>>>>> +    if ( pt->on_list )
>>>>> +        list_del(&pt->list);
>>>>> +    pt_vcpu_unlock(pt->vcpu);
>>>> While these two obviously can't use v, ...
>>>>
>>>>>      pt->vcpu = v;
>>>>> +
>>>>> +    pt_vcpu_lock(pt->vcpu);
>>>>>      if ( pt->on_list )
>>>>>      {
>>>>> -        list_del(&pt->list);
>>>>>          list_add(&pt->list, &v->arch.hvm.tm_list);
>>>>>          migrate_timer(&pt->timer, v->processor);
>>>>>      }
>>>>> +    pt_vcpu_unlock(pt->vcpu);
>>>> ... these two again could (and imo should), and ...
>>>>
>>>>>      write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>>>> ... really this and its counterpart better would do so, too (albeit
>>>> perhaps in a separate patch).
>>>
>>> Are you suggesting to replace pt->vcpu with v here?
>> Yes.
>>
>>> They are different at lock and unlock points (although they obviously point to the same domain).
>> Indeed, but all we care about is - as you say - the domain.
> 
> 
> Hmm.. I think then it's better to stash domain (or, better, pl_time) into a local variable. Otherwise starting with different pointers in lock and unlock paths (even though they eventually lead to the same lock) makes me a bit uncomfortable.
> 
> 
> Secondly, do you want this and the check for pt->vcpu == v in pt_adjust_vcpu() be done in two separate patches or can they go into a single one?

This one should be right in your base patch. What I think may better
be in a separate one is the adjustment to write_lock() / write_unlock().

Jan


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases
  2021-03-27  1:51 [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases Boris Ostrovsky
  2021-03-29  9:56 ` Jan Beulich
  2021-03-29 10:31 ` Roger Pau Monné
@ 2021-03-29 17:44 ` Stephen Brennan
  2 siblings, 0 replies; 10+ messages in thread
From: Stephen Brennan @ 2021-03-29 17:44 UTC (permalink / raw)
  To: Boris Ostrovsky, xen-devel
  Cc: jbeulich, andrew.cooper3, roger.pau, wl, boris.ostrovsky, iwj

Boris Ostrovsky <boris.ostrovsky@oracle.com> writes:

> Commit 8e76aef72820 ("x86/vpt: fix race when migrating timers between
> vCPUs") addressed XSA-336 by introducing a per-domain rwlock that was
> intended to protect periodic timer during VCPU migration. Since such
> migration is an infrequent event no performance impact was expected.
>
> Unfortunately this turned out not to be the case: on a fairly large
> guest (92 VCPUs) we've observed as much as 40% TPCC performance
> regression with some guest kernels. Further investigation pointed to
> pt_migrate read lock taken in pt_update_irq() as the largest contributor
> to this regression. With large number of VCPUs and large number of VMEXITs
> (from where pt_update_irq() is always called) the update of an atomic in
> read_lock() is thought to be the main cause.
>
> Stephen Brennan analyzed locking pattern and classified lock users as
> follows:
>
> 1. Functions which read (maybe write) all periodic_time instances
> attached to a particular vCPU. These are functions which use pt_vcpu_lock()
> after the commit, such as pt_restore_timer(), pt_save_timer(), etc.

I think "the commit" is now a bit ambiguous, it was intended to refer to
8e76aef72820, the fix for XSA-336. Maybe it would be easier to simply
drop the phrase "after the commit" since we're discussing the state of
the code prior to this patch.

> 2. Functions which want to modify a particular periodic_time object.
> These guys lock whichever vCPU the periodic_time is attached to, but
> since the vCPU could be modified without holding any lock, they are
> vulnerable to the bug. Functions in this group use pt_lock(), such as

s/the bug/XSA-336/ may make more sense in this context? Just to
distinguish from the performance issue.

Code changes look good.

Reviewed-by: Stephen Brennan <stephen.s.brennan@oracle.com>

> pt_timer_fn() or destroy_periodic_time().
> 3. Functions which not only want to modify the periodic_time, but also
> would like to modify the =vcpu= fields. These are create_periodic_time()
> or pt_adjust_vcpu(). They create the locking imbalance bug for group 2,
> but we can't simply hold 2 vcpu locks due to the deadlock risk.
>
> Roger Monné then pointed out that group 1 functions don't really need
> to hold the pt_migrate rwlock and that group 3 should be hardened by
> holding appropriate vcpu's tm_lock when adding or deleting a timer
> from its list.
>
> Suggested-by: Stephen Brennan <stephen.s.brennan@oracle.com>
> Suggested-by: Roger Pau Monne <roger.pau@citrix.com>
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> ---
> v2: Drop per-periodic_time spinlock and keep pt_migrate rwlock (and thus
>     change patch subject)
>
>  xen/arch/x86/hvm/vpt.c        | 40 +++++++++++++++++++++++++++++++---------
>  xen/include/asm-x86/hvm/vpt.h |  8 ++++----
>  2 files changed, 35 insertions(+), 13 deletions(-)
>
> diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c
> index 4c2afe2e9154..893641f20e1c 100644
> --- a/xen/arch/x86/hvm/vpt.c
> +++ b/xen/arch/x86/hvm/vpt.c
> @@ -153,32 +153,43 @@ static int pt_irq_masked(struct periodic_time *pt)
>      return 1;
>  }
>  
> +/*
> + * Functions which read (maybe write) all periodic_time instances
> + * attached to a particular vCPU use these locking helpers.
> + *
> + * Such users are explicitly forbidden from changing the value of the
> + * pt->vcpu field, because another thread holding the pt_migrate lock
> + * may already be spinning waiting for your vcpu lock.
> + */
>  static void pt_vcpu_lock(struct vcpu *v)
>  {
> -    read_lock(&v->domain->arch.hvm.pl_time->pt_migrate);
>      spin_lock(&v->arch.hvm.tm_lock);
>  }
>  
>  static void pt_vcpu_unlock(struct vcpu *v)
>  {
>      spin_unlock(&v->arch.hvm.tm_lock);
> -    read_unlock(&v->domain->arch.hvm.pl_time->pt_migrate);
>  }
>  
> +/*
> + * Functions which want to modify a particular periodic_time object
> + * use these locking helpers.
> + *
> + * These users lock whichever vCPU the periodic_time is attached to,
> + * but since the vCPU could be modified without holding any lock, they
> + * need to take an additional lock that protects against pt->vcpu
> + * changing.
> + */
>  static void pt_lock(struct periodic_time *pt)
>  {
> -    /*
> -     * We cannot use pt_vcpu_lock here, because we need to acquire the
> -     * per-domain lock first and then (re-)fetch the value of pt->vcpu, or
> -     * else we might be using a stale value of pt->vcpu.
> -     */
>      read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>      spin_lock(&pt->vcpu->arch.hvm.tm_lock);
>  }
>  
>  static void pt_unlock(struct periodic_time *pt)
>  {
> -    pt_vcpu_unlock(pt->vcpu);
> +    spin_unlock(&pt->vcpu->arch.hvm.tm_lock);
> +    read_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>  }
>  
>  static void pt_process_missed_ticks(struct periodic_time *pt)
> @@ -543,8 +554,10 @@ void create_periodic_time(
>      pt->cb = cb;
>      pt->priv = data;
>  
> +    pt_vcpu_lock(pt->vcpu);
>      pt->on_list = 1;
>      list_add(&pt->list, &v->arch.hvm.tm_list);
> +    pt_vcpu_unlock(pt->vcpu);
>  
>      init_timer(&pt->timer, pt_timer_fn, pt, v->processor);
>      set_timer(&pt->timer, pt->scheduled);
> @@ -580,13 +593,22 @@ static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v)
>          return;
>  
>      write_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
> +
> +    pt_vcpu_lock(pt->vcpu);
> +    if ( pt->on_list )
> +        list_del(&pt->list);
> +    pt_vcpu_unlock(pt->vcpu);
> +
>      pt->vcpu = v;
> +
> +    pt_vcpu_lock(pt->vcpu);
>      if ( pt->on_list )
>      {
> -        list_del(&pt->list);
>          list_add(&pt->list, &v->arch.hvm.tm_list);
>          migrate_timer(&pt->timer, v->processor);
>      }
> +    pt_vcpu_unlock(pt->vcpu);
> +
>      write_unlock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
>  }
>  
> diff --git a/xen/include/asm-x86/hvm/vpt.h b/xen/include/asm-x86/hvm/vpt.h
> index 39d26cbda496..f3c2a439630a 100644
> --- a/xen/include/asm-x86/hvm/vpt.h
> +++ b/xen/include/asm-x86/hvm/vpt.h
> @@ -129,10 +129,10 @@ struct pl_time {    /* platform time */
>      struct HPETState vhpet;
>      struct PMTState  vpmt;
>      /*
> -     * rwlock to prevent periodic_time vCPU migration. Take the lock in read
> -     * mode in order to prevent the vcpu field of periodic_time from changing.
> -     * Lock must be taken in write mode when changes to the vcpu field are
> -     * performed, as it allows exclusive access to all the timers of a domain.
> +     * Functions which want to modify the vcpu field of the vpt need to
> +     * hold the global lock (pt_migrate) in write mode together with the
> +     * per-vcpu locks of the lists being modified. Note that two vcpu
> +     * locks cannot be held at the same time to avoid a deadlock.
>       */
>      rwlock_t pt_migrate;
>      /* guest_time = Xen sys time + stime_offset */
> -- 
> 1.8.3.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-03-29 18:40 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-27  1:51 [PATCH v2] x86/vpt: Do not take pt_migrate rwlock in some cases Boris Ostrovsky
2021-03-29  9:56 ` Jan Beulich
2021-03-29 10:40   ` Roger Pau Monné
2021-03-29 11:01     ` Jan Beulich
2021-03-29 15:04   ` Boris Ostrovsky
2021-03-29 15:21     ` Jan Beulich
2021-03-29 15:31       ` Boris Ostrovsky
2021-03-29 16:22         ` Jan Beulich
2021-03-29 10:31 ` Roger Pau Monné
2021-03-29 17:44 ` Stephen Brennan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.