Re: [RFC PATCH] hvm/vpt: fix inconsistent views of vIOAPIC in vmx_intr_assist()

From: "Xuquan (Quan Xu)" <xuquan8@huawei.com>
To: Chao Gao <chao.gao@intel.com>,
	"xen-devel@lists.xen.org" <xen-devel@lists.xen.org>
Cc: Kevin Tian <kevin.tian@intel.com>,
	Jan Beulich <jbeulich@suse.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Jun Nakajima <jun.nakajima@intel.com>,
	Boris Ostrovsky <boris.ostrovsky@oracle.com>,
	Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Subject: Re: [RFC PATCH] hvm/vpt: fix inconsistent views of vIOAPIC in vmx_intr_assist()
Date: Wed, 12 Apr 2017 02:45:36 +0000	[thread overview]
Message-ID: <E0A769A898ADB6449596C41F51EF62C6B02DF2@SZXEMI506-MBX.china.huawei.com> (raw)
In-Reply-To: <1491535456-12682-1-git-send-email-chao.gao@intel.com>

On April 07, 2017 11:24 AM, Chao Gao wrote:
>When injecting periodic timer interrupt in vmx_intr_assist(), multiple read
>operation is operated during one event delivery and incur to inconsistent
>views of vIOAPIC. For example, if a periodic timer interrupt is from PIT, when
>set the corresponding bit in vIRR, the corresponding RTE is accessed in
>pt_update_irq(). When this function returns, it accesses the RTE again to get
>the vector it set in vIRR.  Between the two accesses, the content of RTE may
>have been changed by another CPU for no protection method in use. This case
>can incur the assertion failure in vmx_intr_assist().  Also after a periodic
>timer interrupt is injected successfully, pt_irq_posted() will decrease the
>count (pending_intr_nr). But if the corresponding RTE has been changed,
>pt_irq_posted() will not decrease the count, resulting one more timer
>interrupt.
>
>More discussion and history can be found in
>1.https://lists.xenproject.org/archives/html/xen-devel/2017-03/msg00906.ht
>ml
>2.https://lists.xenproject.org/archives/html/xen-devel/2017-01/msg01019.ht
>ml
>
>This patch simply uses irq_lock to avoid these inconsistent views of vIOAPIC.

To vLAPIC, is it in a similar situation?

>To fix the deadlock, provide locked version of several functions.
>
>Signed-off-by: Chao Gao <chao.gao@intel.com>
>---
> xen/arch/x86/hvm/irq.c      | 22 ++++++++++++++++------
> xen/arch/x86/hvm/svm/intr.c | 21 +++++++++++++++------
>xen/arch/x86/hvm/vmx/intr.c |  9 +++++++++
> xen/arch/x86/hvm/vpic.c     | 20 ++++++++++++++------
> xen/arch/x86/hvm/vpt.c      |  4 ++--
> xen/include/xen/hvm/irq.h   |  4 ++++
> 6 files changed, 60 insertions(+), 20 deletions(-)
>
>diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c index
>8625584..a1af61e 100644
>--- a/xen/arch/x86/hvm/irq.c
>+++ b/xen/arch/x86/hvm/irq.c
>@@ -126,37 +126,47 @@ void hvm_pci_intx_deassert(
>     spin_unlock(&d->arch.hvm_domain.irq_lock);
> }
>
>-void hvm_isa_irq_assert(
>+void hvm_isa_irq_assert_locked(
>     struct domain *d, unsigned int isa_irq)  {
>     struct hvm_irq *hvm_irq = hvm_domain_irq(d);
>     unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq);
>
>     ASSERT(isa_irq <= 15);
>-
>-    spin_lock(&d->arch.hvm_domain.irq_lock);
>+    ASSERT(spin_is_locked(&d->arch.hvm_domain.irq_lock));
>
>     if ( !__test_and_set_bit(isa_irq, &hvm_irq->isa_irq.i) &&
>          (hvm_irq->gsi_assert_count[gsi]++ == 0) )
>         assert_irq(d, gsi, isa_irq);
>+}
>
>+void hvm_isa_irq_assert(
>+    struct domain *d, unsigned int isa_irq) {
>+    spin_lock(&d->arch.hvm_domain.irq_lock);
>+    hvm_isa_irq_assert_locked(d, isa_irq);
>     spin_unlock(&d->arch.hvm_domain.irq_lock);
> }
>
>-void hvm_isa_irq_deassert(
>+void hvm_isa_irq_deassert_locked(
>     struct domain *d, unsigned int isa_irq)  {
>     struct hvm_irq *hvm_irq = hvm_domain_irq(d);
>     unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq);
>
>     ASSERT(isa_irq <= 15);
>-
>-    spin_lock(&d->arch.hvm_domain.irq_lock);
>+    ASSERT(spin_is_locked(&d->arch.hvm_domain.irq_lock));
>
>     if ( __test_and_clear_bit(isa_irq, &hvm_irq->isa_irq.i) &&
>          (--hvm_irq->gsi_assert_count[gsi] == 0) )
>         deassert_irq(d, isa_irq);
>+}
>
>+void hvm_isa_irq_deassert(
>+    struct domain *d, unsigned int isa_irq) {
>+    spin_lock(&d->arch.hvm_domain.irq_lock);
>+    hvm_isa_irq_deassert_locked(d, isa_irq);
>     spin_unlock(&d->arch.hvm_domain.irq_lock);
> }
>
>diff --git a/xen/arch/x86/hvm/svm/intr.c b/xen/arch/x86/hvm/svm/intr.c
>index 8511ff0..d2a318c 100644
>--- a/xen/arch/x86/hvm/svm/intr.c
>+++ b/xen/arch/x86/hvm/svm/intr.c
>@@ -137,18 +137,25 @@ void svm_intr_assist(void)
>     struct hvm_intack intack;
>     enum hvm_intblk intblk;
>
>+    /*
>+     * Avoid the vIOAPIC RTE being changed by another vCPU.
>+     * Otherwise, pt_update_irq() may return a wrong vector which is not
>in
>+     * vIRR and pt_irq_posted() may not recognize a timer interrupt has
>been
>+     * injected.
>+     */
>+    spin_lock(&v->domain->arch.hvm_domain.irq_lock);
>     /* Crank the handle on interrupt state. */
>     pt_update_irq(v);
>
>     do {
>         intack = hvm_vcpu_has_pending_irq(v);
>         if ( likely(intack.source == hvm_intsrc_none) )
>-            return;
>+            goto out;
>
>         intblk = hvm_interrupt_blocked(v, intack);
>         if ( intblk == hvm_intblk_svm_gif ) {
>             ASSERT(nestedhvm_enabled(v->domain));
>-            return;
>+            goto out;
>         }
>
>         /* Interrupts for the nested guest are already @@ -165,13 +172,13
>@@ void svm_intr_assist(void)
>             switch (rc) {
>             case NSVM_INTR_NOTINTERCEPTED:
>                 /* Inject interrupt into 2nd level guest directly. */
>-                break;
>+                goto out;
>             case NSVM_INTR_NOTHANDLED:
>             case NSVM_INTR_FORCEVMEXIT:
>-                return;
>+                goto out;
>             case NSVM_INTR_MASKED:
>                 /* Guest already enabled an interrupt window. */
>-                return;
>+                goto out;
>             default:
>                 panic("%s: nestedsvm_vcpu_interrupt can't handle
>value %#x",
>                     __func__, rc);
>@@ -195,7 +202,7 @@ void svm_intr_assist(void)
>         if ( unlikely(vmcb->eventinj.fields.v) || intblk )
>         {
>             svm_enable_intr_window(v, intack);
>-            return;
>+            goto out;
>         }
>
>         intack = hvm_vcpu_ack_pending_irq(v, intack); @@ -216,6 +223,8
>@@ void svm_intr_assist(void)
>     intack = hvm_vcpu_has_pending_irq(v);
>     if ( unlikely(intack.source != hvm_intsrc_none) )
>         svm_enable_intr_window(v, intack);
>+ out:
>+    spin_unlock(&v->domain->arch.hvm_domain.irq_lock);
> }
>
> /*
>diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c
>index 1eb9f38..8f3d1a7 100644
>--- a/xen/arch/x86/hvm/vmx/intr.c
>+++ b/xen/arch/x86/hvm/vmx/intr.c
>@@ -234,6 +234,14 @@ void vmx_intr_assist(void)
>         return;
>     }
>
>+    /*
>+     * Avoid the vIOAPIC RTE being changed by another vCPU.

This comment seems inconsistent with the patch description, which said ' by another CPU ', instead of 'by another vCPU'..

-Quan

>+     * Otherwise, pt_update_irq() may return a wrong vector which is not
>in
>+     * vIRR and pt_irq_posted() may not recognize a timer interrupt has
>been
>+     * injected.
>+     */
>+    spin_lock(&v->domain->arch.hvm_domain.irq_lock);
>+
>     /* Crank the handle on interrupt state. */
>     if ( is_hvm_vcpu(v) )
>         pt_vector = pt_update_irq(v);
>@@ -396,6 +404,7 @@ void vmx_intr_assist(void)
>     }
>
>  out:
>+    spin_unlock(&v->domain->arch.hvm_domain.irq_lock);
>     if ( !nestedhvm_vcpu_in_guestmode(v) &&
>          !cpu_has_vmx_virtual_intr_delivery &&
>          cpu_has_vmx_tpr_shadow )
>diff --git a/xen/arch/x86/hvm/vpic.c b/xen/arch/x86/hvm/vpic.c index
>e160bbd..8d02e52 100644
>--- a/xen/arch/x86/hvm/vpic.c
>+++ b/xen/arch/x86/hvm/vpic.c
>@@ -153,14 +153,13 @@ static void __vpic_intack(struct hvm_hw_vpic
>*vpic, int irq)
>     vpic_update_int_output(vpic);
> }
>
>-static int vpic_intack(struct hvm_hw_vpic *vpic)
>+static int vpic_intack_locked(struct hvm_hw_vpic *vpic)
> {
>     int irq = -1;
>
>-    vpic_lock(vpic);
>-
>+    ASSERT(vpic_is_locked(vpic));
>     if ( !vpic->int_output )
>-        goto out;
>+        return irq;
>
>     irq = vpic_get_highest_priority_irq(vpic);
>     BUG_ON(irq < 0);
>@@ -175,7 +174,15 @@ static int vpic_intack(struct hvm_hw_vpic *vpic)
>         irq += 8;
>     }
>
>- out:
>+    return irq;
>+}
>+
>+static int vpic_intack(struct hvm_hw_vpic *vpic) {
>+    int irq;
>+
>+    vpic_lock(vpic);
>+    irq = vpic_intack_locked(vpic);
>     vpic_unlock(vpic);
>     return irq;
> }
>@@ -487,13 +494,14 @@ int vpic_ack_pending_irq(struct vcpu *v)
>     struct hvm_hw_vpic *vpic = &v->domain->arch.hvm_domain.vpic[0];
>
>     ASSERT(has_vpic(v->domain));
>+    ASSERT(vpic_is_locked(vpic));
>
>     TRACE_2D(TRC_HVM_EMUL_PIC_PEND_IRQ_CALL,
>vlapic_accept_pic_intr(v),
>              vpic->int_output);
>     if ( !vlapic_accept_pic_intr(v) || !vpic->int_output )
>         return -1;
>
>-    irq = vpic_intack(vpic);
>+    irq = vpic_intack_locked(vpic);
>     if ( irq == -1 )
>         return -1;
>
>diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c index
>e3f2039..d048f26 100644
>--- a/xen/arch/x86/hvm/vpt.c
>+++ b/xen/arch/x86/hvm/vpt.c
>@@ -296,8 +296,8 @@ int pt_update_irq(struct vcpu *v)
>         vlapic_set_irq(vcpu_vlapic(v), irq, 0);
>     else
>     {
>-        hvm_isa_irq_deassert(v->domain, irq);
>-        hvm_isa_irq_assert(v->domain, irq);
>+        hvm_isa_irq_deassert_locked(v->domain, irq);
>+        hvm_isa_irq_assert_locked(v->domain, irq);
>     }
>
>     /*
>diff --git a/xen/include/xen/hvm/irq.h b/xen/include/xen/hvm/irq.h index
>f041252..a8f36f8 100644
>--- a/xen/include/xen/hvm/irq.h
>+++ b/xen/include/xen/hvm/irq.h
>@@ -119,8 +119,12 @@ void hvm_pci_intx_deassert(
> /* Modify state of an ISA device's IRQ wire. */  void hvm_isa_irq_assert(
>     struct domain *d, unsigned int isa_irq);
>+void hvm_isa_irq_assert_locked(
>+    struct domain *d, unsigned int isa_irq);
> void hvm_isa_irq_deassert(
>     struct domain *d, unsigned int isa_irq);
>+void hvm_isa_irq_deassert_locked(
>+    struct domain *d, unsigned int isa_irq);
>
> int hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq);
>
>--
>1.8.3.1

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel