[PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors

* [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
@ 2016-04-01  9:15 Jan Beulich
  2016-04-01  9:58 ` Jan Beulich
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-04-01  9:15 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Paul Durrant, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 5556 bytes --]

Recent changes to Linux result in there just being a single unmask
operation prior to expecting the first interrupts to arrive. However,
we've had a chicken-and-egg problem here: Qemu invokes
xc_domain_update_msi_irq(), ultimately leading to
msixtbl_pt_register(), upon seeing that first unmask operation. Yet
for msixtbl_range() to return true (in order to msixtbl_write() to get
invoked at all) msixtbl_pt_register() must have completed.

Deal with this by snooping suitable writes in msixtbl_range() and
triggering the invocation of msix_write_completion() from
msixtbl_pt_register() when that happens in the context of a still in
progress vector control field write.

Note that the seemingly unrelated deletion of the redundant
irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
any compiler version used that the "msi_desc" local variable isn't
being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
just for consistency reasons.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?
TODO: remove //temp-s
TODO: (follow-up patch) defer and conditionalize msixtbl_init() invocation
TODO: (follow-up patch) drop msixtbl_list_lock
TODO: (follow-up patch) drop pci_msix_get_table_len()

--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -276,6 +276,7 @@ static int msixtbl_write(struct vcpu *v,
     if ( !entry )
         goto out;
     nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
+printk("%pv: write MSI-X#%u: [%lx]=%0*lx\n", v, nr_entry, address, (int)len * 2, val);//temp
 
     offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
     if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
@@ -321,7 +322,16 @@ static int msixtbl_write(struct vcpu *v,
 
     ASSERT(msi_desc == desc->msi_desc);
    
+{//temp
+ bool_t h = msi_desc->msi_attrib.host_masked;
+ bool_t g = msi_desc->msi_attrib.guest_masked;
+ bool_t ha = entry->pdev->msix->host_maskall;
+ bool_t ga = entry->pdev->msix->guest_maskall;
     guest_mask_msi_irq(desc, !!(val & PCI_MSIX_VECTOR_BITMASK));
+ printk("%pv: MSI-X#%u %d(%d) / %d(%d) -> %d(%d) / %d(%d)\n", v, nr_entry, h, ha, g, ga,
+        msi_desc->msi_attrib.host_masked, entry->pdev->msix->host_maskall,
+        msi_desc->msi_attrib.guest_masked, entry->pdev->msix->guest_maskall);
+}
 
 unlock:
     spin_unlock_irqrestore(&desc->lock, flags);
@@ -330,6 +340,7 @@ unlock:
 
 out:
     rcu_read_unlock(&msixtbl_rcu_lock);
+printk("%pv: write MSI-X [%lx] -> %d\n", v, address, r);//temp
     return r;
 }
 
@@ -341,7 +352,20 @@ static int msixtbl_range(struct vcpu *v,
     desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr);
     rcu_read_unlock(&msixtbl_rcu_lock);
 
-    return !!desc;
+    if ( desc )
+        return 1;
+
+    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) == PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
+    {
+        const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req;
+
+        ASSERT(r->type == IOREQ_TYPE_COPY);
+        if ( r->dir == IOREQ_WRITE && r->size == 4 && !r->data_is_ptr
+             && !(r->data & PCI_MSIX_VECTOR_BITMASK) )
+            v->arch.hvm_vcpu.hvm_io.msix_snoop_address = addr;
+    }
+
+    return 0;
 }
 
 static const struct hvm_mmio_ops msixtbl_mmio_ops = {
@@ -360,6 +384,7 @@ static void add_msixtbl_entry(struct dom
     atomic_set(&entry->refcnt, 0);
 
     entry->table_len = pci_msix_get_table_len(pdev);
+WARN_ON(entry->table_len != (pdev->msix->nr_entries * PCI_MSIX_ENTRY_SIZE));//temp
     entry->pdev = pdev;
     entry->gtable = (unsigned long) gtable;
 
@@ -410,9 +435,6 @@ int msixtbl_pt_register(struct domain *d
         return r;
     }
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -437,6 +459,24 @@ found:
 out:
     spin_unlock_irq(&irq_desc->lock);
     xfree(new_entry);
+
+    if ( !r )
+    {
+        struct vcpu *v;
+
+printk("d%d: MSI-X#%u %u@%lx\n", d->domain_id, msi_desc->msi_attrib.entry_nr, entry->table_len, gtable);//temp
+        for_each_vcpu ( d, v )
+        {
+if(v->arch.hvm_vcpu.hvm_io.msix_snoop_address)//temp
+ printk("%pv: MSI-X#%u snoop %08lx\n", v, msi_desc->msi_attrib.entry_nr, v->arch.hvm_vcpu.hvm_io.msix_snoop_address);//temp
+            if ( v->arch.hvm_vcpu.hvm_io.msix_snoop_address ==
+                 (gtable + msi_desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+                  PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) )
+                v->arch.hvm_vcpu.hvm_io.msix_unmask_address =
+                    v->arch.hvm_vcpu.hvm_io.msix_snoop_address;
+        }
+    }
+
     return r;
 }
 
@@ -457,9 +497,6 @@ void msixtbl_pt_unregister(struct domain
     if ( !irq_desc )
         return;
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -522,6 +559,8 @@ void msix_write_completion(struct vcpu *
 {
     unsigned long ctrl_address = v->arch.hvm_vcpu.hvm_io.msix_unmask_address;
 
+    v->arch.hvm_vcpu.hvm_io.msix_snoop_address = 0;
+
     if ( !ctrl_address )
         return;
 
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -85,6 +85,7 @@ struct hvm_vcpu_io {
     bool_t mmio_retry;
 
     unsigned long msix_unmask_address;
+    unsigned long msix_snoop_address;
 
     const struct g2m_ioport *g2m_ioport;
 };



[-- Attachment #2: x86-vMSI-X-first-unmask.patch --]
[-- Type: text/plain, Size: 5605 bytes --]

x86/vMSI-X: avoid missing first unmask of vectors

Recent changes to Linux result in there just being a single unmask
operation prior to expecting the first interrupts to arrive. However,
we've had a chicken-and-egg problem here: Qemu invokes
xc_domain_update_msi_irq(), ultimately leading to
msixtbl_pt_register(), upon seeing that first unmask operation. Yet
for msixtbl_range() to return true (in order to msixtbl_write() to get
invoked at all) msixtbl_pt_register() must have completed.

Deal with this by snooping suitable writes in msixtbl_range() and
triggering the invocation of msix_write_completion() from
msixtbl_pt_register() when that happens in the context of a still in
progress vector control field write.

Note that the seemingly unrelated deletion of the redundant
irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
any compiler version used that the "msi_desc" local variable isn't
being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
just for consistency reasons.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?
TODO: remove //temp-s
TODO: (follow-up patch) defer and conditionalize msixtbl_init() invocation
TODO: (follow-up patch) drop msixtbl_list_lock
TODO: (follow-up patch) drop pci_msix_get_table_len()

--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -276,6 +276,7 @@ static int msixtbl_write(struct vcpu *v,
     if ( !entry )
         goto out;
     nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
+printk("%pv: write MSI-X#%u: [%lx]=%0*lx\n", v, nr_entry, address, (int)len * 2, val);//temp
 
     offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
     if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
@@ -321,7 +322,16 @@ static int msixtbl_write(struct vcpu *v,
 
     ASSERT(msi_desc == desc->msi_desc);
    
+{//temp
+ bool_t h = msi_desc->msi_attrib.host_masked;
+ bool_t g = msi_desc->msi_attrib.guest_masked;
+ bool_t ha = entry->pdev->msix->host_maskall;
+ bool_t ga = entry->pdev->msix->guest_maskall;
     guest_mask_msi_irq(desc, !!(val & PCI_MSIX_VECTOR_BITMASK));
+ printk("%pv: MSI-X#%u %d(%d) / %d(%d) -> %d(%d) / %d(%d)\n", v, nr_entry, h, ha, g, ga,
+        msi_desc->msi_attrib.host_masked, entry->pdev->msix->host_maskall,
+        msi_desc->msi_attrib.guest_masked, entry->pdev->msix->guest_maskall);
+}
 
 unlock:
     spin_unlock_irqrestore(&desc->lock, flags);
@@ -330,6 +340,7 @@ unlock:
 
 out:
     rcu_read_unlock(&msixtbl_rcu_lock);
+printk("%pv: write MSI-X [%lx] -> %d\n", v, address, r);//temp
     return r;
 }
 
@@ -341,7 +352,20 @@ static int msixtbl_range(struct vcpu *v,
     desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr);
     rcu_read_unlock(&msixtbl_rcu_lock);
 
-    return !!desc;
+    if ( desc )
+        return 1;
+
+    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) == PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
+    {
+        const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req;
+
+        ASSERT(r->type == IOREQ_TYPE_COPY);
+        if ( r->dir == IOREQ_WRITE && r->size == 4 && !r->data_is_ptr
+             && !(r->data & PCI_MSIX_VECTOR_BITMASK) )
+            v->arch.hvm_vcpu.hvm_io.msix_snoop_address = addr;
+    }
+
+    return 0;
 }
 
 static const struct hvm_mmio_ops msixtbl_mmio_ops = {
@@ -360,6 +384,7 @@ static void add_msixtbl_entry(struct dom
     atomic_set(&entry->refcnt, 0);
 
     entry->table_len = pci_msix_get_table_len(pdev);
+WARN_ON(entry->table_len != (pdev->msix->nr_entries * PCI_MSIX_ENTRY_SIZE));//temp
     entry->pdev = pdev;
     entry->gtable = (unsigned long) gtable;
 
@@ -410,9 +435,6 @@ int msixtbl_pt_register(struct domain *d
         return r;
     }
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -437,6 +459,24 @@ found:
 out:
     spin_unlock_irq(&irq_desc->lock);
     xfree(new_entry);
+
+    if ( !r )
+    {
+        struct vcpu *v;
+
+printk("d%d: MSI-X#%u %u@%lx\n", d->domain_id, msi_desc->msi_attrib.entry_nr, entry->table_len, gtable);//temp
+        for_each_vcpu ( d, v )
+        {
+if(v->arch.hvm_vcpu.hvm_io.msix_snoop_address)//temp
+ printk("%pv: MSI-X#%u snoop %08lx\n", v, msi_desc->msi_attrib.entry_nr, v->arch.hvm_vcpu.hvm_io.msix_snoop_address);//temp
+            if ( v->arch.hvm_vcpu.hvm_io.msix_snoop_address ==
+                 (gtable + msi_desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+                  PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) )
+                v->arch.hvm_vcpu.hvm_io.msix_unmask_address =
+                    v->arch.hvm_vcpu.hvm_io.msix_snoop_address;
+        }
+    }
+
     return r;
 }
 
@@ -457,9 +497,6 @@ void msixtbl_pt_unregister(struct domain
     if ( !irq_desc )
         return;
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -522,6 +559,8 @@ void msix_write_completion(struct vcpu *
 {
     unsigned long ctrl_address = v->arch.hvm_vcpu.hvm_io.msix_unmask_address;
 
+    v->arch.hvm_vcpu.hvm_io.msix_snoop_address = 0;
+
     if ( !ctrl_address )
         return;
 
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -85,6 +85,7 @@ struct hvm_vcpu_io {
     bool_t mmio_retry;
 
     unsigned long msix_unmask_address;
+    unsigned long msix_snoop_address;
 
     const struct g2m_ioport *g2m_ioport;
 };

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread