All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] x86/MSI-X: XSA-120 follow-up
@ 2015-03-10 16:16 Jan Beulich
  2015-03-10 16:27 ` [PATCH 1/4] x86/MSI-X: be more careful during teardown Jan Beulich
                   ` (4 more replies)
  0 siblings, 5 replies; 15+ messages in thread
From: Jan Beulich @ 2015-03-10 16:16 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

The problem requiring the first patch here is actually what lead to
XSA-120.

1: be more careful during teardown
2: access MSI-X table only after having enabled MSI-X
3: reduce fiddling with control register during restore
4: cleanup

Signed-off-by: Jan Beulich <jbeulich@suse.com>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/4] x86/MSI-X: be more careful during teardown
  2015-03-10 16:16 [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Jan Beulich
@ 2015-03-10 16:27 ` Jan Beulich
  2015-03-10 21:05   ` Andrew Cooper
  2015-03-13 18:10   ` Andrew Cooper
  2015-03-10 16:28 ` [PATCH 2/4] x86/MSI-X: access MSI‑X table only after having enabled MSI‑X Jan Beulich
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 15+ messages in thread
From: Jan Beulich @ 2015-03-10 16:27 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 9685 bytes --]

When a device gets detached from a guest, pciback will clear its
command register, thus disabling both memory and I/O decoding. The
disabled memory decoding, however, has an effect on the MSI-X table
accesses the hypervisor does: These won't have the intended effect
anymore. Even worse, for PCIe devices (but not SR-IOV virtual
functions) such accesses may (will?) be treated as Unsupported
Requests, causing respective errors to be surfaced, potentially in the
form of NMIs that may be fatal to the hypervisor or Dom0 is different
ways. Hence rather than carrying out these accesses, we should avoid
them where we can, and use alternative (e.g. PCI config space based)
mechanisms to achieve at least the same effect.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Backporting note (largely to myself):
   Depends on (not yet backported) commit 061eebe0e "x86/MSI: drop
   workaround for insecure Dom0 kernels" (due to re-use of struct
   arch_msix's warned field).

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -121,6 +121,27 @@ static void msix_put_fixmap(struct arch_
     spin_unlock(&msix->table_lock);
 }
 
+static bool_t memory_decoded(const struct pci_dev *dev)
+{
+    u8 bus, slot, func;
+
+    if ( !dev->info.is_virtfn )
+    {
+        bus = dev->bus;
+        slot = PCI_SLOT(dev->devfn);
+        func = PCI_FUNC(dev->devfn);
+    }
+    else
+    {
+        bus = dev->info.physfn.bus;
+        slot = PCI_SLOT(dev->info.physfn.devfn);
+        func = PCI_FUNC(dev->info.physfn.devfn);
+    }
+
+    return !!(pci_conf_read16(dev->seg, bus, slot, func, PCI_COMMAND) &
+              PCI_COMMAND_MEMORY);
+}
+
 /*
  * MSI message composition
  */
@@ -162,7 +183,7 @@ void msi_compose_msg(unsigned vector, co
     }
 }
 
-static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+static bool_t read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 {
     switch ( entry->msi_attrib.type )
     {
@@ -198,6 +219,8 @@ static void read_msi_msg(struct msi_desc
         void __iomem *base;
         base = entry->mask_base;
 
+        if ( unlikely(!memory_decoded(entry->dev)) )
+            return 0;
         msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
         msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
@@ -209,6 +232,8 @@ static void read_msi_msg(struct msi_desc
 
     if ( iommu_intremap )
         iommu_read_msi_from_ire(entry, msg);
+
+    return 1;
 }
 
 static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -260,6 +285,8 @@ static int write_msi_msg(struct msi_desc
         void __iomem *base;
         base = entry->mask_base;
 
+        if ( unlikely(!memory_decoded(entry->dev)) )
+            return -ENXIO;
         writel(msg->address_lo,
                base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         writel(msg->address_hi,
@@ -287,7 +314,8 @@ void set_msi_affinity(struct irq_desc *d
     ASSERT(spin_is_locked(&desc->lock));
 
     memset(&msg, 0, sizeof(msg));
-    read_msi_msg(msi_desc, &msg);
+    if ( !read_msi_msg(msi_desc, &msg) )
+        return;
 
     msg.data &= ~MSI_DATA_VECTOR_MASK;
     msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
@@ -347,20 +375,24 @@ int msi_maskable_irq(const struct msi_de
            || entry->msi_attrib.maskbit;
 }
 
-static void msi_set_mask_bit(struct irq_desc *desc, int flag)
+static bool_t msi_set_mask_bit(struct irq_desc *desc, int flag)
 {
     struct msi_desc *entry = desc->msi_desc;
+    struct pci_dev *pdev;
+    u16 seg;
+    u8 bus, slot, func;
 
     ASSERT(spin_is_locked(&desc->lock));
     BUG_ON(!entry || !entry->dev);
+    pdev = entry->dev;
+    seg = pdev->seg;
+    bus = pdev->bus;
+    slot = PCI_SLOT(pdev->devfn);
+    func = PCI_FUNC(pdev->devfn);
     switch (entry->msi_attrib.type) {
     case PCI_CAP_ID_MSI:
         if (entry->msi_attrib.maskbit) {
             u32 mask_bits;
-            u16 seg = entry->dev->seg;
-            u8 bus = entry->dev->bus;
-            u8 slot = PCI_SLOT(entry->dev->devfn);
-            u8 func = PCI_FUNC(entry->dev->devfn);
 
             mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
             mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
@@ -369,24 +401,52 @@ static void msi_set_mask_bit(struct irq_
         }
         break;
     case PCI_CAP_ID_MSIX:
-    {
-        int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
-        writel(flag, entry->mask_base + offset);
-        readl(entry->mask_base + offset);
-        break;
-    }
+        if ( likely(memory_decoded(pdev)) )
+        {
+            writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+            readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+            break;
+        }
+        if ( flag )
+        {
+            u16 control;
+            domid_t domid = pdev->domain->domain_id;
+
+            control = pci_conf_read16(seg, bus, slot, func,
+                                      msix_control_reg(entry->msi_attrib.pos));
+            if ( control & PCI_MSIX_FLAGS_MASKALL )
+                break;
+            pci_conf_write16(seg, bus, slot, func,
+                             msix_control_reg(entry->msi_attrib.pos),
+                             control | PCI_MSIX_FLAGS_MASKALL);
+            if ( pdev->msix->warned != domid )
+            {
+                pdev->msix->warned = domid;
+                printk(XENLOG_G_WARNING
+                       "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
+                       desc->irq, domid, pdev->seg, pdev->bus,
+                       PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+            }
+            break;
+        }
+        /* fall through */
     default:
-        BUG();
-        break;
+        return 0;
     }
     entry->msi_attrib.masked = !!flag;
+
+    return 1;
 }
 
 static int msi_get_mask_bit(const struct msi_desc *entry)
 {
-    switch (entry->msi_attrib.type) {
+    if ( !entry->dev )
+        return -1;
+
+    switch ( entry->msi_attrib.type )
+    {
     case PCI_CAP_ID_MSI:
-        if (!entry->dev || !entry->msi_attrib.maskbit)
+        if ( !entry->msi_attrib.maskbit )
             break;
         return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
                                 PCI_SLOT(entry->dev->devfn),
@@ -394,6 +454,8 @@ static int msi_get_mask_bit(const struct
                                 entry->msi.mpos) >>
                 entry->msi_attrib.entry_nr) & 1;
     case PCI_CAP_ID_MSIX:
+        if ( unlikely(!memory_decoded(entry->dev)) )
+            break;
         return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
     }
     return -1;
@@ -401,12 +463,14 @@ static int msi_get_mask_bit(const struct
 
 void mask_msi_irq(struct irq_desc *desc)
 {
-    msi_set_mask_bit(desc, 1);
+    if ( unlikely(!msi_set_mask_bit(desc, 1)) )
+        BUG();
 }
 
 void unmask_msi_irq(struct irq_desc *desc)
 {
-    msi_set_mask_bit(desc, 0);
+    if ( unlikely(!msi_set_mask_bit(desc, 0)) )
+        WARN();
 }
 
 static unsigned int startup_msi_irq(struct irq_desc *desc)
@@ -713,6 +777,9 @@ static int msix_capability_init(struct p
     control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
     msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
 
+    if ( unlikely(!memory_decoded(dev)) )
+        return -ENXIO;
+
     if ( desc )
     {
         entry = alloc_msi_entry(1);
@@ -845,7 +912,8 @@ static int msix_capability_init(struct p
     ++msix->used_entries;
 
     /* Restore MSI-X enabled bits */
-    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
+    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                     control & ~PCI_MSIX_FLAGS_MASKALL);
 
     return 0;
 }
@@ -998,8 +1066,16 @@ static void __pci_disable_msix(struct ms
 
     BUG_ON(list_empty(&dev->msi_list));
 
-    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-
+    if ( likely(memory_decoded(dev)) )
+        writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+    else if ( !(control & PCI_MSIX_FLAGS_MASKALL) )
+    {
+        printk(XENLOG_WARNING
+               "cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n",
+               entry->irq, dev->seg, dev->bus,
+               PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+        control |= PCI_MSIX_FLAGS_MASKALL;
+    }
     pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
     _pci_cleanup_msix(dev->msix);
@@ -1137,14 +1213,23 @@ int pci_restore_msi_state(struct pci_dev
             nr = entry->msi.nvec;
         }
         else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
+        {
             msix_set_enable(pdev, 0);
+            if ( unlikely(!memory_decoded(pdev)) )
+            {
+                spin_unlock_irqrestore(&desc->lock, flags);
+                return -ENXIO;
+            }
+        }
 
         msg = entry->msg;
         write_msi_msg(entry, &msg);
 
         for ( i = 0; ; )
         {
-            msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
+            if ( unlikely(!msi_set_mask_bit(desc,
+                                            entry[i].msi_attrib.masked)) )
+                BUG();
 
             if ( !--nr )
                 break;



[-- Attachment #2: x86-MSI-X-teardown.patch --]
[-- Type: text/plain, Size: 9727 bytes --]

x86/MSI-X: be more careful during teardown

When a device gets detached from a guest, pciback will clear its
command register, thus disabling both memory and I/O decoding. The
disabled memory decoding, however, has an effect on the MSI-X table
accesses the hypervisor does: These won't have the intended effect
anymore. Even worse, for PCIe devices (but not SR-IOV virtual
functions) such accesses may (will?) be treated as Unsupported
Requests, causing respective errors to be surfaced, potentially in the
form of NMIs that may be fatal to the hypervisor or Dom0 is different
ways. Hence rather than carrying out these accesses, we should avoid
them where we can, and use alternative (e.g. PCI config space based)
mechanisms to achieve at least the same effect.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Backporting note (largely to myself):
   Depends on (not yet backported) commit 061eebe0e "x86/MSI: drop
   workaround for insecure Dom0 kernels" (due to re-use of struct
   arch_msix's warned field).

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -121,6 +121,27 @@ static void msix_put_fixmap(struct arch_
     spin_unlock(&msix->table_lock);
 }
 
+static bool_t memory_decoded(const struct pci_dev *dev)
+{
+    u8 bus, slot, func;
+
+    if ( !dev->info.is_virtfn )
+    {
+        bus = dev->bus;
+        slot = PCI_SLOT(dev->devfn);
+        func = PCI_FUNC(dev->devfn);
+    }
+    else
+    {
+        bus = dev->info.physfn.bus;
+        slot = PCI_SLOT(dev->info.physfn.devfn);
+        func = PCI_FUNC(dev->info.physfn.devfn);
+    }
+
+    return !!(pci_conf_read16(dev->seg, bus, slot, func, PCI_COMMAND) &
+              PCI_COMMAND_MEMORY);
+}
+
 /*
  * MSI message composition
  */
@@ -162,7 +183,7 @@ void msi_compose_msg(unsigned vector, co
     }
 }
 
-static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+static bool_t read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 {
     switch ( entry->msi_attrib.type )
     {
@@ -198,6 +219,8 @@ static void read_msi_msg(struct msi_desc
         void __iomem *base;
         base = entry->mask_base;
 
+        if ( unlikely(!memory_decoded(entry->dev)) )
+            return 0;
         msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
         msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
@@ -209,6 +232,8 @@ static void read_msi_msg(struct msi_desc
 
     if ( iommu_intremap )
         iommu_read_msi_from_ire(entry, msg);
+
+    return 1;
 }
 
 static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -260,6 +285,8 @@ static int write_msi_msg(struct msi_desc
         void __iomem *base;
         base = entry->mask_base;
 
+        if ( unlikely(!memory_decoded(entry->dev)) )
+            return -ENXIO;
         writel(msg->address_lo,
                base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         writel(msg->address_hi,
@@ -287,7 +314,8 @@ void set_msi_affinity(struct irq_desc *d
     ASSERT(spin_is_locked(&desc->lock));
 
     memset(&msg, 0, sizeof(msg));
-    read_msi_msg(msi_desc, &msg);
+    if ( !read_msi_msg(msi_desc, &msg) )
+        return;
 
     msg.data &= ~MSI_DATA_VECTOR_MASK;
     msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
@@ -347,20 +375,24 @@ int msi_maskable_irq(const struct msi_de
            || entry->msi_attrib.maskbit;
 }
 
-static void msi_set_mask_bit(struct irq_desc *desc, int flag)
+static bool_t msi_set_mask_bit(struct irq_desc *desc, int flag)
 {
     struct msi_desc *entry = desc->msi_desc;
+    struct pci_dev *pdev;
+    u16 seg;
+    u8 bus, slot, func;
 
     ASSERT(spin_is_locked(&desc->lock));
     BUG_ON(!entry || !entry->dev);
+    pdev = entry->dev;
+    seg = pdev->seg;
+    bus = pdev->bus;
+    slot = PCI_SLOT(pdev->devfn);
+    func = PCI_FUNC(pdev->devfn);
     switch (entry->msi_attrib.type) {
     case PCI_CAP_ID_MSI:
         if (entry->msi_attrib.maskbit) {
             u32 mask_bits;
-            u16 seg = entry->dev->seg;
-            u8 bus = entry->dev->bus;
-            u8 slot = PCI_SLOT(entry->dev->devfn);
-            u8 func = PCI_FUNC(entry->dev->devfn);
 
             mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
             mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
@@ -369,24 +401,52 @@ static void msi_set_mask_bit(struct irq_
         }
         break;
     case PCI_CAP_ID_MSIX:
-    {
-        int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
-        writel(flag, entry->mask_base + offset);
-        readl(entry->mask_base + offset);
-        break;
-    }
+        if ( likely(memory_decoded(pdev)) )
+        {
+            writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+            readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+            break;
+        }
+        if ( flag )
+        {
+            u16 control;
+            domid_t domid = pdev->domain->domain_id;
+
+            control = pci_conf_read16(seg, bus, slot, func,
+                                      msix_control_reg(entry->msi_attrib.pos));
+            if ( control & PCI_MSIX_FLAGS_MASKALL )
+                break;
+            pci_conf_write16(seg, bus, slot, func,
+                             msix_control_reg(entry->msi_attrib.pos),
+                             control | PCI_MSIX_FLAGS_MASKALL);
+            if ( pdev->msix->warned != domid )
+            {
+                pdev->msix->warned = domid;
+                printk(XENLOG_G_WARNING
+                       "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
+                       desc->irq, domid, pdev->seg, pdev->bus,
+                       PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+            }
+            break;
+        }
+        /* fall through */
     default:
-        BUG();
-        break;
+        return 0;
     }
     entry->msi_attrib.masked = !!flag;
+
+    return 1;
 }
 
 static int msi_get_mask_bit(const struct msi_desc *entry)
 {
-    switch (entry->msi_attrib.type) {
+    if ( !entry->dev )
+        return -1;
+
+    switch ( entry->msi_attrib.type )
+    {
     case PCI_CAP_ID_MSI:
-        if (!entry->dev || !entry->msi_attrib.maskbit)
+        if ( !entry->msi_attrib.maskbit )
             break;
         return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
                                 PCI_SLOT(entry->dev->devfn),
@@ -394,6 +454,8 @@ static int msi_get_mask_bit(const struct
                                 entry->msi.mpos) >>
                 entry->msi_attrib.entry_nr) & 1;
     case PCI_CAP_ID_MSIX:
+        if ( unlikely(!memory_decoded(entry->dev)) )
+            break;
         return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
     }
     return -1;
@@ -401,12 +463,14 @@ static int msi_get_mask_bit(const struct
 
 void mask_msi_irq(struct irq_desc *desc)
 {
-    msi_set_mask_bit(desc, 1);
+    if ( unlikely(!msi_set_mask_bit(desc, 1)) )
+        BUG();
 }
 
 void unmask_msi_irq(struct irq_desc *desc)
 {
-    msi_set_mask_bit(desc, 0);
+    if ( unlikely(!msi_set_mask_bit(desc, 0)) )
+        WARN();
 }
 
 static unsigned int startup_msi_irq(struct irq_desc *desc)
@@ -713,6 +777,9 @@ static int msix_capability_init(struct p
     control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
     msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
 
+    if ( unlikely(!memory_decoded(dev)) )
+        return -ENXIO;
+
     if ( desc )
     {
         entry = alloc_msi_entry(1);
@@ -845,7 +912,8 @@ static int msix_capability_init(struct p
     ++msix->used_entries;
 
     /* Restore MSI-X enabled bits */
-    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
+    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                     control & ~PCI_MSIX_FLAGS_MASKALL);
 
     return 0;
 }
@@ -998,8 +1066,16 @@ static void __pci_disable_msix(struct ms
 
     BUG_ON(list_empty(&dev->msi_list));
 
-    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-
+    if ( likely(memory_decoded(dev)) )
+        writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+    else if ( !(control & PCI_MSIX_FLAGS_MASKALL) )
+    {
+        printk(XENLOG_WARNING
+               "cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n",
+               entry->irq, dev->seg, dev->bus,
+               PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+        control |= PCI_MSIX_FLAGS_MASKALL;
+    }
     pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
     _pci_cleanup_msix(dev->msix);
@@ -1137,14 +1213,23 @@ int pci_restore_msi_state(struct pci_dev
             nr = entry->msi.nvec;
         }
         else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
+        {
             msix_set_enable(pdev, 0);
+            if ( unlikely(!memory_decoded(pdev)) )
+            {
+                spin_unlock_irqrestore(&desc->lock, flags);
+                return -ENXIO;
+            }
+        }
 
         msg = entry->msg;
         write_msi_msg(entry, &msg);
 
         for ( i = 0; ; )
         {
-            msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
+            if ( unlikely(!msi_set_mask_bit(desc,
+                                            entry[i].msi_attrib.masked)) )
+                BUG();
 
             if ( !--nr )
                 break;

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 2/4] x86/MSI-X: access MSI‑X table only after having enabled MSI‑X
  2015-03-10 16:16 [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Jan Beulich
  2015-03-10 16:27 ` [PATCH 1/4] x86/MSI-X: be more careful during teardown Jan Beulich
@ 2015-03-10 16:28 ` Jan Beulich
  2015-03-13 19:18   ` Andrew Cooper
  2015-03-10 16:29 ` [PATCH 3/4] x86/MSI-X: reduce fiddling with control register during restore Jan Beulich
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2015-03-10 16:28 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 10717 bytes --]

As done in Linux by f598282f51 ("PCI: Fix the NIU MSI-X problem in a
better way") and its broken predecessor, make sure we don't access the
MSI-X table without having enabled MSI-X first, using the mask-all flag
instead to prevent interrupts from occurring.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -142,6 +142,19 @@ static bool_t memory_decoded(const struc
               PCI_COMMAND_MEMORY);
 }
 
+static bool_t msix_memory_decoded(const struct pci_dev *dev, unsigned int pos)
+{
+    u16 control = pci_conf_read16(dev->seg, dev->bus,
+                                  PCI_SLOT(dev->devfn),
+                                  PCI_FUNC(dev->devfn),
+                                  msix_control_reg(pos));
+
+    if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
+        return 0;
+
+    return memory_decoded(dev);
+}
+
 /*
  * MSI message composition
  */
@@ -219,7 +236,8 @@ static bool_t read_msi_msg(struct msi_de
         void __iomem *base;
         base = entry->mask_base;
 
-        if ( unlikely(!memory_decoded(entry->dev)) )
+        if ( unlikely(!msix_memory_decoded(entry->dev,
+                                           entry->msi_attrib.pos)) )
             return 0;
         msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
@@ -285,7 +303,8 @@ static int write_msi_msg(struct msi_desc
         void __iomem *base;
         base = entry->mask_base;
 
-        if ( unlikely(!memory_decoded(entry->dev)) )
+        if ( unlikely(!msix_memory_decoded(entry->dev,
+                                           entry->msi_attrib.pos)) )
             return -ENXIO;
         writel(msg->address_lo,
                base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
@@ -379,7 +398,7 @@ static bool_t msi_set_mask_bit(struct ir
 {
     struct msi_desc *entry = desc->msi_desc;
     struct pci_dev *pdev;
-    u16 seg;
+    u16 seg, control;
     u8 bus, slot, func;
 
     ASSERT(spin_is_locked(&desc->lock));
@@ -401,35 +420,38 @@ static bool_t msi_set_mask_bit(struct ir
         }
         break;
     case PCI_CAP_ID_MSIX:
+        control = pci_conf_read16(seg, bus, slot, func,
+                                  msix_control_reg(entry->msi_attrib.pos));
+        if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
+            pci_conf_write16(seg, bus, slot, func,
+                             msix_control_reg(entry->msi_attrib.pos),
+                             control | (PCI_MSIX_FLAGS_ENABLE |
+                                        PCI_MSIX_FLAGS_MASKALL));
         if ( likely(memory_decoded(pdev)) )
         {
             writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
             readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-            break;
+            if ( likely(control & PCI_MSIX_FLAGS_ENABLE) )
+                break;
+            flag = 1;
         }
-        if ( flag )
+        else if ( flag && !(control & PCI_MSIX_FLAGS_MASKALL) )
         {
-            u16 control;
             domid_t domid = pdev->domain->domain_id;
 
-            control = pci_conf_read16(seg, bus, slot, func,
-                                      msix_control_reg(entry->msi_attrib.pos));
-            if ( control & PCI_MSIX_FLAGS_MASKALL )
-                break;
-            pci_conf_write16(seg, bus, slot, func,
-                             msix_control_reg(entry->msi_attrib.pos),
-                             control | PCI_MSIX_FLAGS_MASKALL);
+            control |= PCI_MSIX_FLAGS_MASKALL;
             if ( pdev->msix->warned != domid )
             {
                 pdev->msix->warned = domid;
                 printk(XENLOG_G_WARNING
-                       "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
+                       "cannot mask IRQ %d: masking MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
                        desc->irq, domid, pdev->seg, pdev->bus,
                        PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
             }
-            break;
         }
-        /* fall through */
+        pci_conf_write16(seg, bus, slot, func,
+                         msix_control_reg(entry->msi_attrib.pos), control);
+        return flag;
     default:
         return 0;
     }
@@ -454,7 +476,8 @@ static int msi_get_mask_bit(const struct
                                 entry->msi.mpos) >>
                 entry->msi_attrib.entry_nr) & 1;
     case PCI_CAP_ID_MSIX:
-        if ( unlikely(!memory_decoded(entry->dev)) )
+        if ( unlikely(!msix_memory_decoded(entry->dev,
+                                           entry->msi_attrib.pos)) )
             break;
         return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
     }
@@ -775,16 +798,32 @@ static int msix_capability_init(struct p
 
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
-    msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
+    /*
+     * Ensure MSI-X interrupts are masked during setup. Some devices require
+     * MSI-X to be enabled before we can touch the MSI-X registers. We need
+     * to mask all the vectors to prevent interrupts coming in before they're
+     * fully set up.
+     */
+    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                     control | (PCI_MSIX_FLAGS_ENABLE |
+                                PCI_MSIX_FLAGS_MASKALL));
 
     if ( unlikely(!memory_decoded(dev)) )
+    {
+        pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                         control & ~PCI_MSIX_FLAGS_ENABLE);
         return -ENXIO;
+    }
 
     if ( desc )
     {
         entry = alloc_msi_entry(1);
         if ( !entry )
+        {
+            pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                             control & ~PCI_MSIX_FLAGS_ENABLE);
             return -ENOMEM;
+        }
         ASSERT(msi);
     }
 
@@ -815,6 +854,8 @@ static int msix_capability_init(struct p
     {
         if ( !msi || !msi->table_base )
         {
+            pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                             control & ~PCI_MSIX_FLAGS_ENABLE);
             xfree(entry);
             return -ENXIO;
         }
@@ -857,6 +898,8 @@ static int msix_capability_init(struct p
 
         if ( idx < 0 )
         {
+            pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                             control & ~PCI_MSIX_FLAGS_ENABLE);
             xfree(entry);
             return idx;
         }
@@ -912,8 +955,7 @@ static int msix_capability_init(struct p
     ++msix->used_entries;
 
     /* Restore MSI-X enabled bits */
-    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
-                     control & ~PCI_MSIX_FLAGS_MASKALL);
+    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
     return 0;
 }
@@ -1062,7 +1104,10 @@ static void __pci_disable_msix(struct ms
 
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
-    msix_set_enable(dev, 0);
+    if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
+        pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                         control | (PCI_MSIX_FLAGS_ENABLE |
+                                    PCI_MSIX_FLAGS_MASKALL));
 
     BUG_ON(list_empty(&dev->msi_list));
 
@@ -1076,6 +1121,7 @@ static void __pci_disable_msix(struct ms
                PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
         control |= PCI_MSIX_FLAGS_MASKALL;
     }
+
     pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
     _pci_cleanup_msix(dev->msix);
@@ -1188,6 +1234,8 @@ int pci_restore_msi_state(struct pci_dev
     list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
     {
         unsigned int i = 0, nr = 1;
+        u16 control = 0;
+        u8 slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
 
         irq = entry->irq;
         desc = &irq_desc[irq];
@@ -1214,10 +1262,18 @@ int pci_restore_msi_state(struct pci_dev
         }
         else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
         {
-            msix_set_enable(pdev, 0);
+            control = pci_conf_read16(pdev->seg, pdev->bus, slot, func,
+                                      msix_control_reg(entry->msi_attrib.pos));
+            pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                             msix_control_reg(entry->msi_attrib.pos),
+                             control | (PCI_MSIX_FLAGS_ENABLE |
+                                        PCI_MSIX_FLAGS_MASKALL));
             if ( unlikely(!memory_decoded(pdev)) )
             {
                 spin_unlock_irqrestore(&desc->lock, flags);
+                pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                                 msix_control_reg(entry->msi_attrib.pos),
+                                 control & ~PCI_MSIX_FLAGS_ENABLE);
                 return -ENXIO;
             }
         }
@@ -1246,11 +1302,9 @@ int pci_restore_msi_state(struct pci_dev
         if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
         {
             unsigned int cpos = msi_control_reg(entry->msi_attrib.pos);
-            u16 control = pci_conf_read16(pdev->seg, pdev->bus,
-                                          PCI_SLOT(pdev->devfn),
-                                          PCI_FUNC(pdev->devfn), cpos);
 
-            control &= ~PCI_MSI_FLAGS_QSIZE;
+            control = pci_conf_read16(pdev->seg, pdev->bus, slot, func, cpos) &
+                      ~PCI_MSI_FLAGS_QSIZE;
             multi_msi_enable(control, entry->msi.nvec);
             pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
                              PCI_FUNC(pdev->devfn), cpos, control);
@@ -1258,7 +1312,9 @@ int pci_restore_msi_state(struct pci_dev
             msi_set_enable(pdev, 1);
         }
         else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
-            msix_set_enable(pdev, 1);
+            pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                             msix_control_reg(entry->msi_attrib.pos),
+                             control | PCI_MSIX_FLAGS_ENABLE);
     }
 
     return 0;



[-- Attachment #2: x86-MSI-X-enable.patch --]
[-- Type: text/plain, Size: 10778 bytes --]

x86/MSI-X: access MSI-X table only after having enabled MSI-X

As done in Linux by f598282f51 ("PCI: Fix the NIU MSI-X problem in a
better way") and its broken predecessor, make sure we don't access the
MSI-X table without having enabled MSI-X first, using the mask-all flag
instead to prevent interrupts from occurring.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -142,6 +142,19 @@ static bool_t memory_decoded(const struc
               PCI_COMMAND_MEMORY);
 }
 
+static bool_t msix_memory_decoded(const struct pci_dev *dev, unsigned int pos)
+{
+    u16 control = pci_conf_read16(dev->seg, dev->bus,
+                                  PCI_SLOT(dev->devfn),
+                                  PCI_FUNC(dev->devfn),
+                                  msix_control_reg(pos));
+
+    if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
+        return 0;
+
+    return memory_decoded(dev);
+}
+
 /*
  * MSI message composition
  */
@@ -219,7 +236,8 @@ static bool_t read_msi_msg(struct msi_de
         void __iomem *base;
         base = entry->mask_base;
 
-        if ( unlikely(!memory_decoded(entry->dev)) )
+        if ( unlikely(!msix_memory_decoded(entry->dev,
+                                           entry->msi_attrib.pos)) )
             return 0;
         msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
@@ -285,7 +303,8 @@ static int write_msi_msg(struct msi_desc
         void __iomem *base;
         base = entry->mask_base;
 
-        if ( unlikely(!memory_decoded(entry->dev)) )
+        if ( unlikely(!msix_memory_decoded(entry->dev,
+                                           entry->msi_attrib.pos)) )
             return -ENXIO;
         writel(msg->address_lo,
                base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
@@ -379,7 +398,7 @@ static bool_t msi_set_mask_bit(struct ir
 {
     struct msi_desc *entry = desc->msi_desc;
     struct pci_dev *pdev;
-    u16 seg;
+    u16 seg, control;
     u8 bus, slot, func;
 
     ASSERT(spin_is_locked(&desc->lock));
@@ -401,35 +420,38 @@ static bool_t msi_set_mask_bit(struct ir
         }
         break;
     case PCI_CAP_ID_MSIX:
+        control = pci_conf_read16(seg, bus, slot, func,
+                                  msix_control_reg(entry->msi_attrib.pos));
+        if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
+            pci_conf_write16(seg, bus, slot, func,
+                             msix_control_reg(entry->msi_attrib.pos),
+                             control | (PCI_MSIX_FLAGS_ENABLE |
+                                        PCI_MSIX_FLAGS_MASKALL));
         if ( likely(memory_decoded(pdev)) )
         {
             writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
             readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-            break;
+            if ( likely(control & PCI_MSIX_FLAGS_ENABLE) )
+                break;
+            flag = 1;
         }
-        if ( flag )
+        else if ( flag && !(control & PCI_MSIX_FLAGS_MASKALL) )
         {
-            u16 control;
             domid_t domid = pdev->domain->domain_id;
 
-            control = pci_conf_read16(seg, bus, slot, func,
-                                      msix_control_reg(entry->msi_attrib.pos));
-            if ( control & PCI_MSIX_FLAGS_MASKALL )
-                break;
-            pci_conf_write16(seg, bus, slot, func,
-                             msix_control_reg(entry->msi_attrib.pos),
-                             control | PCI_MSIX_FLAGS_MASKALL);
+            control |= PCI_MSIX_FLAGS_MASKALL;
             if ( pdev->msix->warned != domid )
             {
                 pdev->msix->warned = domid;
                 printk(XENLOG_G_WARNING
-                       "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
+                       "cannot mask IRQ %d: masking MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
                        desc->irq, domid, pdev->seg, pdev->bus,
                        PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
             }
-            break;
         }
-        /* fall through */
+        pci_conf_write16(seg, bus, slot, func,
+                         msix_control_reg(entry->msi_attrib.pos), control);
+        return flag;
     default:
         return 0;
     }
@@ -454,7 +476,8 @@ static int msi_get_mask_bit(const struct
                                 entry->msi.mpos) >>
                 entry->msi_attrib.entry_nr) & 1;
     case PCI_CAP_ID_MSIX:
-        if ( unlikely(!memory_decoded(entry->dev)) )
+        if ( unlikely(!msix_memory_decoded(entry->dev,
+                                           entry->msi_attrib.pos)) )
             break;
         return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
     }
@@ -775,16 +798,32 @@ static int msix_capability_init(struct p
 
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
-    msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
+    /*
+     * Ensure MSI-X interrupts are masked during setup. Some devices require
+     * MSI-X to be enabled before we can touch the MSI-X registers. We need
+     * to mask all the vectors to prevent interrupts coming in before they're
+     * fully set up.
+     */
+    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                     control | (PCI_MSIX_FLAGS_ENABLE |
+                                PCI_MSIX_FLAGS_MASKALL));
 
     if ( unlikely(!memory_decoded(dev)) )
+    {
+        pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                         control & ~PCI_MSIX_FLAGS_ENABLE);
         return -ENXIO;
+    }
 
     if ( desc )
     {
         entry = alloc_msi_entry(1);
         if ( !entry )
+        {
+            pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                             control & ~PCI_MSIX_FLAGS_ENABLE);
             return -ENOMEM;
+        }
         ASSERT(msi);
     }
 
@@ -815,6 +854,8 @@ static int msix_capability_init(struct p
     {
         if ( !msi || !msi->table_base )
         {
+            pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                             control & ~PCI_MSIX_FLAGS_ENABLE);
             xfree(entry);
             return -ENXIO;
         }
@@ -857,6 +898,8 @@ static int msix_capability_init(struct p
 
         if ( idx < 0 )
         {
+            pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                             control & ~PCI_MSIX_FLAGS_ENABLE);
             xfree(entry);
             return idx;
         }
@@ -912,8 +955,7 @@ static int msix_capability_init(struct p
     ++msix->used_entries;
 
     /* Restore MSI-X enabled bits */
-    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
-                     control & ~PCI_MSIX_FLAGS_MASKALL);
+    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
     return 0;
 }
@@ -1062,7 +1104,10 @@ static void __pci_disable_msix(struct ms
 
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
-    msix_set_enable(dev, 0);
+    if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
+        pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
+                         control | (PCI_MSIX_FLAGS_ENABLE |
+                                    PCI_MSIX_FLAGS_MASKALL));
 
     BUG_ON(list_empty(&dev->msi_list));
 
@@ -1076,6 +1121,7 @@ static void __pci_disable_msix(struct ms
                PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
         control |= PCI_MSIX_FLAGS_MASKALL;
     }
+
     pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
     _pci_cleanup_msix(dev->msix);
@@ -1188,6 +1234,8 @@ int pci_restore_msi_state(struct pci_dev
     list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
     {
         unsigned int i = 0, nr = 1;
+        u16 control = 0;
+        u8 slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
 
         irq = entry->irq;
         desc = &irq_desc[irq];
@@ -1214,10 +1262,18 @@ int pci_restore_msi_state(struct pci_dev
         }
         else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
         {
-            msix_set_enable(pdev, 0);
+            control = pci_conf_read16(pdev->seg, pdev->bus, slot, func,
+                                      msix_control_reg(entry->msi_attrib.pos));
+            pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                             msix_control_reg(entry->msi_attrib.pos),
+                             control | (PCI_MSIX_FLAGS_ENABLE |
+                                        PCI_MSIX_FLAGS_MASKALL));
             if ( unlikely(!memory_decoded(pdev)) )
             {
                 spin_unlock_irqrestore(&desc->lock, flags);
+                pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                                 msix_control_reg(entry->msi_attrib.pos),
+                                 control & ~PCI_MSIX_FLAGS_ENABLE);
                 return -ENXIO;
             }
         }
@@ -1246,11 +1302,9 @@ int pci_restore_msi_state(struct pci_dev
         if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
         {
             unsigned int cpos = msi_control_reg(entry->msi_attrib.pos);
-            u16 control = pci_conf_read16(pdev->seg, pdev->bus,
-                                          PCI_SLOT(pdev->devfn),
-                                          PCI_FUNC(pdev->devfn), cpos);
 
-            control &= ~PCI_MSI_FLAGS_QSIZE;
+            control = pci_conf_read16(pdev->seg, pdev->bus, slot, func, cpos) &
+                      ~PCI_MSI_FLAGS_QSIZE;
             multi_msi_enable(control, entry->msi.nvec);
             pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
                              PCI_FUNC(pdev->devfn), cpos, control);
@@ -1258,7 +1312,9 @@ int pci_restore_msi_state(struct pci_dev
             msi_set_enable(pdev, 1);
         }
         else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
-            msix_set_enable(pdev, 1);
+            pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                             msix_control_reg(entry->msi_attrib.pos),
+                             control | PCI_MSIX_FLAGS_ENABLE);
     }
 
     return 0;

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 3/4] x86/MSI-X: reduce fiddling with control register during restore
  2015-03-10 16:16 [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Jan Beulich
  2015-03-10 16:27 ` [PATCH 1/4] x86/MSI-X: be more careful during teardown Jan Beulich
  2015-03-10 16:28 ` [PATCH 2/4] x86/MSI-X: access MSI‑X table only after having enabled MSI‑X Jan Beulich
@ 2015-03-10 16:29 ` Jan Beulich
  2015-03-13 19:38   ` Andrew Cooper
  2015-03-10 16:29 ` [PATCH 4/4] x86/MSI-X: cleanup Jan Beulich
  2015-03-13 20:16 ` [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Andrew Cooper
  4 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2015-03-10 16:29 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 4054 bytes --]

Rather than disabling and enabling MSI-X once per vector, do it just
once per device.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -1218,6 +1218,9 @@ int pci_restore_msi_state(struct pci_dev
     struct msi_desc *entry, *tmp;
     struct irq_desc *desc;
     struct msi_msg msg;
+    u8 slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    unsigned int type = 0, pos = 0;
+    u16 control = 0;
 
     ASSERT(spin_is_locked(&pcidevs_lock));
 
@@ -1234,8 +1237,6 @@ int pci_restore_msi_state(struct pci_dev
     list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
     {
         unsigned int i = 0, nr = 1;
-        u16 control = 0;
-        u8 slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
 
         irq = entry->irq;
         desc = &irq_desc[irq];
@@ -1252,31 +1253,38 @@ int pci_restore_msi_state(struct pci_dev
                     pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
                     PCI_FUNC(pdev->devfn), i);
             spin_unlock_irqrestore(&desc->lock, flags);
+            if ( type == PCI_CAP_ID_MSIX )
+                pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                                 msix_control_reg(pos),
+                                 control & ~PCI_MSIX_FLAGS_ENABLE);
             return -EINVAL;
         }
 
+        ASSERT(!type || type == entry->msi_attrib.type);
+        pos = entry->msi_attrib.pos;
         if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
         {
             msi_set_enable(pdev, 0);
             nr = entry->msi.nvec;
         }
-        else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
+        else if ( !type && entry->msi_attrib.type == PCI_CAP_ID_MSIX )
         {
             control = pci_conf_read16(pdev->seg, pdev->bus, slot, func,
-                                      msix_control_reg(entry->msi_attrib.pos));
+                                      msix_control_reg(pos));
             pci_conf_write16(pdev->seg, pdev->bus, slot, func,
-                             msix_control_reg(entry->msi_attrib.pos),
+                             msix_control_reg(pos),
                              control | (PCI_MSIX_FLAGS_ENABLE |
                                         PCI_MSIX_FLAGS_MASKALL));
             if ( unlikely(!memory_decoded(pdev)) )
             {
                 spin_unlock_irqrestore(&desc->lock, flags);
                 pci_conf_write16(pdev->seg, pdev->bus, slot, func,
-                                 msix_control_reg(entry->msi_attrib.pos),
+                                 msix_control_reg(pos),
                                  control & ~PCI_MSIX_FLAGS_ENABLE);
                 return -ENXIO;
             }
         }
+        type = entry->msi_attrib.type;
 
         msg = entry->msg;
         write_msi_msg(entry, &msg);
@@ -1299,9 +1307,9 @@ int pci_restore_msi_state(struct pci_dev
 
         spin_unlock_irqrestore(&desc->lock, flags);
 
-        if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
+        if ( type == PCI_CAP_ID_MSI )
         {
-            unsigned int cpos = msi_control_reg(entry->msi_attrib.pos);
+            unsigned int cpos = msi_control_reg(pos);
 
             control = pci_conf_read16(pdev->seg, pdev->bus, slot, func, cpos) &
                       ~PCI_MSI_FLAGS_QSIZE;
@@ -1311,12 +1319,13 @@ int pci_restore_msi_state(struct pci_dev
 
             msi_set_enable(pdev, 1);
         }
-        else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
-            pci_conf_write16(pdev->seg, pdev->bus, slot, func,
-                             msix_control_reg(entry->msi_attrib.pos),
-                             control | PCI_MSIX_FLAGS_ENABLE);
     }
 
+    if ( type == PCI_CAP_ID_MSIX )
+        pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                         msix_control_reg(pos),
+                         control | PCI_MSIX_FLAGS_ENABLE);
+
     return 0;
 }
 




[-- Attachment #2: x86-MSI-X-restore-once.patch --]
[-- Type: text/plain, Size: 4115 bytes --]

x86/MSI-X: reduce fiddling with control register during restore

Rather than disabling and enabling MSI-X once per vector, do it just
once per device.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -1218,6 +1218,9 @@ int pci_restore_msi_state(struct pci_dev
     struct msi_desc *entry, *tmp;
     struct irq_desc *desc;
     struct msi_msg msg;
+    u8 slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
+    unsigned int type = 0, pos = 0;
+    u16 control = 0;
 
     ASSERT(spin_is_locked(&pcidevs_lock));
 
@@ -1234,8 +1237,6 @@ int pci_restore_msi_state(struct pci_dev
     list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
     {
         unsigned int i = 0, nr = 1;
-        u16 control = 0;
-        u8 slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
 
         irq = entry->irq;
         desc = &irq_desc[irq];
@@ -1252,31 +1253,38 @@ int pci_restore_msi_state(struct pci_dev
                     pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
                     PCI_FUNC(pdev->devfn), i);
             spin_unlock_irqrestore(&desc->lock, flags);
+            if ( type == PCI_CAP_ID_MSIX )
+                pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                                 msix_control_reg(pos),
+                                 control & ~PCI_MSIX_FLAGS_ENABLE);
             return -EINVAL;
         }
 
+        ASSERT(!type || type == entry->msi_attrib.type);
+        pos = entry->msi_attrib.pos;
         if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
         {
             msi_set_enable(pdev, 0);
             nr = entry->msi.nvec;
         }
-        else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
+        else if ( !type && entry->msi_attrib.type == PCI_CAP_ID_MSIX )
         {
             control = pci_conf_read16(pdev->seg, pdev->bus, slot, func,
-                                      msix_control_reg(entry->msi_attrib.pos));
+                                      msix_control_reg(pos));
             pci_conf_write16(pdev->seg, pdev->bus, slot, func,
-                             msix_control_reg(entry->msi_attrib.pos),
+                             msix_control_reg(pos),
                              control | (PCI_MSIX_FLAGS_ENABLE |
                                         PCI_MSIX_FLAGS_MASKALL));
             if ( unlikely(!memory_decoded(pdev)) )
             {
                 spin_unlock_irqrestore(&desc->lock, flags);
                 pci_conf_write16(pdev->seg, pdev->bus, slot, func,
-                                 msix_control_reg(entry->msi_attrib.pos),
+                                 msix_control_reg(pos),
                                  control & ~PCI_MSIX_FLAGS_ENABLE);
                 return -ENXIO;
             }
         }
+        type = entry->msi_attrib.type;
 
         msg = entry->msg;
         write_msi_msg(entry, &msg);
@@ -1299,9 +1307,9 @@ int pci_restore_msi_state(struct pci_dev
 
         spin_unlock_irqrestore(&desc->lock, flags);
 
-        if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
+        if ( type == PCI_CAP_ID_MSI )
         {
-            unsigned int cpos = msi_control_reg(entry->msi_attrib.pos);
+            unsigned int cpos = msi_control_reg(pos);
 
             control = pci_conf_read16(pdev->seg, pdev->bus, slot, func, cpos) &
                       ~PCI_MSI_FLAGS_QSIZE;
@@ -1311,12 +1319,13 @@ int pci_restore_msi_state(struct pci_dev
 
             msi_set_enable(pdev, 1);
         }
-        else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
-            pci_conf_write16(pdev->seg, pdev->bus, slot, func,
-                             msix_control_reg(entry->msi_attrib.pos),
-                             control | PCI_MSIX_FLAGS_ENABLE);
     }
 
+    if ( type == PCI_CAP_ID_MSIX )
+        pci_conf_write16(pdev->seg, pdev->bus, slot, func,
+                         msix_control_reg(pos),
+                         control | PCI_MSIX_FLAGS_ENABLE);
+
     return 0;
 }
 

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 4/4] x86/MSI-X: cleanup
  2015-03-10 16:16 [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Jan Beulich
                   ` (2 preceding siblings ...)
  2015-03-10 16:29 ` [PATCH 3/4] x86/MSI-X: reduce fiddling with control register during restore Jan Beulich
@ 2015-03-10 16:29 ` Jan Beulich
  2015-03-13 19:56   ` Andrew Cooper
  2015-03-13 20:16 ` [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Andrew Cooper
  4 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2015-03-10 16:29 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 10666 bytes --]

- __pci_enable_msix() now checks that an MSI-X capability was actually
  found
- pass "pos" to msix_capability_init() as both callers already know it
  (and hence there's no need to re-obtain it)
- call __pci_disable_msi{,x}() directly instead of via
  pci_disable_msi() from __pci_enable_msi{x,}() state validation paths
- use msix_control_reg() instead of open coding it
- log message adjustments
- coding style corrections

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -35,6 +35,8 @@
 static s8 __read_mostly use_msi = -1;
 boolean_param("msi", use_msi);
 
+static void __pci_disable_msix(struct msi_desc *);
+
 /* bitmap indicate which fixed map is free */
 static DEFINE_SPINLOCK(msix_fixmap_lock);
 static DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES);
@@ -167,12 +169,14 @@ void msi_compose_msg(unsigned vector, co
     unsigned dest;
 
     memset(msg, 0, sizeof(*msg));
-    if ( !cpumask_intersects(cpu_mask, &cpu_online_map) ) {
+    if ( !cpumask_intersects(cpu_mask, &cpu_online_map) )
+    {
         dprintk(XENLOG_ERR,"%s, compose msi message error!!\n", __func__);
         return;
     }
 
-    if ( vector ) {
+    if ( vector )
+    {
         cpumask_t *mask = this_cpu(scratch_mask);
 
         cpumask_and(mask, cpu_mask, &cpu_online_map);
@@ -233,8 +237,7 @@ static bool_t read_msi_msg(struct msi_de
     }
     case PCI_CAP_ID_MSIX:
     {
-        void __iomem *base;
-        base = entry->mask_base;
+        void __iomem *base = entry->mask_base;
 
         if ( unlikely(!msix_memory_decoded(entry->dev,
                                            entry->msi_attrib.pos)) )
@@ -300,8 +303,7 @@ static int write_msi_msg(struct msi_desc
     }
     case PCI_CAP_ID_MSIX:
     {
-        void __iomem *base;
-        base = entry->mask_base;
+        void __iomem *base = entry->mask_base;
 
         if ( unlikely(!msix_memory_decoded(entry->dev,
                                            entry->msi_attrib.pos)) )
@@ -327,7 +329,7 @@ void set_msi_affinity(struct irq_desc *d
     struct msi_desc *msi_desc = desc->msi_desc;
 
     dest = set_desc_affinity(desc, mask);
-    if (dest == BAD_APICID || !msi_desc)
+    if ( dest == BAD_APICID || !msi_desc )
         return;
 
     ASSERT(spin_is_locked(&desc->lock));
@@ -379,11 +381,11 @@ static void msix_set_enable(struct pci_d
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     if ( pos )
     {
-        control = pci_conf_read16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS);
+        control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
         control &= ~PCI_MSIX_FLAGS_ENABLE;
         if ( enable )
             control |= PCI_MSIX_FLAGS_ENABLE;
-        pci_conf_write16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS, control);
+        pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
     }
 }
 
@@ -408,9 +410,11 @@ static bool_t msi_set_mask_bit(struct ir
     bus = pdev->bus;
     slot = PCI_SLOT(pdev->devfn);
     func = PCI_FUNC(pdev->devfn);
-    switch (entry->msi_attrib.type) {
+    switch ( entry->msi_attrib.type )
+    {
     case PCI_CAP_ID_MSI:
-        if (entry->msi_attrib.maskbit) {
+        if ( entry->msi_attrib.maskbit )
+        {
             u32 mask_bits;
 
             mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
@@ -778,13 +782,14 @@ static u64 read_pci_mem_bar(u16 seg, u8 
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev,
+                                unsigned int pos,
                                 struct msi_info *msi,
                                 struct msi_desc **desc,
                                 unsigned int nr_entries)
 {
     struct arch_msix *msix = dev->msix;
     struct msi_desc *entry = NULL;
-    int pos, vf;
+    int vf;
     u16 control;
     u64 table_paddr;
     u32 table_offset;
@@ -796,7 +801,6 @@ static int msix_capability_init(struct p
 
     ASSERT(spin_is_locked(&pcidevs_lock));
 
-    pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
     /*
      * Ensure MSI-X interrupts are masked during setup. Some devices require
@@ -984,10 +988,9 @@ static int __pci_enable_msi(struct msi_i
     old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSI);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "irq %d has already mapped to MSI on "
-                "device %04x:%02x:%02x.%01x\n",
-                msi->irq, msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        printk(XENLOG_WARNING "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n",
+               msi->irq, msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
         *desc = old_desc;
         return 0;
     }
@@ -995,10 +998,10 @@ static int __pci_enable_msi(struct msi_i
     old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "MSI-X is already in use on "
-                "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
-        pci_disable_msi(old_desc);
+        printk(XENLOG_WARNING "MSI-X already in use on %04x:%02x:%02x.%u\n",
+               msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        __pci_disable_msix(old_desc);
     }
 
     return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr);
@@ -1012,7 +1015,6 @@ static void __pci_disable_msi(struct msi
     msi_set_enable(dev, 0);
 
     BUG_ON(list_empty(&dev->msi_list));
-
 }
 
 /**
@@ -1032,7 +1034,7 @@ static void __pci_disable_msi(struct msi
  **/
 static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
 {
-    int status, pos, nr_entries;
+    int pos, nr_entries;
     struct pci_dev *pdev;
     u16 control;
     u8 slot = PCI_SLOT(msi->devfn);
@@ -1041,23 +1043,22 @@ static int __pci_enable_msix(struct msi_
 
     ASSERT(spin_is_locked(&pcidevs_lock));
     pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
-    if ( !pdev )
+    pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
+    if ( !pdev || !pos )
         return -ENODEV;
 
-    pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(msi->seg, msi->bus, slot, func,
                               msix_control_reg(pos));
     nr_entries = multi_msix_capable(control);
-    if (msi->entry_nr >= nr_entries)
+    if ( msi->entry_nr >= nr_entries )
         return -EINVAL;
 
     old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "irq %d has already mapped to MSIX on "
-                "device %04x:%02x:%02x.%01x\n",
-                msi->irq, msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        printk(XENLOG_WARNING "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n",
+               msi->irq, msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
         *desc = old_desc;
         return 0;
     }
@@ -1065,15 +1066,13 @@ static int __pci_enable_msix(struct msi_
     old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "MSI is already in use on "
-                "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
-        pci_disable_msi(old_desc);
-
+        printk(XENLOG_WARNING "MSI already in use on %04x:%02x:%02x.%u\n",
+               msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        __pci_disable_msi(old_desc);
     }
 
-    status = msix_capability_init(pdev, msi, desc, nr_entries);
-    return status;
+    return msix_capability_init(pdev, pos, msi, desc, nr_entries);
 }
 
 static void _pci_cleanup_msix(struct arch_msix *msix)
@@ -1091,19 +1090,15 @@ static void _pci_cleanup_msix(struct arc
 
 static void __pci_disable_msix(struct msi_desc *entry)
 {
-    struct pci_dev *dev;
-    int pos;
-    u16 control, seg;
-    u8 bus, slot, func;
-
-    dev = entry->dev;
-    seg = dev->seg;
-    bus = dev->bus;
-    slot = PCI_SLOT(dev->devfn);
-    func = PCI_FUNC(dev->devfn);
-
-    pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
-    control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
+    struct pci_dev *dev = entry->dev;
+    u16 seg = dev->seg;
+    u8 bus = dev->bus;
+    u8 slot = PCI_SLOT(dev->devfn);
+    u8 func = PCI_FUNC(dev->devfn);
+    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
+                                           PCI_CAP_ID_MSIX);
+    u16 control = pci_conf_read16(seg, bus, slot, func,
+                                  msix_control_reg(entry->msi_attrib.pos));
     if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
         pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
                          control | (PCI_MSIX_FLAGS_ENABLE |
@@ -1157,7 +1152,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
         u16 control = pci_conf_read16(seg, bus, slot, func,
                                       msix_control_reg(pos));
 
-        rc = msix_capability_init(pdev, NULL, NULL,
+        rc = msix_capability_init(pdev, pos, NULL, NULL,
                                   multi_msix_capable(control));
     }
     spin_unlock(&pcidevs_lock);
@@ -1176,8 +1171,8 @@ int pci_enable_msi(struct msi_info *msi,
     if ( !use_msi )
         return -EPERM;
 
-    return  msi->table_base ? __pci_enable_msix(msi, desc) :
-        __pci_enable_msi(msi, desc);
+    return msi->table_base ? __pci_enable_msix(msi, desc) :
+                             __pci_enable_msi(msi, desc);
 }
 
 /*
@@ -1230,7 +1225,9 @@ int pci_restore_msi_state(struct pci_dev
     if ( !pdev )
         return -EINVAL;
 
-    ret = xsm_resource_setup_pci(XSM_PRIV, (pdev->seg << 16) | (pdev->bus << 8) | pdev->devfn);
+    ret = xsm_resource_setup_pci(XSM_PRIV,
+                                (pdev->seg << 16) | (pdev->bus << 8) |
+                                pdev->devfn);
     if ( ret )
         return ret;
 



[-- Attachment #2: x86-MSI-X-cleanup.patch --]
[-- Type: text/plain, Size: 10684 bytes --]

x86/MSI-X: cleanup

- __pci_enable_msix() now checks that an MSI-X capability was actually
  found
- pass "pos" to msix_capability_init() as both callers already know it
  (and hence there's no need to re-obtain it)
- call __pci_disable_msi{,x}() directly instead of via
  pci_disable_msi() from __pci_enable_msi{x,}() state validation paths
- use msix_control_reg() instead of open coding it
- log message adjustments
- coding style corrections

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -35,6 +35,8 @@
 static s8 __read_mostly use_msi = -1;
 boolean_param("msi", use_msi);
 
+static void __pci_disable_msix(struct msi_desc *);
+
 /* bitmap indicate which fixed map is free */
 static DEFINE_SPINLOCK(msix_fixmap_lock);
 static DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES);
@@ -167,12 +169,14 @@ void msi_compose_msg(unsigned vector, co
     unsigned dest;
 
     memset(msg, 0, sizeof(*msg));
-    if ( !cpumask_intersects(cpu_mask, &cpu_online_map) ) {
+    if ( !cpumask_intersects(cpu_mask, &cpu_online_map) )
+    {
         dprintk(XENLOG_ERR,"%s, compose msi message error!!\n", __func__);
         return;
     }
 
-    if ( vector ) {
+    if ( vector )
+    {
         cpumask_t *mask = this_cpu(scratch_mask);
 
         cpumask_and(mask, cpu_mask, &cpu_online_map);
@@ -233,8 +237,7 @@ static bool_t read_msi_msg(struct msi_de
     }
     case PCI_CAP_ID_MSIX:
     {
-        void __iomem *base;
-        base = entry->mask_base;
+        void __iomem *base = entry->mask_base;
 
         if ( unlikely(!msix_memory_decoded(entry->dev,
                                            entry->msi_attrib.pos)) )
@@ -300,8 +303,7 @@ static int write_msi_msg(struct msi_desc
     }
     case PCI_CAP_ID_MSIX:
     {
-        void __iomem *base;
-        base = entry->mask_base;
+        void __iomem *base = entry->mask_base;
 
         if ( unlikely(!msix_memory_decoded(entry->dev,
                                            entry->msi_attrib.pos)) )
@@ -327,7 +329,7 @@ void set_msi_affinity(struct irq_desc *d
     struct msi_desc *msi_desc = desc->msi_desc;
 
     dest = set_desc_affinity(desc, mask);
-    if (dest == BAD_APICID || !msi_desc)
+    if ( dest == BAD_APICID || !msi_desc )
         return;
 
     ASSERT(spin_is_locked(&desc->lock));
@@ -379,11 +381,11 @@ static void msix_set_enable(struct pci_d
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     if ( pos )
     {
-        control = pci_conf_read16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS);
+        control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
         control &= ~PCI_MSIX_FLAGS_ENABLE;
         if ( enable )
             control |= PCI_MSIX_FLAGS_ENABLE;
-        pci_conf_write16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS, control);
+        pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
     }
 }
 
@@ -408,9 +410,11 @@ static bool_t msi_set_mask_bit(struct ir
     bus = pdev->bus;
     slot = PCI_SLOT(pdev->devfn);
     func = PCI_FUNC(pdev->devfn);
-    switch (entry->msi_attrib.type) {
+    switch ( entry->msi_attrib.type )
+    {
     case PCI_CAP_ID_MSI:
-        if (entry->msi_attrib.maskbit) {
+        if ( entry->msi_attrib.maskbit )
+        {
             u32 mask_bits;
 
             mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
@@ -778,13 +782,14 @@ static u64 read_pci_mem_bar(u16 seg, u8 
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev,
+                                unsigned int pos,
                                 struct msi_info *msi,
                                 struct msi_desc **desc,
                                 unsigned int nr_entries)
 {
     struct arch_msix *msix = dev->msix;
     struct msi_desc *entry = NULL;
-    int pos, vf;
+    int vf;
     u16 control;
     u64 table_paddr;
     u32 table_offset;
@@ -796,7 +801,6 @@ static int msix_capability_init(struct p
 
     ASSERT(spin_is_locked(&pcidevs_lock));
 
-    pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
     /*
      * Ensure MSI-X interrupts are masked during setup. Some devices require
@@ -984,10 +988,9 @@ static int __pci_enable_msi(struct msi_i
     old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSI);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "irq %d has already mapped to MSI on "
-                "device %04x:%02x:%02x.%01x\n",
-                msi->irq, msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        printk(XENLOG_WARNING "irq %d already mapped to MSI on %04x:%02x:%02x.%u\n",
+               msi->irq, msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
         *desc = old_desc;
         return 0;
     }
@@ -995,10 +998,10 @@ static int __pci_enable_msi(struct msi_i
     old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "MSI-X is already in use on "
-                "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
-        pci_disable_msi(old_desc);
+        printk(XENLOG_WARNING "MSI-X already in use on %04x:%02x:%02x.%u\n",
+               msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        __pci_disable_msix(old_desc);
     }
 
     return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr);
@@ -1012,7 +1015,6 @@ static void __pci_disable_msi(struct msi
     msi_set_enable(dev, 0);
 
     BUG_ON(list_empty(&dev->msi_list));
-
 }
 
 /**
@@ -1032,7 +1034,7 @@ static void __pci_disable_msi(struct msi
  **/
 static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc)
 {
-    int status, pos, nr_entries;
+    int pos, nr_entries;
     struct pci_dev *pdev;
     u16 control;
     u8 slot = PCI_SLOT(msi->devfn);
@@ -1041,23 +1043,22 @@ static int __pci_enable_msix(struct msi_
 
     ASSERT(spin_is_locked(&pcidevs_lock));
     pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
-    if ( !pdev )
+    pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
+    if ( !pdev || !pos )
         return -ENODEV;
 
-    pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(msi->seg, msi->bus, slot, func,
                               msix_control_reg(pos));
     nr_entries = multi_msix_capable(control);
-    if (msi->entry_nr >= nr_entries)
+    if ( msi->entry_nr >= nr_entries )
         return -EINVAL;
 
     old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "irq %d has already mapped to MSIX on "
-                "device %04x:%02x:%02x.%01x\n",
-                msi->irq, msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        printk(XENLOG_WARNING "irq %d already mapped to MSI-X on %04x:%02x:%02x.%u\n",
+               msi->irq, msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
         *desc = old_desc;
         return 0;
     }
@@ -1065,15 +1066,13 @@ static int __pci_enable_msix(struct msi_
     old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI);
     if ( old_desc )
     {
-        dprintk(XENLOG_WARNING, "MSI is already in use on "
-                "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus,
-                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
-        pci_disable_msi(old_desc);
-
+        printk(XENLOG_WARNING "MSI already in use on %04x:%02x:%02x.%u\n",
+               msi->seg, msi->bus,
+               PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+        __pci_disable_msi(old_desc);
     }
 
-    status = msix_capability_init(pdev, msi, desc, nr_entries);
-    return status;
+    return msix_capability_init(pdev, pos, msi, desc, nr_entries);
 }
 
 static void _pci_cleanup_msix(struct arch_msix *msix)
@@ -1091,19 +1090,15 @@ static void _pci_cleanup_msix(struct arc
 
 static void __pci_disable_msix(struct msi_desc *entry)
 {
-    struct pci_dev *dev;
-    int pos;
-    u16 control, seg;
-    u8 bus, slot, func;
-
-    dev = entry->dev;
-    seg = dev->seg;
-    bus = dev->bus;
-    slot = PCI_SLOT(dev->devfn);
-    func = PCI_FUNC(dev->devfn);
-
-    pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
-    control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
+    struct pci_dev *dev = entry->dev;
+    u16 seg = dev->seg;
+    u8 bus = dev->bus;
+    u8 slot = PCI_SLOT(dev->devfn);
+    u8 func = PCI_FUNC(dev->devfn);
+    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
+                                           PCI_CAP_ID_MSIX);
+    u16 control = pci_conf_read16(seg, bus, slot, func,
+                                  msix_control_reg(entry->msi_attrib.pos));
     if ( unlikely(!(control & PCI_MSIX_FLAGS_ENABLE)) )
         pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
                          control | (PCI_MSIX_FLAGS_ENABLE |
@@ -1157,7 +1152,7 @@ int pci_prepare_msix(u16 seg, u8 bus, u8
         u16 control = pci_conf_read16(seg, bus, slot, func,
                                       msix_control_reg(pos));
 
-        rc = msix_capability_init(pdev, NULL, NULL,
+        rc = msix_capability_init(pdev, pos, NULL, NULL,
                                   multi_msix_capable(control));
     }
     spin_unlock(&pcidevs_lock);
@@ -1176,8 +1171,8 @@ int pci_enable_msi(struct msi_info *msi,
     if ( !use_msi )
         return -EPERM;
 
-    return  msi->table_base ? __pci_enable_msix(msi, desc) :
-        __pci_enable_msi(msi, desc);
+    return msi->table_base ? __pci_enable_msix(msi, desc) :
+                             __pci_enable_msi(msi, desc);
 }
 
 /*
@@ -1230,7 +1225,9 @@ int pci_restore_msi_state(struct pci_dev
     if ( !pdev )
         return -EINVAL;
 
-    ret = xsm_resource_setup_pci(XSM_PRIV, (pdev->seg << 16) | (pdev->bus << 8) | pdev->devfn);
+    ret = xsm_resource_setup_pci(XSM_PRIV,
+                                (pdev->seg << 16) | (pdev->bus << 8) |
+                                pdev->devfn);
     if ( ret )
         return ret;
 

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] x86/MSI-X: be more careful during teardown
  2015-03-10 16:27 ` [PATCH 1/4] x86/MSI-X: be more careful during teardown Jan Beulich
@ 2015-03-10 21:05   ` Andrew Cooper
  2015-03-11  8:22     ` Jan Beulich
  2015-03-13 18:10   ` Andrew Cooper
  1 sibling, 1 reply; 15+ messages in thread
From: Andrew Cooper @ 2015-03-10 21:05 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 10/03/15 16:27, Jan Beulich wrote:
> When a device gets detached from a guest, pciback will clear its
> command register, thus disabling both memory and I/O decoding. The
> disabled memory decoding, however, has an effect on the MSI-X table
> accesses the hypervisor does: These won't have the intended effect
> anymore. Even worse, for PCIe devices (but not SR-IOV virtual
> functions) such accesses may (will?) be treated as Unsupported
> Requests, causing respective errors to be surfaced, potentially in the
> form of NMIs that may be fatal to the hypervisor or Dom0 is different
> ways. Hence rather than carrying out these accesses, we should avoid
> them where we can, and use alternative (e.g. PCI config space based)
> mechanisms to achieve at least the same effect.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Backporting note (largely to myself):
>    Depends on (not yet backported) commit 061eebe0e "x86/MSI: drop
>    workaround for insecure Dom0 kernels" (due to re-use of struct
>    arch_msix's warned field).
>
> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -121,6 +121,27 @@ static void msix_put_fixmap(struct arch_
>      spin_unlock(&msix->table_lock);
>  }
>  
> +static bool_t memory_decoded(const struct pci_dev *dev)
> +{
> +    u8 bus, slot, func;
> +
> +    if ( !dev->info.is_virtfn )
> +    {
> +        bus = dev->bus;
> +        slot = PCI_SLOT(dev->devfn);
> +        func = PCI_FUNC(dev->devfn);
> +    }
> +    else
> +    {
> +        bus = dev->info.physfn.bus;
> +        slot = PCI_SLOT(dev->info.physfn.devfn);
> +        func = PCI_FUNC(dev->info.physfn.devfn);
> +    }
> +
> +    return !!(pci_conf_read16(dev->seg, bus, slot, func, PCI_COMMAND) &
> +              PCI_COMMAND_MEMORY);
> +}
> +

This check is racy against anyone who can write to the command register,
which includes dom0 and other pcpus in Xen.  There does not appear to be
any exclusion between Xen emulating a control register write on one cpu
and changing irq affinities on another.

As a result, this check does not actually protect against accessing the
MSI-X bar while memory decoding is disabled.  As a downside, it puts an
expensive config space access on moderately frequent codepaths.

One issue we have just identified pertains to dom0 resetting a device
and Xen falling over a UR which has been escalated to fatal, most likely
because an in-progress MSI-X interrupt migration.  There does not appear
to be sufficient synchronisation available in the interface for a dom0
to even cooperatively perform a device reset with Xen.

The more I consider this and related problems, the more I am thinking
that the only longterm solution is to have a full PCI implementation in
Xen, and to prevent any unmediated access, including from dom0.  Xen
need not gain much (any?) more device-specific knowledge, but needs to
gain the ability to properly mediate all config updates, and synchronise
resets against other users of the device.  I do not suggest this
lightly; I realise that it is a huge quantity of work.

~Andrew

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] x86/MSI-X: be more careful during teardown
  2015-03-10 21:05   ` Andrew Cooper
@ 2015-03-11  8:22     ` Jan Beulich
  0 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2015-03-11  8:22 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 10.03.15 at 22:05, <andrew.cooper3@citrix.com> wrote:
> On 10/03/15 16:27, Jan Beulich wrote:
>> When a device gets detached from a guest, pciback will clear its
>> command register, thus disabling both memory and I/O decoding. The
>> disabled memory decoding, however, has an effect on the MSI-X table
>> accesses the hypervisor does: These won't have the intended effect
>> anymore. Even worse, for PCIe devices (but not SR-IOV virtual
>> functions) such accesses may (will?) be treated as Unsupported
>> Requests, causing respective errors to be surfaced, potentially in the
>> form of NMIs that may be fatal to the hypervisor or Dom0 is different
>> ways. Hence rather than carrying out these accesses, we should avoid
>> them where we can, and use alternative (e.g. PCI config space based)
>> mechanisms to achieve at least the same effect.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> Backporting note (largely to myself):
>>    Depends on (not yet backported) commit 061eebe0e "x86/MSI: drop
>>    workaround for insecure Dom0 kernels" (due to re-use of struct
>>    arch_msix's warned field).
>>
>> --- a/xen/arch/x86/msi.c
>> +++ b/xen/arch/x86/msi.c
>> @@ -121,6 +121,27 @@ static void msix_put_fixmap(struct arch_
>>      spin_unlock(&msix->table_lock);
>>  }
>>  
>> +static bool_t memory_decoded(const struct pci_dev *dev)
>> +{
>> +    u8 bus, slot, func;
>> +
>> +    if ( !dev->info.is_virtfn )
>> +    {
>> +        bus = dev->bus;
>> +        slot = PCI_SLOT(dev->devfn);
>> +        func = PCI_FUNC(dev->devfn);
>> +    }
>> +    else
>> +    {
>> +        bus = dev->info.physfn.bus;
>> +        slot = PCI_SLOT(dev->info.physfn.devfn);
>> +        func = PCI_FUNC(dev->info.physfn.devfn);
>> +    }
>> +
>> +    return !!(pci_conf_read16(dev->seg, bus, slot, func, PCI_COMMAND) &
>> +              PCI_COMMAND_MEMORY);
>> +}
>> +
> 
> This check is racy against anyone who can write to the command register,
> which includes dom0 and other pcpus in Xen.  There does not appear to be
> any exclusion between Xen emulating a control register write on one cpu
> and changing irq affinities on another.

But - as in may other contexts - we assume Dom0 to not be hostile,
and hence to not disable memory decoding on a device out of the
blue. Xen itself certainly doesn't do so at runtime at all.

> As a result, this check does not actually protect against accessing the
> MSI-X bar while memory decoding is disabled.  As a downside, it puts an
> expensive config space access on moderately frequent codepaths.
> 
> One issue we have just identified pertains to dom0 resetting a device
> and Xen falling over a UR which has been escalated to fatal, most likely
> because an in-progress MSI-X interrupt migration.

Definitely not. There are no interrupts active on a device at that
point anymore (or else it would have been a bug for Dom0 to
disable memory decoding on it). The observed UR is due to Xen's
own (largely redundant) cleanup of the guest pIRQ upon cleaning
up after the guest's death.

>  There does not appear
> to be sufficient synchronisation available in the interface for a dom0
> to even cooperatively perform a device reset with Xen.
> 
> The more I consider this and related problems, the more I am thinking
> that the only longterm solution is to have a full PCI implementation in
> Xen, and to prevent any unmediated access, including from dom0.  Xen
> need not gain much (any?) more device-specific knowledge, but needs to
> gain the ability to properly mediate all config updates, and synchronise
> resets against other users of the device.  I do not suggest this
> lightly; I realise that it is a huge quantity of work.

Indeed. Plus it would likely break existing Dom0 (all of the sudden
needing to use hypercalls instead of direct access), unless we want
to get into the business of emulating MMCFG accesses.

Anyway - what does all of this mean for the presented patch series?
We've got a problem to fix, and as I'm realizing that the approach
taken may be controversial I'm certainly up for alternative
suggestions, but not such requiring an overhaul of a much larger
portion of code than just the MSI part. And I'd certainly be
hesitant to carry patches like the ones here in our own repos
without being upstream for any meaningful period of time.

As an aside - Linux doesn't do any synchronization or mediation of
(privileged) user space invoked config space accesses either. I.e.
from a Linux Dom0 pov the problem would likely extend into the
linux kernel (then needing a similar modification).

Jan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] x86/MSI-X: be more careful during teardown
  2015-03-10 16:27 ` [PATCH 1/4] x86/MSI-X: be more careful during teardown Jan Beulich
  2015-03-10 21:05   ` Andrew Cooper
@ 2015-03-13 18:10   ` Andrew Cooper
  2015-03-16 11:03     ` Jan Beulich
  1 sibling, 1 reply; 15+ messages in thread
From: Andrew Cooper @ 2015-03-13 18:10 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 10/03/15 16:27, Jan Beulich wrote:
> When a device gets detached from a guest, pciback will clear its
> command register, thus disabling both memory and I/O decoding. The
> disabled memory decoding, however, has an effect on the MSI-X table
> accesses the hypervisor does: These won't have the intended effect
> anymore. Even worse, for PCIe devices (but not SR-IOV virtual
> functions) such accesses may (will?) be treated as Unsupported
> Requests

Will, as the root port will issue a write and get no reply.

VF config space is part of a memory bar on the PF, which itself will
still be valid for requests even if the VF has memory decoding disabled.

On the other hand, if the PF has memory decoding disabled, I expect any
VF config access to result in a UR.

> , causing respective errors to be surfaced, potentially in the
> form of NMIs that may be fatal to the hypervisor or Dom0 is different
> ways. Hence rather than carrying out these accesses, we should avoid
> them where we can, and use alternative (e.g. PCI config space based)
> mechanisms to achieve at least the same effect.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> Backporting note (largely to myself):
>    Depends on (not yet backported) commit 061eebe0e "x86/MSI: drop
>    workaround for insecure Dom0 kernels" (due to re-use of struct
>    arch_msix's warned field).
>
> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -121,6 +121,27 @@ static void msix_put_fixmap(struct arch_
>      spin_unlock(&msix->table_lock);
>  }
>  
> +static bool_t memory_decoded(const struct pci_dev *dev)
> +{
> +    u8 bus, slot, func;
> +
> +    if ( !dev->info.is_virtfn )
> +    {
> +        bus = dev->bus;
> +        slot = PCI_SLOT(dev->devfn);
> +        func = PCI_FUNC(dev->devfn);
> +    }
> +    else
> +    {
> +        bus = dev->info.physfn.bus;
> +        slot = PCI_SLOT(dev->info.physfn.devfn);
> +        func = PCI_FUNC(dev->info.physfn.devfn);
> +    }
> +
> +    return !!(pci_conf_read16(dev->seg, bus, slot, func, PCI_COMMAND) &
> +              PCI_COMMAND_MEMORY);
> +}
> +
>  /*
>   * MSI message composition
>   */
> @@ -162,7 +183,7 @@ void msi_compose_msg(unsigned vector, co
>      }
>  }
>  
> -static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
> +static bool_t read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
>  {
>      switch ( entry->msi_attrib.type )
>      {
> @@ -198,6 +219,8 @@ static void read_msi_msg(struct msi_desc
>          void __iomem *base;
>          base = entry->mask_base;
>  
> +        if ( unlikely(!memory_decoded(entry->dev)) )
> +            return 0;
>          msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
>          msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
>          msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET);
> @@ -209,6 +232,8 @@ static void read_msi_msg(struct msi_desc
>  
>      if ( iommu_intremap )
>          iommu_read_msi_from_ire(entry, msg);
> +
> +    return 1;
>  }
>  
>  static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
> @@ -260,6 +285,8 @@ static int write_msi_msg(struct msi_desc
>          void __iomem *base;
>          base = entry->mask_base;
>  
> +        if ( unlikely(!memory_decoded(entry->dev)) )
> +            return -ENXIO;
>          writel(msg->address_lo,
>                 base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
>          writel(msg->address_hi,
> @@ -287,7 +314,8 @@ void set_msi_affinity(struct irq_desc *d
>      ASSERT(spin_is_locked(&desc->lock));
>  
>      memset(&msg, 0, sizeof(msg));
> -    read_msi_msg(msi_desc, &msg);
> +    if ( !read_msi_msg(msi_desc, &msg) )
> +        return;
>  
>      msg.data &= ~MSI_DATA_VECTOR_MASK;
>      msg.data |= MSI_DATA_VECTOR(desc->arch.vector);

Assuming a non-racy setup, this hunk makes the memory decode check in
write_msi_msg() redundant.  I cant however think of a neat way to elide
the second access, because all other callers of write_msi_msg() need it.

> @@ -347,20 +375,24 @@ int msi_maskable_irq(const struct msi_de
>             || entry->msi_attrib.maskbit;
>  }
>  
> -static void msi_set_mask_bit(struct irq_desc *desc, int flag)
> +static bool_t msi_set_mask_bit(struct irq_desc *desc, int flag)
>  {
>      struct msi_desc *entry = desc->msi_desc;
> +    struct pci_dev *pdev;
> +    u16 seg;
> +    u8 bus, slot, func;
>  
>      ASSERT(spin_is_locked(&desc->lock));
>      BUG_ON(!entry || !entry->dev);
> +    pdev = entry->dev;
> +    seg = pdev->seg;
> +    bus = pdev->bus;
> +    slot = PCI_SLOT(pdev->devfn);
> +    func = PCI_FUNC(pdev->devfn);
>      switch (entry->msi_attrib.type) {
>      case PCI_CAP_ID_MSI:
>          if (entry->msi_attrib.maskbit) {
>              u32 mask_bits;
> -            u16 seg = entry->dev->seg;
> -            u8 bus = entry->dev->bus;
> -            u8 slot = PCI_SLOT(entry->dev->devfn);
> -            u8 func = PCI_FUNC(entry->dev->devfn);
>  
>              mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
>              mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
> @@ -369,24 +401,52 @@ static void msi_set_mask_bit(struct irq_
>          }
>          break;
>      case PCI_CAP_ID_MSIX:
> -    {
> -        int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
> -        writel(flag, entry->mask_base + offset);
> -        readl(entry->mask_base + offset);
> -        break;
> -    }
> +        if ( likely(memory_decoded(pdev)) )
> +        {
> +            writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> +            readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> +            break;
> +        }
> +        if ( flag )
> +        {
> +            u16 control;
> +            domid_t domid = pdev->domain->domain_id;
> +
> +            control = pci_conf_read16(seg, bus, slot, func,
> +                                      msix_control_reg(entry->msi_attrib.pos));
> +            if ( control & PCI_MSIX_FLAGS_MASKALL )
> +                break;
> +            pci_conf_write16(seg, bus, slot, func,
> +                             msix_control_reg(entry->msi_attrib.pos),
> +                             control | PCI_MSIX_FLAGS_MASKALL);
> +            if ( pdev->msix->warned != domid )
> +            {
> +                pdev->msix->warned = domid;
> +                printk(XENLOG_G_WARNING
> +                       "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",

"masked all MSI-X" to make it slightly clearer that the global mask bit
has been hit.

> +                       desc->irq, domid, pdev->seg, pdev->bus,
> +                       PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
> +            }
> +            break;
> +        }
> +        /* fall through */
>      default:
> -        BUG();
> -        break;
> +        return 0;
>      }
>      entry->msi_attrib.masked = !!flag;
> +
> +    return 1;
>  }
>  
>  static int msi_get_mask_bit(const struct msi_desc *entry)
>  {
> -    switch (entry->msi_attrib.type) {
> +    if ( !entry->dev )
> +        return -1;
> +
> +    switch ( entry->msi_attrib.type )
> +    {
>      case PCI_CAP_ID_MSI:
> -        if (!entry->dev || !entry->msi_attrib.maskbit)
> +        if ( !entry->msi_attrib.maskbit )
>              break;
>          return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
>                                  PCI_SLOT(entry->dev->devfn),
> @@ -394,6 +454,8 @@ static int msi_get_mask_bit(const struct
>                                  entry->msi.mpos) >>
>                  entry->msi_attrib.entry_nr) & 1;
>      case PCI_CAP_ID_MSIX:
> +        if ( unlikely(!memory_decoded(entry->dev)) )
> +            break;
>          return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
>      }
>      return -1;
> @@ -401,12 +463,14 @@ static int msi_get_mask_bit(const struct
>  
>  void mask_msi_irq(struct irq_desc *desc)
>  {
> -    msi_set_mask_bit(desc, 1);
> +    if ( unlikely(!msi_set_mask_bit(desc, 1)) )
> +        BUG();

Previously, BUG() was only on a codepath controlled by the
msi_attrib.type, whereas now it includes failures based on failure to
mask an issue.

Is this wise, as it can be guest-influenced?

>  }
>  
>  void unmask_msi_irq(struct irq_desc *desc)
>  {
> -    msi_set_mask_bit(desc, 0);
> +    if ( unlikely(!msi_set_mask_bit(desc, 0)) )
> +        WARN();
>  }
>  
>  static unsigned int startup_msi_irq(struct irq_desc *desc)
> @@ -713,6 +777,9 @@ static int msix_capability_init(struct p
>      control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
>      msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
>  
> +    if ( unlikely(!memory_decoded(dev)) )
> +        return -ENXIO;
> +
>      if ( desc )
>      {
>          entry = alloc_msi_entry(1);
> @@ -845,7 +912,8 @@ static int msix_capability_init(struct p
>      ++msix->used_entries;
>  
>      /* Restore MSI-X enabled bits */
> -    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
> +    pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos),
> +                     control & ~PCI_MSIX_FLAGS_MASKALL);
>  
>      return 0;
>  }
> @@ -998,8 +1066,16 @@ static void __pci_disable_msix(struct ms
>  
>      BUG_ON(list_empty(&dev->msi_list));
>  
> -    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> -
> +    if ( likely(memory_decoded(dev)) )
> +        writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> +    else if ( !(control & PCI_MSIX_FLAGS_MASKALL) )
> +    {
> +        printk(XENLOG_WARNING
> +               "cannot disable IRQ %d: masking MSI-X on %04x:%02x:%02x.%u\n",

"masking all"

~Andrew

> +               entry->irq, dev->seg, dev->bus,
> +               PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
> +        control |= PCI_MSIX_FLAGS_MASKALL;
> +    }
>      pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
>  
>      _pci_cleanup_msix(dev->msix);
> @@ -1137,14 +1213,23 @@ int pci_restore_msi_state(struct pci_dev
>              nr = entry->msi.nvec;
>          }
>          else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
> +        {
>              msix_set_enable(pdev, 0);
> +            if ( unlikely(!memory_decoded(pdev)) )
> +            {
> +                spin_unlock_irqrestore(&desc->lock, flags);
> +                return -ENXIO;
> +            }
> +        }
>  
>          msg = entry->msg;
>          write_msi_msg(entry, &msg);
>  
>          for ( i = 0; ; )
>          {
> -            msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
> +            if ( unlikely(!msi_set_mask_bit(desc,
> +                                            entry[i].msi_attrib.masked)) )
> +                BUG();
>  
>              if ( !--nr )
>                  break;
>
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/4] x86/MSI-X: access MSI‑X table only after having enabled MSI‑X
  2015-03-10 16:28 ` [PATCH 2/4] x86/MSI-X: access MSI‑X table only after having enabled MSI‑X Jan Beulich
@ 2015-03-13 19:18   ` Andrew Cooper
  2015-03-16 11:13     ` Jan Beulich
  0 siblings, 1 reply; 15+ messages in thread
From: Andrew Cooper @ 2015-03-13 19:18 UTC (permalink / raw)
  To: xen-devel

On 10/03/15 16:28, Jan Beulich wrote:
> As done in Linux by f598282f51 ("PCI: Fix the NIU MSI-X problem in a
> better way") and its broken predecessor, make sure we don't access the
> MSI-X table without having enabled MSI-X first, using the mask-all flag
> instead to prevent interrupts from occurring.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

General quarms of this series aside, Reviewed-by: Andrew Cooper
<andrew.cooper3@citrix.com>

There is a spurious whitespace change and a spurious comment change
which you perhaps didn't mean to include.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/4] x86/MSI-X: reduce fiddling with control register during restore
  2015-03-10 16:29 ` [PATCH 3/4] x86/MSI-X: reduce fiddling with control register during restore Jan Beulich
@ 2015-03-13 19:38   ` Andrew Cooper
  0 siblings, 0 replies; 15+ messages in thread
From: Andrew Cooper @ 2015-03-13 19:38 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 10/03/15 16:29, Jan Beulich wrote:
> Rather than disabling and enabling MSI-X once per vector, do it just
> once per device.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] x86/MSI-X: cleanup
  2015-03-10 16:29 ` [PATCH 4/4] x86/MSI-X: cleanup Jan Beulich
@ 2015-03-13 19:56   ` Andrew Cooper
  0 siblings, 0 replies; 15+ messages in thread
From: Andrew Cooper @ 2015-03-13 19:56 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 10/03/15 16:29, Jan Beulich wrote:
> - __pci_enable_msix() now checks that an MSI-X capability was actually
>   found
> - pass "pos" to msix_capability_init() as both callers already know it
>   (and hence there's no need to re-obtain it)
> - call __pci_disable_msi{,x}() directly instead of via
>   pci_disable_msi() from __pci_enable_msi{x,}() state validation paths

"__pci_enable_msi{x,}()'s state validation paths" ?

> - use msix_control_reg() instead of open coding it
> - log message adjustments
> - coding style corrections
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

It occurs to me that for further cleanup, it might be quite useful to
have a concrete sbdf type.

All this code does huge quantities of moving sbdf representations in and
out of a u32, and using 4 parameters instead of 1 to functions will
cause excessive register scheduling issues.  In addition, a custom %p
format identifier would help us move in the direction of consistent
representation (which we are currently a long way from).

~Andrew

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/4] x86/MSI-X: XSA-120 follow-up
  2015-03-10 16:16 [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Jan Beulich
                   ` (3 preceding siblings ...)
  2015-03-10 16:29 ` [PATCH 4/4] x86/MSI-X: cleanup Jan Beulich
@ 2015-03-13 20:16 ` Andrew Cooper
  2015-03-16 15:38   ` Jan Beulich
  4 siblings, 1 reply; 15+ messages in thread
From: Andrew Cooper @ 2015-03-13 20:16 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 10/03/15 16:16, Jan Beulich wrote:
> The problem requiring the first patch here is actually what lead to
> XSA-120.
>
> 1: be more careful during teardown
> 2: access MSI-X table only after having enabled MSI-X
> 3: reduce fiddling with control register during restore
> 4: cleanup
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Having taken another look through I am still hesitant, but don't have a
concrete issue to object to.

This code is used extensively in the idt vector migration codepath, and
in irq->ack() functions.  I still fear that the additional config space
accesses will come with a non-neglegable overhead.

One issue, orthogonal to these changes, which XenServer has hit before
(a fairly long time ago) is having a device failure and having it fall
completely off the PCI bus (the firmware helpfully ate the error).  At
that point, config cycles started returning ~0's, and I seem to recall
that Xen suffered a cascade failure as it started walking backwards
along the msi entry array.  The underlying problem here is Xen's
reliance to read everything from config space, given the lack of
exclusion against dom0 having access.

Given these extra config accesses, I am seriously wondering whether it
would be more efficient overall to trap&emulate dom0 completely and
allow Xen to cache things like the current control register state,
locations of capability structures, etc.  I genuinely don't know the
answer, but I suspect the comparative expense of config accesses means
that we don't need to replace many of the lookups for a net benefit. 
(Of course, it doesn't protect against backdoor access, but all bets are
already off at that point.)

~Andrew

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/4] x86/MSI-X: be more careful during teardown
  2015-03-13 18:10   ` Andrew Cooper
@ 2015-03-16 11:03     ` Jan Beulich
  0 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2015-03-16 11:03 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 13.03.15 at 19:10, <andrew.cooper3@citrix.com> wrote:
> On 10/03/15 16:27, Jan Beulich wrote:
>> When a device gets detached from a guest, pciback will clear its
>> command register, thus disabling both memory and I/O decoding. The
>> disabled memory decoding, however, has an effect on the MSI-X table
>> accesses the hypervisor does: These won't have the intended effect
>> anymore. Even worse, for PCIe devices (but not SR-IOV virtual
>> functions) such accesses may (will?) be treated as Unsupported
>> Requests
> 
> Will, as the root port will issue a write and get no reply.

The questionable point here really is that of compatibility with
older bus architectures: On x86 (other than e.g. on ia64) the
CPU accessing _any_ physical memory would always be valid;
in the worst case reads would return all bits set and write would
be dropped. PCIe "violating" this, I could see platform designers
to implement workarounds resulting in the original behavior for
at least CPU side accesses.

> VF config space is part of a memory bar on the PF, which itself will
> still be valid for requests even if the VF has memory decoding disabled.

A spec compliant VF simply can't disable decoding, as the
respective command register bit is specified to be hardwires to
zero.

> On the other hand, if the PF has memory decoding disabled, I expect any
> VF config access to result in a UR.

Of course. But obviously passing through a PF to an untrusted
guest, allowing it to create VFs, and the passing through those
VFs onwards to other guests will render those other guests
insecure no matter what.

>> @@ -287,7 +314,8 @@ void set_msi_affinity(struct irq_desc *d
>>      ASSERT(spin_is_locked(&desc->lock));
>>  
>>      memset(&msg, 0, sizeof(msg));
>> -    read_msi_msg(msi_desc, &msg);
>> +    if ( !read_msi_msg(msi_desc, &msg) )
>> +        return;
>>  
>>      msg.data &= ~MSI_DATA_VECTOR_MASK;
>>      msg.data |= MSI_DATA_VECTOR(desc->arch.vector);
> 
> Assuming a non-racy setup, this hunk makes the memory decode check in
> write_msi_msg() redundant.  I cant however think of a neat way to elide
> the second access, because all other callers of write_msi_msg() need it.

And indeed I had looked at where such redundancy could be
avoided. As you say, avoiding it here would make the code
event more complicated, which I don't think we want.

>> @@ -369,24 +401,52 @@ static void msi_set_mask_bit(struct irq_
>>          }
>>          break;
>>      case PCI_CAP_ID_MSIX:
>> -    {
>> -        int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
>> -        writel(flag, entry->mask_base + offset);
>> -        readl(entry->mask_base + offset);
>> -        break;
>> -    }
>> +        if ( likely(memory_decoded(pdev)) )
>> +        {
>> +            writel(flag, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
>> +            readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
>> +            break;
>> +        }
>> +        if ( flag )
>> +        {
>> +            u16 control;
>> +            domid_t domid = pdev->domain->domain_id;
>> +
>> +            control = pci_conf_read16(seg, bus, slot, func,
>> +                                      msix_control_reg(entry->msi_attrib.pos));
>> +            if ( control & PCI_MSIX_FLAGS_MASKALL )
>> +                break;
>> +            pci_conf_write16(seg, bus, slot, func,
>> +                             msix_control_reg(entry->msi_attrib.pos),
>> +                             control | PCI_MSIX_FLAGS_MASKALL);
>> +            if ( pdev->msix->warned != domid )
>> +            {
>> +                pdev->msix->warned = domid;
>> +                printk(XENLOG_G_WARNING
>> +                       "cannot mask IRQ %d: masked MSI-X on Dom%d's %04x:%02x:%02x.%u\n",
> 
> "masked all MSI-X" to make it slightly clearer that the global mask bit
> has been hit.

I don't think we need to make the message any longer / more
verbose. For MSI-X it's pretty clear that by not naming a particular
entry, it concerns all of them.

>> @@ -401,12 +463,14 @@ static int msi_get_mask_bit(const struct
>>  
>>  void mask_msi_irq(struct irq_desc *desc)
>>  {
>> -    msi_set_mask_bit(desc, 1);
>> +    if ( unlikely(!msi_set_mask_bit(desc, 1)) )
>> +        BUG();
> 
> Previously, BUG() was only on a codepath controlled by the
> msi_attrib.type, whereas now it includes failures based on failure to
> mask an issue.
> 
> Is this wise, as it can be guest-influenced?

Can it really be?

Jan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/4] x86/MSI-X: access MSI‑X table only after having enabled MSI‑X
  2015-03-13 19:18   ` Andrew Cooper
@ 2015-03-16 11:13     ` Jan Beulich
  0 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2015-03-16 11:13 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 13.03.15 at 20:18, <andrew.cooper3@citrix.com> wrote:
> On 10/03/15 16:28, Jan Beulich wrote:
>> As done in Linux by f598282f51 ("PCI: Fix the NIU MSI-X problem in a
>> better way") and its broken predecessor, make sure we don't access the
>> MSI-X table without having enabled MSI-X first, using the mask-all flag
>> instead to prevent interrupts from occurring.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> General quarms of this series aside, Reviewed-by: Andrew Cooper
> <andrew.cooper3@citrix.com>
> 
> There is a spurious whitespace change and a spurious comment change
> which you perhaps didn't mean to include.

I was able to find the former, but I don't see the latter. There two
comment changes in total: In one case, a "fall through" one gets
intentionally dropped, and in the other the comment gets extended
quite a bit, again intentionally.

Jan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/4] x86/MSI-X: XSA-120 follow-up
  2015-03-13 20:16 ` [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Andrew Cooper
@ 2015-03-16 15:38   ` Jan Beulich
  0 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2015-03-16 15:38 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser

>>> On 13.03.15 at 21:16, <andrew.cooper3@citrix.com> wrote:
> Given these extra config accesses, I am seriously wondering whether it
> would be more efficient overall to trap&emulate dom0 completely and
> allow Xen to cache things like the current control register state,
> locations of capability structures, etc.  I genuinely don't know the
> answer, but I suspect the comparative expense of config accesses means
> that we don't need to replace many of the lookups for a net benefit. 

I would assume this depends on the access method: If Dom0 uses
port I/O (via ports CF8 and CFC), then the overhead needed for
caching would be quite low, and the original access be comparably
slow already anyway. If however we talk of MMCFG accesses, then
I'd expect these to be faster, and Dom0 right now has direct access.
I.e. emulation would first of all involve adding #PF interception and
instruction decoding, and hence the net loss in performance would
be quite a bit higher. Plus we'd all of the sudden make it a
requirement that Dom0 let us know about the usability of MMCFG
regions - right now this is optional, i.e. Dom0 might use MMCFG
while Xen doesn't and hence also would have no handle for
intercepting writes.

> (Of course, it doesn't protect against backdoor access, but all bets are
> already off at that point.)

As long as we read config space rather than cache its supposed
state, there's no problem with backdoor accesses. And we should
already be using cached r/o items like capability positions, at least
on code paths used not only for setup/teardown.

Jan

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2015-03-16 15:38 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-10 16:16 [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Jan Beulich
2015-03-10 16:27 ` [PATCH 1/4] x86/MSI-X: be more careful during teardown Jan Beulich
2015-03-10 21:05   ` Andrew Cooper
2015-03-11  8:22     ` Jan Beulich
2015-03-13 18:10   ` Andrew Cooper
2015-03-16 11:03     ` Jan Beulich
2015-03-10 16:28 ` [PATCH 2/4] x86/MSI-X: access MSI‑X table only after having enabled MSI‑X Jan Beulich
2015-03-13 19:18   ` Andrew Cooper
2015-03-16 11:13     ` Jan Beulich
2015-03-10 16:29 ` [PATCH 3/4] x86/MSI-X: reduce fiddling with control register during restore Jan Beulich
2015-03-13 19:38   ` Andrew Cooper
2015-03-10 16:29 ` [PATCH 4/4] x86/MSI-X: cleanup Jan Beulich
2015-03-13 19:56   ` Andrew Cooper
2015-03-13 20:16 ` [PATCH 0/4] x86/MSI-X: XSA-120 follow-up Andrew Cooper
2015-03-16 15:38   ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.