All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses
@ 2013-02-06 16:50 Jan Beulich
  2013-02-07  8:44 ` Jan Beulich
  2013-02-25  5:17 ` Zhang, Xiantao
  0 siblings, 2 replies; 9+ messages in thread
From: Jan Beulich @ 2013-02-06 16:50 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 12534 bytes --]

This adds two new physdev operations for Dom0 to invoke when resource
allocation for devices is known to be complete, so that the hypervisor
can arrange for the respective MMIO ranges to be marked read-only
before an eventual guest getting such a device assigned even gets
started, such that it won't be able to set up writable mappings for
these MMIO ranges before Xen has a chance to protect them.

This also addresses another issue with the code being modified here,
in that so far write protection for the address ranges in question got
set up only once during the lifetime of a device (i.e. until either
system shutdown or device hot removal), while teardown happened when
the last interrupt was disposed of by the guest (which at least allowed
the tables to be writable when the device got assigned to a second
guest [instance] after the first terminated).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -656,8 +656,8 @@ static u64 read_pci_mem_bar(u16 seg, u8 
  * @entries: pointer to an array of struct msix_entry entries
  * @nvec: number of @entries
  *
- * Setup the MSI-X capability structure of device function with a
- * single MSI-X irq. A return of zero indicates the successful setup of
+ * Setup the MSI-X capability structure of device function with the requested
+ * number MSI-X irqs. A return of zero indicates the successful setup of
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev,
@@ -665,86 +665,69 @@ static int msix_capability_init(struct p
                                 struct msi_desc **desc,
                                 unsigned int nr_entries)
 {
-    struct msi_desc *entry;
-    int pos;
+    struct msi_desc *entry = NULL;
+    int pos, vf;
     u16 control;
-    u64 table_paddr, entry_paddr;
-    u32 table_offset, entry_offset;
-    u8 bir;
-    void __iomem *base;
-    int idx;
+    u64 table_paddr;
+    u32 table_offset;
+    u8 bir, pbus, pslot, pfunc;
     u16 seg = dev->seg;
     u8 bus = dev->bus;
     u8 slot = PCI_SLOT(dev->devfn);
     u8 func = PCI_FUNC(dev->devfn);
 
     ASSERT(spin_is_locked(&pcidevs_lock));
-    ASSERT(desc);
 
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
     msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
 
-    /* MSI-X Table Initialization */
-    entry = alloc_msi_entry();
-    if ( !entry )
-        return -ENOMEM;
+    if ( desc )
+    {
+        entry = alloc_msi_entry();
+        if ( !entry )
+            return -ENOMEM;
+        ASSERT(msi);
+    }
 
-    /* Request & Map MSI-X table region */
+    /* Locate MSI-X table region */
     table_offset = pci_conf_read32(seg, bus, slot, func,
                                    msix_table_offset_reg(pos));
     bir = (u8)(table_offset & PCI_MSIX_BIRMASK);
     table_offset &= ~PCI_MSIX_BIRMASK;
-    entry_offset = msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
 
-    table_paddr = msi->table_base + table_offset;
-    entry_paddr = table_paddr + entry_offset;
-    idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
-    if ( idx < 0 )
-    {
-        xfree(entry);
-        return idx;
-    }
-    base = (void *)(fix_to_virt(idx) +
-        ((unsigned long)entry_paddr & ((1UL << PAGE_SHIFT) - 1)));
-
-    entry->msi_attrib.type = PCI_CAP_ID_MSIX;
-    entry->msi_attrib.is_64 = 1;
-    entry->msi_attrib.entry_nr = msi->entry_nr;
-    entry->msi_attrib.maskbit = 1;
-    entry->msi_attrib.masked = 1;
-    entry->msi_attrib.pos = pos;
-    entry->irq = msi->irq;
-    entry->dev = dev;
-    entry->mask_base = base;
-
-    list_add_tail(&entry->list, &dev->msi_list);
-
-    if ( !dev->msix_nr_entries )
+    if ( !dev->info.is_virtfn )
     {
-        u8 pbus, pslot, pfunc;
-        int vf;
-        u64 pba_paddr;
-        u32 pba_offset;
+        pbus = bus;
+        pslot = slot;
+        pfunc = func;
+        vf = -1;
+    }
+    else
+    {
+        pbus = dev->info.physfn.bus;
+        pslot = PCI_SLOT(dev->info.physfn.devfn);
+        pfunc = PCI_FUNC(dev->info.physfn.devfn);
+        vf = PCI_BDF2(dev->bus, dev->devfn);
+    }
 
-        if ( !dev->info.is_virtfn )
-        {
-            pbus = bus;
-            pslot = slot;
-            pfunc = func;
-            vf = -1;
-        }
-        else
+    table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf);
+    WARN_ON(msi && msi->table_base != table_paddr);
+    if ( !table_paddr )
+    {
+        if ( !msi || !msi->table_base )
         {
-            pbus = dev->info.physfn.bus;
-            pslot = PCI_SLOT(dev->info.physfn.devfn);
-            pfunc = PCI_FUNC(dev->info.physfn.devfn);
-            vf = PCI_BDF2(dev->bus, dev->devfn);
+            xfree(entry);
+            return -ENXIO;
         }
+        table_paddr = msi->table_base;
+    }
+    table_paddr += table_offset;
 
-        ASSERT(!dev->msix_used_entries);
-        WARN_ON(msi->table_base !=
-                read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf));
+    if ( !dev->msix_used_entries )
+    {
+        u64 pba_paddr;
+        u32 pba_offset;
 
         dev->msix_nr_entries = nr_entries;
         dev->msix_table.first = PFN_DOWN(table_paddr);
@@ -765,7 +748,42 @@ static int msix_capability_init(struct p
                                       BITS_TO_LONGS(nr_entries) - 1);
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_pba.first,
                                         dev->msix_pba.last));
+    }
 
+    if ( entry )
+    {
+        /* Map MSI-X table region */
+        u64 entry_paddr = table_paddr + msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
+        int idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
+        void __iomem *base;
+
+        if ( idx < 0 )
+        {
+            xfree(entry);
+            return idx;
+        }
+        base = (void *)(fix_to_virt(idx) +
+                        ((unsigned long)entry_paddr & (PAGE_SIZE - 1)));
+
+        /* Mask interrupt here */
+        writel(1, base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+
+        entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+        entry->msi_attrib.is_64 = 1;
+        entry->msi_attrib.entry_nr = msi->entry_nr;
+        entry->msi_attrib.maskbit = 1;
+        entry->msi_attrib.masked = 1;
+        entry->msi_attrib.pos = pos;
+        entry->irq = msi->irq;
+        entry->dev = dev;
+        entry->mask_base = base;
+
+        list_add_tail(&entry->list, &dev->msi_list);
+        *desc = entry;
+    }
+
+    if ( !dev->msix_used_entries )
+    {
         if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first,
                                 dev->msix_table.last) )
             WARN();
@@ -776,7 +794,7 @@ static int msix_capability_init(struct p
         if ( dev->domain )
             p2m_change_entry_type_global(dev->domain,
                                          p2m_mmio_direct, p2m_mmio_direct);
-        if ( !dev->domain || !paging_mode_translate(dev->domain) )
+        if ( desc && (!dev->domain || !paging_mode_translate(dev->domain)) )
         {
             struct domain *d = dev->domain;
 
@@ -790,6 +808,13 @@ static int msix_capability_init(struct p
                         break;
             if ( d )
             {
+                if ( !IS_PRIV(d) && dev->msix_warned != d->domain_id )
+                {
+                    dev->msix_warned = d->domain_id;
+                    printk(XENLOG_ERR
+                           "Potentially insecure use of MSI-X on %04x:%02x:%02x.%u by Dom%d\n",
+                           seg, bus, slot, func, d->domain_id);
+                }
                 /* XXX How to deal with existing mappings? */
             }
         }
@@ -798,10 +823,6 @@ static int msix_capability_init(struct p
     WARN_ON(dev->msix_table.first != (table_paddr >> PAGE_SHIFT));
     ++dev->msix_used_entries;
 
-    /* Mask interrupt here */
-    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-
-    *desc = entry;
     /* Restore MSI-X enabled bits */
     pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
@@ -926,6 +947,19 @@ static int __pci_enable_msix(struct msi_
     return status;
 }
 
+static void _pci_cleanup_msix(struct pci_dev *dev)
+{
+    if ( !--dev->msix_used_entries )
+    {
+        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
+                                   dev->msix_table.last) )
+            WARN();
+        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
+                                   dev->msix_pba.last) )
+            WARN();
+    }
+}
+
 static void __pci_disable_msix(struct msi_desc *entry)
 {
     struct pci_dev *dev;
@@ -949,15 +983,42 @@ static void __pci_disable_msix(struct ms
 
     pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
-    if ( !--dev->msix_used_entries )
+    _pci_cleanup_msix(dev);
+}
+
+int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off)
+{
+    int rc;
+    struct pci_dev *pdev;
+    u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
+    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
+                                           PCI_CAP_ID_MSIX);
+
+    if ( !pos )
+        return -ENODEV;
+
+    spin_lock(&pcidevs_lock);
+    pdev = pci_get_pdev(seg, bus, devfn);
+    if ( !pdev )
+        rc = -ENODEV;
+    else if ( pdev->msix_used_entries != !!off )
+        rc = -EBUSY;
+    else if ( off )
     {
-        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
-                                  dev->msix_table.last) )
-            WARN();
-        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
-                                   dev->msix_pba.last) )
-            WARN();
+        _pci_cleanup_msix(pdev);
+        rc = 0;
     }
+    else
+    {
+        u16 control = pci_conf_read16(seg, bus, slot, func,
+                                      msix_control_reg(pos));
+
+        rc = msix_capability_init(pdev, NULL, NULL,
+                                  multi_msix_capable(control));
+    }
+    spin_unlock(&pcidevs_lock);
+
+    return rc;
 }
 
 /*
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -575,6 +575,18 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         break;
     }
 
+    case PHYSDEVOP_prepare_msix:
+    case PHYSDEVOP_release_msix: {
+        struct physdev_pci_device dev;
+
+        if ( copy_from_guest(&dev, arg, 1) )
+            ret = -EFAULT;
+        else
+            ret = pci_prepare_msix(dev.seg, dev.bus, dev.devfn,
+                                   cmd != PHYSDEVOP_prepare_msix);
+        break;
+    }
+
     case PHYSDEVOP_pci_mmcfg_reserved: {
         struct physdev_pci_mmcfg_reserved info;
 
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -76,6 +76,7 @@ struct msi_desc;
 /* Helper functions */
 extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc);
 extern void pci_disable_msi(struct msi_desc *desc);
+extern int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off);
 extern void pci_cleanup_msi(struct pci_dev *pdev);
 extern void setup_msi_handler(struct irq_desc *, struct msi_desc *);
 extern void setup_msi_irq(struct irq_desc *);
--- a/xen/include/public/physdev.h
+++ b/xen/include/public/physdev.h
@@ -303,6 +303,12 @@ DEFINE_XEN_GUEST_HANDLE(physdev_pci_devi
 
 #define PHYSDEVOP_pci_device_remove     26
 #define PHYSDEVOP_restore_msi_ext       27
+/*
+ * Dom0 should use these two to announce MMIO resources assigned to
+ * MSI-X capable devices won't (prepare) or may (release) change.
+ */
+#define PHYSDEVOP_prepare_msix          30
+#define PHYSDEVOP_release_msix          31
 struct physdev_pci_device {
     /* IN */
     uint16_t seg;
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -57,6 +57,7 @@ struct pci_dev {
     int msix_table_refcnt[MAX_MSIX_TABLE_PAGES];
     int msix_table_idx[MAX_MSIX_TABLE_PAGES];
     spinlock_t msix_table_lock;
+    domid_t msix_warned;
 
     struct domain *domain;
     const u16 seg;



[-- Attachment #2: x86-MSI-X-fully-hide-table.patch --]
[-- Type: text/plain, Size: 12602 bytes --]

x86/MSI: add mechanism to protect MSI-X table from PV guest accesses

This adds two new physdev operations for Dom0 to invoke when resource
allocation for devices is known to be complete, so that the hypervisor
can arrange for the respective MMIO ranges to be marked read-only
before an eventual guest getting such a device assigned even gets
started, such that it won't be able to set up writable mappings for
these MMIO ranges before Xen has a chance to protect them.

This also addresses another issue with the code being modified here,
in that so far write protection for the address ranges in question got
set up only once during the lifetime of a device (i.e. until either
system shutdown or device hot removal), while teardown happened when
the last interrupt was disposed of by the guest (which at least allowed
the tables to be writable when the device got assigned to a second
guest [instance] after the first terminated).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -656,8 +656,8 @@ static u64 read_pci_mem_bar(u16 seg, u8 
  * @entries: pointer to an array of struct msix_entry entries
  * @nvec: number of @entries
  *
- * Setup the MSI-X capability structure of device function with a
- * single MSI-X irq. A return of zero indicates the successful setup of
+ * Setup the MSI-X capability structure of device function with the requested
+ * number MSI-X irqs. A return of zero indicates the successful setup of
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev,
@@ -665,86 +665,69 @@ static int msix_capability_init(struct p
                                 struct msi_desc **desc,
                                 unsigned int nr_entries)
 {
-    struct msi_desc *entry;
-    int pos;
+    struct msi_desc *entry = NULL;
+    int pos, vf;
     u16 control;
-    u64 table_paddr, entry_paddr;
-    u32 table_offset, entry_offset;
-    u8 bir;
-    void __iomem *base;
-    int idx;
+    u64 table_paddr;
+    u32 table_offset;
+    u8 bir, pbus, pslot, pfunc;
     u16 seg = dev->seg;
     u8 bus = dev->bus;
     u8 slot = PCI_SLOT(dev->devfn);
     u8 func = PCI_FUNC(dev->devfn);
 
     ASSERT(spin_is_locked(&pcidevs_lock));
-    ASSERT(desc);
 
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
     msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
 
-    /* MSI-X Table Initialization */
-    entry = alloc_msi_entry();
-    if ( !entry )
-        return -ENOMEM;
+    if ( desc )
+    {
+        entry = alloc_msi_entry();
+        if ( !entry )
+            return -ENOMEM;
+        ASSERT(msi);
+    }
 
-    /* Request & Map MSI-X table region */
+    /* Locate MSI-X table region */
     table_offset = pci_conf_read32(seg, bus, slot, func,
                                    msix_table_offset_reg(pos));
     bir = (u8)(table_offset & PCI_MSIX_BIRMASK);
     table_offset &= ~PCI_MSIX_BIRMASK;
-    entry_offset = msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
 
-    table_paddr = msi->table_base + table_offset;
-    entry_paddr = table_paddr + entry_offset;
-    idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
-    if ( idx < 0 )
-    {
-        xfree(entry);
-        return idx;
-    }
-    base = (void *)(fix_to_virt(idx) +
-        ((unsigned long)entry_paddr & ((1UL << PAGE_SHIFT) - 1)));
-
-    entry->msi_attrib.type = PCI_CAP_ID_MSIX;
-    entry->msi_attrib.is_64 = 1;
-    entry->msi_attrib.entry_nr = msi->entry_nr;
-    entry->msi_attrib.maskbit = 1;
-    entry->msi_attrib.masked = 1;
-    entry->msi_attrib.pos = pos;
-    entry->irq = msi->irq;
-    entry->dev = dev;
-    entry->mask_base = base;
-
-    list_add_tail(&entry->list, &dev->msi_list);
-
-    if ( !dev->msix_nr_entries )
+    if ( !dev->info.is_virtfn )
     {
-        u8 pbus, pslot, pfunc;
-        int vf;
-        u64 pba_paddr;
-        u32 pba_offset;
+        pbus = bus;
+        pslot = slot;
+        pfunc = func;
+        vf = -1;
+    }
+    else
+    {
+        pbus = dev->info.physfn.bus;
+        pslot = PCI_SLOT(dev->info.physfn.devfn);
+        pfunc = PCI_FUNC(dev->info.physfn.devfn);
+        vf = PCI_BDF2(dev->bus, dev->devfn);
+    }
 
-        if ( !dev->info.is_virtfn )
-        {
-            pbus = bus;
-            pslot = slot;
-            pfunc = func;
-            vf = -1;
-        }
-        else
+    table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf);
+    WARN_ON(msi && msi->table_base != table_paddr);
+    if ( !table_paddr )
+    {
+        if ( !msi || !msi->table_base )
         {
-            pbus = dev->info.physfn.bus;
-            pslot = PCI_SLOT(dev->info.physfn.devfn);
-            pfunc = PCI_FUNC(dev->info.physfn.devfn);
-            vf = PCI_BDF2(dev->bus, dev->devfn);
+            xfree(entry);
+            return -ENXIO;
         }
+        table_paddr = msi->table_base;
+    }
+    table_paddr += table_offset;
 
-        ASSERT(!dev->msix_used_entries);
-        WARN_ON(msi->table_base !=
-                read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf));
+    if ( !dev->msix_used_entries )
+    {
+        u64 pba_paddr;
+        u32 pba_offset;
 
         dev->msix_nr_entries = nr_entries;
         dev->msix_table.first = PFN_DOWN(table_paddr);
@@ -765,7 +748,42 @@ static int msix_capability_init(struct p
                                       BITS_TO_LONGS(nr_entries) - 1);
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_pba.first,
                                         dev->msix_pba.last));
+    }
 
+    if ( entry )
+    {
+        /* Map MSI-X table region */
+        u64 entry_paddr = table_paddr + msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
+        int idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
+        void __iomem *base;
+
+        if ( idx < 0 )
+        {
+            xfree(entry);
+            return idx;
+        }
+        base = (void *)(fix_to_virt(idx) +
+                        ((unsigned long)entry_paddr & (PAGE_SIZE - 1)));
+
+        /* Mask interrupt here */
+        writel(1, base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+
+        entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+        entry->msi_attrib.is_64 = 1;
+        entry->msi_attrib.entry_nr = msi->entry_nr;
+        entry->msi_attrib.maskbit = 1;
+        entry->msi_attrib.masked = 1;
+        entry->msi_attrib.pos = pos;
+        entry->irq = msi->irq;
+        entry->dev = dev;
+        entry->mask_base = base;
+
+        list_add_tail(&entry->list, &dev->msi_list);
+        *desc = entry;
+    }
+
+    if ( !dev->msix_used_entries )
+    {
         if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first,
                                 dev->msix_table.last) )
             WARN();
@@ -776,7 +794,7 @@ static int msix_capability_init(struct p
         if ( dev->domain )
             p2m_change_entry_type_global(dev->domain,
                                          p2m_mmio_direct, p2m_mmio_direct);
-        if ( !dev->domain || !paging_mode_translate(dev->domain) )
+        if ( desc && (!dev->domain || !paging_mode_translate(dev->domain)) )
         {
             struct domain *d = dev->domain;
 
@@ -790,6 +808,13 @@ static int msix_capability_init(struct p
                         break;
             if ( d )
             {
+                if ( !IS_PRIV(d) && dev->msix_warned != d->domain_id )
+                {
+                    dev->msix_warned = d->domain_id;
+                    printk(XENLOG_ERR
+                           "Potentially insecure use of MSI-X on %04x:%02x:%02x.%u by Dom%d\n",
+                           seg, bus, slot, func, d->domain_id);
+                }
                 /* XXX How to deal with existing mappings? */
             }
         }
@@ -798,10 +823,6 @@ static int msix_capability_init(struct p
     WARN_ON(dev->msix_table.first != (table_paddr >> PAGE_SHIFT));
     ++dev->msix_used_entries;
 
-    /* Mask interrupt here */
-    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-
-    *desc = entry;
     /* Restore MSI-X enabled bits */
     pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
@@ -926,6 +947,19 @@ static int __pci_enable_msix(struct msi_
     return status;
 }
 
+static void _pci_cleanup_msix(struct pci_dev *dev)
+{
+    if ( !--dev->msix_used_entries )
+    {
+        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
+                                   dev->msix_table.last) )
+            WARN();
+        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
+                                   dev->msix_pba.last) )
+            WARN();
+    }
+}
+
 static void __pci_disable_msix(struct msi_desc *entry)
 {
     struct pci_dev *dev;
@@ -949,15 +983,42 @@ static void __pci_disable_msix(struct ms
 
     pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
 
-    if ( !--dev->msix_used_entries )
+    _pci_cleanup_msix(dev);
+}
+
+int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off)
+{
+    int rc;
+    struct pci_dev *pdev;
+    u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
+    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
+                                           PCI_CAP_ID_MSIX);
+
+    if ( !pos )
+        return -ENODEV;
+
+    spin_lock(&pcidevs_lock);
+    pdev = pci_get_pdev(seg, bus, devfn);
+    if ( !pdev )
+        rc = -ENODEV;
+    else if ( pdev->msix_used_entries != !!off )
+        rc = -EBUSY;
+    else if ( off )
     {
-        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
-                                  dev->msix_table.last) )
-            WARN();
-        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
-                                   dev->msix_pba.last) )
-            WARN();
+        _pci_cleanup_msix(pdev);
+        rc = 0;
     }
+    else
+    {
+        u16 control = pci_conf_read16(seg, bus, slot, func,
+                                      msix_control_reg(pos));
+
+        rc = msix_capability_init(pdev, NULL, NULL,
+                                  multi_msix_capable(control));
+    }
+    spin_unlock(&pcidevs_lock);
+
+    return rc;
 }
 
 /*
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -575,6 +575,18 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         break;
     }
 
+    case PHYSDEVOP_prepare_msix:
+    case PHYSDEVOP_release_msix: {
+        struct physdev_pci_device dev;
+
+        if ( copy_from_guest(&dev, arg, 1) )
+            ret = -EFAULT;
+        else
+            ret = pci_prepare_msix(dev.seg, dev.bus, dev.devfn,
+                                   cmd != PHYSDEVOP_prepare_msix);
+        break;
+    }
+
     case PHYSDEVOP_pci_mmcfg_reserved: {
         struct physdev_pci_mmcfg_reserved info;
 
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -76,6 +76,7 @@ struct msi_desc;
 /* Helper functions */
 extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc);
 extern void pci_disable_msi(struct msi_desc *desc);
+extern int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off);
 extern void pci_cleanup_msi(struct pci_dev *pdev);
 extern void setup_msi_handler(struct irq_desc *, struct msi_desc *);
 extern void setup_msi_irq(struct irq_desc *);
--- a/xen/include/public/physdev.h
+++ b/xen/include/public/physdev.h
@@ -303,6 +303,12 @@ DEFINE_XEN_GUEST_HANDLE(physdev_pci_devi
 
 #define PHYSDEVOP_pci_device_remove     26
 #define PHYSDEVOP_restore_msi_ext       27
+/*
+ * Dom0 should use these two to announce MMIO resources assigned to
+ * MSI-X capable devices won't (prepare) or may (release) change.
+ */
+#define PHYSDEVOP_prepare_msix          30
+#define PHYSDEVOP_release_msix          31
 struct physdev_pci_device {
     /* IN */
     uint16_t seg;
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -57,6 +57,7 @@ struct pci_dev {
     int msix_table_refcnt[MAX_MSIX_TABLE_PAGES];
     int msix_table_idx[MAX_MSIX_TABLE_PAGES];
     spinlock_t msix_table_lock;
+    domid_t msix_warned;
 
     struct domain *domain;
     const u16 seg;

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses
  2013-02-06 16:50 [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses Jan Beulich
@ 2013-02-07  8:44 ` Jan Beulich
  2013-02-14 10:00   ` Ian Campbell
  2013-02-28  9:22   ` Ping: " Jan Beulich
  2013-02-25  5:17 ` Zhang, Xiantao
  1 sibling, 2 replies; 9+ messages in thread
From: Jan Beulich @ 2013-02-07  8:44 UTC (permalink / raw)
  To: xen-devel

>>> On 06.02.13 at 17:50, "Jan Beulich" <JBeulich@suse.com> wrote:
> This adds two new physdev operations for Dom0 to invoke when resource
> allocation for devices is known to be complete, so that the hypervisor
> can arrange for the respective MMIO ranges to be marked read-only
> before an eventual guest getting such a device assigned even gets
> started, such that it won't be able to set up writable mappings for
> these MMIO ranges before Xen has a chance to protect them.

I should probably mention the alternatives:

1) Brute force scan of the (PV) guest's L1 page tables, locating
eventual mappings of the questionable MMIO pages, and
converting those mappings to R/O ones.

2) Snoop BAR modifications (via xen/arch/x86/traps.c:
guest_io_write(), taking note of which BAR(s) are relevant at the
point where the device gets first detected/reported), perhaps
along with snoops of the PCI_COMMAND_MEMORY bit.

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses
  2013-02-07  8:44 ` Jan Beulich
@ 2013-02-14 10:00   ` Ian Campbell
  2013-02-14 10:49     ` Jan Beulich
  2013-02-28  9:22   ` Ping: " Jan Beulich
  1 sibling, 1 reply; 9+ messages in thread
From: Ian Campbell @ 2013-02-14 10:00 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Keir Fraser, xen-devel

On Thu, 2013-02-07 at 08:44 +0000, Jan Beulich wrote:
> >>> On 06.02.13 at 17:50, "Jan Beulich" <JBeulich@suse.com> wrote:
> > This adds two new physdev operations for Dom0 to invoke when resource
> > allocation for devices is known to be complete, so that the hypervisor
> > can arrange for the respective MMIO ranges to be marked read-only
> > before an eventual guest getting such a device assigned even gets
> > started, such that it won't be able to set up writable mappings for
> > these MMIO ranges before Xen has a chance to protect them.

This sounds reasonable, is the "when resource allocation ... complete"
difficult to assess or likely to be fragile going forward?

> I should probably mention the alternatives:
> 
> 1) Brute force scan of the (PV) guest's L1 page tables, locating
> eventual mappings of the questionable MMIO pages, and
> converting those mappings to R/O ones.

When would this occur? Just on setup (might be ok) or on any MMU update
(clearly not) or some middle ground (like only DOMID_IO mappings
perhaps)?

> 2) Snoop BAR modifications (via xen/arch/x86/traps.c:
> guest_io_write(), taking note of which BAR(s) are relevant at the
> point where the device gets first detected/reported), perhaps
> along with snoops of the PCI_COMMAND_MEMORY bit.

Do these modifications come straight from the guest, or do they come
from dom0 via pciback and/or dom0's own PCI subsystem setup? If these
are coming via pciback then it seems reasonable to me to use physdev
operations, which takes us back to the patch you've implemented I think.

Trapping and emulating them sounds like something to avoid even if these
are not especially hot paths.

Ian.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses
  2013-02-14 10:00   ` Ian Campbell
@ 2013-02-14 10:49     ` Jan Beulich
  0 siblings, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2013-02-14 10:49 UTC (permalink / raw)
  To: Ian Campbell; +Cc: Keir Fraser, xen-devel

>>> On 14.02.13 at 11:00, Ian Campbell <Ian.Campbell@citrix.com> wrote:
> On Thu, 2013-02-07 at 08:44 +0000, Jan Beulich wrote:
>> >>> On 06.02.13 at 17:50, "Jan Beulich" <JBeulich@suse.com> wrote:
>> > This adds two new physdev operations for Dom0 to invoke when resource
>> > allocation for devices is known to be complete, so that the hypervisor
>> > can arrange for the respective MMIO ranges to be marked read-only
>> > before an eventual guest getting such a device assigned even gets
>> > started, such that it won't be able to set up writable mappings for
>> > these MMIO ranges before Xen has a chance to protect them.
> 
> This sounds reasonable, is the "when resource allocation ... complete"
> difficult to assess or likely to be fragile going forward?

I went through that logic on the Linux side a couple of times, but
couldn't really convince myself of when resource re-allocation might
happen, or would be guaranteed not to happen. I may have to try
to ping the PCI subsytem maintainer(s) to (hopefully) get a firm
statement...

>> I should probably mention the alternatives:
>> 
>> 1) Brute force scan of the (PV) guest's L1 page tables, locating
>> eventual mappings of the questionable MMIO pages, and
>> converting those mappings to R/O ones.
> 
> When would this occur? Just on setup (might be ok) or on any MMU update
> (clearly not) or some middle ground (like only DOMID_IO mappings
> perhaps)?

Since we carry out the page table modifications for the guest
anyway, the only point this is needed is when the first MSI-X
interrupt gets set up. But the guest can do this setup at any rate,
and hence it might get us (maliciously) very busy in this scan, and
handling preemption here might be tricky.

>> 2) Snoop BAR modifications (via xen/arch/x86/traps.c:
>> guest_io_write(), taking note of which BAR(s) are relevant at the
>> point where the device gets first detected/reported), perhaps
>> along with snoops of the PCI_COMMAND_MEMORY bit.

Actually, the I/O port based CFG accesses would even be the more
strait forward part. The trickier thing would be the MMCFG based
alternative.

> Do these modifications come straight from the guest, or do they come
> from dom0 via pciback and/or dom0's own PCI subsystem setup? If these
> are coming via pciback then it seems reasonable to me to use physdev
> operations, which takes us back to the patch you've implemented I think.

All CFG accesses go through pciback. The resource assignment (and
hence BAR modifications) are done exclusively by Dom0 (anything
else would be a security hole).

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses
  2013-02-06 16:50 [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses Jan Beulich
  2013-02-07  8:44 ` Jan Beulich
@ 2013-02-25  5:17 ` Zhang, Xiantao
  2013-02-25  9:38   ` Jan Beulich
  1 sibling, 1 reply; 9+ messages in thread
From: Zhang, Xiantao @ 2013-02-25  5:17 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

Hi, Jan
I think this fix also covers HVM side, right ?   For PV side,  I recalled there was already a fix in Xend to protect PV guest from accessing the related MMIO range. Also for HVM guest, last year you proposed and submitted a patch to Xen-Qemu for blocking guest's accesses to the related MMIO range.    Looks like the previous fixes are not needed any more if this fix got check-in, right ?  
Thanks! 
Xiantao

> -----Original Message-----
> From: xen-devel-bounces@lists.xen.org [mailto:xen-devel-
> bounces@lists.xen.org] On Behalf Of Jan Beulich
> Sent: Thursday, February 07, 2013 12:51 AM
> To: xen-devel
> Subject: [Xen-devel] [PATCH] x86/MSI: add mechanism to protect MSI-X
> table from PV guest accesses
> 
> This adds two new physdev operations for Dom0 to invoke when resource
> allocation for devices is known to be complete, so that the hypervisor
> can arrange for the respective MMIO ranges to be marked read-only
> before an eventual guest getting such a device assigned even gets
> started, such that it won't be able to set up writable mappings for
> these MMIO ranges before Xen has a chance to protect them.
> 
> This also addresses another issue with the code being modified here,
> in that so far write protection for the address ranges in question got
> set up only once during the lifetime of a device (i.e. until either
> system shutdown or device hot removal), while teardown happened when
> the last interrupt was disposed of by the guest (which at least allowed
> the tables to be writable when the device got assigned to a second
> guest [instance] after the first terminated).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -656,8 +656,8 @@ static u64 read_pci_mem_bar(u16 seg, u8
>   * @entries: pointer to an array of struct msix_entry entries
>   * @nvec: number of @entries
>   *
> - * Setup the MSI-X capability structure of device function with a
> - * single MSI-X irq. A return of zero indicates the successful setup of
> + * Setup the MSI-X capability structure of device function with the
> requested
> + * number MSI-X irqs. A return of zero indicates the successful setup of
>   * requested MSI-X entries with allocated irqs or non-zero for otherwise.
>   **/
>  static int msix_capability_init(struct pci_dev *dev,
> @@ -665,86 +665,69 @@ static int msix_capability_init(struct p
>                                  struct msi_desc **desc,
>                                  unsigned int nr_entries)
>  {
> -    struct msi_desc *entry;
> -    int pos;
> +    struct msi_desc *entry = NULL;
> +    int pos, vf;
>      u16 control;
> -    u64 table_paddr, entry_paddr;
> -    u32 table_offset, entry_offset;
> -    u8 bir;
> -    void __iomem *base;
> -    int idx;
> +    u64 table_paddr;
> +    u32 table_offset;
> +    u8 bir, pbus, pslot, pfunc;
>      u16 seg = dev->seg;
>      u8 bus = dev->bus;
>      u8 slot = PCI_SLOT(dev->devfn);
>      u8 func = PCI_FUNC(dev->devfn);
> 
>      ASSERT(spin_is_locked(&pcidevs_lock));
> -    ASSERT(desc);
> 
>      pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
>      control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos));
>      msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
> 
> -    /* MSI-X Table Initialization */
> -    entry = alloc_msi_entry();
> -    if ( !entry )
> -        return -ENOMEM;
> +    if ( desc )
> +    {
> +        entry = alloc_msi_entry();
> +        if ( !entry )
> +            return -ENOMEM;
> +        ASSERT(msi);
> +    }
> 
> -    /* Request & Map MSI-X table region */
> +    /* Locate MSI-X table region */
>      table_offset = pci_conf_read32(seg, bus, slot, func,
>                                     msix_table_offset_reg(pos));
>      bir = (u8)(table_offset & PCI_MSIX_BIRMASK);
>      table_offset &= ~PCI_MSIX_BIRMASK;
> -    entry_offset = msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
> 
> -    table_paddr = msi->table_base + table_offset;
> -    entry_paddr = table_paddr + entry_offset;
> -    idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
> -    if ( idx < 0 )
> -    {
> -        xfree(entry);
> -        return idx;
> -    }
> -    base = (void *)(fix_to_virt(idx) +
> -        ((unsigned long)entry_paddr & ((1UL << PAGE_SHIFT) - 1)));
> -
> -    entry->msi_attrib.type = PCI_CAP_ID_MSIX;
> -    entry->msi_attrib.is_64 = 1;
> -    entry->msi_attrib.entry_nr = msi->entry_nr;
> -    entry->msi_attrib.maskbit = 1;
> -    entry->msi_attrib.masked = 1;
> -    entry->msi_attrib.pos = pos;
> -    entry->irq = msi->irq;
> -    entry->dev = dev;
> -    entry->mask_base = base;
> -
> -    list_add_tail(&entry->list, &dev->msi_list);
> -
> -    if ( !dev->msix_nr_entries )
> +    if ( !dev->info.is_virtfn )
>      {
> -        u8 pbus, pslot, pfunc;
> -        int vf;
> -        u64 pba_paddr;
> -        u32 pba_offset;
> +        pbus = bus;
> +        pslot = slot;
> +        pfunc = func;
> +        vf = -1;
> +    }
> +    else
> +    {
> +        pbus = dev->info.physfn.bus;
> +        pslot = PCI_SLOT(dev->info.physfn.devfn);
> +        pfunc = PCI_FUNC(dev->info.physfn.devfn);
> +        vf = PCI_BDF2(dev->bus, dev->devfn);
> +    }
> 
> -        if ( !dev->info.is_virtfn )
> -        {
> -            pbus = bus;
> -            pslot = slot;
> -            pfunc = func;
> -            vf = -1;
> -        }
> -        else
> +    table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf);
> +    WARN_ON(msi && msi->table_base != table_paddr);
> +    if ( !table_paddr )
> +    {
> +        if ( !msi || !msi->table_base )
>          {
> -            pbus = dev->info.physfn.bus;
> -            pslot = PCI_SLOT(dev->info.physfn.devfn);
> -            pfunc = PCI_FUNC(dev->info.physfn.devfn);
> -            vf = PCI_BDF2(dev->bus, dev->devfn);
> +            xfree(entry);
> +            return -ENXIO;
>          }
> +        table_paddr = msi->table_base;
> +    }
> +    table_paddr += table_offset;
> 
> -        ASSERT(!dev->msix_used_entries);
> -        WARN_ON(msi->table_base !=
> -                read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf));
> +    if ( !dev->msix_used_entries )
> +    {
> +        u64 pba_paddr;
> +        u32 pba_offset;
> 
>          dev->msix_nr_entries = nr_entries;
>          dev->msix_table.first = PFN_DOWN(table_paddr);
> @@ -765,7 +748,42 @@ static int msix_capability_init(struct p
>                                        BITS_TO_LONGS(nr_entries) - 1);
>          WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev-
> >msix_pba.first,
>                                          dev->msix_pba.last));
> +    }
> 
> +    if ( entry )
> +    {
> +        /* Map MSI-X table region */
> +        u64 entry_paddr = table_paddr + msi->entry_nr *
> PCI_MSIX_ENTRY_SIZE;
> +        int idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
> +        void __iomem *base;
> +
> +        if ( idx < 0 )
> +        {
> +            xfree(entry);
> +            return idx;
> +        }
> +        base = (void *)(fix_to_virt(idx) +
> +                        ((unsigned long)entry_paddr & (PAGE_SIZE - 1)));
> +
> +        /* Mask interrupt here */
> +        writel(1, base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> +
> +        entry->msi_attrib.type = PCI_CAP_ID_MSIX;
> +        entry->msi_attrib.is_64 = 1;
> +        entry->msi_attrib.entry_nr = msi->entry_nr;
> +        entry->msi_attrib.maskbit = 1;
> +        entry->msi_attrib.masked = 1;
> +        entry->msi_attrib.pos = pos;
> +        entry->irq = msi->irq;
> +        entry->dev = dev;
> +        entry->mask_base = base;
> +
> +        list_add_tail(&entry->list, &dev->msi_list);
> +        *desc = entry;
> +    }
> +
> +    if ( !dev->msix_used_entries )
> +    {
>          if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first,
>                                  dev->msix_table.last) )
>              WARN();
> @@ -776,7 +794,7 @@ static int msix_capability_init(struct p
>          if ( dev->domain )
>              p2m_change_entry_type_global(dev->domain,
>                                           p2m_mmio_direct, p2m_mmio_direct);
> -        if ( !dev->domain || !paging_mode_translate(dev->domain) )
> +        if ( desc && (!dev->domain || !paging_mode_translate(dev->domain)) )
>          {
>              struct domain *d = dev->domain;
> 
> @@ -790,6 +808,13 @@ static int msix_capability_init(struct p
>                          break;
>              if ( d )
>              {
> +                if ( !IS_PRIV(d) && dev->msix_warned != d->domain_id )
> +                {
> +                    dev->msix_warned = d->domain_id;
> +                    printk(XENLOG_ERR
> +                           "Potentially insecure use of MSI-X on %04x:%02x:%02x.%u by
> Dom%d\n",
> +                           seg, bus, slot, func, d->domain_id);
> +                }
>                  /* XXX How to deal with existing mappings? */
>              }
>          }
> @@ -798,10 +823,6 @@ static int msix_capability_init(struct p
>      WARN_ON(dev->msix_table.first != (table_paddr >> PAGE_SHIFT));
>      ++dev->msix_used_entries;
> 
> -    /* Mask interrupt here */
> -    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> -
> -    *desc = entry;
>      /* Restore MSI-X enabled bits */
>      pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
> 
> @@ -926,6 +947,19 @@ static int __pci_enable_msix(struct msi_
>      return status;
>  }
> 
> +static void _pci_cleanup_msix(struct pci_dev *dev)
> +{
> +    if ( !--dev->msix_used_entries )
> +    {
> +        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
> +                                   dev->msix_table.last) )
> +            WARN();
> +        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
> +                                   dev->msix_pba.last) )
> +            WARN();
> +    }
> +}
> +
>  static void __pci_disable_msix(struct msi_desc *entry)
>  {
>      struct pci_dev *dev;
> @@ -949,15 +983,42 @@ static void __pci_disable_msix(struct ms
> 
>      pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control);
> 
> -    if ( !--dev->msix_used_entries )
> +    _pci_cleanup_msix(dev);
> +}
> +
> +int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off)
> +{
> +    int rc;
> +    struct pci_dev *pdev;
> +    u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
> +    unsigned int pos = pci_find_cap_offset(seg, bus, slot, func,
> +                                           PCI_CAP_ID_MSIX);
> +
> +    if ( !pos )
> +        return -ENODEV;
> +
> +    spin_lock(&pcidevs_lock);
> +    pdev = pci_get_pdev(seg, bus, devfn);
> +    if ( !pdev )
> +        rc = -ENODEV;
> +    else if ( pdev->msix_used_entries != !!off )
> +        rc = -EBUSY;
> +    else if ( off )
>      {
> -        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
> -                                  dev->msix_table.last) )
> -            WARN();
> -        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
> -                                   dev->msix_pba.last) )
> -            WARN();
> +        _pci_cleanup_msix(pdev);
> +        rc = 0;
>      }
> +    else
> +    {
> +        u16 control = pci_conf_read16(seg, bus, slot, func,
> +                                      msix_control_reg(pos));
> +
> +        rc = msix_capability_init(pdev, NULL, NULL,
> +                                  multi_msix_capable(control));
> +    }
> +    spin_unlock(&pcidevs_lock);
> +
> +    return rc;
>  }
> 
>  /*
> --- a/xen/arch/x86/physdev.c
> +++ b/xen/arch/x86/physdev.c
> @@ -575,6 +575,18 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
>          break;
>      }
> 
> +    case PHYSDEVOP_prepare_msix:
> +    case PHYSDEVOP_release_msix: {
> +        struct physdev_pci_device dev;
> +
> +        if ( copy_from_guest(&dev, arg, 1) )
> +            ret = -EFAULT;
> +        else
> +            ret = pci_prepare_msix(dev.seg, dev.bus, dev.devfn,
> +                                   cmd != PHYSDEVOP_prepare_msix);
> +        break;
> +    }
> +
>      case PHYSDEVOP_pci_mmcfg_reserved: {
>          struct physdev_pci_mmcfg_reserved info;
> 
> --- a/xen/include/asm-x86/msi.h
> +++ b/xen/include/asm-x86/msi.h
> @@ -76,6 +76,7 @@ struct msi_desc;
>  /* Helper functions */
>  extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc);
>  extern void pci_disable_msi(struct msi_desc *desc);
> +extern int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off);
>  extern void pci_cleanup_msi(struct pci_dev *pdev);
>  extern void setup_msi_handler(struct irq_desc *, struct msi_desc *);
>  extern void setup_msi_irq(struct irq_desc *);
> --- a/xen/include/public/physdev.h
> +++ b/xen/include/public/physdev.h
> @@ -303,6 +303,12 @@ DEFINE_XEN_GUEST_HANDLE(physdev_pci_devi
> 
>  #define PHYSDEVOP_pci_device_remove     26
>  #define PHYSDEVOP_restore_msi_ext       27
> +/*
> + * Dom0 should use these two to announce MMIO resources assigned to
> + * MSI-X capable devices won't (prepare) or may (release) change.
> + */
> +#define PHYSDEVOP_prepare_msix          30
> +#define PHYSDEVOP_release_msix          31
>  struct physdev_pci_device {
>      /* IN */
>      uint16_t seg;
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -57,6 +57,7 @@ struct pci_dev {
>      int msix_table_refcnt[MAX_MSIX_TABLE_PAGES];
>      int msix_table_idx[MAX_MSIX_TABLE_PAGES];
>      spinlock_t msix_table_lock;
> +    domid_t msix_warned;
> 
>      struct domain *domain;
>      const u16 seg;
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses
  2013-02-25  5:17 ` Zhang, Xiantao
@ 2013-02-25  9:38   ` Jan Beulich
  0 siblings, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2013-02-25  9:38 UTC (permalink / raw)
  To: Xiantao Zhang; +Cc: xen-devel

>>> On 25.02.13 at 06:17, "Zhang, Xiantao" <xiantao.zhang@intel.com> wrote:
> I think this fix also covers HVM side, right ?

The HVM case was already taken care of without this change.

>   For PV side,  I recalled 
> there was already a fix in Xend to protect PV guest from accessing the 
> related MMIO range.

There once was such a change, but iirc it got reverted quickly, and
no-one ever cared to restore it in a way that wouldn't break things.

Also, that wouldn't help xl in any way.

> Also for HVM guest, last year you proposed and submitted 
> a patch to Xen-Qemu for blocking guest's accesses to the related MMIO range.   

Not quite - that change was a pre-req to make it possible to also
deny Dom0 access to that range, because qemu was incorrectly
mapping the range in question.

>  Looks like the previous fixes are not needed any more if this fix got 
> check-in, right ?  

They all will still be necessary. This change (or one of the described
alternatives) is only to close the remaining hole for PV guests.

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Ping: [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses
  2013-02-07  8:44 ` Jan Beulich
  2013-02-14 10:00   ` Ian Campbell
@ 2013-02-28  9:22   ` Jan Beulich
  2013-02-28 12:30     ` Andrew Cooper
  1 sibling, 1 reply; 9+ messages in thread
From: Jan Beulich @ 2013-02-28  9:22 UTC (permalink / raw)
  To: Haitao Shan, xiantao.zhang, Keir Fraser
  Cc: Ian Campbell, Donald D Dugger, xen-devel

>>> On 07.02.13 at 09:44, "Jan Beulich" <JBeulich@suse.com> wrote:
>>>> On 06.02.13 at 17:50, "Jan Beulich" <JBeulich@suse.com> wrote:
>> This adds two new physdev operations for Dom0 to invoke when resource
>> allocation for devices is known to be complete, so that the hypervisor
>> can arrange for the respective MMIO ranges to be marked read-only
>> before an eventual guest getting such a device assigned even gets
>> started, such that it won't be able to set up writable mappings for
>> these MMIO ranges before Xen has a chance to protect them.
> 
> I should probably mention the alternatives:
> 
> 1) Brute force scan of the (PV) guest's L1 page tables, locating
> eventual mappings of the questionable MMIO pages, and
> converting those mappings to R/O ones.
> 
> 2) Snoop BAR modifications (via xen/arch/x86/traps.c:
> guest_io_write(), taking note of which BAR(s) are relevant at the
> point where the device gets first detected/reported), perhaps
> along with snoops of the PCI_COMMAND_MEMORY bit.

So I just now sent a mail to Linux'es PCI subsystem maintainer,
assuming that we'll go with the approach the patch presented at
the start of this thread. The outcome of this, however, is not
really relevant for the hypervisor side change to go in, as it
would only affect the placement of the new hypercall within
pciback or the core PCI subsystem.

That assumption is largely because no-one really voiced an opinion
towards a preference of one of the alternatives described above.

Unless I'll receive an explicit NAK or further comments moving this
discussion forward, I'm intending to commit the patch without
anyone's ACK in a few days time _and_ backport it to the stable
trees (4.2 at least, not sure how well it will backport to 4.1).

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Ping: [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses
  2013-02-28  9:22   ` Ping: " Jan Beulich
@ 2013-02-28 12:30     ` Andrew Cooper
  2013-02-28 13:46       ` Jan Beulich
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Cooper @ 2013-02-28 12:30 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Keir (Xen.org),
	Ian Campbell, Haitao Shan, Donald D Dugger, xen-devel,
	xiantao.zhang

On 28/02/13 09:22, Jan Beulich wrote:
>>>> On 07.02.13 at 09:44, "Jan Beulich" <JBeulich@suse.com> wrote:
>>>>> On 06.02.13 at 17:50, "Jan Beulich" <JBeulich@suse.com> wrote:
>>> This adds two new physdev operations for Dom0 to invoke when resource
>>> allocation for devices is known to be complete, so that the hypervisor
>>> can arrange for the respective MMIO ranges to be marked read-only
>>> before an eventual guest getting such a device assigned even gets
>>> started, such that it won't be able to set up writable mappings for
>>> these MMIO ranges before Xen has a chance to protect them.
>> I should probably mention the alternatives:
>>
>> 1) Brute force scan of the (PV) guest's L1 page tables, locating
>> eventual mappings of the questionable MMIO pages, and
>> converting those mappings to R/O ones.
>>
>> 2) Snoop BAR modifications (via xen/arch/x86/traps.c:
>> guest_io_write(), taking note of which BAR(s) are relevant at the
>> point where the device gets first detected/reported), perhaps
>> along with snoops of the PCI_COMMAND_MEMORY bit.
> So I just now sent a mail to Linux'es PCI subsystem maintainer,
> assuming that we'll go with the approach the patch presented at
> the start of this thread. The outcome of this, however, is not
> really relevant for the hypervisor side change to go in, as it
> would only affect the placement of the new hypercall within
> pciback or the core PCI subsystem.
>
> That assumption is largely because no-one really voiced an opinion
> towards a preference of one of the alternatives described above.
>
> Unless I'll receive an explicit NAK or further comments moving this
> discussion forward, I'm intending to commit the patch without
> anyone's ACK in a few days time _and_ backport it to the stable
> trees (4.2 at least, not sure how well it will backport to 4.1).
>
> Jan

For what it is worth, I think the principle is good.  One query I have
is whether it is sensible to restrict this to dom0, as the comments
indicate, or whether it should be permitted to be used by any domain
with appropriate permissions to manage PCI passthrough.

How do you see dom0 attempting to use these hypercalls in an example of
passing a PCI device through to an untrusted domain?

~Andrew

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Ping: [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses
  2013-02-28 12:30     ` Andrew Cooper
@ 2013-02-28 13:46       ` Jan Beulich
  0 siblings, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2013-02-28 13:46 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Keir (Xen.org),
	Ian Campbell, Haitao Shan, Donald D Dugger, xen-devel,
	xiantao.zhang

>>> On 28.02.13 at 13:30, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> For what it is worth, I think the principle is good.  One query I have
> is whether it is sensible to restrict this to dom0, as the comments
> indicate, or whether it should be permitted to be used by any domain
> with appropriate permissions to manage PCI passthrough.

No, I think this indeed ought to be restricted to Dom0 as the
original owner of all devices. If Dom0 decides to had some
devices for management to a second domain, the resource
assignment nevertheless needs to be coordinated by Dom0,
and hence the notification should also come from there.

> How do you see dom0 attempting to use these hypercalls in an example of
> passing a PCI device through to an untrusted domain?

Right now my plan is to have pciback issue the hypercall right
after having called pci_enable_device(), pending confirmation
that resources won't change after that point anymore (see
the mail I sent to Bjorn Helgaas earlier today, xen-devel Cc-ed).

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2013-02-28 13:46 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-02-06 16:50 [PATCH] x86/MSI: add mechanism to protect MSI-X table from PV guest accesses Jan Beulich
2013-02-07  8:44 ` Jan Beulich
2013-02-14 10:00   ` Ian Campbell
2013-02-14 10:49     ` Jan Beulich
2013-02-28  9:22   ` Ping: " Jan Beulich
2013-02-28 12:30     ` Andrew Cooper
2013-02-28 13:46       ` Jan Beulich
2013-02-25  5:17 ` Zhang, Xiantao
2013-02-25  9:38   ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.