All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH resend 0/3] x86/IOMMU: multi-vector MSI
@ 2013-07-16 10:00 Jan Beulich
  2013-07-16 10:13 ` [PATCH resend 1/3] VT-d: enable for " Jan Beulich
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Jan Beulich @ 2013-07-16 10:00 UTC (permalink / raw)
  To: xen-devel; +Cc: Keir Fraser, xiantao.zhang

These are the patches from an earlier submitted series that haven't
got ack-ed by anyone so far.

1: VT-d: enable for multi-vector MSI
2: x86: enable multi-vector MSI
3: pciif: add multi-vector-MSI command

Signed-off-by: Jan Beulich <jbeulich@suse.com>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH resend 1/3]  VT-d: enable for multi-vector MSI
  2013-07-16 10:00 [PATCH resend 0/3] x86/IOMMU: multi-vector MSI Jan Beulich
@ 2013-07-16 10:13 ` Jan Beulich
  2013-07-16 11:15   ` Andrew Cooper
  2013-07-16 10:14 ` [PATCH resend 2/3] x86: enable " Jan Beulich
  2013-07-16 10:15 ` [PATCH resend 3/3] pciif: add multi-vector-MSI command Jan Beulich
  2 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2013-07-16 10:13 UTC (permalink / raw)
  To: xen-devel, Jan Beulich; +Cc: Keir Fraser, xiantao.zhang

[-- Attachment #1: Type: text/plain, Size: 6549 bytes --]

The main change being to make alloc_remap_entry() capable of allocating
a block of entries.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
@@ -194,18 +194,18 @@ static void free_remap_entry(struct iomm
 }
 
 /*
- * Look for a free intr remap entry.
+ * Look for a free intr remap entry (or a contiguous set thereof).
  * Need hold iremap_lock, and setup returned entry before releasing lock.
  */
-static int alloc_remap_entry(struct iommu *iommu)
+static unsigned int alloc_remap_entry(struct iommu *iommu, unsigned int nr)
 {
     struct iremap_entry *iremap_entries = NULL;
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
-    int i;
+    unsigned int i, found;
 
     ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) );
 
-    for ( i = 0; i < IREMAP_ENTRY_NR; i++ )
+    for ( found = i = 0; i < IREMAP_ENTRY_NR; i++ )
     {
         struct iremap_entry *p;
         if ( i % (1 << IREMAP_ENTRY_ORDER) == 0 )
@@ -220,7 +220,9 @@ static int alloc_remap_entry(struct iomm
         else
             p = &iremap_entries[i % (1 << IREMAP_ENTRY_ORDER)];
 
-        if ( p->lo_val == 0 && p->hi_val == 0 ) /* a free entry */
+        if ( p->lo_val || p->hi_val ) /* not a free entry */
+            found = 0;
+        else if ( ++found == nr )
             break;
     }
 
@@ -228,7 +230,7 @@ static int alloc_remap_entry(struct iomm
         unmap_vtd_domain_page(iremap_entries);
 
     if ( i < IREMAP_ENTRY_NR ) 
-        ir_ctrl->iremap_num++;
+        ir_ctrl->iremap_num += nr;
     return i;
 }
 
@@ -293,7 +295,7 @@ static int ioapic_rte_to_remap_entry(str
     index = apic_pin_2_ir_idx[apic][ioapic_pin];
     if ( index < 0 )
     {
-        index = alloc_remap_entry(iommu);
+        index = alloc_remap_entry(iommu, 1);
         if ( index < IREMAP_ENTRY_NR )
             apic_pin_2_ir_idx[apic][ioapic_pin] = index;
     }
@@ -485,19 +487,18 @@ static void set_msi_source_id(struct pci
 }
 
 static int remap_entry_to_msi_msg(
-    struct iommu *iommu, struct msi_msg *msg)
+    struct iommu *iommu, struct msi_msg *msg, unsigned int index)
 {
     struct iremap_entry *iremap_entry = NULL, *iremap_entries;
     struct msi_msg_remap_entry *remap_rte;
-    int index;
     unsigned long flags;
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
 
     remap_rte = (struct msi_msg_remap_entry *) msg;
-    index = (remap_rte->address_lo.index_15 << 15) |
+    index += (remap_rte->address_lo.index_15 << 15) |
              remap_rte->address_lo.index_0_14;
 
-    if ( index < 0 || index > IREMAP_ENTRY_NR - 1 )
+    if ( index >= IREMAP_ENTRY_NR )
     {
         dprintk(XENLOG_ERR VTDPREFIX,
                 "%s: index (%d) for remap table is invalid !\n",
@@ -555,31 +556,29 @@ static int msi_msg_to_remap_entry(
     struct iremap_entry *iremap_entry = NULL, *iremap_entries;
     struct iremap_entry new_ire;
     struct msi_msg_remap_entry *remap_rte;
-    int index;
+    unsigned int index, i, nr = 1;
     unsigned long flags;
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
 
-    remap_rte = (struct msi_msg_remap_entry *) msg;
+    if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
+        nr = msi_desc->msi.nvec;
+
     spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
 
     if ( msg == NULL )
     {
-        /* Free specified unused IRTE */
-        free_remap_entry(iommu, msi_desc->remap_index);
+        /* Free specified unused IRTEs */
+        for ( i = 0; i < nr; ++i )
+            free_remap_entry(iommu, msi_desc->remap_index + i);
         spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
         return 0;
     }
 
     if ( msi_desc->remap_index < 0 )
     {
-        /*
-         * TODO: Multiple-vector MSI requires allocating multiple continuous
-         * entries and configuring addr/data of msi_msg in different way. So
-         * alloca_remap_entry will be changed if enabling multiple-vector MSI
-         * in future.
-         */
-        index = alloc_remap_entry(iommu);
-        msi_desc->remap_index = index;
+        index = alloc_remap_entry(iommu, nr);
+        for ( i = 0; i < nr; ++i )
+            msi_desc[i].remap_index = index + i;
     }
     else
         index = msi_desc->remap_index;
@@ -590,7 +589,8 @@ static int msi_msg_to_remap_entry(
                 "%s: intremap index (%d) is larger than"
                 " the maximum index (%d)!\n",
                 __func__, index, IREMAP_ENTRY_NR - 1);
-        msi_desc->remap_index = -1;
+        for ( i = 0; i < nr; ++i )
+            msi_desc[i].remap_index = -1;
         spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
         return -EFAULT;
     }
@@ -626,14 +626,18 @@ static int msi_msg_to_remap_entry(
     new_ire.lo.p = 1;    /* finally, set present bit */
 
     /* now construct new MSI/MSI-X rte entry */
+    remap_rte = (struct msi_msg_remap_entry *)msg;
     remap_rte->address_lo.dontcare = 0;
-    remap_rte->address_lo.index_15 = (index >> 15) & 0x1;
-    remap_rte->address_lo.index_0_14 = index & 0x7fff;
+    i = index;
+    if ( !nr )
+        i -= msi_desc->msi_attrib.entry_nr;
+    remap_rte->address_lo.index_15 = (i >> 15) & 0x1;
+    remap_rte->address_lo.index_0_14 = i & 0x7fff;
     remap_rte->address_lo.SHV = 1;
     remap_rte->address_lo.format = 1;
 
     remap_rte->address_hi = 0;
-    remap_rte->data = 0;
+    remap_rte->data = index - i;
 
     memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
     iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry));
@@ -654,7 +658,9 @@ void msi_msg_read_remap_rte(
     drhd = pdev ? acpi_find_matched_drhd_unit(pdev)
                 : hpet_to_drhd(msi_desc->hpet_id);
     if ( drhd )
-        remap_entry_to_msi_msg(drhd->iommu, msg);
+        remap_entry_to_msi_msg(drhd->iommu, msg,
+                               msi_desc->msi_attrib.type == PCI_CAP_ID_MSI
+                               ? msi_desc->msi_attrib.entry_nr : 0);
 }
 
 int msi_msg_write_remap_rte(
@@ -680,7 +686,7 @@ int __init intel_setup_hpet_msi(struct m
         return 0;
 
     spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
-    msi_desc->remap_index = alloc_remap_entry(iommu);
+    msi_desc->remap_index = alloc_remap_entry(iommu, 1);
     if ( msi_desc->remap_index >= IREMAP_ENTRY_NR )
     {
         dprintk(XENLOG_ERR VTDPREFIX,



[-- Attachment #2: VT-d-multi-vector-MSI.patch --]
[-- Type: text/plain, Size: 6582 bytes --]

VT-d: enable for multi-vector MSI

The main change being to make alloc_remap_entry() capable of allocating
a block of entries.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
@@ -194,18 +194,18 @@ static void free_remap_entry(struct iomm
 }
 
 /*
- * Look for a free intr remap entry.
+ * Look for a free intr remap entry (or a contiguous set thereof).
  * Need hold iremap_lock, and setup returned entry before releasing lock.
  */
-static int alloc_remap_entry(struct iommu *iommu)
+static unsigned int alloc_remap_entry(struct iommu *iommu, unsigned int nr)
 {
     struct iremap_entry *iremap_entries = NULL;
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
-    int i;
+    unsigned int i, found;
 
     ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) );
 
-    for ( i = 0; i < IREMAP_ENTRY_NR; i++ )
+    for ( found = i = 0; i < IREMAP_ENTRY_NR; i++ )
     {
         struct iremap_entry *p;
         if ( i % (1 << IREMAP_ENTRY_ORDER) == 0 )
@@ -220,7 +220,9 @@ static int alloc_remap_entry(struct iomm
         else
             p = &iremap_entries[i % (1 << IREMAP_ENTRY_ORDER)];
 
-        if ( p->lo_val == 0 && p->hi_val == 0 ) /* a free entry */
+        if ( p->lo_val || p->hi_val ) /* not a free entry */
+            found = 0;
+        else if ( ++found == nr )
             break;
     }
 
@@ -228,7 +230,7 @@ static int alloc_remap_entry(struct iomm
         unmap_vtd_domain_page(iremap_entries);
 
     if ( i < IREMAP_ENTRY_NR ) 
-        ir_ctrl->iremap_num++;
+        ir_ctrl->iremap_num += nr;
     return i;
 }
 
@@ -293,7 +295,7 @@ static int ioapic_rte_to_remap_entry(str
     index = apic_pin_2_ir_idx[apic][ioapic_pin];
     if ( index < 0 )
     {
-        index = alloc_remap_entry(iommu);
+        index = alloc_remap_entry(iommu, 1);
         if ( index < IREMAP_ENTRY_NR )
             apic_pin_2_ir_idx[apic][ioapic_pin] = index;
     }
@@ -485,19 +487,18 @@ static void set_msi_source_id(struct pci
 }
 
 static int remap_entry_to_msi_msg(
-    struct iommu *iommu, struct msi_msg *msg)
+    struct iommu *iommu, struct msi_msg *msg, unsigned int index)
 {
     struct iremap_entry *iremap_entry = NULL, *iremap_entries;
     struct msi_msg_remap_entry *remap_rte;
-    int index;
     unsigned long flags;
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
 
     remap_rte = (struct msi_msg_remap_entry *) msg;
-    index = (remap_rte->address_lo.index_15 << 15) |
+    index += (remap_rte->address_lo.index_15 << 15) |
              remap_rte->address_lo.index_0_14;
 
-    if ( index < 0 || index > IREMAP_ENTRY_NR - 1 )
+    if ( index >= IREMAP_ENTRY_NR )
     {
         dprintk(XENLOG_ERR VTDPREFIX,
                 "%s: index (%d) for remap table is invalid !\n",
@@ -555,31 +556,29 @@ static int msi_msg_to_remap_entry(
     struct iremap_entry *iremap_entry = NULL, *iremap_entries;
     struct iremap_entry new_ire;
     struct msi_msg_remap_entry *remap_rte;
-    int index;
+    unsigned int index, i, nr = 1;
     unsigned long flags;
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
 
-    remap_rte = (struct msi_msg_remap_entry *) msg;
+    if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
+        nr = msi_desc->msi.nvec;
+
     spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
 
     if ( msg == NULL )
     {
-        /* Free specified unused IRTE */
-        free_remap_entry(iommu, msi_desc->remap_index);
+        /* Free specified unused IRTEs */
+        for ( i = 0; i < nr; ++i )
+            free_remap_entry(iommu, msi_desc->remap_index + i);
         spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
         return 0;
     }
 
     if ( msi_desc->remap_index < 0 )
     {
-        /*
-         * TODO: Multiple-vector MSI requires allocating multiple continuous
-         * entries and configuring addr/data of msi_msg in different way. So
-         * alloca_remap_entry will be changed if enabling multiple-vector MSI
-         * in future.
-         */
-        index = alloc_remap_entry(iommu);
-        msi_desc->remap_index = index;
+        index = alloc_remap_entry(iommu, nr);
+        for ( i = 0; i < nr; ++i )
+            msi_desc[i].remap_index = index + i;
     }
     else
         index = msi_desc->remap_index;
@@ -590,7 +589,8 @@ static int msi_msg_to_remap_entry(
                 "%s: intremap index (%d) is larger than"
                 " the maximum index (%d)!\n",
                 __func__, index, IREMAP_ENTRY_NR - 1);
-        msi_desc->remap_index = -1;
+        for ( i = 0; i < nr; ++i )
+            msi_desc[i].remap_index = -1;
         spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
         return -EFAULT;
     }
@@ -626,14 +626,18 @@ static int msi_msg_to_remap_entry(
     new_ire.lo.p = 1;    /* finally, set present bit */
 
     /* now construct new MSI/MSI-X rte entry */
+    remap_rte = (struct msi_msg_remap_entry *)msg;
     remap_rte->address_lo.dontcare = 0;
-    remap_rte->address_lo.index_15 = (index >> 15) & 0x1;
-    remap_rte->address_lo.index_0_14 = index & 0x7fff;
+    i = index;
+    if ( !nr )
+        i -= msi_desc->msi_attrib.entry_nr;
+    remap_rte->address_lo.index_15 = (i >> 15) & 0x1;
+    remap_rte->address_lo.index_0_14 = i & 0x7fff;
     remap_rte->address_lo.SHV = 1;
     remap_rte->address_lo.format = 1;
 
     remap_rte->address_hi = 0;
-    remap_rte->data = 0;
+    remap_rte->data = index - i;
 
     memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
     iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry));
@@ -654,7 +658,9 @@ void msi_msg_read_remap_rte(
     drhd = pdev ? acpi_find_matched_drhd_unit(pdev)
                 : hpet_to_drhd(msi_desc->hpet_id);
     if ( drhd )
-        remap_entry_to_msi_msg(drhd->iommu, msg);
+        remap_entry_to_msi_msg(drhd->iommu, msg,
+                               msi_desc->msi_attrib.type == PCI_CAP_ID_MSI
+                               ? msi_desc->msi_attrib.entry_nr : 0);
 }
 
 int msi_msg_write_remap_rte(
@@ -680,7 +686,7 @@ int __init intel_setup_hpet_msi(struct m
         return 0;
 
     spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
-    msi_desc->remap_index = alloc_remap_entry(iommu);
+    msi_desc->remap_index = alloc_remap_entry(iommu, 1);
     if ( msi_desc->remap_index >= IREMAP_ENTRY_NR )
     {
         dprintk(XENLOG_ERR VTDPREFIX,

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH resend 2/3] x86: enable multi-vector MSI
  2013-07-16 10:00 [PATCH resend 0/3] x86/IOMMU: multi-vector MSI Jan Beulich
  2013-07-16 10:13 ` [PATCH resend 1/3] VT-d: enable for " Jan Beulich
@ 2013-07-16 10:14 ` Jan Beulich
  2013-07-16 11:36   ` Andrew Cooper
                     ` (2 more replies)
  2013-07-16 10:15 ` [PATCH resend 3/3] pciif: add multi-vector-MSI command Jan Beulich
  2 siblings, 3 replies; 18+ messages in thread
From: Jan Beulich @ 2013-07-16 10:14 UTC (permalink / raw)
  To: xen-devel; +Cc: Keir Fraser, xiantao.zhang

[-- Attachment #1: Type: text/plain, Size: 23243 bytes --]

This implies
- extending the public interface to have a way to request a block of
  MSIs
- allocating a block of contiguous pIRQ-s for the target domain (but
  note that the Xen IRQs allocated have no need of being contiguous)
- repeating certain operations for all involved IRQs
- fixing multi_msi_enable()
- adjusting the mask bit accesses for maskable MSIs

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1863,6 +1863,25 @@ int get_free_pirq(struct domain *d, int 
     return -ENOSPC;
 }
 
+int get_free_pirqs(struct domain *d, unsigned int nr)
+{
+    unsigned int i, found = 0;
+
+    ASSERT(spin_is_locked(&d->event_lock));
+
+    for ( i = d->nr_pirqs - 1; i >= nr_irqs_gsi; --i )
+        if ( is_free_pirq(d, pirq_info(d, i)) )
+        {
+            pirq_get_info(d, i);
+            if ( ++found == nr )
+                return i;
+        }
+        else
+            found = 0;
+
+    return -ENOSPC;
+}
+
 int map_domain_pirq(
     struct domain *d, int pirq, int irq, int type, void *data)
 {
@@ -1918,11 +1937,12 @@ int map_domain_pirq(
 
     desc = irq_to_desc(irq);
 
-    if ( type == MAP_PIRQ_TYPE_MSI )
+    if ( type == MAP_PIRQ_TYPE_MSI || type == MAP_PIRQ_TYPE_MULTI_MSI )
     {
         struct msi_info *msi = (struct msi_info *)data;
         struct msi_desc *msi_desc;
         struct pci_dev *pdev;
+        unsigned int nr = 0;
 
         ASSERT(spin_is_locked(&pcidevs_lock));
 
@@ -1933,7 +1953,14 @@ int map_domain_pirq(
         pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
         ret = pci_enable_msi(msi, &msi_desc);
         if ( ret )
+        {
+            if ( ret > 0 )
+            {
+                msi->entry_nr = ret;
+                ret = -ENFILE;
+            }
             goto done;
+        }
 
         spin_lock_irqsave(&desc->lock, flags);
 
@@ -1947,25 +1974,73 @@ int map_domain_pirq(
             goto done;
         }
 
-        ret = setup_msi_irq(desc, msi_desc);
-        if ( ret )
+        while ( !(ret = setup_msi_irq(desc, msi_desc + nr)) )
         {
+            if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV &&
+                 !desc->arch.used_vectors )
+            {
+                desc->arch.used_vectors = &pdev->arch.used_vectors;
+                if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED )
+                {
+                    int vector = desc->arch.vector;
+
+                    ASSERT(!test_bit(vector, desc->arch.used_vectors));
+                    set_bit(vector, desc->arch.used_vectors);
+                }
+            }
+            if ( type == MAP_PIRQ_TYPE_MSI ||
+                 msi_desc->msi_attrib.type != PCI_CAP_ID_MSI ||
+                 ++nr == msi->entry_nr )
+                break;
+
+            set_domain_irq_pirq(d, irq, info);
             spin_unlock_irqrestore(&desc->lock, flags);
-            pci_disable_msi(msi_desc);
-            goto done;
+
+            info = NULL;
+            irq = create_irq(NUMA_NO_NODE);
+            ret = irq >= 0 ? prepare_domain_irq_pirq(d, irq, pirq + nr, &info)
+                           : irq;
+            if ( ret )
+                break;
+            msi_desc[nr].irq = irq;
+
+            if ( irq_permit_access(d, irq) != 0 )
+                printk(XENLOG_G_WARNING
+                       "dom%d: could not permit access to IRQ%d (pirq %d)\n",
+                       d->domain_id, irq, pirq);
+
+            desc = irq_to_desc(irq);
+            spin_lock_irqsave(&desc->lock, flags);
+
+            if ( desc->handler != &no_irq_type )
+            {
+                dprintk(XENLOG_G_ERR, "dom%d: irq %d (pirq %u) in use (%s)\n",
+                        d->domain_id, irq, pirq + nr, desc->handler->typename);
+                ret = -EBUSY;
+                break;
+            }
         }
 
-        if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV
-             && !desc->arch.used_vectors )
+        if ( ret )
         {
-            desc->arch.used_vectors = &pdev->arch.used_vectors;
-            if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED )
+            spin_unlock_irqrestore(&desc->lock, flags);
+            while ( nr-- )
             {
-                int vector = desc->arch.vector;
-                ASSERT(!test_bit(vector, desc->arch.used_vectors));
-
-                set_bit(vector, desc->arch.used_vectors);
+                if ( irq >= 0 )
+                {
+                    if ( irq_deny_access(d, irq) )
+                        printk(XENLOG_G_ERR
+                               "dom%d: could not revoke access to IRQ%d (pirq %d)\n",
+                               d->domain_id, irq, pirq);
+                    destroy_irq(irq);
+                }
+                if ( info )
+                    cleanup_domain_irq_pirq(d, irq, info);
+                info = pirq_info(d, pirq + nr);
+                irq = info->arch.irq;
             }
+            pci_disable_msi(msi_desc);
+            goto done;
         }
 
         set_domain_irq_pirq(d, irq, info);
@@ -1996,7 +2071,8 @@ int unmap_domain_pirq(struct domain *d, 
 {
     unsigned long flags;
     struct irq_desc *desc;
-    int irq, ret = 0;
+    int irq, ret = 0, rc;
+    unsigned int i, nr = 1;
     bool_t forced_unbind;
     struct pirq *info;
     struct msi_desc *msi_desc = NULL;
@@ -2018,6 +2094,18 @@ int unmap_domain_pirq(struct domain *d, 
 
     desc = irq_to_desc(irq);
     msi_desc = desc->msi_desc;
+    if ( msi_desc && msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
+    {
+        if ( msi_desc->msi_attrib.entry_nr )
+        {
+            printk(XENLOG_G_ERR
+                   "dom%d: trying to unmap secondary MSI pirq %d\n",
+                   d->domain_id, pirq);
+            ret = -EBUSY;
+            goto done;
+        }
+        nr = msi_desc->msi.nvec;
+    }
 
     ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, msi_desc);
     if ( ret )
@@ -2033,37 +2121,83 @@ int unmap_domain_pirq(struct domain *d, 
 
     spin_lock_irqsave(&desc->lock, flags);
 
-    BUG_ON(irq != domain_pirq_to_irq(d, pirq));
-
-    if ( !forced_unbind )
-        clear_domain_irq_pirq(d, irq, info);
-    else
+    for ( i = 0; ; )
     {
-        info->arch.irq = -irq;
-        radix_tree_replace_slot(
-            radix_tree_lookup_slot(&d->arch.irq_pirq, irq),
-            radix_tree_int_to_ptr(-pirq));
+        BUG_ON(irq != domain_pirq_to_irq(d, pirq + i));
+
+        if ( !forced_unbind )
+            clear_domain_irq_pirq(d, irq, info);
+        else
+        {
+            info->arch.irq = -irq;
+            radix_tree_replace_slot(
+                radix_tree_lookup_slot(&d->arch.irq_pirq, irq),
+                radix_tree_int_to_ptr(-pirq));
+        }
+
+        if ( msi_desc )
+        {
+            desc->handler = &no_irq_type;
+            desc->msi_desc = NULL;
+        }
+
+        if ( ++i == nr )
+            break;
+
+        spin_unlock_irqrestore(&desc->lock, flags);
+
+        if ( !forced_unbind )
+           cleanup_domain_irq_pirq(d, irq, info);
+
+        rc = irq_deny_access(d, irq);
+        if ( rc )
+        {
+            printk(XENLOG_G_ERR
+                   "dom%d: could not deny access to IRQ%d (pirq %d)\n",
+                   d->domain_id, irq, pirq + i);
+            ret = rc;
+        }
+
+        do {
+            info = pirq_info(d, pirq + i);
+            if ( info && (irq = info->arch.irq) > 0 )
+                break;
+            printk(XENLOG_G_ERR "dom%d: MSI pirq %d not mapped\n",
+                   d->domain_id, pirq + i);
+        } while ( ++i < nr );
+
+        if ( i == nr )
+        {
+            desc = NULL;
+            break;
+        }
+
+        desc = irq_to_desc(irq);
+        BUG_ON(desc->msi_desc != msi_desc + i);
+
+        spin_lock_irqsave(&desc->lock, flags);
     }
 
-    if ( msi_desc )
+    if ( desc )
     {
-        desc->handler = &no_irq_type;
-        desc->msi_desc = NULL;
+        spin_unlock_irqrestore(&desc->lock, flags);
+
+        if ( !forced_unbind )
+            cleanup_domain_irq_pirq(d, irq, info);
+
+        rc = irq_deny_access(d, irq);
+        if ( rc )
+        {
+            printk(XENLOG_G_ERR
+                   "dom%d: could not deny access to IRQ%d (pirq %d)\n",
+                   d->domain_id, irq, pirq + nr - 1);
+            ret = rc;
+        }
     }
 
-    spin_unlock_irqrestore(&desc->lock, flags);
     if (msi_desc)
         msi_free_irq(msi_desc);
 
-    if ( !forced_unbind )
-        cleanup_domain_irq_pirq(d, irq, info);
-
-    ret = irq_deny_access(d, irq);
-    if ( ret )
-        printk(XENLOG_G_ERR
-               "dom%d: could not deny access to IRQ%d (pirq %d)\n",
-               d->domain_id, irq, pirq);
-
  done:
     return ret;
 }
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -236,6 +236,11 @@ static int write_msi_msg(struct msi_desc
         u8 bus = dev->bus;
         u8 slot = PCI_SLOT(dev->devfn);
         u8 func = PCI_FUNC(dev->devfn);
+        int nr = entry->msi_attrib.entry_nr;
+
+        ASSERT((msg->data & (entry[-nr].msi.nvec - 1)) == nr);
+        if ( nr )
+            return 0;
 
         pci_conf_write32(seg, bus, slot, func, msi_lower_address_reg(pos),
                          msg->address_lo);
@@ -359,8 +364,8 @@ static void msi_set_mask_bit(struct irq_
             u8 func = PCI_FUNC(entry->dev->devfn);
 
             mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
-            mask_bits &= ~(1);
-            mask_bits |= flag;
+            mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
+            mask_bits |= (u32)flag << entry->msi_attrib.entry_nr;
             pci_conf_write32(seg, bus, slot, func, entry->msi.mpos, mask_bits);
         }
         break;
@@ -384,10 +389,11 @@ static int msi_get_mask_bit(const struct
     case PCI_CAP_ID_MSI:
         if (!entry->dev || !entry->msi_attrib.maskbit)
             break;
-        return pci_conf_read32(entry->dev->seg, entry->dev->bus,
-                               PCI_SLOT(entry->dev->devfn),
-                               PCI_FUNC(entry->dev->devfn),
-                               entry->msi.mpos) & 1;
+        return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
+                                PCI_SLOT(entry->dev->devfn),
+                                PCI_FUNC(entry->dev->devfn),
+                                entry->msi.mpos) >>
+                entry->msi_attrib.entry_nr) & 1;
     case PCI_CAP_ID_MSIX:
         return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
     }
@@ -453,17 +459,20 @@ static hw_irq_controller pci_msi_nonmask
     .set_affinity = set_msi_affinity
 };
 
-static struct msi_desc* alloc_msi_entry(void)
+static struct msi_desc *alloc_msi_entry(unsigned int nr)
 {
     struct msi_desc *entry;
 
-    entry = xmalloc(struct msi_desc);
+    entry = xmalloc_array(struct msi_desc, nr);
     if ( !entry )
         return NULL;
 
     INIT_LIST_HEAD(&entry->list);
-    entry->dev = NULL;
-    entry->remap_index = -1;
+    while ( nr-- )
+    {
+        entry[nr].dev = NULL;
+        entry[nr].remap_index = -1;
+    }
 
     return entry;
 }
@@ -488,17 +497,24 @@ int __setup_msi_irq(struct irq_desc *des
 
 int msi_free_irq(struct msi_desc *entry)
 {
-    destroy_irq(entry->irq);
+    unsigned int nr = entry->msi.nvec;
+
     if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
     {
         unsigned long start;
         start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1);
         msix_put_fixmap(entry->dev, virt_to_fix(start));
+        nr = 1;
     }
 
-    /* Free the unused IRTE if intr remap enabled */
-    if ( iommu_intremap )
-        iommu_update_ire_from_msi(entry, NULL);
+    while ( nr-- )
+    {
+        destroy_irq(entry[nr].irq);
+
+        /* Free the unused IRTE if intr remap enabled */
+        if ( iommu_intremap )
+            iommu_update_ire_from_msi(entry + nr, NULL);
+    }
 
     list_del(&entry->list);
     xfree(entry);
@@ -531,11 +547,12 @@ static struct msi_desc *find_msi_entry(s
  **/
 static int msi_capability_init(struct pci_dev *dev,
                                int irq,
-                               struct msi_desc **desc)
+                               struct msi_desc **desc,
+                               unsigned int nvec)
 {
     struct msi_desc *entry;
     int pos;
-    unsigned int maxvec, mpos;
+    unsigned int i, maxvec, mpos;
     u16 control, seg = dev->seg;
     u8 bus = dev->bus;
     u8 slot = PCI_SLOT(dev->devfn);
@@ -545,27 +562,34 @@ static int msi_capability_init(struct pc
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI);
     control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
     maxvec = multi_msi_capable(control);
+    if ( nvec > maxvec )
+        return maxvec;
     control &= ~PCI_MSI_FLAGS_QSIZE;
+    multi_msi_enable(control, nvec);
 
     /* MSI Entry Initialization */
     msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */
 
-    entry = alloc_msi_entry();
+    entry = alloc_msi_entry(nvec);
     if ( !entry )
         return -ENOMEM;
 
-    entry->msi_attrib.type = PCI_CAP_ID_MSI;
-    entry->msi_attrib.is_64 = is_64bit_address(control);
-    entry->msi_attrib.entry_nr = 0;
-    entry->msi_attrib.maskbit = is_mask_bit_support(control);
-    entry->msi_attrib.masked = 1;
-    entry->msi_attrib.pos = pos;
     mpos = msi_mask_bits_reg(pos, is_64bit_address(control));
-    entry->msi.nvec = 1;
+    for ( i = 0; i < nvec; ++i )
+    {
+        entry[i].msi_attrib.type = PCI_CAP_ID_MSI;
+        entry[i].msi_attrib.is_64 = is_64bit_address(control);
+        entry[i].msi_attrib.entry_nr = i;
+        entry[i].msi_attrib.maskbit = is_mask_bit_support(control);
+        entry[i].msi_attrib.masked = 1;
+        entry[i].msi_attrib.pos = pos;
+        if ( entry[i].msi_attrib.maskbit )
+            entry[i].msi.mpos = mpos;
+        entry[i].msi.nvec = 0;
+        entry[i].dev = dev;
+    }
+    entry->msi.nvec = nvec;
     entry->irq = irq;
-    if ( is_mask_bit_support(control) )
-        entry->msi.mpos = mpos;
-    entry->dev = dev;
     if ( entry->msi_attrib.maskbit )
     {
         u32 maskbits;
@@ -693,7 +717,7 @@ static int msix_capability_init(struct p
 
     if ( desc )
     {
-        entry = alloc_msi_entry();
+        entry = alloc_msi_entry(1);
         if ( !entry )
             return -ENOMEM;
         ASSERT(msi);
@@ -851,7 +875,6 @@ static int msix_capability_init(struct p
 
 static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
 {
-    int status;
     struct pci_dev *pdev;
     struct msi_desc *old_desc;
 
@@ -880,8 +903,7 @@ static int __pci_enable_msi(struct msi_i
         pci_disable_msi(old_desc);
     }
 
-    status = msi_capability_init(pdev, msi->irq, desc);
-    return status;
+    return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr);
 }
 
 static void __pci_disable_msi(struct msi_desc *entry)
@@ -1101,6 +1123,8 @@ int pci_restore_msi_state(struct pci_dev
 
     list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
     {
+        unsigned int i = 0, nr = 1;
+
         irq = entry->irq;
         desc = &irq_desc[irq];
 
@@ -1110,30 +1134,58 @@ int pci_restore_msi_state(struct pci_dev
 
         if (desc->msi_desc != entry)
         {
+    bogus:
             dprintk(XENLOG_ERR,
-                    "Restore MSI for dev %04x:%02x:%02x:%x not set before?\n",
+                    "Restore MSI for %04x:%02x:%02x:%u entry %u not set?\n",
                     pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
-                    PCI_FUNC(pdev->devfn));
+                    PCI_FUNC(pdev->devfn), i);
             spin_unlock_irqrestore(&desc->lock, flags);
             return -EINVAL;
         }
 
         if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
+        {
             msi_set_enable(pdev, 0);
+            nr = entry->msi.nvec;
+        }
         else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
             msix_set_enable(pdev, 0);
 
         msg = entry->msg;
         write_msi_msg(entry, &msg);
 
-        msi_set_mask_bit(desc, entry->msi_attrib.masked);
+        for ( i = 0; ; )
+        {
+            msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
+            spin_unlock_irqrestore(&desc->lock, flags);
+
+            if ( !--nr )
+                break;
+
+            desc = &irq_desc[entry[++i].irq];
+            spin_lock_irqsave(&desc->lock, flags);
+            if ( desc->msi_desc != entry + i )
+                goto bogus;
+        }
+
+        spin_unlock_irqrestore(&desc->lock, flags);
 
         if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
+        {
+            unsigned int cpos = msi_control_reg(entry->msi_attrib.pos);
+            u16 control = pci_conf_read16(pdev->seg, pdev->bus,
+                                          PCI_SLOT(pdev->devfn),
+                                          PCI_FUNC(pdev->devfn), cpos);
+
+            control &= ~PCI_MSI_FLAGS_QSIZE;
+            multi_msi_enable(control, entry->msi.nvec);
+            pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                             PCI_FUNC(pdev->devfn), cpos, control);
+
             msi_set_enable(pdev, 1);
+        }
         else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
             msix_set_enable(pdev, 1);
-
-        spin_unlock_irqrestore(&desc->lock, flags);
     }
 
     return 0;
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -140,8 +140,11 @@ int physdev_map_pirq(domid_t domid, int 
         break;
 
     case MAP_PIRQ_TYPE_MSI:
+        if ( !msi->table_base )
+            msi->entry_nr = 1;
         irq = *index;
         if ( irq == -1 )
+    case MAP_PIRQ_TYPE_MULTI_MSI:
             irq = create_irq(NUMA_NO_NODE);
 
         if ( irq < nr_irqs_gsi || irq >= nr_irqs )
@@ -179,6 +182,30 @@ int physdev_map_pirq(domid_t domid, int 
                 goto done;
             }
         }
+        else if ( type == MAP_PIRQ_TYPE_MULTI_MSI )
+        {
+            if ( msi->entry_nr <= 0 || msi->entry_nr > 32 )
+                ret = -EDOM;
+            else if ( msi->entry_nr != 1 && !iommu_intremap )
+                ret = -EOPNOTSUPP;
+            else
+            {
+                while ( msi->entry_nr & (msi->entry_nr - 1) )
+                    msi->entry_nr += msi->entry_nr & -msi->entry_nr;
+                pirq = get_free_pirqs(d, msi->entry_nr);
+                if ( pirq < 0 )
+                {
+                    while ( (msi->entry_nr >>= 1) > 1 )
+                        if ( get_free_pirqs(d, msi->entry_nr) > 0 )
+                            break;
+                    dprintk(XENLOG_G_ERR, "dom%d: no block of %d free pirqs\n",
+                            d->domain_id, msi->entry_nr << 1);
+                    ret = pirq;
+                }
+            }
+            if ( ret < 0 )
+                goto done;
+        }
         else
         {
             pirq = get_free_pirq(d, type);
@@ -210,8 +237,15 @@ int physdev_map_pirq(domid_t domid, int 
  done:
     spin_unlock(&d->event_lock);
     spin_unlock(&pcidevs_lock);
-    if ( (ret != 0) && (type == MAP_PIRQ_TYPE_MSI) && (*index == -1) )
-        destroy_irq(irq);
+    if ( ret != 0 )
+        switch ( type )
+        {
+        case MAP_PIRQ_TYPE_MSI:
+            if ( *index == -1 )
+        case MAP_PIRQ_TYPE_MULTI_MSI:
+                destroy_irq(irq);
+            break;
+        }
  free_domain:
     rcu_unlock_domain(d);
     return ret;
@@ -390,14 +424,22 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         if ( copy_from_guest(&map, arg, 1) != 0 )
             break;
 
-        if ( map.type == MAP_PIRQ_TYPE_MSI_SEG )
+        switch ( map.type )
         {
+        case MAP_PIRQ_TYPE_MSI_SEG:
             map.type = MAP_PIRQ_TYPE_MSI;
             msi.seg = map.bus >> 16;
-        }
-        else
-        {
+            break;
+
+        case MAP_PIRQ_TYPE_MULTI_MSI:
+            if ( map.table_base )
+                return -EINVAL;
+            msi.seg = map.bus >> 16;
+            break;
+
+        default:
             msi.seg = 0;
+            break;
         }
         msi.bus = map.bus;
         msi.devfn = map.devfn;
@@ -406,6 +448,8 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         ret = physdev_map_pirq(map.domid, map.type, &map.index, &map.pirq,
                                &msi);
 
+        if ( map.type == MAP_PIRQ_TYPE_MULTI_MSI )
+            map.entry_nr = msi.entry_nr;
         if ( __copy_to_guest(arg, &map, 1) )
             ret = -EFAULT;
         break;
--- a/xen/include/asm-x86/irq.h
+++ b/xen/include/asm-x86/irq.h
@@ -141,6 +141,7 @@ int map_domain_pirq(struct domain *d, in
                            void *data);
 int unmap_domain_pirq(struct domain *d, int pirq);
 int get_free_pirq(struct domain *d, int type);
+int get_free_pirqs(struct domain *, unsigned int nr);
 void free_domain_pirqs(struct domain *d);
 int map_domain_emuirq_pirq(struct domain *d, int pirq, int irq);
 int unmap_domain_pirq_emuirq(struct domain *d, int pirq);
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -148,7 +148,7 @@ int msi_free_irq(struct msi_desc *entry)
 #define multi_msi_capable(control) \
 	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
 #define multi_msi_enable(control, num) \
-	control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
+	control |= (((fls(num) - 1) << 4) & PCI_MSI_FLAGS_QSIZE);
 #define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
 #define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
 #define msi_enable(control, num) multi_msi_enable(control, num); \
--- a/xen/include/public/physdev.h
+++ b/xen/include/public/physdev.h
@@ -151,21 +151,22 @@ DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
 #define MAP_PIRQ_TYPE_GSI               0x1
 #define MAP_PIRQ_TYPE_UNKNOWN           0x2
 #define MAP_PIRQ_TYPE_MSI_SEG           0x3
+#define MAP_PIRQ_TYPE_MULTI_MSI         0x4
 
 #define PHYSDEVOP_map_pirq               13
 struct physdev_map_pirq {
     domid_t domid;
     /* IN */
     int type;
-    /* IN */
+    /* IN (ignored for ..._MULTI_MSI) */
     int index;
     /* IN or OUT */
     int pirq;
-    /* IN - high 16 bits hold segment for MAP_PIRQ_TYPE_MSI_SEG */
+    /* IN - high 16 bits hold segment for ..._MSI_SEG and ..._MULTI_MSI */
     int bus;
     /* IN */
     int devfn;
-    /* IN */
+    /* IN (also OUT for ..._MULTI_MSI) */
     int entry_nr;
     /* IN */
     uint64_t table_base;



[-- Attachment #2: x86-multi-vector-MSI.patch --]
[-- Type: text/plain, Size: 23271 bytes --]

x86: enable multi-vector MSI

This implies
- extending the public interface to have a way to request a block of
  MSIs
- allocating a block of contiguous pIRQ-s for the target domain (but
  note that the Xen IRQs allocated have no need of being contiguous)
- repeating certain operations for all involved IRQs
- fixing multi_msi_enable()
- adjusting the mask bit accesses for maskable MSIs

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1863,6 +1863,25 @@ int get_free_pirq(struct domain *d, int 
     return -ENOSPC;
 }
 
+int get_free_pirqs(struct domain *d, unsigned int nr)
+{
+    unsigned int i, found = 0;
+
+    ASSERT(spin_is_locked(&d->event_lock));
+
+    for ( i = d->nr_pirqs - 1; i >= nr_irqs_gsi; --i )
+        if ( is_free_pirq(d, pirq_info(d, i)) )
+        {
+            pirq_get_info(d, i);
+            if ( ++found == nr )
+                return i;
+        }
+        else
+            found = 0;
+
+    return -ENOSPC;
+}
+
 int map_domain_pirq(
     struct domain *d, int pirq, int irq, int type, void *data)
 {
@@ -1918,11 +1937,12 @@ int map_domain_pirq(
 
     desc = irq_to_desc(irq);
 
-    if ( type == MAP_PIRQ_TYPE_MSI )
+    if ( type == MAP_PIRQ_TYPE_MSI || type == MAP_PIRQ_TYPE_MULTI_MSI )
     {
         struct msi_info *msi = (struct msi_info *)data;
         struct msi_desc *msi_desc;
         struct pci_dev *pdev;
+        unsigned int nr = 0;
 
         ASSERT(spin_is_locked(&pcidevs_lock));
 
@@ -1933,7 +1953,14 @@ int map_domain_pirq(
         pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
         ret = pci_enable_msi(msi, &msi_desc);
         if ( ret )
+        {
+            if ( ret > 0 )
+            {
+                msi->entry_nr = ret;
+                ret = -ENFILE;
+            }
             goto done;
+        }
 
         spin_lock_irqsave(&desc->lock, flags);
 
@@ -1947,25 +1974,73 @@ int map_domain_pirq(
             goto done;
         }
 
-        ret = setup_msi_irq(desc, msi_desc);
-        if ( ret )
+        while ( !(ret = setup_msi_irq(desc, msi_desc + nr)) )
         {
+            if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV &&
+                 !desc->arch.used_vectors )
+            {
+                desc->arch.used_vectors = &pdev->arch.used_vectors;
+                if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED )
+                {
+                    int vector = desc->arch.vector;
+
+                    ASSERT(!test_bit(vector, desc->arch.used_vectors));
+                    set_bit(vector, desc->arch.used_vectors);
+                }
+            }
+            if ( type == MAP_PIRQ_TYPE_MSI ||
+                 msi_desc->msi_attrib.type != PCI_CAP_ID_MSI ||
+                 ++nr == msi->entry_nr )
+                break;
+
+            set_domain_irq_pirq(d, irq, info);
             spin_unlock_irqrestore(&desc->lock, flags);
-            pci_disable_msi(msi_desc);
-            goto done;
+
+            info = NULL;
+            irq = create_irq(NUMA_NO_NODE);
+            ret = irq >= 0 ? prepare_domain_irq_pirq(d, irq, pirq + nr, &info)
+                           : irq;
+            if ( ret )
+                break;
+            msi_desc[nr].irq = irq;
+
+            if ( irq_permit_access(d, irq) != 0 )
+                printk(XENLOG_G_WARNING
+                       "dom%d: could not permit access to IRQ%d (pirq %d)\n",
+                       d->domain_id, irq, pirq);
+
+            desc = irq_to_desc(irq);
+            spin_lock_irqsave(&desc->lock, flags);
+
+            if ( desc->handler != &no_irq_type )
+            {
+                dprintk(XENLOG_G_ERR, "dom%d: irq %d (pirq %u) in use (%s)\n",
+                        d->domain_id, irq, pirq + nr, desc->handler->typename);
+                ret = -EBUSY;
+                break;
+            }
         }
 
-        if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV
-             && !desc->arch.used_vectors )
+        if ( ret )
         {
-            desc->arch.used_vectors = &pdev->arch.used_vectors;
-            if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED )
+            spin_unlock_irqrestore(&desc->lock, flags);
+            while ( nr-- )
             {
-                int vector = desc->arch.vector;
-                ASSERT(!test_bit(vector, desc->arch.used_vectors));
-
-                set_bit(vector, desc->arch.used_vectors);
+                if ( irq >= 0 )
+                {
+                    if ( irq_deny_access(d, irq) )
+                        printk(XENLOG_G_ERR
+                               "dom%d: could not revoke access to IRQ%d (pirq %d)\n",
+                               d->domain_id, irq, pirq);
+                    destroy_irq(irq);
+                }
+                if ( info )
+                    cleanup_domain_irq_pirq(d, irq, info);
+                info = pirq_info(d, pirq + nr);
+                irq = info->arch.irq;
             }
+            pci_disable_msi(msi_desc);
+            goto done;
         }
 
         set_domain_irq_pirq(d, irq, info);
@@ -1996,7 +2071,8 @@ int unmap_domain_pirq(struct domain *d, 
 {
     unsigned long flags;
     struct irq_desc *desc;
-    int irq, ret = 0;
+    int irq, ret = 0, rc;
+    unsigned int i, nr = 1;
     bool_t forced_unbind;
     struct pirq *info;
     struct msi_desc *msi_desc = NULL;
@@ -2018,6 +2094,18 @@ int unmap_domain_pirq(struct domain *d, 
 
     desc = irq_to_desc(irq);
     msi_desc = desc->msi_desc;
+    if ( msi_desc && msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
+    {
+        if ( msi_desc->msi_attrib.entry_nr )
+        {
+            printk(XENLOG_G_ERR
+                   "dom%d: trying to unmap secondary MSI pirq %d\n",
+                   d->domain_id, pirq);
+            ret = -EBUSY;
+            goto done;
+        }
+        nr = msi_desc->msi.nvec;
+    }
 
     ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, msi_desc);
     if ( ret )
@@ -2033,37 +2121,83 @@ int unmap_domain_pirq(struct domain *d, 
 
     spin_lock_irqsave(&desc->lock, flags);
 
-    BUG_ON(irq != domain_pirq_to_irq(d, pirq));
-
-    if ( !forced_unbind )
-        clear_domain_irq_pirq(d, irq, info);
-    else
+    for ( i = 0; ; )
     {
-        info->arch.irq = -irq;
-        radix_tree_replace_slot(
-            radix_tree_lookup_slot(&d->arch.irq_pirq, irq),
-            radix_tree_int_to_ptr(-pirq));
+        BUG_ON(irq != domain_pirq_to_irq(d, pirq + i));
+
+        if ( !forced_unbind )
+            clear_domain_irq_pirq(d, irq, info);
+        else
+        {
+            info->arch.irq = -irq;
+            radix_tree_replace_slot(
+                radix_tree_lookup_slot(&d->arch.irq_pirq, irq),
+                radix_tree_int_to_ptr(-pirq));
+        }
+
+        if ( msi_desc )
+        {
+            desc->handler = &no_irq_type;
+            desc->msi_desc = NULL;
+        }
+
+        if ( ++i == nr )
+            break;
+
+        spin_unlock_irqrestore(&desc->lock, flags);
+
+        if ( !forced_unbind )
+           cleanup_domain_irq_pirq(d, irq, info);
+
+        rc = irq_deny_access(d, irq);
+        if ( rc )
+        {
+            printk(XENLOG_G_ERR
+                   "dom%d: could not deny access to IRQ%d (pirq %d)\n",
+                   d->domain_id, irq, pirq + i);
+            ret = rc;
+        }
+
+        do {
+            info = pirq_info(d, pirq + i);
+            if ( info && (irq = info->arch.irq) > 0 )
+                break;
+            printk(XENLOG_G_ERR "dom%d: MSI pirq %d not mapped\n",
+                   d->domain_id, pirq + i);
+        } while ( ++i < nr );
+
+        if ( i == nr )
+        {
+            desc = NULL;
+            break;
+        }
+
+        desc = irq_to_desc(irq);
+        BUG_ON(desc->msi_desc != msi_desc + i);
+
+        spin_lock_irqsave(&desc->lock, flags);
     }
 
-    if ( msi_desc )
+    if ( desc )
     {
-        desc->handler = &no_irq_type;
-        desc->msi_desc = NULL;
+        spin_unlock_irqrestore(&desc->lock, flags);
+
+        if ( !forced_unbind )
+            cleanup_domain_irq_pirq(d, irq, info);
+
+        rc = irq_deny_access(d, irq);
+        if ( rc )
+        {
+            printk(XENLOG_G_ERR
+                   "dom%d: could not deny access to IRQ%d (pirq %d)\n",
+                   d->domain_id, irq, pirq + nr - 1);
+            ret = rc;
+        }
     }
 
-    spin_unlock_irqrestore(&desc->lock, flags);
     if (msi_desc)
         msi_free_irq(msi_desc);
 
-    if ( !forced_unbind )
-        cleanup_domain_irq_pirq(d, irq, info);
-
-    ret = irq_deny_access(d, irq);
-    if ( ret )
-        printk(XENLOG_G_ERR
-               "dom%d: could not deny access to IRQ%d (pirq %d)\n",
-               d->domain_id, irq, pirq);
-
  done:
     return ret;
 }
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -236,6 +236,11 @@ static int write_msi_msg(struct msi_desc
         u8 bus = dev->bus;
         u8 slot = PCI_SLOT(dev->devfn);
         u8 func = PCI_FUNC(dev->devfn);
+        int nr = entry->msi_attrib.entry_nr;
+
+        ASSERT((msg->data & (entry[-nr].msi.nvec - 1)) == nr);
+        if ( nr )
+            return 0;
 
         pci_conf_write32(seg, bus, slot, func, msi_lower_address_reg(pos),
                          msg->address_lo);
@@ -359,8 +364,8 @@ static void msi_set_mask_bit(struct irq_
             u8 func = PCI_FUNC(entry->dev->devfn);
 
             mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
-            mask_bits &= ~(1);
-            mask_bits |= flag;
+            mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
+            mask_bits |= (u32)flag << entry->msi_attrib.entry_nr;
             pci_conf_write32(seg, bus, slot, func, entry->msi.mpos, mask_bits);
         }
         break;
@@ -384,10 +389,11 @@ static int msi_get_mask_bit(const struct
     case PCI_CAP_ID_MSI:
         if (!entry->dev || !entry->msi_attrib.maskbit)
             break;
-        return pci_conf_read32(entry->dev->seg, entry->dev->bus,
-                               PCI_SLOT(entry->dev->devfn),
-                               PCI_FUNC(entry->dev->devfn),
-                               entry->msi.mpos) & 1;
+        return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
+                                PCI_SLOT(entry->dev->devfn),
+                                PCI_FUNC(entry->dev->devfn),
+                                entry->msi.mpos) >>
+                entry->msi_attrib.entry_nr) & 1;
     case PCI_CAP_ID_MSIX:
         return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
     }
@@ -453,17 +459,20 @@ static hw_irq_controller pci_msi_nonmask
     .set_affinity = set_msi_affinity
 };
 
-static struct msi_desc* alloc_msi_entry(void)
+static struct msi_desc *alloc_msi_entry(unsigned int nr)
 {
     struct msi_desc *entry;
 
-    entry = xmalloc(struct msi_desc);
+    entry = xmalloc_array(struct msi_desc, nr);
     if ( !entry )
         return NULL;
 
     INIT_LIST_HEAD(&entry->list);
-    entry->dev = NULL;
-    entry->remap_index = -1;
+    while ( nr-- )
+    {
+        entry[nr].dev = NULL;
+        entry[nr].remap_index = -1;
+    }
 
     return entry;
 }
@@ -488,17 +497,24 @@ int __setup_msi_irq(struct irq_desc *des
 
 int msi_free_irq(struct msi_desc *entry)
 {
-    destroy_irq(entry->irq);
+    unsigned int nr = entry->msi.nvec;
+
     if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
     {
         unsigned long start;
         start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1);
         msix_put_fixmap(entry->dev, virt_to_fix(start));
+        nr = 1;
     }
 
-    /* Free the unused IRTE if intr remap enabled */
-    if ( iommu_intremap )
-        iommu_update_ire_from_msi(entry, NULL);
+    while ( nr-- )
+    {
+        destroy_irq(entry[nr].irq);
+
+        /* Free the unused IRTE if intr remap enabled */
+        if ( iommu_intremap )
+            iommu_update_ire_from_msi(entry + nr, NULL);
+    }
 
     list_del(&entry->list);
     xfree(entry);
@@ -531,11 +547,12 @@ static struct msi_desc *find_msi_entry(s
  **/
 static int msi_capability_init(struct pci_dev *dev,
                                int irq,
-                               struct msi_desc **desc)
+                               struct msi_desc **desc,
+                               unsigned int nvec)
 {
     struct msi_desc *entry;
     int pos;
-    unsigned int maxvec, mpos;
+    unsigned int i, maxvec, mpos;
     u16 control, seg = dev->seg;
     u8 bus = dev->bus;
     u8 slot = PCI_SLOT(dev->devfn);
@@ -545,27 +562,34 @@ static int msi_capability_init(struct pc
     pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI);
     control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
     maxvec = multi_msi_capable(control);
+    if ( nvec > maxvec )
+        return maxvec;
     control &= ~PCI_MSI_FLAGS_QSIZE;
+    multi_msi_enable(control, nvec);
 
     /* MSI Entry Initialization */
     msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */
 
-    entry = alloc_msi_entry();
+    entry = alloc_msi_entry(nvec);
     if ( !entry )
         return -ENOMEM;
 
-    entry->msi_attrib.type = PCI_CAP_ID_MSI;
-    entry->msi_attrib.is_64 = is_64bit_address(control);
-    entry->msi_attrib.entry_nr = 0;
-    entry->msi_attrib.maskbit = is_mask_bit_support(control);
-    entry->msi_attrib.masked = 1;
-    entry->msi_attrib.pos = pos;
     mpos = msi_mask_bits_reg(pos, is_64bit_address(control));
-    entry->msi.nvec = 1;
+    for ( i = 0; i < nvec; ++i )
+    {
+        entry[i].msi_attrib.type = PCI_CAP_ID_MSI;
+        entry[i].msi_attrib.is_64 = is_64bit_address(control);
+        entry[i].msi_attrib.entry_nr = i;
+        entry[i].msi_attrib.maskbit = is_mask_bit_support(control);
+        entry[i].msi_attrib.masked = 1;
+        entry[i].msi_attrib.pos = pos;
+        if ( entry[i].msi_attrib.maskbit )
+            entry[i].msi.mpos = mpos;
+        entry[i].msi.nvec = 0;
+        entry[i].dev = dev;
+    }
+    entry->msi.nvec = nvec;
     entry->irq = irq;
-    if ( is_mask_bit_support(control) )
-        entry->msi.mpos = mpos;
-    entry->dev = dev;
     if ( entry->msi_attrib.maskbit )
     {
         u32 maskbits;
@@ -693,7 +717,7 @@ static int msix_capability_init(struct p
 
     if ( desc )
     {
-        entry = alloc_msi_entry();
+        entry = alloc_msi_entry(1);
         if ( !entry )
             return -ENOMEM;
         ASSERT(msi);
@@ -851,7 +875,6 @@ static int msix_capability_init(struct p
 
 static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
 {
-    int status;
     struct pci_dev *pdev;
     struct msi_desc *old_desc;
 
@@ -880,8 +903,7 @@ static int __pci_enable_msi(struct msi_i
         pci_disable_msi(old_desc);
     }
 
-    status = msi_capability_init(pdev, msi->irq, desc);
-    return status;
+    return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr);
 }
 
 static void __pci_disable_msi(struct msi_desc *entry)
@@ -1101,6 +1123,8 @@ int pci_restore_msi_state(struct pci_dev
 
     list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
     {
+        unsigned int i = 0, nr = 1;
+
         irq = entry->irq;
         desc = &irq_desc[irq];
 
@@ -1110,30 +1134,58 @@ int pci_restore_msi_state(struct pci_dev
 
         if (desc->msi_desc != entry)
         {
+    bogus:
             dprintk(XENLOG_ERR,
-                    "Restore MSI for dev %04x:%02x:%02x:%x not set before?\n",
+                    "Restore MSI for %04x:%02x:%02x:%u entry %u not set?\n",
                     pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
-                    PCI_FUNC(pdev->devfn));
+                    PCI_FUNC(pdev->devfn), i);
             spin_unlock_irqrestore(&desc->lock, flags);
             return -EINVAL;
         }
 
         if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
+        {
             msi_set_enable(pdev, 0);
+            nr = entry->msi.nvec;
+        }
         else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
             msix_set_enable(pdev, 0);
 
         msg = entry->msg;
         write_msi_msg(entry, &msg);
 
-        msi_set_mask_bit(desc, entry->msi_attrib.masked);
+        for ( i = 0; ; )
+        {
+            msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
+            spin_unlock_irqrestore(&desc->lock, flags);
+
+            if ( !--nr )
+                break;
+
+            desc = &irq_desc[entry[++i].irq];
+            spin_lock_irqsave(&desc->lock, flags);
+            if ( desc->msi_desc != entry + i )
+                goto bogus;
+        }
+
+        spin_unlock_irqrestore(&desc->lock, flags);
 
         if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
+        {
+            unsigned int cpos = msi_control_reg(entry->msi_attrib.pos);
+            u16 control = pci_conf_read16(pdev->seg, pdev->bus,
+                                          PCI_SLOT(pdev->devfn),
+                                          PCI_FUNC(pdev->devfn), cpos);
+
+            control &= ~PCI_MSI_FLAGS_QSIZE;
+            multi_msi_enable(control, entry->msi.nvec);
+            pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                             PCI_FUNC(pdev->devfn), cpos, control);
+
             msi_set_enable(pdev, 1);
+        }
         else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
             msix_set_enable(pdev, 1);
-
-        spin_unlock_irqrestore(&desc->lock, flags);
     }
 
     return 0;
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -140,8 +140,11 @@ int physdev_map_pirq(domid_t domid, int 
         break;
 
     case MAP_PIRQ_TYPE_MSI:
+        if ( !msi->table_base )
+            msi->entry_nr = 1;
         irq = *index;
         if ( irq == -1 )
+    case MAP_PIRQ_TYPE_MULTI_MSI:
             irq = create_irq(NUMA_NO_NODE);
 
         if ( irq < nr_irqs_gsi || irq >= nr_irqs )
@@ -179,6 +182,30 @@ int physdev_map_pirq(domid_t domid, int 
                 goto done;
             }
         }
+        else if ( type == MAP_PIRQ_TYPE_MULTI_MSI )
+        {
+            if ( msi->entry_nr <= 0 || msi->entry_nr > 32 )
+                ret = -EDOM;
+            else if ( msi->entry_nr != 1 && !iommu_intremap )
+                ret = -EOPNOTSUPP;
+            else
+            {
+                while ( msi->entry_nr & (msi->entry_nr - 1) )
+                    msi->entry_nr += msi->entry_nr & -msi->entry_nr;
+                pirq = get_free_pirqs(d, msi->entry_nr);
+                if ( pirq < 0 )
+                {
+                    while ( (msi->entry_nr >>= 1) > 1 )
+                        if ( get_free_pirqs(d, msi->entry_nr) > 0 )
+                            break;
+                    dprintk(XENLOG_G_ERR, "dom%d: no block of %d free pirqs\n",
+                            d->domain_id, msi->entry_nr << 1);
+                    ret = pirq;
+                }
+            }
+            if ( ret < 0 )
+                goto done;
+        }
         else
         {
             pirq = get_free_pirq(d, type);
@@ -210,8 +237,15 @@ int physdev_map_pirq(domid_t domid, int 
  done:
     spin_unlock(&d->event_lock);
     spin_unlock(&pcidevs_lock);
-    if ( (ret != 0) && (type == MAP_PIRQ_TYPE_MSI) && (*index == -1) )
-        destroy_irq(irq);
+    if ( ret != 0 )
+        switch ( type )
+        {
+        case MAP_PIRQ_TYPE_MSI:
+            if ( *index == -1 )
+        case MAP_PIRQ_TYPE_MULTI_MSI:
+                destroy_irq(irq);
+            break;
+        }
  free_domain:
     rcu_unlock_domain(d);
     return ret;
@@ -390,14 +424,22 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         if ( copy_from_guest(&map, arg, 1) != 0 )
             break;
 
-        if ( map.type == MAP_PIRQ_TYPE_MSI_SEG )
+        switch ( map.type )
         {
+        case MAP_PIRQ_TYPE_MSI_SEG:
             map.type = MAP_PIRQ_TYPE_MSI;
             msi.seg = map.bus >> 16;
-        }
-        else
-        {
+            break;
+
+        case MAP_PIRQ_TYPE_MULTI_MSI:
+            if ( map.table_base )
+                return -EINVAL;
+            msi.seg = map.bus >> 16;
+            break;
+
+        default:
             msi.seg = 0;
+            break;
         }
         msi.bus = map.bus;
         msi.devfn = map.devfn;
@@ -406,6 +448,8 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         ret = physdev_map_pirq(map.domid, map.type, &map.index, &map.pirq,
                                &msi);
 
+        if ( map.type == MAP_PIRQ_TYPE_MULTI_MSI )
+            map.entry_nr = msi.entry_nr;
         if ( __copy_to_guest(arg, &map, 1) )
             ret = -EFAULT;
         break;
--- a/xen/include/asm-x86/irq.h
+++ b/xen/include/asm-x86/irq.h
@@ -141,6 +141,7 @@ int map_domain_pirq(struct domain *d, in
                            void *data);
 int unmap_domain_pirq(struct domain *d, int pirq);
 int get_free_pirq(struct domain *d, int type);
+int get_free_pirqs(struct domain *, unsigned int nr);
 void free_domain_pirqs(struct domain *d);
 int map_domain_emuirq_pirq(struct domain *d, int pirq, int irq);
 int unmap_domain_pirq_emuirq(struct domain *d, int pirq);
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -148,7 +148,7 @@ int msi_free_irq(struct msi_desc *entry)
 #define multi_msi_capable(control) \
 	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
 #define multi_msi_enable(control, num) \
-	control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
+	control |= (((fls(num) - 1) << 4) & PCI_MSI_FLAGS_QSIZE);
 #define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
 #define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
 #define msi_enable(control, num) multi_msi_enable(control, num); \
--- a/xen/include/public/physdev.h
+++ b/xen/include/public/physdev.h
@@ -151,21 +151,22 @@ DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
 #define MAP_PIRQ_TYPE_GSI               0x1
 #define MAP_PIRQ_TYPE_UNKNOWN           0x2
 #define MAP_PIRQ_TYPE_MSI_SEG           0x3
+#define MAP_PIRQ_TYPE_MULTI_MSI         0x4
 
 #define PHYSDEVOP_map_pirq               13
 struct physdev_map_pirq {
     domid_t domid;
     /* IN */
     int type;
-    /* IN */
+    /* IN (ignored for ..._MULTI_MSI) */
     int index;
     /* IN or OUT */
     int pirq;
-    /* IN - high 16 bits hold segment for MAP_PIRQ_TYPE_MSI_SEG */
+    /* IN - high 16 bits hold segment for ..._MSI_SEG and ..._MULTI_MSI */
     int bus;
     /* IN */
     int devfn;
-    /* IN */
+    /* IN (also OUT for ..._MULTI_MSI) */
     int entry_nr;
     /* IN */
     uint64_t table_base;

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH resend 3/3] pciif: add multi-vector-MSI command
  2013-07-16 10:00 [PATCH resend 0/3] x86/IOMMU: multi-vector MSI Jan Beulich
  2013-07-16 10:13 ` [PATCH resend 1/3] VT-d: enable for " Jan Beulich
  2013-07-16 10:14 ` [PATCH resend 2/3] x86: enable " Jan Beulich
@ 2013-07-16 10:15 ` Jan Beulich
  2013-07-16 11:19   ` Andrew Cooper
                     ` (2 more replies)
  2 siblings, 3 replies; 18+ messages in thread
From: Jan Beulich @ 2013-07-16 10:15 UTC (permalink / raw)
  To: xen-devel; +Cc: Keir Fraser, xiantao.zhang

[-- Attachment #1: Type: text/plain, Size: 640 bytes --]

The requested vector count is to be passed in struct xen_pci_op's info
field. Upon failure, if a smaller vector count might work, the backend
will pass that smaller count in the value field (which so far is always
being set to zero in the error path).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/include/public/io/pciif.h
+++ b/xen/include/public/io/pciif.h
@@ -46,6 +46,7 @@
 #define XEN_PCI_OP_aer_resume		(7)
 #define XEN_PCI_OP_aer_mmio		(8)
 #define XEN_PCI_OP_aer_slotreset	(9)
+#define XEN_PCI_OP_enable_multi_msi	(10)
 
 /* xen_pci_op error numbers */
 #define XEN_PCI_ERR_success          (0)




[-- Attachment #2: pciif-multi-vector-MSI.patch --]
[-- Type: text/plain, Size: 673 bytes --]

pciif: add multi-vector-MSI command

The requested vector count is to be passed in struct xen_pci_op's info
field. Upon failure, if a smaller vector count might work, the backend
will pass that smaller count in the value field (which so far is always
being set to zero in the error path).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/include/public/io/pciif.h
+++ b/xen/include/public/io/pciif.h
@@ -46,6 +46,7 @@
 #define XEN_PCI_OP_aer_resume		(7)
 #define XEN_PCI_OP_aer_mmio		(8)
 #define XEN_PCI_OP_aer_slotreset	(9)
+#define XEN_PCI_OP_enable_multi_msi	(10)
 
 /* xen_pci_op error numbers */
 #define XEN_PCI_ERR_success          (0)

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 1/3] VT-d: enable for multi-vector MSI
  2013-07-16 10:13 ` [PATCH resend 1/3] VT-d: enable for " Jan Beulich
@ 2013-07-16 11:15   ` Andrew Cooper
  2013-07-16 11:32     ` Jan Beulich
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Cooper @ 2013-07-16 11:15 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Keir Fraser, xiantao.zhang, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 7097 bytes --]

On 16/07/13 11:13, Jan Beulich wrote:
> The main change being to make alloc_remap_entry() capable of allocating
> a block of entries.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/drivers/passthrough/vtd/intremap.c
> +++ b/xen/drivers/passthrough/vtd/intremap.c
> @@ -194,18 +194,18 @@ static void free_remap_entry(struct iomm
>  }
>  
>  /*
> - * Look for a free intr remap entry.
> + * Look for a free intr remap entry (or a contiguous set thereof).
>   * Need hold iremap_lock, and setup returned entry before releasing lock.
>   */
> -static int alloc_remap_entry(struct iommu *iommu)
> +static unsigned int alloc_remap_entry(struct iommu *iommu, unsigned int nr)

alloc_remap_entries() now that it unconditionally takes a count (and you
already have to patch all callsites)



>  {
>      struct iremap_entry *iremap_entries = NULL;
>      struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
> -    int i;
> +    unsigned int i, found;
>  
>      ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) );
>  
> -    for ( i = 0; i < IREMAP_ENTRY_NR; i++ )
> +    for ( found = i = 0; i < IREMAP_ENTRY_NR; i++ )
>      {
>          struct iremap_entry *p;
>          if ( i % (1 << IREMAP_ENTRY_ORDER) == 0 )
> @@ -220,7 +220,9 @@ static int alloc_remap_entry(struct iomm
>          else
>              p = &iremap_entries[i % (1 << IREMAP_ENTRY_ORDER)];
>  
> -        if ( p->lo_val == 0 && p->hi_val == 0 ) /* a free entry */
> +        if ( p->lo_val || p->hi_val ) /* not a free entry */
> +            found = 0;
> +        else if ( ++found == nr )
>              break;
>      }
>  
> @@ -228,7 +230,7 @@ static int alloc_remap_entry(struct iomm
>          unmap_vtd_domain_page(iremap_entries);
>  
>      if ( i < IREMAP_ENTRY_NR ) 
> -        ir_ctrl->iremap_num++;
> +        ir_ctrl->iremap_num += nr;
>      return i;
>  }
>  
> @@ -293,7 +295,7 @@ static int ioapic_rte_to_remap_entry(str
>      index = apic_pin_2_ir_idx[apic][ioapic_pin];
>      if ( index < 0 )
>      {
> -        index = alloc_remap_entry(iommu);
> +        index = alloc_remap_entry(iommu, 1);
>          if ( index < IREMAP_ENTRY_NR )
>              apic_pin_2_ir_idx[apic][ioapic_pin] = index;
>      }
> @@ -485,19 +487,18 @@ static void set_msi_source_id(struct pci
>  }
>  
>  static int remap_entry_to_msi_msg(
> -    struct iommu *iommu, struct msi_msg *msg)
> +    struct iommu *iommu, struct msi_msg *msg, unsigned int index)
>  {
>      struct iremap_entry *iremap_entry = NULL, *iremap_entries;
>      struct msi_msg_remap_entry *remap_rte;
> -    int index;
>      unsigned long flags;
>      struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
>  
>      remap_rte = (struct msi_msg_remap_entry *) msg;
> -    index = (remap_rte->address_lo.index_15 << 15) |
> +    index += (remap_rte->address_lo.index_15 << 15) |
>               remap_rte->address_lo.index_0_14;
>  
> -    if ( index < 0 || index > IREMAP_ENTRY_NR - 1 )
> +    if ( index >= IREMAP_ENTRY_NR )
>      {
>          dprintk(XENLOG_ERR VTDPREFIX,
>                  "%s: index (%d) for remap table is invalid !\n",
> @@ -555,31 +556,29 @@ static int msi_msg_to_remap_entry(
>      struct iremap_entry *iremap_entry = NULL, *iremap_entries;
>      struct iremap_entry new_ire;
>      struct msi_msg_remap_entry *remap_rte;
> -    int index;
> +    unsigned int index, i, nr = 1;

Does this hardcoding of nr=1 defeat the purpose of the following logic?

~Andrew

>      unsigned long flags;
>      struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
>  
> -    remap_rte = (struct msi_msg_remap_entry *) msg;
> +    if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
> +        nr = msi_desc->msi.nvec;
> +
>      spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
>  
>      if ( msg == NULL )
>      {
> -        /* Free specified unused IRTE */
> -        free_remap_entry(iommu, msi_desc->remap_index);
> +        /* Free specified unused IRTEs */
> +        for ( i = 0; i < nr; ++i )
> +            free_remap_entry(iommu, msi_desc->remap_index + i);
>          spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
>          return 0;
>      }
>  
>      if ( msi_desc->remap_index < 0 )
>      {
> -        /*
> -         * TODO: Multiple-vector MSI requires allocating multiple continuous
> -         * entries and configuring addr/data of msi_msg in different way. So
> -         * alloca_remap_entry will be changed if enabling multiple-vector MSI
> -         * in future.
> -         */
> -        index = alloc_remap_entry(iommu);
> -        msi_desc->remap_index = index;
> +        index = alloc_remap_entry(iommu, nr);
> +        for ( i = 0; i < nr; ++i )
> +            msi_desc[i].remap_index = index + i;
>      }
>      else
>          index = msi_desc->remap_index;
> @@ -590,7 +589,8 @@ static int msi_msg_to_remap_entry(
>                  "%s: intremap index (%d) is larger than"
>                  " the maximum index (%d)!\n",
>                  __func__, index, IREMAP_ENTRY_NR - 1);
> -        msi_desc->remap_index = -1;
> +        for ( i = 0; i < nr; ++i )
> +            msi_desc[i].remap_index = -1;
>          spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags);
>          return -EFAULT;
>      }
> @@ -626,14 +626,18 @@ static int msi_msg_to_remap_entry(
>      new_ire.lo.p = 1;    /* finally, set present bit */
>  
>      /* now construct new MSI/MSI-X rte entry */
> +    remap_rte = (struct msi_msg_remap_entry *)msg;
>      remap_rte->address_lo.dontcare = 0;
> -    remap_rte->address_lo.index_15 = (index >> 15) & 0x1;
> -    remap_rte->address_lo.index_0_14 = index & 0x7fff;
> +    i = index;
> +    if ( !nr )
> +        i -= msi_desc->msi_attrib.entry_nr;
> +    remap_rte->address_lo.index_15 = (i >> 15) & 0x1;
> +    remap_rte->address_lo.index_0_14 = i & 0x7fff;
>      remap_rte->address_lo.SHV = 1;
>      remap_rte->address_lo.format = 1;
>  
>      remap_rte->address_hi = 0;
> -    remap_rte->data = 0;
> +    remap_rte->data = index - i;
>  
>      memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
>      iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry));
> @@ -654,7 +658,9 @@ void msi_msg_read_remap_rte(
>      drhd = pdev ? acpi_find_matched_drhd_unit(pdev)
>                  : hpet_to_drhd(msi_desc->hpet_id);
>      if ( drhd )
> -        remap_entry_to_msi_msg(drhd->iommu, msg);
> +        remap_entry_to_msi_msg(drhd->iommu, msg,
> +                               msi_desc->msi_attrib.type == PCI_CAP_ID_MSI
> +                               ? msi_desc->msi_attrib.entry_nr : 0);
>  }
>  
>  int msi_msg_write_remap_rte(
> @@ -680,7 +686,7 @@ int __init intel_setup_hpet_msi(struct m
>          return 0;
>  
>      spin_lock_irqsave(&ir_ctrl->iremap_lock, flags);
> -    msi_desc->remap_index = alloc_remap_entry(iommu);
> +    msi_desc->remap_index = alloc_remap_entry(iommu, 1);
>      if ( msi_desc->remap_index >= IREMAP_ENTRY_NR )
>      {
>          dprintk(XENLOG_ERR VTDPREFIX,
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 8037 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 3/3] pciif: add multi-vector-MSI command
  2013-07-16 10:15 ` [PATCH resend 3/3] pciif: add multi-vector-MSI command Jan Beulich
@ 2013-07-16 11:19   ` Andrew Cooper
  2013-07-16 11:35     ` Jan Beulich
  2013-08-05 13:12   ` Ping: " Jan Beulich
  2013-08-08  9:02   ` Keir Fraser
  2 siblings, 1 reply; 18+ messages in thread
From: Andrew Cooper @ 2013-07-16 11:19 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Keir Fraser, xiantao.zhang, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 1074 bytes --]

On 16/07/13 11:15, Jan Beulich wrote:
> The requested vector count is to be passed in struct xen_pci_op's info
> field. Upon failure, if a smaller vector count might work, the backend
> will pass that smaller count in the value field (which so far is always
> being set to zero in the error path).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/include/public/io/pciif.h
> +++ b/xen/include/public/io/pciif.h
> @@ -46,6 +46,7 @@
>  #define XEN_PCI_OP_aer_resume		(7)
>  #define XEN_PCI_OP_aer_mmio		(8)
>  #define XEN_PCI_OP_aer_slotreset	(9)
> +#define XEN_PCI_OP_enable_multi_msi	(10)

/* Be sure to bump this number if you change this file */
#define XEN_PCI_MAGIC "7"

Should you bump this version, or is the comment stale?  The only in-tree
consumer I can find is MiniOS's pcifront, which writes it into xenstore.

~Andrew

>  
>  /* xen_pci_op error numbers */
>  #define XEN_PCI_ERR_success          (0)
>
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 1934 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 1/3] VT-d: enable for multi-vector MSI
  2013-07-16 11:15   ` Andrew Cooper
@ 2013-07-16 11:32     ` Jan Beulich
  2013-07-17  9:50       ` Andrew Cooper
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2013-07-16 11:32 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Keir Fraser, xiantao.zhang, xen-devel

>>> On 16.07.13 at 13:15, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> On 16/07/13 11:13, Jan Beulich wrote:
>> The main change being to make alloc_remap_entry() capable of allocating
>> a block of entries.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/xen/drivers/passthrough/vtd/intremap.c
>> +++ b/xen/drivers/passthrough/vtd/intremap.c
>> @@ -194,18 +194,18 @@ static void free_remap_entry(struct iomm
>>  }
>>  
>>  /*
>> - * Look for a free intr remap entry.
>> + * Look for a free intr remap entry (or a contiguous set thereof).
>>   * Need hold iremap_lock, and setup returned entry before releasing lock.
>>   */
>> -static int alloc_remap_entry(struct iommu *iommu)
>> +static unsigned int alloc_remap_entry(struct iommu *iommu, unsigned int nr)
> 
> alloc_remap_entries() now that it unconditionally takes a count (and you
> already have to patch all callsites)

Actually I checked with Linux, and the use singular in the function
name too (albeit the name isn't identical).

>> @@ -555,31 +556,29 @@ static int msi_msg_to_remap_entry(
>>      struct iremap_entry *iremap_entry = NULL, *iremap_entries;
>>      struct iremap_entry new_ire;
>>      struct msi_msg_remap_entry *remap_rte;
>> -    int index;
>> +    unsigned int index, i, nr = 1;
> 
> Does this hardcoding of nr=1 defeat the purpose of the following logic?

In what way?

>>      unsigned long flags;
>>      struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
>>  
>> -    remap_rte = (struct msi_msg_remap_entry *) msg;
>> +    if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
>> +        nr = msi_desc->msi.nvec;

The logic here makes the vector count 1 for MSI-X and msi.nvec
for MSI.

Jan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 3/3] pciif: add multi-vector-MSI command
  2013-07-16 11:19   ` Andrew Cooper
@ 2013-07-16 11:35     ` Jan Beulich
  2013-07-17  9:02       ` Andrew Cooper
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2013-07-16 11:35 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Keir Fraser, xiantao.zhang, xen-devel

>>> On 16.07.13 at 13:19, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> On 16/07/13 11:15, Jan Beulich wrote:
>> The requested vector count is to be passed in struct xen_pci_op's info
>> field. Upon failure, if a smaller vector count might work, the backend
>> will pass that smaller count in the value field (which so far is always
>> being set to zero in the error path).
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/xen/include/public/io/pciif.h
>> +++ b/xen/include/public/io/pciif.h
>> @@ -46,6 +46,7 @@
>>  #define XEN_PCI_OP_aer_resume		(7)
>>  #define XEN_PCI_OP_aer_mmio		(8)
>>  #define XEN_PCI_OP_aer_slotreset	(9)
>> +#define XEN_PCI_OP_enable_multi_msi	(10)
> 
> /* Be sure to bump this number if you change this file */
> #define XEN_PCI_MAGIC "7"
> 
> Should you bump this version, or is the comment stale?  The only in-tree
> consumer I can find is MiniOS's pcifront, which writes it into xenstore.

Whether it's stale I don't know (likely it is considering that you
found just a single consumer), but bumping a revision just
because of the (backwards compatible) addition seems
superfluous to me. This would be different if I changed the
existing enable_msi...

Jan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 2/3] x86: enable multi-vector MSI
  2013-07-16 10:14 ` [PATCH resend 2/3] x86: enable " Jan Beulich
@ 2013-07-16 11:36   ` Andrew Cooper
  2013-07-16 11:48     ` Jan Beulich
  2013-08-05 13:11   ` Ping: " Jan Beulich
  2013-08-08  9:02   ` Keir Fraser
  2 siblings, 1 reply; 18+ messages in thread
From: Andrew Cooper @ 2013-07-16 11:36 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Keir Fraser, xiantao.zhang, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 24573 bytes --]

On 16/07/13 11:14, Jan Beulich wrote:
> This implies
> - extending the public interface to have a way to request a block of
>   MSIs
> - allocating a block of contiguous pIRQ-s for the target domain (but
>   note that the Xen IRQs allocated have no need of being contiguous)
> - repeating certain operations for all involved IRQs
> - fixing multi_msi_enable()
> - adjusting the mask bit accesses for maskable MSIs
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/irq.c
> +++ b/xen/arch/x86/irq.c
> @@ -1863,6 +1863,25 @@ int get_free_pirq(struct domain *d, int 
>      return -ENOSPC;
>  }
>  
> +int get_free_pirqs(struct domain *d, unsigned int nr)
> +{
> +    unsigned int i, found = 0;
> +
> +    ASSERT(spin_is_locked(&d->event_lock));
> +
> +    for ( i = d->nr_pirqs - 1; i >= nr_irqs_gsi; --i )
> +        if ( is_free_pirq(d, pirq_info(d, i)) )
> +        {
> +            pirq_get_info(d, i);
> +            if ( ++found == nr )
> +                return i;
> +        }
> +        else
> +            found = 0;
> +
> +    return -ENOSPC;
> +}
> +

Is there any reason why this loop is backwards? Unless I am mistaken,
guests can choose their own pirqs when binding them, reducing the
likelyhood that the top of the available space will be free.

>  int map_domain_pirq(
>      struct domain *d, int pirq, int irq, int type, void *data)
>  {
> @@ -1918,11 +1937,12 @@ int map_domain_pirq(
>  
>      desc = irq_to_desc(irq);
>  
> -    if ( type == MAP_PIRQ_TYPE_MSI )
> +    if ( type == MAP_PIRQ_TYPE_MSI || type == MAP_PIRQ_TYPE_MULTI_MSI )
>      {
>          struct msi_info *msi = (struct msi_info *)data;
>          struct msi_desc *msi_desc;
>          struct pci_dev *pdev;
> +        unsigned int nr = 0;
>  
>          ASSERT(spin_is_locked(&pcidevs_lock));
>  
> @@ -1933,7 +1953,14 @@ int map_domain_pirq(
>          pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
>          ret = pci_enable_msi(msi, &msi_desc);
>          if ( ret )
> +        {
> +            if ( ret > 0 )
> +            {
> +                msi->entry_nr = ret;
> +                ret = -ENFILE;
> +            }
>              goto done;
> +        }
>  
>          spin_lock_irqsave(&desc->lock, flags);
>  
> @@ -1947,25 +1974,73 @@ int map_domain_pirq(
>              goto done;
>          }
>  
> -        ret = setup_msi_irq(desc, msi_desc);
> -        if ( ret )
> +        while ( !(ret = setup_msi_irq(desc, msi_desc + nr)) )
>          {
> +            if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV &&
> +                 !desc->arch.used_vectors )
> +            {
> +                desc->arch.used_vectors = &pdev->arch.used_vectors;
> +                if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED )
> +                {
> +                    int vector = desc->arch.vector;
> +
> +                    ASSERT(!test_bit(vector, desc->arch.used_vectors));
> +                    set_bit(vector, desc->arch.used_vectors);
> +                }
> +            }
> +            if ( type == MAP_PIRQ_TYPE_MSI ||
> +                 msi_desc->msi_attrib.type != PCI_CAP_ID_MSI ||
> +                 ++nr == msi->entry_nr )
> +                break;
> +
> +            set_domain_irq_pirq(d, irq, info);
>              spin_unlock_irqrestore(&desc->lock, flags);
> -            pci_disable_msi(msi_desc);
> -            goto done;
> +
> +            info = NULL;
> +            irq = create_irq(NUMA_NO_NODE);
> +            ret = irq >= 0 ? prepare_domain_irq_pirq(d, irq, pirq + nr, &info)
> +                           : irq;
> +            if ( ret )
> +                break;
> +            msi_desc[nr].irq = irq;
> +
> +            if ( irq_permit_access(d, irq) != 0 )
> +                printk(XENLOG_G_WARNING
> +                       "dom%d: could not permit access to IRQ%d (pirq %d)\n",
> +                       d->domain_id, irq, pirq);
> +
> +            desc = irq_to_desc(irq);
> +            spin_lock_irqsave(&desc->lock, flags);
> +
> +            if ( desc->handler != &no_irq_type )
> +            {
> +                dprintk(XENLOG_G_ERR, "dom%d: irq %d (pirq %u) in use (%s)\n",
> +                        d->domain_id, irq, pirq + nr, desc->handler->typename);
> +                ret = -EBUSY;
> +                break;
> +            }
>          }
>  
> -        if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV
> -             && !desc->arch.used_vectors )
> +        if ( ret )
>          {
> -            desc->arch.used_vectors = &pdev->arch.used_vectors;
> -            if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED )
> +            spin_unlock_irqrestore(&desc->lock, flags);
> +            while ( nr-- )
>              {
> -                int vector = desc->arch.vector;
> -                ASSERT(!test_bit(vector, desc->arch.used_vectors));
> -
> -                set_bit(vector, desc->arch.used_vectors);
> +                if ( irq >= 0 )
> +                {
> +                    if ( irq_deny_access(d, irq) )
> +                        printk(XENLOG_G_ERR
> +                               "dom%d: could not revoke access to IRQ%d (pirq %d)\n",
> +                               d->domain_id, irq, pirq);
> +                    destroy_irq(irq);
> +                }
> +                if ( info )
> +                    cleanup_domain_irq_pirq(d, irq, info);
> +                info = pirq_info(d, pirq + nr);
> +                irq = info->arch.irq;
>              }
> +            pci_disable_msi(msi_desc);
> +            goto done;
>          }
>  
>          set_domain_irq_pirq(d, irq, info);
> @@ -1996,7 +2071,8 @@ int unmap_domain_pirq(struct domain *d, 
>  {
>      unsigned long flags;
>      struct irq_desc *desc;
> -    int irq, ret = 0;
> +    int irq, ret = 0, rc;
> +    unsigned int i, nr = 1;
>      bool_t forced_unbind;
>      struct pirq *info;
>      struct msi_desc *msi_desc = NULL;
> @@ -2018,6 +2094,18 @@ int unmap_domain_pirq(struct domain *d, 
>  
>      desc = irq_to_desc(irq);
>      msi_desc = desc->msi_desc;
> +    if ( msi_desc && msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
> +    {
> +        if ( msi_desc->msi_attrib.entry_nr )
> +        {
> +            printk(XENLOG_G_ERR
> +                   "dom%d: trying to unmap secondary MSI pirq %d\n",
> +                   d->domain_id, pirq);
> +            ret = -EBUSY;
> +            goto done;
> +        }
> +        nr = msi_desc->msi.nvec;
> +    }
>  
>      ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, msi_desc);
>      if ( ret )
> @@ -2033,37 +2121,83 @@ int unmap_domain_pirq(struct domain *d, 
>  
>      spin_lock_irqsave(&desc->lock, flags);
>  
> -    BUG_ON(irq != domain_pirq_to_irq(d, pirq));
> -
> -    if ( !forced_unbind )
> -        clear_domain_irq_pirq(d, irq, info);
> -    else
> +    for ( i = 0; ; )
>      {
> -        info->arch.irq = -irq;
> -        radix_tree_replace_slot(
> -            radix_tree_lookup_slot(&d->arch.irq_pirq, irq),
> -            radix_tree_int_to_ptr(-pirq));
> +        BUG_ON(irq != domain_pirq_to_irq(d, pirq + i));
> +
> +        if ( !forced_unbind )
> +            clear_domain_irq_pirq(d, irq, info);
> +        else
> +        {
> +            info->arch.irq = -irq;
> +            radix_tree_replace_slot(
> +                radix_tree_lookup_slot(&d->arch.irq_pirq, irq),
> +                radix_tree_int_to_ptr(-pirq));
> +        }
> +
> +        if ( msi_desc )
> +        {
> +            desc->handler = &no_irq_type;
> +            desc->msi_desc = NULL;
> +        }
> +
> +        if ( ++i == nr )
> +            break;
> +
> +        spin_unlock_irqrestore(&desc->lock, flags);
> +
> +        if ( !forced_unbind )
> +           cleanup_domain_irq_pirq(d, irq, info);
> +
> +        rc = irq_deny_access(d, irq);
> +        if ( rc )
> +        {
> +            printk(XENLOG_G_ERR
> +                   "dom%d: could not deny access to IRQ%d (pirq %d)\n",
> +                   d->domain_id, irq, pirq + i);
> +            ret = rc;
> +        }
> +
> +        do {
> +            info = pirq_info(d, pirq + i);
> +            if ( info && (irq = info->arch.irq) > 0 )
> +                break;
> +            printk(XENLOG_G_ERR "dom%d: MSI pirq %d not mapped\n",
> +                   d->domain_id, pirq + i);
> +        } while ( ++i < nr );
> +
> +        if ( i == nr )
> +        {
> +            desc = NULL;
> +            break;
> +        }
> +
> +        desc = irq_to_desc(irq);
> +        BUG_ON(desc->msi_desc != msi_desc + i);
> +
> +        spin_lock_irqsave(&desc->lock, flags);
>      }
>  
> -    if ( msi_desc )
> +    if ( desc )
>      {
> -        desc->handler = &no_irq_type;
> -        desc->msi_desc = NULL;
> +        spin_unlock_irqrestore(&desc->lock, flags);
> +
> +        if ( !forced_unbind )
> +            cleanup_domain_irq_pirq(d, irq, info);
> +
> +        rc = irq_deny_access(d, irq);
> +        if ( rc )
> +        {
> +            printk(XENLOG_G_ERR
> +                   "dom%d: could not deny access to IRQ%d (pirq %d)\n",
> +                   d->domain_id, irq, pirq + nr - 1);
> +            ret = rc;
> +        }
>      }
>  
> -    spin_unlock_irqrestore(&desc->lock, flags);
>      if (msi_desc)
>          msi_free_irq(msi_desc);
>  
> -    if ( !forced_unbind )
> -        cleanup_domain_irq_pirq(d, irq, info);
> -
> -    ret = irq_deny_access(d, irq);
> -    if ( ret )
> -        printk(XENLOG_G_ERR
> -               "dom%d: could not deny access to IRQ%d (pirq %d)\n",
> -               d->domain_id, irq, pirq);
> -
>   done:
>      return ret;
>  }
> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -236,6 +236,11 @@ static int write_msi_msg(struct msi_desc
>          u8 bus = dev->bus;
>          u8 slot = PCI_SLOT(dev->devfn);
>          u8 func = PCI_FUNC(dev->devfn);
> +        int nr = entry->msi_attrib.entry_nr;
> +
> +        ASSERT((msg->data & (entry[-nr].msi.nvec - 1)) == nr);
> +        if ( nr )
> +            return 0;
>  
>          pci_conf_write32(seg, bus, slot, func, msi_lower_address_reg(pos),
>                           msg->address_lo);
> @@ -359,8 +364,8 @@ static void msi_set_mask_bit(struct irq_
>              u8 func = PCI_FUNC(entry->dev->devfn);
>  
>              mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos);
> -            mask_bits &= ~(1);
> -            mask_bits |= flag;
> +            mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
> +            mask_bits |= (u32)flag << entry->msi_attrib.entry_nr;
>              pci_conf_write32(seg, bus, slot, func, entry->msi.mpos, mask_bits);
>          }
>          break;
> @@ -384,10 +389,11 @@ static int msi_get_mask_bit(const struct
>      case PCI_CAP_ID_MSI:
>          if (!entry->dev || !entry->msi_attrib.maskbit)
>              break;
> -        return pci_conf_read32(entry->dev->seg, entry->dev->bus,
> -                               PCI_SLOT(entry->dev->devfn),
> -                               PCI_FUNC(entry->dev->devfn),
> -                               entry->msi.mpos) & 1;
> +        return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
> +                                PCI_SLOT(entry->dev->devfn),
> +                                PCI_FUNC(entry->dev->devfn),
> +                                entry->msi.mpos) >>
> +                entry->msi_attrib.entry_nr) & 1;
>      case PCI_CAP_ID_MSIX:
>          return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1;
>      }
> @@ -453,17 +459,20 @@ static hw_irq_controller pci_msi_nonmask
>      .set_affinity = set_msi_affinity
>  };
>  
> -static struct msi_desc* alloc_msi_entry(void)
> +static struct msi_desc *alloc_msi_entry(unsigned int nr)
>  {
>      struct msi_desc *entry;
>  
> -    entry = xmalloc(struct msi_desc);
> +    entry = xmalloc_array(struct msi_desc, nr);
>      if ( !entry )
>          return NULL;
>  
>      INIT_LIST_HEAD(&entry->list);
> -    entry->dev = NULL;
> -    entry->remap_index = -1;
> +    while ( nr-- )
> +    {
> +        entry[nr].dev = NULL;
> +        entry[nr].remap_index = -1;
> +    }
>  
>      return entry;
>  }
> @@ -488,17 +497,24 @@ int __setup_msi_irq(struct irq_desc *des
>  
>  int msi_free_irq(struct msi_desc *entry)
>  {
> -    destroy_irq(entry->irq);
> +    unsigned int nr = entry->msi.nvec;
> +
>      if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
>      {
>          unsigned long start;
>          start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1);
>          msix_put_fixmap(entry->dev, virt_to_fix(start));
> +        nr = 1;
>      }
>  
> -    /* Free the unused IRTE if intr remap enabled */
> -    if ( iommu_intremap )
> -        iommu_update_ire_from_msi(entry, NULL);
> +    while ( nr-- )
> +    {
> +        destroy_irq(entry[nr].irq);
> +
> +        /* Free the unused IRTE if intr remap enabled */
> +        if ( iommu_intremap )
> +            iommu_update_ire_from_msi(entry + nr, NULL);
> +    }
>  
>      list_del(&entry->list);
>      xfree(entry);
> @@ -531,11 +547,12 @@ static struct msi_desc *find_msi_entry(s
>   **/
>  static int msi_capability_init(struct pci_dev *dev,
>                                 int irq,
> -                               struct msi_desc **desc)
> +                               struct msi_desc **desc,
> +                               unsigned int nvec)
>  {
>      struct msi_desc *entry;
>      int pos;
> -    unsigned int maxvec, mpos;
> +    unsigned int i, maxvec, mpos;
>      u16 control, seg = dev->seg;
>      u8 bus = dev->bus;
>      u8 slot = PCI_SLOT(dev->devfn);
> @@ -545,27 +562,34 @@ static int msi_capability_init(struct pc
>      pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI);
>      control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
>      maxvec = multi_msi_capable(control);
> +    if ( nvec > maxvec )
> +        return maxvec;
>      control &= ~PCI_MSI_FLAGS_QSIZE;
> +    multi_msi_enable(control, nvec);
>  
>      /* MSI Entry Initialization */
>      msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */
>  
> -    entry = alloc_msi_entry();
> +    entry = alloc_msi_entry(nvec);
>      if ( !entry )
>          return -ENOMEM;
>  
> -    entry->msi_attrib.type = PCI_CAP_ID_MSI;
> -    entry->msi_attrib.is_64 = is_64bit_address(control);
> -    entry->msi_attrib.entry_nr = 0;
> -    entry->msi_attrib.maskbit = is_mask_bit_support(control);
> -    entry->msi_attrib.masked = 1;
> -    entry->msi_attrib.pos = pos;
>      mpos = msi_mask_bits_reg(pos, is_64bit_address(control));
> -    entry->msi.nvec = 1;
> +    for ( i = 0; i < nvec; ++i )
> +    {
> +        entry[i].msi_attrib.type = PCI_CAP_ID_MSI;
> +        entry[i].msi_attrib.is_64 = is_64bit_address(control);
> +        entry[i].msi_attrib.entry_nr = i;
> +        entry[i].msi_attrib.maskbit = is_mask_bit_support(control);
> +        entry[i].msi_attrib.masked = 1;
> +        entry[i].msi_attrib.pos = pos;
> +        if ( entry[i].msi_attrib.maskbit )
> +            entry[i].msi.mpos = mpos;
> +        entry[i].msi.nvec = 0;
> +        entry[i].dev = dev;
> +    }
> +    entry->msi.nvec = nvec;
>      entry->irq = irq;
> -    if ( is_mask_bit_support(control) )
> -        entry->msi.mpos = mpos;
> -    entry->dev = dev;
>      if ( entry->msi_attrib.maskbit )
>      {
>          u32 maskbits;
> @@ -693,7 +717,7 @@ static int msix_capability_init(struct p
>  
>      if ( desc )
>      {
> -        entry = alloc_msi_entry();
> +        entry = alloc_msi_entry(1);
>          if ( !entry )
>              return -ENOMEM;
>          ASSERT(msi);
> @@ -851,7 +875,6 @@ static int msix_capability_init(struct p
>  
>  static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
>  {
> -    int status;
>      struct pci_dev *pdev;
>      struct msi_desc *old_desc;
>  
> @@ -880,8 +903,7 @@ static int __pci_enable_msi(struct msi_i
>          pci_disable_msi(old_desc);
>      }
>  
> -    status = msi_capability_init(pdev, msi->irq, desc);
> -    return status;
> +    return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr);
>  }
>  
>  static void __pci_disable_msi(struct msi_desc *entry)
> @@ -1101,6 +1123,8 @@ int pci_restore_msi_state(struct pci_dev
>  
>      list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
>      {
> +        unsigned int i = 0, nr = 1;
> +
>          irq = entry->irq;
>          desc = &irq_desc[irq];
>  
> @@ -1110,30 +1134,58 @@ int pci_restore_msi_state(struct pci_dev
>  
>          if (desc->msi_desc != entry)
>          {
> +    bogus:
>              dprintk(XENLOG_ERR,
> -                    "Restore MSI for dev %04x:%02x:%02x:%x not set before?\n",
> +                    "Restore MSI for %04x:%02x:%02x:%u entry %u not set?\n",
>                      pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
> -                    PCI_FUNC(pdev->devfn));
> +                    PCI_FUNC(pdev->devfn), i);
>              spin_unlock_irqrestore(&desc->lock, flags);
>              return -EINVAL;
>          }
>  
>          if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
> +        {
>              msi_set_enable(pdev, 0);
> +            nr = entry->msi.nvec;
> +        }
>          else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
>              msix_set_enable(pdev, 0);
>  
>          msg = entry->msg;
>          write_msi_msg(entry, &msg);
>  
> -        msi_set_mask_bit(desc, entry->msi_attrib.masked);
> +        for ( i = 0; ; )
> +        {
> +            msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
> +            spin_unlock_irqrestore(&desc->lock, flags);
> +
> +            if ( !--nr )
> +                break;
> +
> +            desc = &irq_desc[entry[++i].irq];
> +            spin_lock_irqsave(&desc->lock, flags);
> +            if ( desc->msi_desc != entry + i )
> +                goto bogus;
> +        }
> +
> +        spin_unlock_irqrestore(&desc->lock, flags);
>  
>          if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
> +        {
> +            unsigned int cpos = msi_control_reg(entry->msi_attrib.pos);
> +            u16 control = pci_conf_read16(pdev->seg, pdev->bus,
> +                                          PCI_SLOT(pdev->devfn),
> +                                          PCI_FUNC(pdev->devfn), cpos);
> +
> +            control &= ~PCI_MSI_FLAGS_QSIZE;
> +            multi_msi_enable(control, entry->msi.nvec);
> +            pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
> +                             PCI_FUNC(pdev->devfn), cpos, control);
> +
>              msi_set_enable(pdev, 1);
> +        }
>          else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
>              msix_set_enable(pdev, 1);
> -
> -        spin_unlock_irqrestore(&desc->lock, flags);
>      }
>  
>      return 0;
> --- a/xen/arch/x86/physdev.c
> +++ b/xen/arch/x86/physdev.c
> @@ -140,8 +140,11 @@ int physdev_map_pirq(domid_t domid, int 
>          break;
>  
>      case MAP_PIRQ_TYPE_MSI:
> +        if ( !msi->table_base )
> +            msi->entry_nr = 1;
>          irq = *index;
>          if ( irq == -1 )
> +    case MAP_PIRQ_TYPE_MULTI_MSI:
>              irq = create_irq(NUMA_NO_NODE);
>  
>          if ( irq < nr_irqs_gsi || irq >= nr_irqs )
> @@ -179,6 +182,30 @@ int physdev_map_pirq(domid_t domid, int 
>                  goto done;
>              }
>          }
> +        else if ( type == MAP_PIRQ_TYPE_MULTI_MSI )
> +        {
> +            if ( msi->entry_nr <= 0 || msi->entry_nr > 32 )
> +                ret = -EDOM;
> +            else if ( msi->entry_nr != 1 && !iommu_intremap )
> +                ret = -EOPNOTSUPP;
> +            else
> +            {
> +                while ( msi->entry_nr & (msi->entry_nr - 1) )
> +                    msi->entry_nr += msi->entry_nr & -msi->entry_nr;
> +                pirq = get_free_pirqs(d, msi->entry_nr);
> +                if ( pirq < 0 )
> +                {
> +                    while ( (msi->entry_nr >>= 1) > 1 )
> +                        if ( get_free_pirqs(d, msi->entry_nr) > 0 )
> +                            break;
> +                    dprintk(XENLOG_G_ERR, "dom%d: no block of %d free pirqs\n",
> +                            d->domain_id, msi->entry_nr << 1);
> +                    ret = pirq;
> +                }
> +            }
> +            if ( ret < 0 )
> +                goto done;
> +        }
>          else
>          {
>              pirq = get_free_pirq(d, type);
> @@ -210,8 +237,15 @@ int physdev_map_pirq(domid_t domid, int 
>   done:
>      spin_unlock(&d->event_lock);
>      spin_unlock(&pcidevs_lock);
> -    if ( (ret != 0) && (type == MAP_PIRQ_TYPE_MSI) && (*index == -1) )
> -        destroy_irq(irq);
> +    if ( ret != 0 )
> +        switch ( type )
> +        {
> +        case MAP_PIRQ_TYPE_MSI:
> +            if ( *index == -1 )
> +        case MAP_PIRQ_TYPE_MULTI_MSI:
> +                destroy_irq(irq);
> +            break;
> +        }

Do we not need to create and destroy entry_nr irqs in this function, or
is a multi-vector-msi now considered as just as single irq ?

I ask because this appears to lack the "repeating certain operations for
all involved IRQs" described in the comment.

~Andrew

>   free_domain:
>      rcu_unlock_domain(d);
>      return ret;
> @@ -390,14 +424,22 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
>          if ( copy_from_guest(&map, arg, 1) != 0 )
>              break;
>  
> -        if ( map.type == MAP_PIRQ_TYPE_MSI_SEG )
> +        switch ( map.type )
>          {
> +        case MAP_PIRQ_TYPE_MSI_SEG:
>              map.type = MAP_PIRQ_TYPE_MSI;
>              msi.seg = map.bus >> 16;
> -        }
> -        else
> -        {
> +            break;
> +
> +        case MAP_PIRQ_TYPE_MULTI_MSI:
> +            if ( map.table_base )
> +                return -EINVAL;
> +            msi.seg = map.bus >> 16;
> +            break;
> +
> +        default:
>              msi.seg = 0;
> +            break;
>          }
>          msi.bus = map.bus;
>          msi.devfn = map.devfn;
> @@ -406,6 +448,8 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
>          ret = physdev_map_pirq(map.domid, map.type, &map.index, &map.pirq,
>                                 &msi);
>  
> +        if ( map.type == MAP_PIRQ_TYPE_MULTI_MSI )
> +            map.entry_nr = msi.entry_nr;
>          if ( __copy_to_guest(arg, &map, 1) )
>              ret = -EFAULT;
>          break;
> --- a/xen/include/asm-x86/irq.h
> +++ b/xen/include/asm-x86/irq.h
> @@ -141,6 +141,7 @@ int map_domain_pirq(struct domain *d, in
>                             void *data);
>  int unmap_domain_pirq(struct domain *d, int pirq);
>  int get_free_pirq(struct domain *d, int type);
> +int get_free_pirqs(struct domain *, unsigned int nr);
>  void free_domain_pirqs(struct domain *d);
>  int map_domain_emuirq_pirq(struct domain *d, int pirq, int irq);
>  int unmap_domain_pirq_emuirq(struct domain *d, int pirq);
> --- a/xen/include/asm-x86/msi.h
> +++ b/xen/include/asm-x86/msi.h
> @@ -148,7 +148,7 @@ int msi_free_irq(struct msi_desc *entry)
>  #define multi_msi_capable(control) \
>  	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
>  #define multi_msi_enable(control, num) \
> -	control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
> +	control |= (((fls(num) - 1) << 4) & PCI_MSI_FLAGS_QSIZE);
>  #define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
>  #define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
>  #define msi_enable(control, num) multi_msi_enable(control, num); \
> --- a/xen/include/public/physdev.h
> +++ b/xen/include/public/physdev.h
> @@ -151,21 +151,22 @@ DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
>  #define MAP_PIRQ_TYPE_GSI               0x1
>  #define MAP_PIRQ_TYPE_UNKNOWN           0x2
>  #define MAP_PIRQ_TYPE_MSI_SEG           0x3
> +#define MAP_PIRQ_TYPE_MULTI_MSI         0x4
>  
>  #define PHYSDEVOP_map_pirq               13
>  struct physdev_map_pirq {
>      domid_t domid;
>      /* IN */
>      int type;
> -    /* IN */
> +    /* IN (ignored for ..._MULTI_MSI) */
>      int index;
>      /* IN or OUT */
>      int pirq;
> -    /* IN - high 16 bits hold segment for MAP_PIRQ_TYPE_MSI_SEG */
> +    /* IN - high 16 bits hold segment for ..._MSI_SEG and ..._MULTI_MSI */
>      int bus;
>      /* IN */
>      int devfn;
> -    /* IN */
> +    /* IN (also OUT for ..._MULTI_MSI) */
>      int entry_nr;
>      /* IN */
>      uint64_t table_base;
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 25128 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 2/3] x86: enable multi-vector MSI
  2013-07-16 11:36   ` Andrew Cooper
@ 2013-07-16 11:48     ` Jan Beulich
  2013-07-17 10:02       ` Andrew Cooper
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Beulich @ 2013-07-16 11:48 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Keir Fraser, xiantao.zhang, xen-devel

>>> On 16.07.13 at 13:36, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> On 16/07/13 11:14, Jan Beulich wrote:
>> +int get_free_pirqs(struct domain *d, unsigned int nr)
>> +{
>> +    unsigned int i, found = 0;
>> +
>> +    ASSERT(spin_is_locked(&d->event_lock));
>> +
>> +    for ( i = d->nr_pirqs - 1; i >= nr_irqs_gsi; --i )
>> +        if ( is_free_pirq(d, pirq_info(d, i)) )
>> +        {
>> +            pirq_get_info(d, i);
>> +            if ( ++found == nr )
>> +                return i;
>> +        }
>> +        else
>> +            found = 0;
>> +
>> +    return -ENOSPC;
>> +}
>> +
> 
> Is there any reason why this loop is backwards? Unless I am mistaken,
> guests can choose their own pirqs when binding them, reducing the
> likelyhood that the top of the available space will be free.

This just follows the behavior of get_free_pirq(). I'm not up to
having the two behave differently.

>> @@ -210,8 +237,15 @@ int physdev_map_pirq(domid_t domid, int 
>>   done:
>>      spin_unlock(&d->event_lock);
>>      spin_unlock(&pcidevs_lock);
>> -    if ( (ret != 0) && (type == MAP_PIRQ_TYPE_MSI) && (*index == -1) )
>> -        destroy_irq(irq);
>> +    if ( ret != 0 )
>> +        switch ( type )
>> +        {
>> +        case MAP_PIRQ_TYPE_MSI:
>> +            if ( *index == -1 )
>> +        case MAP_PIRQ_TYPE_MULTI_MSI:
>> +                destroy_irq(irq);
>> +            break;
>> +        }
> 
> Do we not need to create and destroy entry_nr irqs in this function, or
> is a multi-vector-msi now considered as just as single irq ?
> 
> I ask because this appears to lack the "repeating certain operations for
> all involved IRQs" described in the comment.

No, there's a single create_irq() in the function, and having a single
destroy_irq() here matches this. The remaining ones (both!) are in
map_domain_pirq().

Also, as a general remark, asking for changes in a series that was
posted 2.5 months ago (and deferred just because of the 4.3
release process) seems a little strange to me. I had to repost
merely to collect ack-s, and didn't really expect further requests
for adjustments as there was ample time before to do so.

Jan

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 3/3] pciif: add multi-vector-MSI command
  2013-07-16 11:35     ` Jan Beulich
@ 2013-07-17  9:02       ` Andrew Cooper
  0 siblings, 0 replies; 18+ messages in thread
From: Andrew Cooper @ 2013-07-17  9:02 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Keir Fraser, xiantao.zhang, xen-devel

On 16/07/13 12:35, Jan Beulich wrote:
>>>> On 16.07.13 at 13:19, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> On 16/07/13 11:15, Jan Beulich wrote:
>>> The requested vector count is to be passed in struct xen_pci_op's info
>>> field. Upon failure, if a smaller vector count might work, the backend
>>> will pass that smaller count in the value field (which so far is always
>>> being set to zero in the error path).
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>
>>> --- a/xen/include/public/io/pciif.h
>>> +++ b/xen/include/public/io/pciif.h
>>> @@ -46,6 +46,7 @@
>>>  #define XEN_PCI_OP_aer_resume		(7)
>>>  #define XEN_PCI_OP_aer_mmio		(8)
>>>  #define XEN_PCI_OP_aer_slotreset	(9)
>>> +#define XEN_PCI_OP_enable_multi_msi	(10)
>> /* Be sure to bump this number if you change this file */
>> #define XEN_PCI_MAGIC "7"
>>
>> Should you bump this version, or is the comment stale?  The only in-tree
>> consumer I can find is MiniOS's pcifront, which writes it into xenstore.
> Whether it's stale I don't know (likely it is considering that you
> found just a single consumer), but bumping a revision just
> because of the (backwards compatible) addition seems
> superfluous to me. This would be different if I changed the
> existing enable_msi...
>
> Jan
>

I suspected that was the answer, but just wanted to check that it hadn't
been overlooked,

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 1/3] VT-d: enable for multi-vector MSI
  2013-07-16 11:32     ` Jan Beulich
@ 2013-07-17  9:50       ` Andrew Cooper
  2013-07-18 10:48         ` Zhang, Xiantao
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Cooper @ 2013-07-17  9:50 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Keir Fraser, xiantao.zhang, xen-devel

On 16/07/13 12:32, Jan Beulich wrote:
>>>> On 16.07.13 at 13:15, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> On 16/07/13 11:13, Jan Beulich wrote:
>>> The main change being to make alloc_remap_entry() capable of allocating
>>> a block of entries.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>
>>> --- a/xen/drivers/passthrough/vtd/intremap.c
>>> +++ b/xen/drivers/passthrough/vtd/intremap.c
>>> @@ -194,18 +194,18 @@ static void free_remap_entry(struct iomm
>>>  }
>>>  
>>>  /*
>>> - * Look for a free intr remap entry.
>>> + * Look for a free intr remap entry (or a contiguous set thereof).
>>>   * Need hold iremap_lock, and setup returned entry before releasing lock.
>>>   */
>>> -static int alloc_remap_entry(struct iommu *iommu)
>>> +static unsigned int alloc_remap_entry(struct iommu *iommu, unsigned int nr)
>> alloc_remap_entries() now that it unconditionally takes a count (and you
>> already have to patch all callsites)
> Actually I checked with Linux, and the use singular in the function
> name too (albeit the name isn't identical).
>
>>> @@ -555,31 +556,29 @@ static int msi_msg_to_remap_entry(
>>>      struct iremap_entry *iremap_entry = NULL, *iremap_entries;
>>>      struct iremap_entry new_ire;
>>>      struct msi_msg_remap_entry *remap_rte;
>>> -    int index;
>>> +    unsigned int index, i, nr = 1;
>> Does this hardcoding of nr=1 defeat the purpose of the following logic?
> In what way?
>
>>>      unsigned long flags;
>>>      struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
>>>  
>>> -    remap_rte = (struct msi_msg_remap_entry *) msg;
>>> +    if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
>>> +        nr = msi_desc->msi.nvec;
> The logic here makes the vector count 1 for MSI-X and msi.nvec
> for MSI.
>
> Jan
>

Ah yes - I see now.

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 2/3] x86: enable multi-vector MSI
  2013-07-16 11:48     ` Jan Beulich
@ 2013-07-17 10:02       ` Andrew Cooper
  0 siblings, 0 replies; 18+ messages in thread
From: Andrew Cooper @ 2013-07-17 10:02 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Keir Fraser, xiantao.zhang, xen-devel

On 16/07/13 12:48, Jan Beulich wrote:
>>>> On 16.07.13 at 13:36, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> On 16/07/13 11:14, Jan Beulich wrote:
>>> +int get_free_pirqs(struct domain *d, unsigned int nr)
>>> +{
>>> +    unsigned int i, found = 0;
>>> +
>>> +    ASSERT(spin_is_locked(&d->event_lock));
>>> +
>>> +    for ( i = d->nr_pirqs - 1; i >= nr_irqs_gsi; --i )
>>> +        if ( is_free_pirq(d, pirq_info(d, i)) )
>>> +        {
>>> +            pirq_get_info(d, i);
>>> +            if ( ++found == nr )
>>> +                return i;
>>> +        }
>>> +        else
>>> +            found = 0;
>>> +
>>> +    return -ENOSPC;
>>> +}
>>> +
>> Is there any reason why this loop is backwards? Unless I am mistaken,
>> guests can choose their own pirqs when binding them, reducing the
>> likelyhood that the top of the available space will be free.
> This just follows the behavior of get_free_pirq(). I'm not up to
> having the two behave differently.
>
>>> @@ -210,8 +237,15 @@ int physdev_map_pirq(domid_t domid, int 
>>>   done:
>>>      spin_unlock(&d->event_lock);
>>>      spin_unlock(&pcidevs_lock);
>>> -    if ( (ret != 0) && (type == MAP_PIRQ_TYPE_MSI) && (*index == -1) )
>>> -        destroy_irq(irq);
>>> +    if ( ret != 0 )
>>> +        switch ( type )
>>> +        {
>>> +        case MAP_PIRQ_TYPE_MSI:
>>> +            if ( *index == -1 )
>>> +        case MAP_PIRQ_TYPE_MULTI_MSI:
>>> +                destroy_irq(irq);
>>> +            break;
>>> +        }
>> Do we not need to create and destroy entry_nr irqs in this function, or
>> is a multi-vector-msi now considered as just as single irq ?
>>
>> I ask because this appears to lack the "repeating certain operations for
>> all involved IRQs" described in the comment.
> No, there's a single create_irq() in the function, and having a single
> destroy_irq() here matches this. The remaining ones (both!) are in
> map_domain_pirq().

Ok.

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

>
> Also, as a general remark, asking for changes in a series that was
> posted 2.5 months ago (and deferred just because of the 4.3
> release process) seems a little strange to me. I had to repost
> merely to collect ack-s, and didn't really expect further requests
> for adjustments as there was ample time before to do so.
>
> Jan
>

2.5 months ago, I was very busy with XSAs, hotfixes and a release of
XenServer, so I appologies for not reviewing back then.

~Andrew

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 1/3] VT-d: enable for multi-vector MSI
  2013-07-17  9:50       ` Andrew Cooper
@ 2013-07-18 10:48         ` Zhang, Xiantao
  0 siblings, 0 replies; 18+ messages in thread
From: Zhang, Xiantao @ 2013-07-18 10:48 UTC (permalink / raw)
  To: Andrew Cooper, Jan Beulich; +Cc: Keir Fraser, Zhang, Xiantao, xen-devel

Thanks, Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>

> -----Original Message-----
> From: Andrew Cooper [mailto:andrew.cooper3@citrix.com]
> Sent: Wednesday, July 17, 2013 5:50 PM
> To: Jan Beulich
> Cc: Zhang, Xiantao; xen-devel; Keir Fraser
> Subject: Re: [Xen-devel] [PATCH resend 1/3] VT-d: enable for multi-vector
> MSI
> 
> On 16/07/13 12:32, Jan Beulich wrote:
> >>>> On 16.07.13 at 13:15, Andrew Cooper <andrew.cooper3@citrix.com>
> wrote:
> >> On 16/07/13 11:13, Jan Beulich wrote:
> >>> The main change being to make alloc_remap_entry() capable of
> allocating
> >>> a block of entries.
> >>>
> >>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> >>>
> >>> --- a/xen/drivers/passthrough/vtd/intremap.c
> >>> +++ b/xen/drivers/passthrough/vtd/intremap.c
> >>> @@ -194,18 +194,18 @@ static void free_remap_entry(struct iomm
> >>>  }
> >>>
> >>>  /*
> >>> - * Look for a free intr remap entry.
> >>> + * Look for a free intr remap entry (or a contiguous set thereof).
> >>>   * Need hold iremap_lock, and setup returned entry before releasing
> lock.
> >>>   */
> >>> -static int alloc_remap_entry(struct iommu *iommu)
> >>> +static unsigned int alloc_remap_entry(struct iommu *iommu, unsigned
> int nr)
> >> alloc_remap_entries() now that it unconditionally takes a count (and you
> >> already have to patch all callsites)
> > Actually I checked with Linux, and the use singular in the function
> > name too (albeit the name isn't identical).
> >
> >>> @@ -555,31 +556,29 @@ static int msi_msg_to_remap_entry(
> >>>      struct iremap_entry *iremap_entry = NULL, *iremap_entries;
> >>>      struct iremap_entry new_ire;
> >>>      struct msi_msg_remap_entry *remap_rte;
> >>> -    int index;
> >>> +    unsigned int index, i, nr = 1;
> >> Does this hardcoding of nr=1 defeat the purpose of the following logic?
> > In what way?
> >
> >>>      unsigned long flags;
> >>>      struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
> >>>
> >>> -    remap_rte = (struct msi_msg_remap_entry *) msg;
> >>> +    if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
> >>> +        nr = msi_desc->msi.nvec;
> > The logic here makes the vector count 1 for MSI-X and msi.nvec
> > for MSI.
> >
> > Jan
> >
> 
> Ah yes - I see now.
> 
> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Ping: [PATCH resend 2/3] x86: enable multi-vector MSI
  2013-07-16 10:14 ` [PATCH resend 2/3] x86: enable " Jan Beulich
  2013-07-16 11:36   ` Andrew Cooper
@ 2013-08-05 13:11   ` Jan Beulich
  2013-08-08  9:02   ` Keir Fraser
  2 siblings, 0 replies; 18+ messages in thread
From: Jan Beulich @ 2013-08-05 13:11 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel, xiantao.zhang

Ping (public interface change)?

>>> On 16.07.13 at 12:14, "Jan Beulich" <JBeulich@suse.com> wrote:
> This implies
> - extending the public interface to have a way to request a block of
>   MSIs
> - allocating a block of contiguous pIRQ-s for the target domain (but
>   note that the Xen IRQs allocated have no need of being contiguous)
> - repeating certain operations for all involved IRQs
> - fixing multi_msi_enable()
> - adjusting the mask bit accesses for maskable MSIs
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/irq.c
> +++ b/xen/arch/x86/irq.c
> @@ -1863,6 +1863,25 @@ int get_free_pirq(struct domain *d, int 
>      return -ENOSPC;
>  }
>  
> +int get_free_pirqs(struct domain *d, unsigned int nr)
> +{
> +    unsigned int i, found = 0;
> +
> +    ASSERT(spin_is_locked(&d->event_lock));
> +
> +    for ( i = d->nr_pirqs - 1; i >= nr_irqs_gsi; --i )
> +        if ( is_free_pirq(d, pirq_info(d, i)) )
> +        {
> +            pirq_get_info(d, i);
> +            if ( ++found == nr )
> +                return i;
> +        }
> +        else
> +            found = 0;
> +
> +    return -ENOSPC;
> +}
> +
>  int map_domain_pirq(
>      struct domain *d, int pirq, int irq, int type, void *data)
>  {
> @@ -1918,11 +1937,12 @@ int map_domain_pirq(
>  
>      desc = irq_to_desc(irq);
>  
> -    if ( type == MAP_PIRQ_TYPE_MSI )
> +    if ( type == MAP_PIRQ_TYPE_MSI || type == MAP_PIRQ_TYPE_MULTI_MSI )
>      {
>          struct msi_info *msi = (struct msi_info *)data;
>          struct msi_desc *msi_desc;
>          struct pci_dev *pdev;
> +        unsigned int nr = 0;
>  
>          ASSERT(spin_is_locked(&pcidevs_lock));
>  
> @@ -1933,7 +1953,14 @@ int map_domain_pirq(
>          pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn);
>          ret = pci_enable_msi(msi, &msi_desc);
>          if ( ret )
> +        {
> +            if ( ret > 0 )
> +            {
> +                msi->entry_nr = ret;
> +                ret = -ENFILE;
> +            }
>              goto done;
> +        }
>  
>          spin_lock_irqsave(&desc->lock, flags);
>  
> @@ -1947,25 +1974,73 @@ int map_domain_pirq(
>              goto done;
>          }
>  
> -        ret = setup_msi_irq(desc, msi_desc);
> -        if ( ret )
> +        while ( !(ret = setup_msi_irq(desc, msi_desc + nr)) )
>          {
> +            if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV &&
> +                 !desc->arch.used_vectors )
> +            {
> +                desc->arch.used_vectors = &pdev->arch.used_vectors;
> +                if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED )
> +                {
> +                    int vector = desc->arch.vector;
> +
> +                    ASSERT(!test_bit(vector, desc->arch.used_vectors));
> +                    set_bit(vector, desc->arch.used_vectors);
> +                }
> +            }
> +            if ( type == MAP_PIRQ_TYPE_MSI ||
> +                 msi_desc->msi_attrib.type != PCI_CAP_ID_MSI ||
> +                 ++nr == msi->entry_nr )
> +                break;
> +
> +            set_domain_irq_pirq(d, irq, info);
>              spin_unlock_irqrestore(&desc->lock, flags);
> -            pci_disable_msi(msi_desc);
> -            goto done;
> +
> +            info = NULL;
> +            irq = create_irq(NUMA_NO_NODE);
> +            ret = irq >= 0 ? prepare_domain_irq_pirq(d, irq, pirq + nr, 
> &info)
> +                           : irq;
> +            if ( ret )
> +                break;
> +            msi_desc[nr].irq = irq;
> +
> +            if ( irq_permit_access(d, irq) != 0 )
> +                printk(XENLOG_G_WARNING
> +                       "dom%d: could not permit access to IRQ%d (pirq 
> %d)\n",
> +                       d->domain_id, irq, pirq);
> +
> +            desc = irq_to_desc(irq);
> +            spin_lock_irqsave(&desc->lock, flags);
> +
> +            if ( desc->handler != &no_irq_type )
> +            {
> +                dprintk(XENLOG_G_ERR, "dom%d: irq %d (pirq %u) in use 
> (%s)\n",
> +                        d->domain_id, irq, pirq + nr, desc->handler->typename);
> +                ret = -EBUSY;
> +                break;
> +            }
>          }
>  
> -        if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV
> -             && !desc->arch.used_vectors )
> +        if ( ret )
>          {
> -            desc->arch.used_vectors = &pdev->arch.used_vectors;
> -            if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED )
> +            spin_unlock_irqrestore(&desc->lock, flags);
> +            while ( nr-- )
>              {
> -                int vector = desc->arch.vector;
> -                ASSERT(!test_bit(vector, desc->arch.used_vectors));
> -
> -                set_bit(vector, desc->arch.used_vectors);
> +                if ( irq >= 0 )
> +                {
> +                    if ( irq_deny_access(d, irq) )
> +                        printk(XENLOG_G_ERR
> +                               "dom%d: could not revoke access to IRQ%d 
> (pirq %d)\n",
> +                               d->domain_id, irq, pirq);
> +                    destroy_irq(irq);
> +                }
> +                if ( info )
> +                    cleanup_domain_irq_pirq(d, irq, info);
> +                info = pirq_info(d, pirq + nr);
> +                irq = info->arch.irq;
>              }
> +            pci_disable_msi(msi_desc);
> +            goto done;
>          }
>  
>          set_domain_irq_pirq(d, irq, info);
> @@ -1996,7 +2071,8 @@ int unmap_domain_pirq(struct domain *d, 
>  {
>      unsigned long flags;
>      struct irq_desc *desc;
> -    int irq, ret = 0;
> +    int irq, ret = 0, rc;
> +    unsigned int i, nr = 1;
>      bool_t forced_unbind;
>      struct pirq *info;
>      struct msi_desc *msi_desc = NULL;
> @@ -2018,6 +2094,18 @@ int unmap_domain_pirq(struct domain *d, 
>  
>      desc = irq_to_desc(irq);
>      msi_desc = desc->msi_desc;
> +    if ( msi_desc && msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
> +    {
> +        if ( msi_desc->msi_attrib.entry_nr )
> +        {
> +            printk(XENLOG_G_ERR
> +                   "dom%d: trying to unmap secondary MSI pirq %d\n",
> +                   d->domain_id, pirq);
> +            ret = -EBUSY;
> +            goto done;
> +        }
> +        nr = msi_desc->msi.nvec;
> +    }
>  
>      ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, msi_desc);
>      if ( ret )
> @@ -2033,37 +2121,83 @@ int unmap_domain_pirq(struct domain *d, 
>  
>      spin_lock_irqsave(&desc->lock, flags);
>  
> -    BUG_ON(irq != domain_pirq_to_irq(d, pirq));
> -
> -    if ( !forced_unbind )
> -        clear_domain_irq_pirq(d, irq, info);
> -    else
> +    for ( i = 0; ; )
>      {
> -        info->arch.irq = -irq;
> -        radix_tree_replace_slot(
> -            radix_tree_lookup_slot(&d->arch.irq_pirq, irq),
> -            radix_tree_int_to_ptr(-pirq));
> +        BUG_ON(irq != domain_pirq_to_irq(d, pirq + i));
> +
> +        if ( !forced_unbind )
> +            clear_domain_irq_pirq(d, irq, info);
> +        else
> +        {
> +            info->arch.irq = -irq;
> +            radix_tree_replace_slot(
> +                radix_tree_lookup_slot(&d->arch.irq_pirq, irq),
> +                radix_tree_int_to_ptr(-pirq));
> +        }
> +
> +        if ( msi_desc )
> +        {
> +            desc->handler = &no_irq_type;
> +            desc->msi_desc = NULL;
> +        }
> +
> +        if ( ++i == nr )
> +            break;
> +
> +        spin_unlock_irqrestore(&desc->lock, flags);
> +
> +        if ( !forced_unbind )
> +           cleanup_domain_irq_pirq(d, irq, info);
> +
> +        rc = irq_deny_access(d, irq);
> +        if ( rc )
> +        {
> +            printk(XENLOG_G_ERR
> +                   "dom%d: could not deny access to IRQ%d (pirq %d)\n",
> +                   d->domain_id, irq, pirq + i);
> +            ret = rc;
> +        }
> +
> +        do {
> +            info = pirq_info(d, pirq + i);
> +            if ( info && (irq = info->arch.irq) > 0 )
> +                break;
> +            printk(XENLOG_G_ERR "dom%d: MSI pirq %d not mapped\n",
> +                   d->domain_id, pirq + i);
> +        } while ( ++i < nr );
> +
> +        if ( i == nr )
> +        {
> +            desc = NULL;
> +            break;
> +        }
> +
> +        desc = irq_to_desc(irq);
> +        BUG_ON(desc->msi_desc != msi_desc + i);
> +
> +        spin_lock_irqsave(&desc->lock, flags);
>      }
>  
> -    if ( msi_desc )
> +    if ( desc )
>      {
> -        desc->handler = &no_irq_type;
> -        desc->msi_desc = NULL;
> +        spin_unlock_irqrestore(&desc->lock, flags);
> +
> +        if ( !forced_unbind )
> +            cleanup_domain_irq_pirq(d, irq, info);
> +
> +        rc = irq_deny_access(d, irq);
> +        if ( rc )
> +        {
> +            printk(XENLOG_G_ERR
> +                   "dom%d: could not deny access to IRQ%d (pirq %d)\n",
> +                   d->domain_id, irq, pirq + nr - 1);
> +            ret = rc;
> +        }
>      }
>  
> -    spin_unlock_irqrestore(&desc->lock, flags);
>      if (msi_desc)
>          msi_free_irq(msi_desc);
>  
> -    if ( !forced_unbind )
> -        cleanup_domain_irq_pirq(d, irq, info);
> -
> -    ret = irq_deny_access(d, irq);
> -    if ( ret )
> -        printk(XENLOG_G_ERR
> -               "dom%d: could not deny access to IRQ%d (pirq %d)\n",
> -               d->domain_id, irq, pirq);
> -
>   done:
>      return ret;
>  }
> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -236,6 +236,11 @@ static int write_msi_msg(struct msi_desc
>          u8 bus = dev->bus;
>          u8 slot = PCI_SLOT(dev->devfn);
>          u8 func = PCI_FUNC(dev->devfn);
> +        int nr = entry->msi_attrib.entry_nr;
> +
> +        ASSERT((msg->data & (entry[-nr].msi.nvec - 1)) == nr);
> +        if ( nr )
> +            return 0;
>  
>          pci_conf_write32(seg, bus, slot, func, msi_lower_address_reg(pos),
>                           msg->address_lo);
> @@ -359,8 +364,8 @@ static void msi_set_mask_bit(struct irq_
>              u8 func = PCI_FUNC(entry->dev->devfn);
>  
>              mask_bits = pci_conf_read32(seg, bus, slot, func, 
> entry->msi.mpos);
> -            mask_bits &= ~(1);
> -            mask_bits |= flag;
> +            mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr);
> +            mask_bits |= (u32)flag << entry->msi_attrib.entry_nr;
>              pci_conf_write32(seg, bus, slot, func, entry->msi.mpos, 
> mask_bits);
>          }
>          break;
> @@ -384,10 +389,11 @@ static int msi_get_mask_bit(const struct
>      case PCI_CAP_ID_MSI:
>          if (!entry->dev || !entry->msi_attrib.maskbit)
>              break;
> -        return pci_conf_read32(entry->dev->seg, entry->dev->bus,
> -                               PCI_SLOT(entry->dev->devfn),
> -                               PCI_FUNC(entry->dev->devfn),
> -                               entry->msi.mpos) & 1;
> +        return (pci_conf_read32(entry->dev->seg, entry->dev->bus,
> +                                PCI_SLOT(entry->dev->devfn),
> +                                PCI_FUNC(entry->dev->devfn),
> +                                entry->msi.mpos) >>
> +                entry->msi_attrib.entry_nr) & 1;
>      case PCI_CAP_ID_MSIX:
>          return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 
> 1;
>      }
> @@ -453,17 +459,20 @@ static hw_irq_controller pci_msi_nonmask
>      .set_affinity = set_msi_affinity
>  };
>  
> -static struct msi_desc* alloc_msi_entry(void)
> +static struct msi_desc *alloc_msi_entry(unsigned int nr)
>  {
>      struct msi_desc *entry;
>  
> -    entry = xmalloc(struct msi_desc);
> +    entry = xmalloc_array(struct msi_desc, nr);
>      if ( !entry )
>          return NULL;
>  
>      INIT_LIST_HEAD(&entry->list);
> -    entry->dev = NULL;
> -    entry->remap_index = -1;
> +    while ( nr-- )
> +    {
> +        entry[nr].dev = NULL;
> +        entry[nr].remap_index = -1;
> +    }
>  
>      return entry;
>  }
> @@ -488,17 +497,24 @@ int __setup_msi_irq(struct irq_desc *des
>  
>  int msi_free_irq(struct msi_desc *entry)
>  {
> -    destroy_irq(entry->irq);
> +    unsigned int nr = entry->msi.nvec;
> +
>      if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
>      {
>          unsigned long start;
>          start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1);
>          msix_put_fixmap(entry->dev, virt_to_fix(start));
> +        nr = 1;
>      }
>  
> -    /* Free the unused IRTE if intr remap enabled */
> -    if ( iommu_intremap )
> -        iommu_update_ire_from_msi(entry, NULL);
> +    while ( nr-- )
> +    {
> +        destroy_irq(entry[nr].irq);
> +
> +        /* Free the unused IRTE if intr remap enabled */
> +        if ( iommu_intremap )
> +            iommu_update_ire_from_msi(entry + nr, NULL);
> +    }
>  
>      list_del(&entry->list);
>      xfree(entry);
> @@ -531,11 +547,12 @@ static struct msi_desc *find_msi_entry(s
>   **/
>  static int msi_capability_init(struct pci_dev *dev,
>                                 int irq,
> -                               struct msi_desc **desc)
> +                               struct msi_desc **desc,
> +                               unsigned int nvec)
>  {
>      struct msi_desc *entry;
>      int pos;
> -    unsigned int maxvec, mpos;
> +    unsigned int i, maxvec, mpos;
>      u16 control, seg = dev->seg;
>      u8 bus = dev->bus;
>      u8 slot = PCI_SLOT(dev->devfn);
> @@ -545,27 +562,34 @@ static int msi_capability_init(struct pc
>      pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI);
>      control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos));
>      maxvec = multi_msi_capable(control);
> +    if ( nvec > maxvec )
> +        return maxvec;
>      control &= ~PCI_MSI_FLAGS_QSIZE;
> +    multi_msi_enable(control, nvec);
>  
>      /* MSI Entry Initialization */
>      msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */
>  
> -    entry = alloc_msi_entry();
> +    entry = alloc_msi_entry(nvec);
>      if ( !entry )
>          return -ENOMEM;
>  
> -    entry->msi_attrib.type = PCI_CAP_ID_MSI;
> -    entry->msi_attrib.is_64 = is_64bit_address(control);
> -    entry->msi_attrib.entry_nr = 0;
> -    entry->msi_attrib.maskbit = is_mask_bit_support(control);
> -    entry->msi_attrib.masked = 1;
> -    entry->msi_attrib.pos = pos;
>      mpos = msi_mask_bits_reg(pos, is_64bit_address(control));
> -    entry->msi.nvec = 1;
> +    for ( i = 0; i < nvec; ++i )
> +    {
> +        entry[i].msi_attrib.type = PCI_CAP_ID_MSI;
> +        entry[i].msi_attrib.is_64 = is_64bit_address(control);
> +        entry[i].msi_attrib.entry_nr = i;
> +        entry[i].msi_attrib.maskbit = is_mask_bit_support(control);
> +        entry[i].msi_attrib.masked = 1;
> +        entry[i].msi_attrib.pos = pos;
> +        if ( entry[i].msi_attrib.maskbit )
> +            entry[i].msi.mpos = mpos;
> +        entry[i].msi.nvec = 0;
> +        entry[i].dev = dev;
> +    }
> +    entry->msi.nvec = nvec;
>      entry->irq = irq;
> -    if ( is_mask_bit_support(control) )
> -        entry->msi.mpos = mpos;
> -    entry->dev = dev;
>      if ( entry->msi_attrib.maskbit )
>      {
>          u32 maskbits;
> @@ -693,7 +717,7 @@ static int msix_capability_init(struct p
>  
>      if ( desc )
>      {
> -        entry = alloc_msi_entry();
> +        entry = alloc_msi_entry(1);
>          if ( !entry )
>              return -ENOMEM;
>          ASSERT(msi);
> @@ -851,7 +875,6 @@ static int msix_capability_init(struct p
>  
>  static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc)
>  {
> -    int status;
>      struct pci_dev *pdev;
>      struct msi_desc *old_desc;
>  
> @@ -880,8 +903,7 @@ static int __pci_enable_msi(struct msi_i
>          pci_disable_msi(old_desc);
>      }
>  
> -    status = msi_capability_init(pdev, msi->irq, desc);
> -    return status;
> +    return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr);
>  }
>  
>  static void __pci_disable_msi(struct msi_desc *entry)
> @@ -1101,6 +1123,8 @@ int pci_restore_msi_state(struct pci_dev
>  
>      list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list )
>      {
> +        unsigned int i = 0, nr = 1;
> +
>          irq = entry->irq;
>          desc = &irq_desc[irq];
>  
> @@ -1110,30 +1134,58 @@ int pci_restore_msi_state(struct pci_dev
>  
>          if (desc->msi_desc != entry)
>          {
> +    bogus:
>              dprintk(XENLOG_ERR,
> -                    "Restore MSI for dev %04x:%02x:%02x:%x not set 
> before?\n",
> +                    "Restore MSI for %04x:%02x:%02x:%u entry %u not 
> set?\n",
>                      pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
> -                    PCI_FUNC(pdev->devfn));
> +                    PCI_FUNC(pdev->devfn), i);
>              spin_unlock_irqrestore(&desc->lock, flags);
>              return -EINVAL;
>          }
>  
>          if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
> +        {
>              msi_set_enable(pdev, 0);
> +            nr = entry->msi.nvec;
> +        }
>          else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
>              msix_set_enable(pdev, 0);
>  
>          msg = entry->msg;
>          write_msi_msg(entry, &msg);
>  
> -        msi_set_mask_bit(desc, entry->msi_attrib.masked);
> +        for ( i = 0; ; )
> +        {
> +            msi_set_mask_bit(desc, entry[i].msi_attrib.masked);
> +            spin_unlock_irqrestore(&desc->lock, flags);
> +
> +            if ( !--nr )
> +                break;
> +
> +            desc = &irq_desc[entry[++i].irq];
> +            spin_lock_irqsave(&desc->lock, flags);
> +            if ( desc->msi_desc != entry + i )
> +                goto bogus;
> +        }
> +
> +        spin_unlock_irqrestore(&desc->lock, flags);
>  
>          if ( entry->msi_attrib.type == PCI_CAP_ID_MSI )
> +        {
> +            unsigned int cpos = msi_control_reg(entry->msi_attrib.pos);
> +            u16 control = pci_conf_read16(pdev->seg, pdev->bus,
> +                                          PCI_SLOT(pdev->devfn),
> +                                          PCI_FUNC(pdev->devfn), cpos);
> +
> +            control &= ~PCI_MSI_FLAGS_QSIZE;
> +            multi_msi_enable(control, entry->msi.nvec);
> +            pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
> +                             PCI_FUNC(pdev->devfn), cpos, control);
> +
>              msi_set_enable(pdev, 1);
> +        }
>          else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX )
>              msix_set_enable(pdev, 1);
> -
> -        spin_unlock_irqrestore(&desc->lock, flags);
>      }
>  
>      return 0;
> --- a/xen/arch/x86/physdev.c
> +++ b/xen/arch/x86/physdev.c
> @@ -140,8 +140,11 @@ int physdev_map_pirq(domid_t domid, int 
>          break;
>  
>      case MAP_PIRQ_TYPE_MSI:
> +        if ( !msi->table_base )
> +            msi->entry_nr = 1;
>          irq = *index;
>          if ( irq == -1 )
> +    case MAP_PIRQ_TYPE_MULTI_MSI:
>              irq = create_irq(NUMA_NO_NODE);
>  
>          if ( irq < nr_irqs_gsi || irq >= nr_irqs )
> @@ -179,6 +182,30 @@ int physdev_map_pirq(domid_t domid, int 
>                  goto done;
>              }
>          }
> +        else if ( type == MAP_PIRQ_TYPE_MULTI_MSI )
> +        {
> +            if ( msi->entry_nr <= 0 || msi->entry_nr > 32 )
> +                ret = -EDOM;
> +            else if ( msi->entry_nr != 1 && !iommu_intremap )
> +                ret = -EOPNOTSUPP;
> +            else
> +            {
> +                while ( msi->entry_nr & (msi->entry_nr - 1) )
> +                    msi->entry_nr += msi->entry_nr & -msi->entry_nr;
> +                pirq = get_free_pirqs(d, msi->entry_nr);
> +                if ( pirq < 0 )
> +                {
> +                    while ( (msi->entry_nr >>= 1) > 1 )
> +                        if ( get_free_pirqs(d, msi->entry_nr) > 0 )
> +                            break;
> +                    dprintk(XENLOG_G_ERR, "dom%d: no block of %d free 
> pirqs\n",
> +                            d->domain_id, msi->entry_nr << 1);
> +                    ret = pirq;
> +                }
> +            }
> +            if ( ret < 0 )
> +                goto done;
> +        }
>          else
>          {
>              pirq = get_free_pirq(d, type);
> @@ -210,8 +237,15 @@ int physdev_map_pirq(domid_t domid, int 
>   done:
>      spin_unlock(&d->event_lock);
>      spin_unlock(&pcidevs_lock);
> -    if ( (ret != 0) && (type == MAP_PIRQ_TYPE_MSI) && (*index == -1) )
> -        destroy_irq(irq);
> +    if ( ret != 0 )
> +        switch ( type )
> +        {
> +        case MAP_PIRQ_TYPE_MSI:
> +            if ( *index == -1 )
> +        case MAP_PIRQ_TYPE_MULTI_MSI:
> +                destroy_irq(irq);
> +            break;
> +        }
>   free_domain:
>      rcu_unlock_domain(d);
>      return ret;
> @@ -390,14 +424,22 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
>          if ( copy_from_guest(&map, arg, 1) != 0 )
>              break;
>  
> -        if ( map.type == MAP_PIRQ_TYPE_MSI_SEG )
> +        switch ( map.type )
>          {
> +        case MAP_PIRQ_TYPE_MSI_SEG:
>              map.type = MAP_PIRQ_TYPE_MSI;
>              msi.seg = map.bus >> 16;
> -        }
> -        else
> -        {
> +            break;
> +
> +        case MAP_PIRQ_TYPE_MULTI_MSI:
> +            if ( map.table_base )
> +                return -EINVAL;
> +            msi.seg = map.bus >> 16;
> +            break;
> +
> +        default:
>              msi.seg = 0;
> +            break;
>          }
>          msi.bus = map.bus;
>          msi.devfn = map.devfn;
> @@ -406,6 +448,8 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
>          ret = physdev_map_pirq(map.domid, map.type, &map.index, &map.pirq,
>                                 &msi);
>  
> +        if ( map.type == MAP_PIRQ_TYPE_MULTI_MSI )
> +            map.entry_nr = msi.entry_nr;
>          if ( __copy_to_guest(arg, &map, 1) )
>              ret = -EFAULT;
>          break;
> --- a/xen/include/asm-x86/irq.h
> +++ b/xen/include/asm-x86/irq.h
> @@ -141,6 +141,7 @@ int map_domain_pirq(struct domain *d, in
>                             void *data);
>  int unmap_domain_pirq(struct domain *d, int pirq);
>  int get_free_pirq(struct domain *d, int type);
> +int get_free_pirqs(struct domain *, unsigned int nr);
>  void free_domain_pirqs(struct domain *d);
>  int map_domain_emuirq_pirq(struct domain *d, int pirq, int irq);
>  int unmap_domain_pirq_emuirq(struct domain *d, int pirq);
> --- a/xen/include/asm-x86/msi.h
> +++ b/xen/include/asm-x86/msi.h
> @@ -148,7 +148,7 @@ int msi_free_irq(struct msi_desc *entry)
>  #define multi_msi_capable(control) \
>  	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
>  #define multi_msi_enable(control, num) \
> -	control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
> +	control |= (((fls(num) - 1) << 4) & PCI_MSI_FLAGS_QSIZE);
>  #define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
>  #define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
>  #define msi_enable(control, num) multi_msi_enable(control, num); \
> --- a/xen/include/public/physdev.h
> +++ b/xen/include/public/physdev.h
> @@ -151,21 +151,22 @@ DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
>  #define MAP_PIRQ_TYPE_GSI               0x1
>  #define MAP_PIRQ_TYPE_UNKNOWN           0x2
>  #define MAP_PIRQ_TYPE_MSI_SEG           0x3
> +#define MAP_PIRQ_TYPE_MULTI_MSI         0x4
>  
>  #define PHYSDEVOP_map_pirq               13
>  struct physdev_map_pirq {
>      domid_t domid;
>      /* IN */
>      int type;
> -    /* IN */
> +    /* IN (ignored for ..._MULTI_MSI) */
>      int index;
>      /* IN or OUT */
>      int pirq;
> -    /* IN - high 16 bits hold segment for MAP_PIRQ_TYPE_MSI_SEG */
> +    /* IN - high 16 bits hold segment for ..._MSI_SEG and ..._MULTI_MSI */
>      int bus;
>      /* IN */
>      int devfn;
> -    /* IN */
> +    /* IN (also OUT for ..._MULTI_MSI) */
>      int entry_nr;
>      /* IN */
>      uint64_t table_base;

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Ping: [PATCH resend 3/3] pciif: add multi-vector-MSI command
  2013-07-16 10:15 ` [PATCH resend 3/3] pciif: add multi-vector-MSI command Jan Beulich
  2013-07-16 11:19   ` Andrew Cooper
@ 2013-08-05 13:12   ` Jan Beulich
  2013-08-08  9:02   ` Keir Fraser
  2 siblings, 0 replies; 18+ messages in thread
From: Jan Beulich @ 2013-08-05 13:12 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

Ping (I/O interface change, however trivial it might be)?

>>> On 16.07.13 at 12:15, "Jan Beulich" <JBeulich@suse.com> wrote:
> The requested vector count is to be passed in struct xen_pci_op's info
> field. Upon failure, if a smaller vector count might work, the backend
> will pass that smaller count in the value field (which so far is always
> being set to zero in the error path).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/include/public/io/pciif.h
> +++ b/xen/include/public/io/pciif.h
> @@ -46,6 +46,7 @@
>  #define XEN_PCI_OP_aer_resume		(7)
>  #define XEN_PCI_OP_aer_mmio		(8)
>  #define XEN_PCI_OP_aer_slotreset	(9)
> +#define XEN_PCI_OP_enable_multi_msi	(10)
>  
>  /* xen_pci_op error numbers */
>  #define XEN_PCI_ERR_success          (0)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 2/3] x86: enable multi-vector MSI
  2013-07-16 10:14 ` [PATCH resend 2/3] x86: enable " Jan Beulich
  2013-07-16 11:36   ` Andrew Cooper
  2013-08-05 13:11   ` Ping: " Jan Beulich
@ 2013-08-08  9:02   ` Keir Fraser
  2 siblings, 0 replies; 18+ messages in thread
From: Keir Fraser @ 2013-08-08  9:02 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: xiantao.zhang

On 16/07/2013 11:14, "Jan Beulich" <JBeulich@suse.com> wrote:

> This implies
> - extending the public interface to have a way to request a block of
>   MSIs
> - allocating a block of contiguous pIRQ-s for the target domain (but
>   note that the Xen IRQs allocated have no need of being contiguous)
> - repeating certain operations for all involved IRQs
> - fixing multi_msi_enable()
> - adjusting the mask bit accesses for maskable MSIs
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Keir Fraser <keir@xen.org>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH resend 3/3] pciif: add multi-vector-MSI command
  2013-07-16 10:15 ` [PATCH resend 3/3] pciif: add multi-vector-MSI command Jan Beulich
  2013-07-16 11:19   ` Andrew Cooper
  2013-08-05 13:12   ` Ping: " Jan Beulich
@ 2013-08-08  9:02   ` Keir Fraser
  2 siblings, 0 replies; 18+ messages in thread
From: Keir Fraser @ 2013-08-08  9:02 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: xiantao.zhang

On 16/07/2013 11:15, "Jan Beulich" <JBeulich@suse.com> wrote:

> The requested vector count is to be passed in struct xen_pci_op's info
> field. Upon failure, if a smaller vector count might work, the backend
> will pass that smaller count in the value field (which so far is always
> being set to zero in the error path).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Keir Fraser <keir@xen.org>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2013-08-08  9:02 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-07-16 10:00 [PATCH resend 0/3] x86/IOMMU: multi-vector MSI Jan Beulich
2013-07-16 10:13 ` [PATCH resend 1/3] VT-d: enable for " Jan Beulich
2013-07-16 11:15   ` Andrew Cooper
2013-07-16 11:32     ` Jan Beulich
2013-07-17  9:50       ` Andrew Cooper
2013-07-18 10:48         ` Zhang, Xiantao
2013-07-16 10:14 ` [PATCH resend 2/3] x86: enable " Jan Beulich
2013-07-16 11:36   ` Andrew Cooper
2013-07-16 11:48     ` Jan Beulich
2013-07-17 10:02       ` Andrew Cooper
2013-08-05 13:11   ` Ping: " Jan Beulich
2013-08-08  9:02   ` Keir Fraser
2013-07-16 10:15 ` [PATCH resend 3/3] pciif: add multi-vector-MSI command Jan Beulich
2013-07-16 11:19   ` Andrew Cooper
2013-07-16 11:35     ` Jan Beulich
2013-07-17  9:02       ` Andrew Cooper
2013-08-05 13:12   ` Ping: " Jan Beulich
2013-08-08  9:02   ` Keir Fraser

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.