All of lore.kernel.org
 help / color / mirror / Atom feed
From: Micha Nelissen <micha@neli.hopto.org>
To: Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, "H. Peter Anvin" <hpa@zytor.com>,
	x86@kernel.org, "Venkatesh Pallipadi (Venki)" <venki@google.com>,
	Jesse Barnes <jbarnes@virtuousgeek.org>,
	linux-kernel@vger.kernel.org, linux-pci@vger.kernel.org,
	Matthew Wilcox <matthew@wil.cx>
Subject: [PATCH] Add support for multiple MSI on x86
Date: Sun, 13 Feb 2011 21:25:21 +0100	[thread overview]
Message-ID: <4D583E31.4070507@neli.hopto.org> (raw)

[-- Attachment #1: Type: text/plain, Size: 53 bytes --]

Patch is based on earlier patch from Matthew Wilcox.

[-- Attachment #2: 0001-Add-support-for-multiple-MSI-on-x86.patch --]
[-- Type: text/plain, Size: 14423 bytes --]

---
 arch/x86/kernel/apic/io_apic.c |  310 ++++++++++++++++++++++++++++++++++------
 arch/x86/kernel/hpet.c         |    2 +-
 drivers/pci/htirq.c            |    2 +-
 include/linux/irq.h            |    3 +-
 4 files changed, 271 insertions(+), 46 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fadcd74..5e9decc 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -249,11 +249,6 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
 	return cfg;
 }
 
-static int alloc_irq_from(unsigned int from, int node)
-{
-	return irq_alloc_desc_from(from, node);
-}
-
 static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
 {
 	free_irq_cfg(at, cfg);
@@ -1037,6 +1032,39 @@ void unlock_vector_lock(void)
 	raw_spin_unlock(&vector_lock);
 }
 
+/*
+ * The P6 family and Pentium processors (presumably also earlier processors),
+ * can queue no more than two interrupts per priority level, and will ignore
+ * other interrupts that are received within the same priority level (the
+ * priority level is the vector number shifted right by 4), so we try to
+ * spread these out a bit to avoid this happening.
+ *
+ * Pentium 4, Xeon and later processors do not have this limitation.
+ * It is unknown what limitations AMD, Cyrix, Transmeta, VIA, IDT and
+ * other manufacturers have.
+ */
+static int many_vectors_per_prio(void)
+{
+	struct cpuinfo_x86 *c;
+	static char init, result;
+	if (init)
+		return result;
+
+	c = &boot_cpu_data;
+	switch (c->x86_vendor) {
+	case X86_VENDOR_INTEL:
+		if (c->x86 > 6 ||
+		    ((c->x86 == 6) && (c->x86_model >= 13)))
+			result = 1;
+		break;
+	default:
+		break;
+	}
+
+	init = 1;
+	return result;
+}
+
 static int
 __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 {
@@ -1117,13 +1145,110 @@ next:
 	return err;
 }
 
+static int __assign_irq_vector_block(int irq, unsigned count, const struct cpumask *mask)
+{
+	static int current_vector = FIRST_EXTERNAL_VECTOR;
+	unsigned int old_vector;
+	unsigned i, cpu;
+	int err;
+	struct irq_cfg *cfg;
+	cpumask_var_t tmp_mask;
+
+	BUG_ON(irq + count > NR_IRQS);
+	BUG_ON(count & (count - 1));
+
+	for (i = 0; i < count; i++) {
+		cfg = irq_cfg(irq + i);
+		if (cfg->move_in_progress)
+			return -EBUSY;
+	}
+
+	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+		return -ENOMEM;
+
+	cfg = irq_cfg(irq);
+	old_vector = cfg->vector;
+	if (old_vector) {
+		err = 0;
+		cpumask_and(tmp_mask, mask, cpu_online_mask);
+		cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+		if (!cpumask_empty(tmp_mask))
+			goto out;
+	}
+
+	/* Only try and allocate irqs on cpus that are present */
+	err = -ENOSPC;
+	for_each_cpu_and(cpu, mask, cpu_online_mask) {
+		int new_cpu;
+		int vector;
+
+		apic->vector_allocation_domain(cpu, tmp_mask);
+
+		vector = current_vector & ~(count - 1);
+next:
+		vector += count;
+		if (vector + count >= first_system_vector) {
+			vector = FIRST_EXTERNAL_VECTOR & ~(count - 1);
+			if (vector < FIRST_EXTERNAL_VECTOR)
+				vector += count;
+		}
+		if (unlikely((current_vector & ~(count - 1)) == vector))
+			continue;
+
+		for (i = 0; i < count; i++)
+			if (test_bit(vector + i, used_vectors))
+				goto next;
+
+		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+			for (i = 0; i < count; i++) {
+				if (per_cpu(vector_irq, new_cpu)[vector + i] != -1)
+					goto next;
+			}
+		}
+		/* Found one! */
+		current_vector = vector + count - 1;
+		for (i = 0; i < count; i++) {
+			cfg = irq_cfg(irq + i);
+			if (old_vector) {
+				cfg->move_in_progress = 1;
+				cpumask_copy(cfg->old_domain, cfg->domain);
+			}
+			for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+				per_cpu(vector_irq, new_cpu)[vector + i] = irq + i;
+			cfg->vector = vector + i;
+			cpumask_copy(cfg->domain, tmp_mask);
+		}
+		err = 0;
+		break;
+	}
+out:
+	free_cpumask_var(tmp_mask);
+	return err;
+}
+
 int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 {
 	int err;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, cfg, mask);
+	if (many_vectors_per_prio())
+		err = __assign_irq_vector_block(irq, 1, mask);
+	else
+		err = __assign_irq_vector(irq, cfg, mask);
+	raw_spin_unlock_irqrestore(&vector_lock, flags);
+	return err;
+}
+
+/* Assumes that count is a power of two and aligns to that power of two */
+static int
+assign_irq_vector_block(int irq, unsigned count, const struct cpumask *mask)
+{
+	int err;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&vector_lock, flags);
+	err = __assign_irq_vector_block(irq, count, mask);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
@@ -2200,14 +2325,34 @@ int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
 			  unsigned int *dest_id)
 {
 	struct irq_cfg *cfg = data->chip_data;
+	unsigned irq;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
 		return -1;
 
-	if (assign_irq_vector(data->irq, data->chip_data, mask))
-		return -1;
+	irq = data->irq;
+	cfg = data->chip_data;
 
-	cpumask_copy(data->affinity, mask);
+	if (many_vectors_per_prio()) {
+		struct msi_desc *msi_desc = data->msi_desc;
+		unsigned i, count = 1;
+
+		if (msi_desc)
+			count = 1 << msi_desc->msi_attrib.multiple;
+
+		/* Multiple MSIs all go to the same destination */
+		if (assign_irq_vector_block(irq, count, mask))
+			return -1;
+		for (i = 0; i < count; i++) {
+			data = &irq_to_desc(irq + i)->irq_data;
+			cpumask_copy(data->affinity, mask);
+		}
+	} else {
+		if (assign_irq_vector(irq, cfg, mask))
+			return BAD_APICID;
+
+		cpumask_copy(data->affinity, mask);
+	}
 
 	*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
 	return 0;
@@ -3053,7 +3198,7 @@ device_initcall(ioapic_init_sysfs);
 /*
  * Dynamic irq allocate and deallocation
  */
-unsigned int create_irq_nr(unsigned int from, int node)
+unsigned int create_irq_nr(unsigned int from, unsigned count, int node)
 {
 	struct irq_cfg *cfg;
 	unsigned long flags;
@@ -3063,25 +3208,31 @@ unsigned int create_irq_nr(unsigned int from, int node)
 	if (from < nr_irqs_gsi)
 		from = nr_irqs_gsi;
 
-	irq = alloc_irq_from(from, node);
+	irq = irq_alloc_descs(-1, from, count, node);
 	if (irq < 0)
 		return 0;
 	cfg = alloc_irq_cfg(irq, node);
 	if (!cfg) {
-		free_irq_at(irq, NULL);
+		irq_free_descs(irq, count);
 		return 0;
 	}
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
-		ret = irq;
+	if (many_vectors_per_prio()) {
+		if (!__assign_irq_vector_block(irq, count, apic->target_cpus()))
+			ret = irq;
+	} else {
+		if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+			ret = irq;
+	}
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
 	if (ret) {
 		set_irq_chip_data(irq, cfg);
 		irq_clear_status_flags(irq, IRQ_NOREQUEST);
 	} else {
-		free_irq_at(irq, cfg);
+		free_irq_cfg(irq, cfg);
+		irq_free_descs(irq, count);
 	}
 	return ret;
 }
@@ -3093,7 +3244,7 @@ int create_irq(void)
 	int irq;
 
 	irq_want = nr_irqs_gsi;
-	irq = create_irq_nr(irq_want, node);
+	irq = create_irq_nr(irq_want, 1, node);
 
 	if (irq == 0)
 		irq = -1;
@@ -3121,7 +3272,7 @@ void destroy_irq(unsigned int irq)
  */
 #ifdef CONFIG_PCI_MSI
 static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
-			   struct msi_msg *msg, u8 hpet_id)
+			   unsigned count, struct msi_msg *msg, u8 hpet_id)
 {
 	struct irq_cfg *cfg;
 	int err;
@@ -3131,7 +3282,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
 		return -ENXIO;
 
 	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, apic->target_cpus());
+	if (count == 1)
+		err = assign_irq_vector(irq, cfg, apic->target_cpus());
+	else
+		err = assign_irq_vector_block(irq, count, apic->target_cpus());
 	if (err)
 		return err;
 
@@ -3307,47 +3461,99 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 	return index;
 }
 
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+			 unsigned count, int base_irq)
 {
 	struct msi_msg msg;
+	unsigned irq;
 	int ret;
 
-	ret = msi_compose_msg(dev, irq, &msg, -1);
+	ret = msi_compose_msg(dev, base_irq, count, &msg, -1);
 	if (ret < 0)
 		return ret;
 
-	set_irq_msi(irq, msidesc);
-	write_msi_msg(irq, &msg);
+	msidesc->msi_attrib.multiple = order_base_2(count);
 
-	if (irq_remapped(get_irq_chip_data(irq))) {
-		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-		set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
-	} else
-		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+	/* perform loop backwards, so first irq has msidesc set */
+	for (irq = base_irq + count - 1; irq >= base_irq; irq--) {
+		set_irq_msi(irq, msidesc);
+		if (irq_remapped(get_irq_chip_data(irq))) {
+			irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+			set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
+		} else
+			set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+	}
 
-	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+	write_msi_msg(base_irq, &msg);
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d-%d for MSI/MSI-X\n",
+		base_irq, base_irq + count - 1);
 
 	return 0;
 }
 
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+static int setup_msi_irqs(struct pci_dev *dev, int nvec)
+{
+	unsigned base_irq, alloc, i;
+	int ret, node;
+	struct msi_desc *msidesc = list_first_entry(&dev->msi_list,
+							struct msi_desc, list);
+	struct intel_iommu *iommu = map_dev_to_ir(dev);
+
+	if (intr_remapping_enabled && !iommu)
+		return -ENOENT;
+	if (nvec > 1 && !many_vectors_per_prio())
+		return 1;
+
+	/*
+	 * MSI only lets you program the device with nvec that is a power
+	 * of two.  We could possibly trust the device driver that it'll
+	 * only use the number it asked for, but to be safe, let's reserve
+	 * all the interrupts we're telling the device it can use.
+	 */
+	alloc = roundup_pow_of_two(nvec);
+	node = dev_to_node(&dev->dev);
+	base_irq = create_irq_nr(nr_irqs_gsi, alloc, node);
+	if (base_irq == 0)
+		return (alloc > 1) ? alloc / 2 : -ENOSPC;
+
+	if (intr_remapping_enabled) {
+		ret = msi_alloc_irte(dev, base_irq, alloc);
+		if (ret < 0)
+			goto error;
+
+		for (i = 1; i < alloc; i++)
+			set_irte_irq(base_irq + i, iommu, ret, i);
+	}
+
+	ret = setup_msi_irq(dev, msidesc, alloc, base_irq);
+	if (ret < 0)
+		goto error;
+
+	return 0;
+
+error:
+	for (i = 0; i < alloc; i++)
+		destroy_irq(base_irq + i);
+	return ret;
+}
+
+static int setup_msix_irqs(struct pci_dev *dev, int nvec)
 {
 	int node, ret, sub_handle, index = 0;
+	struct intel_iommu *iommu = map_dev_to_ir(dev);
 	unsigned int irq, irq_want;
 	struct msi_desc *msidesc;
-	struct intel_iommu *iommu = NULL;
 
-	/* x86 doesn't support multiple MSI yet */
-	if (type == PCI_CAP_ID_MSI && nvec > 1)
-		return 1;
+	if (intr_remapping_enabled && !iommu)
+		return -ENOENT;
 
 	node = dev_to_node(&dev->dev);
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = create_irq_nr(irq_want, node);
+		irq = create_irq_nr(irq_want, 1, node);
 		if (irq == 0)
-			return -1;
+			return -ENOSPC;
 		irq_want = irq + 1;
 		if (!intr_remapping_enabled)
 			goto no_ir;
@@ -3363,11 +3569,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 				goto error;
 			}
 		} else {
-			iommu = map_dev_to_ir(dev);
-			if (!iommu) {
-				ret = -ENOENT;
-				goto error;
-			}
 			/*
 			 * setup the mapping between the irq and the IRTE
 			 * base index, the sub_handle pointing to the
@@ -3376,7 +3577,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 			set_irte_irq(irq, iommu, index, sub_handle);
 		}
 no_ir:
-		ret = setup_msi_irq(dev, msidesc, irq);
+		ret = setup_msi_irq(dev, msidesc, 1, irq);
 		if (ret < 0)
 			goto error;
 		sub_handle++;
@@ -3388,11 +3589,34 @@ error:
 	return ret;
 }
 
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	if (type == PCI_CAP_ID_MSI) {
+		return setup_msi_irqs(dev, nvec);
+	} else {
+		return setup_msix_irqs(dev, nvec);
+	}
+}
+
 void native_teardown_msi_irq(unsigned int irq)
 {
 	destroy_irq(irq);
 }
 
+void native_teardown_msi_irqs(struct pci_dev *dev)
+{
+	struct msi_desc *desc;
+	unsigned i;
+
+	list_for_each_entry(desc, &dev->msi_list, list) {
+		if (desc->irq == 0)
+			continue;
+		for (i = 0; i < (1 << desc->msi_attrib.multiple); i++) {
+			destroy_irq(desc->irq + i);
+		}
+	}
+}
+
 #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
 static int
@@ -3437,7 +3661,7 @@ int arch_setup_dmar_msi(unsigned int irq)
 	int ret;
 	struct msi_msg msg;
 
-	ret = msi_compose_msg(NULL, irq, &msg, -1);
+	ret = msi_compose_msg(NULL, irq, 1, &msg, -1);
 	if (ret < 0)
 		return ret;
 	dmar_msi_write(irq, &msg);
@@ -3515,7 +3739,7 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
 			return -1;
 	}
 
-	ret = msi_compose_msg(NULL, irq, &msg, id);
+	ret = msi_compose_msg(NULL, irq, 1, &msg, id);
 	if (ret < 0)
 		return ret;
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 4ff5968..cce3afd 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -499,7 +499,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
 {
 	unsigned int irq;
 
-	irq = create_irq_nr(0, -1);
+	irq = create_irq_nr(0, 1, -1);
 	if (!irq)
 		return -EINVAL;
 
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index 834842a..2b48cc3 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -120,7 +120,7 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	cfg->msg.address_hi = 0xffffffff;
 
 	node = dev_to_node(&dev->dev);
-	irq = create_irq_nr(0, node);
+	irq = create_irq_nr(0, 1, node);
 
 	if (irq <= 0) {
 		kfree(cfg);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index abde252..842a8c4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -322,7 +322,8 @@ static inline void set_irq_probe(unsigned int irq)
 }
 
 /* Handle dynamic irq creation and destruction */
-extern unsigned int create_irq_nr(unsigned int irq_want, int node);
+extern unsigned int create_irq_nr(unsigned int irq_want, unsigned count,
+				  int node);
 extern int create_irq(void);
 extern void destroy_irq(unsigned int irq);
 
-- 
1.5.6.5


             reply	other threads:[~2011-02-13 20:36 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-02-13 20:25 Micha Nelissen [this message]
2011-02-14 12:34 ` [PATCH] Add support for multiple MSI on x86 Ingo Molnar
2011-02-14 19:47   ` Micha Nelissen
2011-02-15  2:38     ` Ingo Molnar
2011-02-14 20:55 ` Thomas Gleixner
2011-03-04 18:37   ` Jesse Barnes
2011-06-17 17:12   ` Matthew Wilcox
2011-03-04 18:36 ` Jesse Barnes
2011-03-04 19:53   ` Micha Nelissen
2011-03-08 21:05     ` Thomas Gleixner
     [not found]       ` <35bd5f56-658b-48e3-a376-b07350a29cf6@email.android.com>
2011-03-08 21:16         ` Thomas Gleixner
     [not found]           ` <71ed11a4-aff7-4eb6-b037-0e097bb96444@email.android.com>
2011-03-08 22:13             ` Thomas Gleixner
2011-03-10  2:05           ` Roland Dreier
2011-03-10 15:33             ` Clemens Ladisch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4D583E31.4070507@neli.hopto.org \
    --to=micha@neli.hopto.org \
    --cc=hpa@zytor.com \
    --cc=jbarnes@virtuousgeek.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=matthew@wil.cx \
    --cc=mingo@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=venki@google.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.