linux-hyperv.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Wei Liu <wei.liu@kernel.org>
To: Linux on Hyper-V List <linux-hyperv@vger.kernel.org>
Cc: virtualization@lists.linux-foundation.org,
	Linux Kernel List <linux-kernel@vger.kernel.org>,
	Michael Kelley <mikelley@microsoft.com>,
	Vineeth Pillai <viremana@linux.microsoft.com>,
	Sunil Muthuswamy <sunilmut@microsoft.com>,
	Nuno Das Neves <nudasnev@microsoft.com>,
	Wei Liu <wei.liu@kernel.org>,
	"K. Y. Srinivasan" <kys@microsoft.com>,
	Haiyang Zhang <haiyangz@microsoft.com>,
	Stephen Hemminger <sthemmin@microsoft.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	x86@kernel.org (maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)),
	"H. Peter Anvin" <hpa@zytor.com>
Subject: [PATCH RFC v1 16/18] x86/hyperv: implement MSI domain for root partition
Date: Mon, 14 Sep 2020 11:59:25 +0000	[thread overview]
Message-ID: <20200914115928.83184-8-wei.liu@kernel.org> (raw)
In-Reply-To: <20200914112802.80611-1-wei.liu@kernel.org>

When Linux runs as the root partition on Microsoft Hypervisor, its
interrupts are remapped.  Linux will need to explicitly map and unmap
interrupts for hardware.

Implement an MSI domain to issue the correct hypercalls. And initialize
this irqdomain in the PCI init hook.

Signed-off-by: Sunil Muthuswamy <sunilmut@microsoft.com>
Co-Developed-by: Sunil Muthuswamy <sunilmut@microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 arch/x86/hyperv/Makefile    |   2 +-
 arch/x86/hyperv/hv_init.c   |  14 ++
 arch/x86/hyperv/irqdomain.c | 355 ++++++++++++++++++++++++++++++++++++
 3 files changed, 370 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/hyperv/irqdomain.c

diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index 565358020921..48e2c51464e8 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-y			:= hv_init.o mmu.o nested.o
+obj-y			:= hv_init.o mmu.o nested.o irqdomain.o
 obj-$(CONFIG_X86_64)	+= hv_apic.o hv_proc.o
 
 ifdef CONFIG_X86_64
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 26233aebc86c..d26d9573ceab 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -277,10 +277,24 @@ static int hv_cpu_die(unsigned int cpu)
 	return 0;
 }
 
+int hv_setup_msi_domain_irqs(struct pci_dev *dev, int nvec, int type);
+void hv_teardown_msi_irq(unsigned int irq);
+void hv_teardown_msi_irqs(struct pci_dev *dev);
+int hv_init_msi_domain(void);
+
 static int __init hv_pci_init(void)
 {
 	int gen2vm = efi_enabled(EFI_BOOT);
 
+	if (hv_root_partition) {
+		if (hv_init_msi_domain() < 0)
+			panic("Failed to allocate MSI domain\n");
+
+		x86_msi.setup_msi_irqs = hv_setup_msi_domain_irqs;
+		x86_msi.teardown_msi_irq = hv_teardown_msi_irq;
+		x86_msi.teardown_msi_irqs = hv_teardown_msi_irqs;
+	}
+
 	/*
 	 * For Generation-2 VM, we exit from pci_arch_init() by returning 0.
 	 * The purpose is to suppress the harmless warning:
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
new file mode 100644
index 000000000000..6ffe32d9cde5
--- /dev/null
+++ b/arch/x86/hyperv/irqdomain.c
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Irqdomain for Linux to run as the root partition on Microsoft Hypervisor.
+//
+// Authors:
+//   Sunil Muthuswamy <sunilmut@microsoft.com>
+//   Wei Liu <wei.liu@kernel.org>
+
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <asm/mshyperv.h>
+
+struct rid_data {
+	struct pci_dev *bridge;
+	u32 rid;
+};
+
+static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
+{
+	struct rid_data *rd = data;
+	u8 bus = PCI_BUS_NUM(rd->rid);
+
+	if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) {
+		rd->bridge = pdev;
+		rd->rid = alias;
+	}
+
+	return 0;
+}
+
+static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
+{
+	union hv_device_id dev_id;
+	struct rid_data data = {
+		.bridge = NULL,
+		.rid = PCI_DEVID(dev->bus->number, dev->devfn)
+	};
+
+	pci_for_each_dma_alias(dev, get_rid_cb, &data);
+
+	dev_id.as_uint64 = 0;
+	dev_id.device_type = HV_DEVICE_TYPE_PCI;
+	dev_id.pci.segment = pci_domain_nr(dev->bus);
+
+	dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid);
+	dev_id.pci.bdf.device = PCI_SLOT(data.rid);
+	dev_id.pci.bdf.function = PCI_FUNC(data.rid);
+	dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE;
+
+	if (data.bridge) {
+		int pos;
+
+		/*
+		 * Microsoft Hypervisor requires a bus range when the bridge is
+		 * running in PCI-X mode.
+		 *
+		 * To distinguish conventional vs PCI-X bridge, we can check
+		 * the bridge's PCI-X Secondary Status Register, Secondary Bus
+		 * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
+		 * Specification Revision 1.0 5.2.2.1.3.
+		 *
+		 * Value zero means it is in conventional mode, otherwise it is
+		 * in PCI-X mode.
+		 */
+
+		pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
+		if (pos) {
+			u16 status;
+
+			pci_read_config_word(data.bridge, pos +
+					PCI_X_BRIDGE_SSTATUS, &status);
+
+			if (status & PCI_X_SSTATUS_FREQ) {
+				/* Non-zero, PCI-X mode */
+				u8 sec_bus, sub_bus;
+
+				dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;
+
+				pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus);
+				dev_id.pci.shadow_bus_range.secondary_bus = sec_bus;
+				pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus);
+				dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus;
+			}
+		}
+	}
+
+	return dev_id;
+}
+
+static int hv_map_msi_interrupt(struct pci_dev *dev, int vcpu, int vector,
+				struct hv_interrupt_entry *entry)
+{
+	struct hv_input_map_device_interrupt *input;
+	struct hv_output_map_device_interrupt *output;
+	struct hv_device_interrupt_descriptor *intr_desc;
+	unsigned long flags;
+	int status;
+
+	local_irq_save(flags);
+
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+	intr_desc = &input->interrupt_descriptor;
+	memset(input, 0, sizeof(*input));
+	input->partition_id = hv_current_partition_id;
+	input->device_id = hv_build_pci_dev_id(dev).as_uint64;
+	intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
+	intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE;
+	intr_desc->vector_count = 1;
+	intr_desc->target.vector = vector;
+	__set_bit(vcpu, (unsigned long*)&intr_desc->target.vp_mask);
+
+	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, 0, input, output) &
+			 HV_HYPERCALL_RESULT_MASK;
+	local_irq_restore(flags);
+
+	if (status != HV_STATUS_SUCCESS) {
+		pr_err("%s: hypercall failed, status %d\n", __func__, status);
+		return status;
+	}
+
+	*entry = output->interrupt_entry;
+
+	return status;
+}
+
+static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg)
+{
+	/* High address is always 0 */
+	msg->address_hi = 0;
+	msg->address_lo = entry->msi_entry.address.as_uint32;
+	msg->data = entry->msi_entry.data.as_uint32;
+}
+
+static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry);
+static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	struct msi_desc *msidesc;
+	struct pci_dev *dev;
+	struct hv_interrupt_entry out_entry, *stored_entry;
+	struct irq_cfg *cfg = irqd_cfg(data);
+	struct cpumask *affinity;
+	int cpu, vcpu;
+	int status;
+
+	msidesc = irq_data_get_msi_desc(data);
+	dev = msi_desc_to_pci_dev(msidesc);
+
+	if (!cfg) {
+		pr_debug("%s: cfg is NULL", __func__);
+		return;
+	}
+
+	affinity = irq_data_get_effective_affinity_mask(data);
+	cpu = cpumask_first_and(affinity, cpu_online_mask);
+	vcpu = hv_cpu_number_to_vp_number(cpu);
+
+	if (data->chip_data) {
+		/*
+		 * This interrupt is already mapped. Let's unmap first.
+		 *
+		 * We don't use retarget interrupt hypercalls here because
+		 * Microsoft Hypervisor doens't allow root to change the vector
+		 * or specify VPs outside of the set that is initially used
+		 * during mapping.
+		 */
+		stored_entry = data->chip_data;
+		data->chip_data = NULL;
+
+		status = hv_unmap_msi_interrupt(dev, stored_entry);
+
+		kfree(stored_entry);
+
+		if (status != HV_STATUS_SUCCESS) {
+			pr_debug("%s: failed to unmap, status %d", __func__, status);
+			return;
+		}
+	}
+
+	stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC);
+	if (!stored_entry) {
+		pr_debug("%s: failed to allocate chip data\n", __func__);
+		return;
+	}
+
+	status = hv_map_msi_interrupt(dev, vcpu, cfg->vector, &out_entry);
+	if (status != HV_STATUS_SUCCESS) {
+		kfree(stored_entry);
+		return;
+	}
+
+	*stored_entry = out_entry;
+	data->chip_data = stored_entry;
+	entry_to_msi_msg(&out_entry, msg);
+
+	return;
+}
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip hv_pci_msi_controller = {
+	.name			= "HV-PCI-MSI",
+	.irq_unmask		= pci_msi_unmask_irq,
+	.irq_mask		= pci_msi_mask_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg	= hv_irq_compose_msi_msg,
+	.irq_set_affinity	= msi_domain_set_affinity,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct msi_domain_ops pci_msi_domain_ops = {
+	.get_hwirq	= pci_msi_get_hwirq,
+	.msi_prepare	= pci_msi_prepare,
+	.set_desc	= pci_msi_set_desc,
+};
+
+static struct msi_domain_info hv_pci_msi_domain_info = {
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+			  MSI_FLAG_PCI_MSIX,
+	.ops		= &pci_msi_domain_ops,
+	.chip		= &hv_pci_msi_controller,
+	.handler	= handle_edge_irq,
+	.handler_name	= "edge",
+};
+
+static struct irq_domain *hv_msi_domain;
+
+int hv_init_msi_domain(void)
+{
+	struct fwnode_handle *fn;
+
+	fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI");
+	if (!fn) {
+		pr_debug("irq_domain_alloc_named_fwnode failed\n");
+		return -1;
+	}
+
+	hv_msi_domain = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info,
+					  x86_vector_domain);
+	irq_domain_free_fwnode(fn);
+	if (!hv_msi_domain) {
+		pr_warn("Failed to initialize irqdomain for MSI/MSI-X.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+int hv_setup_msi_domain_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	int ret;
+
+	if (type == PCI_CAP_ID_MSI && nvec > 1) {
+		pr_debug("%s: Multi-message PCI MSI not supported", __func__);
+		return 1;
+	}
+
+	ret = msi_domain_alloc_irqs(hv_msi_domain, &dev->dev, nvec);
+	if (ret)
+		pr_debug("%s: msi_domain_alloc_irqs returned: %d\n", __func__, ret);
+
+	return ret;
+}
+
+static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
+{
+	unsigned long flags;
+	struct hv_input_unmap_device_interrupt *input;
+	struct hv_interrupt_entry *intr_entry;
+	int status;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+	memset(input, 0, sizeof(*input));
+	intr_entry = &input->interrupt_entry;
+	input->partition_id = hv_current_partition_id;
+	input->device_id = id;
+	*intr_entry = *old_entry;
+
+	status = hv_do_rep_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, 0, 0, input, NULL) &
+			 HV_HYPERCALL_RESULT_MASK;
+	local_irq_restore(flags);
+
+	return status;
+}
+
+static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry)
+{
+	return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry)
+		& HV_HYPERCALL_RESULT_MASK;
+}
+
+static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+{
+	int status;
+	struct hv_interrupt_entry old_entry;
+	struct irq_desc *desc;
+	struct irq_data *data;
+	struct msi_msg msg;
+
+	desc = irq_to_desc(irq);
+	if (!desc) {
+		pr_debug("%s: no irq desc\n", __func__);
+		return;
+	}
+
+	data = &desc->irq_data;
+	if (!data) {
+		pr_debug("%s: no irq data\n", __func__);
+		return;
+	}
+
+	if (!data->chip_data) {
+		pr_debug("%s: no chip data\n!", __func__);
+		return;
+	}
+
+	old_entry = *(struct hv_interrupt_entry *)data->chip_data;
+	entry_to_msi_msg(&old_entry, &msg);
+
+	kfree(data->chip_data);
+	data->chip_data = NULL;
+
+	status = hv_unmap_msi_interrupt(dev, &old_entry);
+
+	if (status != HV_STATUS_SUCCESS) {
+		pr_err("%s: hypercall failed, status %d\n", __func__, status);
+		return;
+	}
+}
+
+void hv_teardown_msi_irq(unsigned int irq)
+{
+	irq_domain_free_irqs(irq, 1);
+}
+
+void hv_teardown_msi_irqs(struct pci_dev *dev)
+{
+	int i;
+	struct msi_desc *entry;
+
+	for_each_pci_msi_entry(entry, dev) {
+		if (entry->irq) {
+			for (i = 0; i < entry->nvec_used; i++) {
+				hv_teardown_msi_irq_common(dev, entry, entry->irq + i);
+				hv_teardown_msi_irq(entry->irq + i);
+			}
+		}
+	}
+}
-- 
2.20.1


  parent reply	other threads:[~2020-09-14 17:25 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-14 11:27 [PATCH RFC v1 00/18] Introducing Linux root partition support for Microsoft Hypervisor Wei Liu
2020-09-14 11:27 ` [PATCH RFC v1 01/18] asm-generic/hyperv: change HV_CPU_POWER_MANAGEMENT to HV_CPU_MANAGEMENT Wei Liu
2020-09-14 11:27 ` [PATCH RFC v1 02/18] x86/hyperv: detect if Linux is the root partition Wei Liu
2020-09-14 11:27 ` [PATCH RFC v1 03/18] Drivers: hv: vmbus: skip VMBus initialization if Linux is root Wei Liu
2020-09-14 11:27 ` [PATCH RFC v1 04/18] iommu/hyperv: don't setup IRQ remapping when running as root Wei Liu
2020-09-18  9:12   ` Joerg Roedel
2020-09-14 11:27 ` [PATCH RFC v1 05/18] clocksource/hyperv: use MSR-based access if " Wei Liu
2020-09-15 10:10   ` Vitaly Kuznetsov
2020-09-15 10:32     ` Wei Liu
2020-09-14 11:27 ` [PATCH RFC v1 06/18] x86/hyperv: allocate output arg pages if required Wei Liu
2020-09-15 10:16   ` Vitaly Kuznetsov
2020-09-15 12:43     ` Wei Liu
2020-09-16 15:42       ` Wei Liu
2020-09-16 16:10         ` Vitaly Kuznetsov
2020-09-14 11:27 ` [PATCH RFC v1 07/18] x86/hyperv: extract partition ID from Microsoft Hypervisor if necessary Wei Liu
2020-09-15 10:27   ` Vitaly Kuznetsov
2020-09-16 16:32     ` Wei Liu
2020-10-27 12:19       ` Wei Liu
2020-09-14 11:27 ` [PATCH RFC v1 08/18] x86/hyperv: handling hypercall page setup for root Wei Liu
2020-09-15 10:32   ` Vitaly Kuznetsov
2020-09-15 10:37     ` Wei Liu
2020-09-15 11:02       ` Vitaly Kuznetsov
2020-09-15 11:16         ` Wei Liu
2020-09-15 11:23           ` Vitaly Kuznetsov
2020-09-15 11:27             ` Wei Liu
2020-09-16 21:34       ` [EXTERNAL] " Sunil Muthuswamy
2020-09-17 11:06         ` Vitaly Kuznetsov
2020-09-14 11:59 ` [PATCH RFC v1 09/18] x86/hyperv: provide a bunch of helper functions Wei Liu
2020-09-15 11:00   ` Vitaly Kuznetsov
2020-10-27 13:10     ` Wei Liu
2020-09-14 11:59 ` [PATCH RFC v1 10/18] x86/hyperv: implement and use hv_smp_prepare_cpus Wei Liu
2020-09-15 11:14   ` Vitaly Kuznetsov
2020-10-27 13:47     ` Wei Liu
2020-10-27 13:56       ` Wei Liu
2020-09-14 11:59 ` [PATCH RFC v1 11/18] asm-generic/hyperv: update hv_msi_entry Wei Liu
2020-09-14 11:59 ` [PATCH RFC v1 12/18] asm-generic/hyperv: update hv_interrupt_entry Wei Liu
2020-10-01 14:33   ` Rob Herring
2020-09-14 11:59 ` [PATCH RFC v1 13/18] asm-generic/hyperv: introduce hv_device_id and auxiliary structures Wei Liu
2020-09-15 11:16   ` Vitaly Kuznetsov
2020-09-15 11:59     ` Wei Liu
2020-09-14 11:59 ` [PATCH RFC v1 14/18] asm-generic/hyperv: import data structures for mapping device interrupts Wei Liu
2020-09-14 11:59 ` [PATCH RFC v1 15/18] x86/apic/msi: export pci_msi_get_hwirq Wei Liu
2020-09-14 11:59 ` Wei Liu [this message]
2020-09-14 11:59 ` [PATCH RFC v1 17/18] x86/ioapic: export a few functions and data structures via io_apic.h Wei Liu
2020-09-14 11:59 ` [PATCH RFC v1 18/18] x86/hyperv: handle IO-APIC when running as root Wei Liu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200914115928.83184-8-wei.liu@kernel.org \
    --to=wei.liu@kernel.org \
    --cc=bp@alien8.de \
    --cc=haiyangz@microsoft.com \
    --cc=hpa@zytor.com \
    --cc=kys@microsoft.com \
    --cc=linux-hyperv@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mikelley@microsoft.com \
    --cc=mingo@redhat.com \
    --cc=nudasnev@microsoft.com \
    --cc=sthemmin@microsoft.com \
    --cc=sunilmut@microsoft.com \
    --cc=tglx@linutronix.de \
    --cc=viremana@linux.microsoft.com \
    --cc=virtualization@lists.linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).