iommu.lists.linux-foundation.org archive mirror
 help / color / mirror / Atom feed
* [RFC v1 3/8] intel/vt-d: make DMAR table parsing code more flexible
       [not found] <20210709114339.3467637-1-wei.liu@kernel.org>
@ 2021-07-09 11:43 ` Wei Liu
  2021-07-09 12:56   ` Robin Murphy
  2021-07-09 11:43 ` [RFC v1 4/8] intel/vt-d: export intel_iommu_get_resv_regions Wei Liu
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 18+ messages in thread
From: Wei Liu @ 2021-07-09 11:43 UTC (permalink / raw)
  To: Linux on Hyper-V List
  Cc: Wei Liu, pasha.tatashin, Will Deacon, kumarpraveen,
	David Woodhouse, Linux Kernel List, Michael Kelley,
	open list:INTEL IOMMU VT-d, Nuno Das Neves, Sunil Muthuswamy,
	virtualization, Vineeth Pillai

Microsoft Hypervisor provides a set of hypercalls to manage device
domains. The root kernel should parse the DMAR so that it can program
the IOMMU (with hypercalls) correctly.

The DMAR code was designed to work with Intel IOMMU only. Add two more
parameters to make it useful to Microsoft Hypervisor. Microsoft
Hypervisor does not need the DMAR parsing code to allocate an Intel
IOMMU structure; it also wishes to always reparse the DMAR table even
after it has been parsed before.

Adjust Intel IOMMU code to use the new dmar_table_init. There should be
no functional change to Intel IOMMU code.

Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
We may be able to combine alloc and force_parse?
---
 drivers/iommu/intel/dmar.c          | 38 ++++++++++++++++++++---------
 drivers/iommu/intel/iommu.c         |  2 +-
 drivers/iommu/intel/irq_remapping.c |  2 +-
 include/linux/dmar.h                |  2 +-
 4 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 84057cb9596c..bd72f47c728b 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -408,7 +408,8 @@ dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
  * structure which uniquely represent one DMA remapping hardware unit
  * present in the platform
  */
-static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
+static int dmar_parse_one_drhd_internal(struct acpi_dmar_header *header,
+		void *arg, bool alloc)
 {
 	struct acpi_dmar_hardware_unit *drhd;
 	struct dmar_drhd_unit *dmaru;
@@ -440,12 +441,14 @@ static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
 		return -ENOMEM;
 	}
 
-	ret = alloc_iommu(dmaru);
-	if (ret) {
-		dmar_free_dev_scope(&dmaru->devices,
-				    &dmaru->devices_cnt);
-		kfree(dmaru);
-		return ret;
+	if (alloc) {
+		ret = alloc_iommu(dmaru);
+		if (ret) {
+			dmar_free_dev_scope(&dmaru->devices,
+					    &dmaru->devices_cnt);
+			kfree(dmaru);
+			return ret;
+		}
 	}
 	dmar_register_drhd_unit(dmaru);
 
@@ -456,6 +459,16 @@ static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
 	return 0;
 }
 
+static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
+{
+	return dmar_parse_one_drhd_internal(header, arg, true);
+}
+
+int dmar_parse_one_drhd_noalloc(struct acpi_dmar_header *header, void *arg)
+{
+	return dmar_parse_one_drhd_internal(header, arg, false);
+}
+
 static void dmar_free_drhd(struct dmar_drhd_unit *dmaru)
 {
 	if (dmaru->devices && dmaru->devices_cnt)
@@ -633,7 +646,7 @@ static inline int dmar_walk_dmar_table(struct acpi_table_dmar *dmar,
  * parse_dmar_table - parses the DMA reporting table
  */
 static int __init
-parse_dmar_table(void)
+parse_dmar_table(bool alloc)
 {
 	struct acpi_table_dmar *dmar;
 	int drhd_count = 0;
@@ -650,6 +663,9 @@ parse_dmar_table(void)
 		.cb[ACPI_DMAR_TYPE_SATC] = &dmar_parse_one_satc,
 	};
 
+	if (!alloc)
+		cb.cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_parse_one_drhd_noalloc;
+
 	/*
 	 * Do it again, earlier dmar_tbl mapping could be mapped with
 	 * fixed map.
@@ -840,13 +856,13 @@ void __init dmar_register_bus_notifier(void)
 }
 
 
-int __init dmar_table_init(void)
+int __init dmar_table_init(bool alloc, bool force_parse)
 {
 	static int dmar_table_initialized;
 	int ret;
 
-	if (dmar_table_initialized == 0) {
-		ret = parse_dmar_table();
+	if (dmar_table_initialized == 0 || force_parse) {
+		ret = parse_dmar_table(alloc);
 		if (ret < 0) {
 			if (ret != -ENODEV)
 				pr_info("Parse DMAR table failure.\n");
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index be35284a2016..a4294d310b93 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4310,7 +4310,7 @@ int __init intel_iommu_init(void)
 	}
 
 	down_write(&dmar_global_lock);
-	if (dmar_table_init()) {
+	if (dmar_table_init(true, false)) {
 		if (force_on)
 			panic("tboot: Failed to initialize DMAR table\n");
 		goto out_free_dmar;
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index f912fe45bea2..0e8abef862e4 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -732,7 +732,7 @@ static int __init intel_prepare_irq_remapping(void)
 		return -ENODEV;
 	}
 
-	if (dmar_table_init() < 0)
+	if (dmar_table_init(true, false) < 0)
 		return -ENODEV;
 
 	if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL))
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e04436a7ff27..f88535d41a6e 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -106,7 +106,7 @@ static inline bool dmar_rcu_check(void)
 	for_each_dev_scope((devs), (cnt), (i), (tmp))			\
 		if (!(tmp)) { continue; } else
 
-extern int dmar_table_init(void);
+extern int dmar_table_init(bool alloc, bool force_parse);
 extern int dmar_dev_scope_init(void);
 extern void dmar_register_bus_notifier(void);
 extern int dmar_parse_dev_scope(void *start, void *end, int *cnt,
-- 
2.30.2

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC v1 4/8] intel/vt-d: export intel_iommu_get_resv_regions
       [not found] <20210709114339.3467637-1-wei.liu@kernel.org>
  2021-07-09 11:43 ` [RFC v1 3/8] intel/vt-d: make DMAR table parsing code more flexible Wei Liu
@ 2021-07-09 11:43 ` Wei Liu
  2021-07-09 14:17   ` Lu Baolu
  2021-07-09 11:43 ` [RFC v1 5/8] mshv: add paravirtualized IOMMU support Wei Liu
  2021-07-09 11:43 ` [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU Wei Liu
  3 siblings, 1 reply; 18+ messages in thread
From: Wei Liu @ 2021-07-09 11:43 UTC (permalink / raw)
  To: Linux on Hyper-V List
  Cc: Wei Liu, pasha.tatashin, Will Deacon, kumarpraveen,
	David Woodhouse, Linux Kernel List, Michael Kelley,
	open list:INTEL IOMMU VT-d, Nuno Das Neves, Sunil Muthuswamy,
	virtualization, Vineeth Pillai

When Microsoft Hypervisor runs on Intel platforms it needs to know the
reserved regions to program devices correctly. There is no reason to
duplicate intel_iommu_get_resv_regions. Export it.

Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/iommu/intel/iommu.c | 5 +++--
 include/linux/intel-iommu.h | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index a4294d310b93..01973bc20080 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5176,8 +5176,8 @@ static void intel_iommu_probe_finalize(struct device *dev)
 		set_dma_ops(dev, NULL);
 }
 
-static void intel_iommu_get_resv_regions(struct device *device,
-					 struct list_head *head)
+void intel_iommu_get_resv_regions(struct device *device,
+				 struct list_head *head)
 {
 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
 	struct iommu_resv_region *reg;
@@ -5232,6 +5232,7 @@ static void intel_iommu_get_resv_regions(struct device *device,
 		return;
 	list_add_tail(&reg->list, head);
 }
+EXPORT_SYMBOL_GPL(intel_iommu_get_resv_regions);
 
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
 {
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 03faf20a6817..f91869f765bc 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -814,6 +814,8 @@ extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
 extern int dmar_disabled;
 extern int intel_iommu_enabled;
 extern int intel_iommu_gfx_mapped;
+extern void intel_iommu_get_resv_regions(struct device *device,
+				 struct list_head *head);
 #else
 static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
 {
@@ -825,6 +827,8 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 }
 #define dmar_disabled	(1)
 #define intel_iommu_enabled (0)
+static inline void intel_iommu_get_resv_regions(struct device *device,
+				 struct list_head *head) {}
 #endif
 
 #endif
-- 
2.30.2

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC v1 5/8] mshv: add paravirtualized IOMMU support
       [not found] <20210709114339.3467637-1-wei.liu@kernel.org>
  2021-07-09 11:43 ` [RFC v1 3/8] intel/vt-d: make DMAR table parsing code more flexible Wei Liu
  2021-07-09 11:43 ` [RFC v1 4/8] intel/vt-d: export intel_iommu_get_resv_regions Wei Liu
@ 2021-07-09 11:43 ` Wei Liu
  2021-08-03 18:40   ` Praveen Kumar
  2021-07-09 11:43 ` [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU Wei Liu
  3 siblings, 1 reply; 18+ messages in thread
From: Wei Liu @ 2021-07-09 11:43 UTC (permalink / raw)
  To: Linux on Hyper-V List
  Cc: Wei Liu, K. Y. Srinivasan, Stephen Hemminger, pasha.tatashin,
	kumarpraveen, Will Deacon, Dexuan Cui, Linux Kernel List,
	Michael Kelley, open list:IOMMU DRIVERS, Nuno Das Neves,
	Sunil Muthuswamy, virtualization, Vineeth Pillai, Haiyang Zhang

Microsoft Hypervisor provides a set of hypercalls to manage device
domains. Implement a type-1 IOMMU using those hypercalls.

Implement DMA remapping as the first step for this driver. Interrupt
remapping will come in a later stage.

Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/iommu/Kconfig        |  14 +
 drivers/iommu/hyperv-iommu.c | 628 +++++++++++++++++++++++++++++++++++
 2 files changed, 642 insertions(+)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 1f111b399bca..e230c4536825 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -397,6 +397,20 @@ config HYPERV_IOMMU
 	  Stub IOMMU driver to handle IRQs as to allow Hyper-V Linux
 	  guests to run with x2APIC mode enabled.
 
+config HYPERV_ROOT_PVIOMMU
+	bool "Microsoft Hypervisor paravirtualized IOMMU support"
+	depends on HYPERV && X86
+	select IOMMU_API
+	select IOMMU_DMA
+	select DMA_OPS
+	select IOMMU_IOVA
+	select INTEL_IOMMU
+	select DMAR_TABLE
+	default HYPERV
+	help
+	  A paravirtualized IOMMU for Microsoft Hypervisor. It supports DMA
+	  remapping.
+
 config VIRTIO_IOMMU
 	tristate "Virtio IOMMU driver"
 	depends on VIRTIO
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index e285a220c913..043dcff06511 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -12,7 +12,12 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/iommu.h>
+#include <linux/dma-iommu.h>
 #include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/interval_tree.h>
+#include <linux/dma-map-ops.h>
+#include <linux/dmar.h>
 
 #include <asm/apic.h>
 #include <asm/cpu.h>
@@ -21,6 +26,9 @@
 #include <asm/irq_remapping.h>
 #include <asm/hypervisor.h>
 #include <asm/mshyperv.h>
+#include <asm/iommu_table.h>
+
+#include <linux/intel-iommu.h>
 
 #include "irq_remapping.h"
 
@@ -338,3 +346,623 @@ static const struct irq_domain_ops hyperv_root_ir_domain_ops = {
 };
 
 #endif
+
+#ifdef CONFIG_HYPERV_ROOT_PVIOMMU
+
+/* DMA remapping support */
+struct hv_iommu_domain {
+	struct iommu_domain domain;
+	struct hv_iommu_dev *hv_iommu;
+
+	struct hv_input_device_domain device_domain;
+
+	spinlock_t mappings_lock;
+	struct rb_root_cached mappings;
+
+	u32 map_flags;
+	u64 pgsize_bitmap;
+};
+
+/* What hardware IOMMU? */
+enum {
+	INVALID = 0,
+	INTEL, /* Only Intel for now */
+} hw_iommu_type = { INVALID };
+
+static struct hv_iommu_domain hv_identity_domain, hv_null_domain;
+
+#define to_hv_iommu_domain(d) \
+	container_of(d, struct hv_iommu_domain, domain)
+
+struct hv_iommu_mapping {
+	phys_addr_t paddr;
+	struct interval_tree_node iova;
+	u32 flags;
+};
+
+struct hv_iommu_dev {
+	struct iommu_device iommu;
+
+	struct ida domain_ids;
+
+	/* Device configuration */
+	struct iommu_domain_geometry geometry;
+	u64 first_domain;
+	u64 last_domain;
+
+	u32 map_flags;
+	u64 pgsize_bitmap;
+};
+
+static struct hv_iommu_dev *hv_iommu_device;
+
+struct hv_iommu_endpoint {
+	struct device *dev;
+	struct hv_iommu_dev *hv_iommu;
+	struct hv_iommu_domain *domain;
+};
+
+static void __init hv_initialize_special_domains(void)
+{
+	struct hv_iommu_domain *domain;
+
+	/* Default passthrough domain */
+	domain = &hv_identity_domain;
+
+	memset(domain, 0, sizeof(*domain));
+
+	domain->device_domain.partition_id = HV_PARTITION_ID_SELF;
+	domain->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+	domain->device_domain.domain_id.id = HV_DEVICE_DOMAIN_ID_S2_DEFAULT;
+
+	domain->domain.geometry = hv_iommu_device->geometry;
+
+	/* NULL domain that blocks all DMA transactions */
+	domain = &hv_null_domain;
+
+	memset(domain, 0, sizeof(*domain));
+
+	domain->device_domain.partition_id = HV_PARTITION_ID_SELF;
+	domain->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+	domain->device_domain.domain_id.id = HV_DEVICE_DOMAIN_ID_S2_NULL;
+
+	domain->domain.geometry = hv_iommu_device->geometry;
+}
+
+static bool is_identity_domain(struct hv_iommu_domain *d)
+{
+	return d->device_domain.domain_id.id == HV_DEVICE_DOMAIN_ID_S2_DEFAULT;
+}
+
+static bool is_null_domain(struct hv_iommu_domain *d)
+{
+	return d->device_domain.domain_id.id == HV_DEVICE_DOMAIN_ID_S2_NULL;
+}
+
+static struct iommu_domain *hv_iommu_domain_alloc(unsigned type)
+{
+	struct hv_iommu_domain *domain;
+	int ret;
+	u64 status;
+	unsigned long flags;
+	struct hv_input_create_device_domain *input;
+
+	if (type == IOMMU_DOMAIN_IDENTITY)
+		return &hv_identity_domain.domain;
+
+	if (type == IOMMU_DOMAIN_BLOCKED)
+		return &hv_null_domain.domain;
+
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (!domain)
+		goto out;
+
+	spin_lock_init(&domain->mappings_lock);
+	domain->mappings = RB_ROOT_CACHED;
+
+	if (type == IOMMU_DOMAIN_DMA &&
+		iommu_get_dma_cookie(&domain->domain)) {
+		goto out_free;
+	}
+
+	ret = ida_alloc_range(&hv_iommu_device->domain_ids,
+			hv_iommu_device->first_domain, hv_iommu_device->last_domain,
+			GFP_KERNEL);
+	if (ret < 0)
+		goto out_put_cookie;
+
+	domain->device_domain.partition_id = HV_PARTITION_ID_SELF;
+	domain->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+	domain->device_domain.domain_id.id = ret;
+
+	domain->hv_iommu = hv_iommu_device;
+	domain->map_flags = hv_iommu_device->map_flags;
+
+	local_irq_save(flags);
+
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	input->device_domain = domain->device_domain;
+
+	input->create_device_domain_flags.forward_progress_required = 1;
+	input->create_device_domain_flags.inherit_owning_vtl = 0;
+
+	status = hv_do_hypercall(HVCALL_CREATE_DEVICE_DOMAIN, input, NULL);
+
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status)) {
+		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+		goto out_free_id;
+	}
+
+	domain->domain.pgsize_bitmap = hv_iommu_device->pgsize_bitmap;
+	domain->domain.geometry = hv_iommu_device->geometry;
+
+	return &domain->domain;
+
+out_free_id:
+	ida_free(&hv_iommu_device->domain_ids, domain->device_domain.domain_id.id);
+out_put_cookie:
+	iommu_put_dma_cookie(&domain->domain);
+out_free:
+	kfree(domain);
+out:
+	return NULL;
+}
+
+static void hv_iommu_domain_free(struct iommu_domain *d)
+{
+	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
+	unsigned long flags;
+	u64 status;
+	struct hv_input_delete_device_domain *input;
+
+	if (is_identity_domain(domain) || is_null_domain(domain))
+		return;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	input->device_domain= domain->device_domain;
+
+	status = hv_do_hypercall(HVCALL_DELETE_DEVICE_DOMAIN, input, NULL);
+
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status))
+		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+
+	ida_free(&domain->hv_iommu->domain_ids, domain->device_domain.domain_id.id);
+
+	iommu_put_dma_cookie(d);
+
+	kfree(domain);
+}
+
+static int hv_iommu_attach_dev(struct iommu_domain *d, struct device *dev)
+{
+	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
+	u64 status;
+	unsigned long flags;
+	struct hv_input_attach_device_domain *input;
+	struct pci_dev *pdev;
+	struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
+
+	/* Only allow PCI devices for now */
+	if (!dev_is_pci(dev))
+		return -EINVAL;
+
+	pdev = to_pci_dev(dev);
+
+	dev_dbg(dev, "Attaching (%strusted) to %d\n", pdev->untrusted ? "un" : "",
+		domain->device_domain.domain_id.id);
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	input->device_domain = domain->device_domain;
+	input->device_id = hv_build_pci_dev_id(pdev);
+
+	status = hv_do_hypercall(HVCALL_ATTACH_DEVICE_DOMAIN, input, NULL);
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status))
+		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+	else
+		vdev->domain = domain;
+
+	return hv_status_to_errno(status);
+}
+
+static void hv_iommu_detach_dev(struct iommu_domain *d, struct device *dev)
+{
+	u64 status;
+	unsigned long flags;
+	struct hv_input_detach_device_domain *input;
+	struct pci_dev *pdev;
+	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
+	struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
+
+	/* See the attach function, only PCI devices for now */
+	if (!dev_is_pci(dev))
+		return;
+
+	pdev = to_pci_dev(dev);
+
+	dev_dbg(dev, "Detaching from %d\n", domain->device_domain.domain_id.id);
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	input->partition_id = HV_PARTITION_ID_SELF;
+	input->device_id = hv_build_pci_dev_id(pdev);
+
+	status = hv_do_hypercall(HVCALL_DETACH_DEVICE_DOMAIN, input, NULL);
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status))
+		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+
+	vdev->domain = NULL;
+}
+
+static int hv_iommu_add_mapping(struct hv_iommu_domain *domain, unsigned long iova,
+		phys_addr_t paddr, size_t size, u32 flags)
+{
+	unsigned long irqflags;
+	struct hv_iommu_mapping *mapping;
+
+	mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
+	if (!mapping)
+		return -ENOMEM;
+
+	mapping->paddr = paddr;
+	mapping->iova.start = iova;
+	mapping->iova.last = iova + size - 1;
+	mapping->flags = flags;
+
+	spin_lock_irqsave(&domain->mappings_lock, irqflags);
+	interval_tree_insert(&mapping->iova, &domain->mappings);
+	spin_unlock_irqrestore(&domain->mappings_lock, irqflags);
+
+	return 0;
+}
+
+static size_t hv_iommu_del_mappings(struct hv_iommu_domain *domain,
+		unsigned long iova, size_t size)
+{
+	unsigned long flags;
+	size_t unmapped = 0;
+	unsigned long last = iova + size - 1;
+	struct hv_iommu_mapping *mapping = NULL;
+	struct interval_tree_node *node, *next;
+
+	spin_lock_irqsave(&domain->mappings_lock, flags);
+	next = interval_tree_iter_first(&domain->mappings, iova, last);
+	while (next) {
+		node = next;
+		mapping = container_of(node, struct hv_iommu_mapping, iova);
+		next = interval_tree_iter_next(node, iova, last);
+
+		/* Trying to split a mapping? Not supported for now. */
+		if (mapping->iova.start < iova)
+			break;
+
+		unmapped += mapping->iova.last - mapping->iova.start + 1;
+
+		interval_tree_remove(node, &domain->mappings);
+		kfree(mapping);
+	}
+	spin_unlock_irqrestore(&domain->mappings_lock, flags);
+
+	return unmapped;
+}
+
+static int hv_iommu_map(struct iommu_domain *d, unsigned long iova,
+			phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+	u32 map_flags;
+	unsigned long flags, pfn, npages;
+	int ret, i;
+	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
+	struct hv_input_map_device_gpa_pages *input;
+	u64 status;
+
+	/* Reject size that's not a whole page */
+	if (size & ~HV_HYP_PAGE_MASK)
+		return -EINVAL;
+
+	map_flags = HV_MAP_GPA_READABLE; /* Always required */
+	map_flags |= prot & IOMMU_WRITE ? HV_MAP_GPA_WRITABLE : 0;
+
+	ret = hv_iommu_add_mapping(domain, iova, paddr, size, flags);
+	if (ret)
+		return ret;
+
+	npages = size >> HV_HYP_PAGE_SHIFT;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	input->device_domain = domain->device_domain;
+	input->map_flags = map_flags;
+	input->target_device_va_base = iova;
+
+	pfn = paddr >> HV_HYP_PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		input->gpa_page_list[i] = pfn;
+		pfn += 1;
+	}
+
+	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_GPA_PAGES, npages, 0,
+			input, NULL);
+
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status)) {
+		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+		hv_iommu_del_mappings(domain, iova, size);
+	}
+
+	return hv_status_to_errno(status);
+}
+
+static size_t hv_iommu_unmap(struct iommu_domain *d, unsigned long iova,
+			   size_t size, struct iommu_iotlb_gather *gather)
+{
+	size_t unmapped;
+	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
+	unsigned long flags, npages;
+	struct hv_input_unmap_device_gpa_pages *input;
+	u64 status;
+
+	unmapped = hv_iommu_del_mappings(domain, iova, size);
+	if (unmapped < size)
+		return 0;
+
+	npages = size >> HV_HYP_PAGE_SHIFT;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	input->device_domain = domain->device_domain;
+	input->target_device_va_base = iova;
+
+	/* Unmap `npages` pages starting from VA base */
+	status = hv_do_rep_hypercall(HVCALL_UNMAP_DEVICE_GPA_PAGES, npages,
+			0, input, NULL);
+
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status))
+		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+
+	return hv_result_success(status) ? unmapped : 0;
+}
+
+static phys_addr_t hv_iommu_iova_to_phys(struct iommu_domain *d,
+				       dma_addr_t iova)
+{
+	u64 paddr = 0;
+	unsigned long flags;
+	struct hv_iommu_mapping *mapping;
+	struct interval_tree_node *node;
+	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
+
+	spin_lock_irqsave(&domain->mappings_lock, flags);
+	node = interval_tree_iter_first(&domain->mappings, iova, iova);
+	if (node) {
+		mapping = container_of(node, struct hv_iommu_mapping, iova);
+		paddr = mapping->paddr + (iova - mapping->iova.start);
+	}
+	spin_unlock_irqrestore(&domain->mappings_lock, flags);
+
+	return paddr;
+}
+
+static struct iommu_device *hv_iommu_probe_device(struct device *dev)
+{
+	struct hv_iommu_endpoint *vdev;
+
+	if (!dev_is_pci(dev))
+		return ERR_PTR(-ENODEV);
+
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev)
+		return ERR_PTR(-ENOMEM);
+
+	vdev->dev = dev;
+	vdev->hv_iommu = hv_iommu_device;
+	dev_iommu_priv_set(dev, vdev);
+
+	return &vdev->hv_iommu->iommu;
+}
+
+static void hv_iommu_probe_finalize(struct device *dev)
+{
+	struct iommu_domain *d = iommu_get_domain_for_dev(dev);
+
+	if (d && d->type == IOMMU_DOMAIN_DMA)
+		iommu_setup_dma_ops(dev, 1 << PAGE_SHIFT, 0);
+	else
+		set_dma_ops(dev, NULL);
+}
+
+static void hv_iommu_release_device(struct device *dev)
+{
+	struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
+
+	/* Need to detach device from device domain if necessary. */
+	if (vdev->domain)
+		hv_iommu_detach_dev(&vdev->domain->domain, dev);
+
+	dev_iommu_priv_set(dev, NULL);
+	set_dma_ops(dev, NULL);
+
+	kfree(vdev);
+}
+
+static struct iommu_group *hv_iommu_device_group(struct device *dev)
+{
+	if (dev_is_pci(dev))
+		return pci_device_group(dev);
+	else
+		return generic_device_group(dev);
+}
+
+/*
+ * This bitmap is used to advertise the page sizes our hardware support to the
+ * IOMMU core, which will then use this information to split physically
+ * contiguous memory regions it is mapping into page sizes that we support.
+ *
+ * Given the size of the PFN array we can accommodate less than 512 4KiB pages.
+ */
+#define HV_IOMMU_PGSIZES (SZ_4K | SZ_1M)
+
+static void hv_iommu_get_resv_regions(struct device *dev,
+		struct list_head *head)
+{
+	switch (hw_iommu_type) {
+	case INTEL:
+		intel_iommu_get_resv_regions(dev, head);
+		break;
+	default:
+		/* Do nothing */;
+	}
+}
+
+static int hv_iommu_def_domain_type(struct device *dev)
+{
+	/* The hypervisor has created a default passthrough domain */
+	return IOMMU_DOMAIN_IDENTITY;
+}
+
+static struct iommu_ops hv_iommu_ops = {
+	.domain_alloc     = hv_iommu_domain_alloc,
+	.domain_free      = hv_iommu_domain_free,
+	.attach_dev       = hv_iommu_attach_dev,
+	.detach_dev       = hv_iommu_detach_dev,
+	.map              = hv_iommu_map,
+	.unmap            = hv_iommu_unmap,
+	.iova_to_phys     = hv_iommu_iova_to_phys,
+	.probe_device     = hv_iommu_probe_device,
+	.probe_finalize   = hv_iommu_probe_finalize,
+	.release_device   = hv_iommu_release_device,
+	.def_domain_type  = hv_iommu_def_domain_type,
+	.device_group     = hv_iommu_device_group,
+	.get_resv_regions = hv_iommu_get_resv_regions,
+	.put_resv_regions = generic_iommu_put_resv_regions,
+	.pgsize_bitmap    = HV_IOMMU_PGSIZES,
+	.owner            = THIS_MODULE,
+};
+
+static void __init hv_initalize_resv_regions_intel(void)
+{
+	int ret;
+
+	down_write(&dmar_global_lock);
+	if (dmar_table_init(false, true)) {
+		pr_err("Failed to initialize DMAR table\n");
+		up_write(&dmar_global_lock);
+		return;
+	}
+
+	ret = dmar_dev_scope_init();
+	if (ret)
+		pr_err("Failed to initialize device scope\n");
+
+	up_write(&dmar_global_lock);
+
+	hw_iommu_type = INTEL;
+}
+
+static void __init hv_initialize_resv_regions(void)
+{
+	hv_initalize_resv_regions_intel();
+}
+
+int __init hv_iommu_init(void)
+{
+	int ret = 0;
+	struct hv_iommu_dev *hv_iommu = NULL;
+
+	if (!hv_is_hyperv_initialized())
+		return -ENODEV;
+
+	hv_initialize_resv_regions();
+
+	hv_iommu = kzalloc(sizeof(*hv_iommu), GFP_KERNEL);
+	if (!hv_iommu)
+		return -ENOMEM;
+
+	ida_init(&hv_iommu->domain_ids);
+	hv_iommu->first_domain = HV_DEVICE_DOMAIN_ID_S2_DEFAULT + 1;
+	hv_iommu->last_domain = HV_DEVICE_DOMAIN_ID_S2_NULL - 1;
+
+	hv_iommu->geometry = (struct iommu_domain_geometry) {
+		.aperture_start = 0,
+		.aperture_end = -1UL,
+		.force_aperture = true,
+	};
+
+	hv_iommu->map_flags = IOMMU_READ | IOMMU_WRITE;
+	hv_iommu->pgsize_bitmap = HV_IOMMU_PGSIZES;
+
+	ret = iommu_device_sysfs_add(&hv_iommu->iommu, NULL, NULL, "%s", "hv-iommu");
+	if (ret) {
+		pr_err("iommu_device_sysfs_add failed: %d\n", ret);
+		goto err_free;
+	}
+
+	ret = iommu_device_register(&hv_iommu->iommu, &hv_iommu_ops, NULL);
+	if (ret) {
+		pr_err("iommu_device_register failed: %d\n", ret);
+		goto err_sysfs_remove;
+	}
+
+	/* This must come before bus_set_iommu because it calls into the hooks. */
+	hv_iommu_device = hv_iommu;
+	hv_initialize_special_domains();
+
+#ifdef CONFIG_PCI
+	ret = bus_set_iommu(&pci_bus_type, &hv_iommu_ops);
+	if (ret) {
+		pr_err("bus_set_iommu failed: %d\n", ret);
+		goto err_unregister;
+	}
+#endif
+	pr_info("Microsoft Hypervisor IOMMU initialized\n");
+
+	return 0;
+
+#ifdef CONFIG_PCI
+err_unregister:
+	iommu_device_unregister(&hv_iommu->iommu);
+#endif
+err_sysfs_remove:
+	iommu_device_sysfs_remove(&hv_iommu->iommu);
+err_free:
+	kfree(hv_iommu);
+	return ret;
+}
+
+int __init hv_iommu_detect(void)
+{
+	if (!(ms_hyperv.misc_features & HV_DEVICE_DOMAIN_AVAILABLE))
+		return -ENODEV;
+
+	iommu_detected = 1;
+	x86_init.iommu.iommu_init = hv_iommu_init;
+
+	return 1;
+}
+IOMMU_INIT_POST(hv_iommu_detect);
+
+#endif /* CONFIG_HYPERV_ROOT_PVIOMMU */
-- 
2.30.2

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU
       [not found] <20210709114339.3467637-1-wei.liu@kernel.org>
                   ` (2 preceding siblings ...)
  2021-07-09 11:43 ` [RFC v1 5/8] mshv: add paravirtualized IOMMU support Wei Liu
@ 2021-07-09 11:43 ` Wei Liu
  2021-07-09 12:46   ` Robin Murphy
  2021-08-03 18:50   ` Praveen Kumar
  3 siblings, 2 replies; 18+ messages in thread
From: Wei Liu @ 2021-07-09 11:43 UTC (permalink / raw)
  To: Linux on Hyper-V List
  Cc: Wei Liu, K. Y. Srinivasan, Stephen Hemminger, pasha.tatashin,
	kumarpraveen, Will Deacon, Haiyang Zhang, Dexuan Cui,
	Linux Kernel List, Michael Kelley, open list:IOMMU DRIVERS,
	Nuno Das Neves, Sunil Muthuswamy, virtualization, Vineeth Pillai

Some devices may have been claimed by the hypervisor already. One such
example is a user can assign a NIC for debugging purpose.

Ideally Linux should be able to tell retrieve that information, but
there is no way to do that yet. And designing that new mechanism is
going to take time.

Provide a command line option for skipping devices. This is a stopgap
solution, so it is intentionally undocumented. Hopefully we can retire
it in the future.

Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/iommu/hyperv-iommu.c | 45 ++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index 043dcff06511..353da5036387 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -349,6 +349,16 @@ static const struct irq_domain_ops hyperv_root_ir_domain_ops = {
 
 #ifdef CONFIG_HYPERV_ROOT_PVIOMMU
 
+/* The IOMMU will not claim these PCI devices. */
+static char *pci_devs_to_skip;
+static int __init mshv_iommu_setup_skip(char *str) {
+	pci_devs_to_skip = str;
+
+	return 0;
+}
+/* mshv_iommu_skip=(SSSS:BB:DD.F)(SSSS:BB:DD.F) */
+__setup("mshv_iommu_skip=", mshv_iommu_setup_skip);
+
 /* DMA remapping support */
 struct hv_iommu_domain {
 	struct iommu_domain domain;
@@ -774,6 +784,41 @@ static struct iommu_device *hv_iommu_probe_device(struct device *dev)
 	if (!dev_is_pci(dev))
 		return ERR_PTR(-ENODEV);
 
+	/*
+	 * Skip the PCI device specified in `pci_devs_to_skip`. This is a
+	 * temporary solution until we figure out a way to extract information
+	 * from the hypervisor what devices it is already using.
+	 */
+	if (pci_devs_to_skip && *pci_devs_to_skip) {
+		int pos = 0;
+		int parsed;
+		int segment, bus, slot, func;
+		struct pci_dev *pdev = to_pci_dev(dev);
+
+		do {
+			parsed = 0;
+
+			sscanf(pci_devs_to_skip + pos,
+				" (%x:%x:%x.%x) %n",
+				&segment, &bus, &slot, &func, &parsed);
+
+			if (parsed <= 0)
+				break;
+
+			if (pci_domain_nr(pdev->bus) == segment &&
+				pdev->bus->number == bus &&
+				PCI_SLOT(pdev->devfn) == slot &&
+				PCI_FUNC(pdev->devfn) == func)
+			{
+				dev_info(dev, "skipped by MSHV IOMMU\n");
+				return ERR_PTR(-ENODEV);
+			}
+
+			pos += parsed;
+
+		} while (pci_devs_to_skip[pos]);
+	}
+
 	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
 	if (!vdev)
 		return ERR_PTR(-ENOMEM);
-- 
2.30.2

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU
  2021-07-09 11:43 ` [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU Wei Liu
@ 2021-07-09 12:46   ` Robin Murphy
  2021-07-09 13:34     ` Wei Liu
  2021-08-03 18:50   ` Praveen Kumar
  1 sibling, 1 reply; 18+ messages in thread
From: Robin Murphy @ 2021-07-09 12:46 UTC (permalink / raw)
  To: Wei Liu, Linux on Hyper-V List
  Cc: Stephen Hemminger, pasha.tatashin, kumarpraveen, Vineeth Pillai,
	Haiyang Zhang, Dexuan Cui, Linux Kernel List, Michael Kelley,
	open list:IOMMU DRIVERS, Nuno Das Neves, Sunil Muthuswamy,
	K. Y. Srinivasan, Will Deacon, virtualization

On 2021-07-09 12:43, Wei Liu wrote:
> Some devices may have been claimed by the hypervisor already. One such
> example is a user can assign a NIC for debugging purpose.
> 
> Ideally Linux should be able to tell retrieve that information, but
> there is no way to do that yet. And designing that new mechanism is
> going to take time.
> 
> Provide a command line option for skipping devices. This is a stopgap
> solution, so it is intentionally undocumented. Hopefully we can retire
> it in the future.

Huh? If the host is using a device, why the heck is it exposing any 
knowledge of that device to the guest at all, let alone allowing the 
guest to do anything that could affect its operation!?

Robin.

> Signed-off-by: Wei Liu <wei.liu@kernel.org>
> ---
>   drivers/iommu/hyperv-iommu.c | 45 ++++++++++++++++++++++++++++++++++++
>   1 file changed, 45 insertions(+)
> 
> diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
> index 043dcff06511..353da5036387 100644
> --- a/drivers/iommu/hyperv-iommu.c
> +++ b/drivers/iommu/hyperv-iommu.c
> @@ -349,6 +349,16 @@ static const struct irq_domain_ops hyperv_root_ir_domain_ops = {
>   
>   #ifdef CONFIG_HYPERV_ROOT_PVIOMMU
>   
> +/* The IOMMU will not claim these PCI devices. */
> +static char *pci_devs_to_skip;
> +static int __init mshv_iommu_setup_skip(char *str) {
> +	pci_devs_to_skip = str;
> +
> +	return 0;
> +}
> +/* mshv_iommu_skip=(SSSS:BB:DD.F)(SSSS:BB:DD.F) */
> +__setup("mshv_iommu_skip=", mshv_iommu_setup_skip);
> +
>   /* DMA remapping support */
>   struct hv_iommu_domain {
>   	struct iommu_domain domain;
> @@ -774,6 +784,41 @@ static struct iommu_device *hv_iommu_probe_device(struct device *dev)
>   	if (!dev_is_pci(dev))
>   		return ERR_PTR(-ENODEV);
>   
> +	/*
> +	 * Skip the PCI device specified in `pci_devs_to_skip`. This is a
> +	 * temporary solution until we figure out a way to extract information
> +	 * from the hypervisor what devices it is already using.
> +	 */
> +	if (pci_devs_to_skip && *pci_devs_to_skip) {
> +		int pos = 0;
> +		int parsed;
> +		int segment, bus, slot, func;
> +		struct pci_dev *pdev = to_pci_dev(dev);
> +
> +		do {
> +			parsed = 0;
> +
> +			sscanf(pci_devs_to_skip + pos,
> +				" (%x:%x:%x.%x) %n",
> +				&segment, &bus, &slot, &func, &parsed);
> +
> +			if (parsed <= 0)
> +				break;
> +
> +			if (pci_domain_nr(pdev->bus) == segment &&
> +				pdev->bus->number == bus &&
> +				PCI_SLOT(pdev->devfn) == slot &&
> +				PCI_FUNC(pdev->devfn) == func)
> +			{
> +				dev_info(dev, "skipped by MSHV IOMMU\n");
> +				return ERR_PTR(-ENODEV);
> +			}
> +
> +			pos += parsed;
> +
> +		} while (pci_devs_to_skip[pos]);
> +	}
> +
>   	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
>   	if (!vdev)
>   		return ERR_PTR(-ENOMEM);
> 
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 3/8] intel/vt-d: make DMAR table parsing code more flexible
  2021-07-09 11:43 ` [RFC v1 3/8] intel/vt-d: make DMAR table parsing code more flexible Wei Liu
@ 2021-07-09 12:56   ` Robin Murphy
  2021-07-09 13:42     ` Wei Liu
  0 siblings, 1 reply; 18+ messages in thread
From: Robin Murphy @ 2021-07-09 12:56 UTC (permalink / raw)
  To: Wei Liu, Linux on Hyper-V List
  Cc: Jean-Philippe Brucker, pasha.tatashin, kumarpraveen, Will Deacon,
	Linux Kernel List, Michael Kelley, open list:INTEL IOMMU VT-d,
	Nuno Das Neves, Sunil Muthuswamy, virtualization,
	David Woodhouse, Vineeth Pillai

On 2021-07-09 12:43, Wei Liu wrote:
> Microsoft Hypervisor provides a set of hypercalls to manage device
> domains. The root kernel should parse the DMAR so that it can program
> the IOMMU (with hypercalls) correctly.
> 
> The DMAR code was designed to work with Intel IOMMU only. Add two more
> parameters to make it useful to Microsoft Hypervisor. Microsoft
> Hypervisor does not need the DMAR parsing code to allocate an Intel
> IOMMU structure; it also wishes to always reparse the DMAR table even
> after it has been parsed before.

We've recently defined the VIOT table for describing paravirtualised 
IOMMUs - would it make more sense to extend that to support the 
Microsoft implementation than to abuse a hardware-specific table? Am I 
right in assuming said hypervisor isn't intended to only ever run on 
Intel hardware?

Robin.

> Adjust Intel IOMMU code to use the new dmar_table_init. There should be
> no functional change to Intel IOMMU code.
> 
> Signed-off-by: Wei Liu <wei.liu@kernel.org>
> ---
> We may be able to combine alloc and force_parse?
> ---
>   drivers/iommu/intel/dmar.c          | 38 ++++++++++++++++++++---------
>   drivers/iommu/intel/iommu.c         |  2 +-
>   drivers/iommu/intel/irq_remapping.c |  2 +-
>   include/linux/dmar.h                |  2 +-
>   4 files changed, 30 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
> index 84057cb9596c..bd72f47c728b 100644
> --- a/drivers/iommu/intel/dmar.c
> +++ b/drivers/iommu/intel/dmar.c
> @@ -408,7 +408,8 @@ dmar_find_dmaru(struct acpi_dmar_hardware_unit *drhd)
>    * structure which uniquely represent one DMA remapping hardware unit
>    * present in the platform
>    */
> -static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
> +static int dmar_parse_one_drhd_internal(struct acpi_dmar_header *header,
> +		void *arg, bool alloc)
>   {
>   	struct acpi_dmar_hardware_unit *drhd;
>   	struct dmar_drhd_unit *dmaru;
> @@ -440,12 +441,14 @@ static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
>   		return -ENOMEM;
>   	}
>   
> -	ret = alloc_iommu(dmaru);
> -	if (ret) {
> -		dmar_free_dev_scope(&dmaru->devices,
> -				    &dmaru->devices_cnt);
> -		kfree(dmaru);
> -		return ret;
> +	if (alloc) {
> +		ret = alloc_iommu(dmaru);
> +		if (ret) {
> +			dmar_free_dev_scope(&dmaru->devices,
> +					    &dmaru->devices_cnt);
> +			kfree(dmaru);
> +			return ret;
> +		}
>   	}
>   	dmar_register_drhd_unit(dmaru);
>   
> @@ -456,6 +459,16 @@ static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
>   	return 0;
>   }
>   
> +static int dmar_parse_one_drhd(struct acpi_dmar_header *header, void *arg)
> +{
> +	return dmar_parse_one_drhd_internal(header, arg, true);
> +}
> +
> +int dmar_parse_one_drhd_noalloc(struct acpi_dmar_header *header, void *arg)
> +{
> +	return dmar_parse_one_drhd_internal(header, arg, false);
> +}
> +
>   static void dmar_free_drhd(struct dmar_drhd_unit *dmaru)
>   {
>   	if (dmaru->devices && dmaru->devices_cnt)
> @@ -633,7 +646,7 @@ static inline int dmar_walk_dmar_table(struct acpi_table_dmar *dmar,
>    * parse_dmar_table - parses the DMA reporting table
>    */
>   static int __init
> -parse_dmar_table(void)
> +parse_dmar_table(bool alloc)
>   {
>   	struct acpi_table_dmar *dmar;
>   	int drhd_count = 0;
> @@ -650,6 +663,9 @@ parse_dmar_table(void)
>   		.cb[ACPI_DMAR_TYPE_SATC] = &dmar_parse_one_satc,
>   	};
>   
> +	if (!alloc)
> +		cb.cb[ACPI_DMAR_TYPE_HARDWARE_UNIT] = &dmar_parse_one_drhd_noalloc;
> +
>   	/*
>   	 * Do it again, earlier dmar_tbl mapping could be mapped with
>   	 * fixed map.
> @@ -840,13 +856,13 @@ void __init dmar_register_bus_notifier(void)
>   }
>   
>   
> -int __init dmar_table_init(void)
> +int __init dmar_table_init(bool alloc, bool force_parse)
>   {
>   	static int dmar_table_initialized;
>   	int ret;
>   
> -	if (dmar_table_initialized == 0) {
> -		ret = parse_dmar_table();
> +	if (dmar_table_initialized == 0 || force_parse) {
> +		ret = parse_dmar_table(alloc);
>   		if (ret < 0) {
>   			if (ret != -ENODEV)
>   				pr_info("Parse DMAR table failure.\n");
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index be35284a2016..a4294d310b93 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -4310,7 +4310,7 @@ int __init intel_iommu_init(void)
>   	}
>   
>   	down_write(&dmar_global_lock);
> -	if (dmar_table_init()) {
> +	if (dmar_table_init(true, false)) {
>   		if (force_on)
>   			panic("tboot: Failed to initialize DMAR table\n");
>   		goto out_free_dmar;
> diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
> index f912fe45bea2..0e8abef862e4 100644
> --- a/drivers/iommu/intel/irq_remapping.c
> +++ b/drivers/iommu/intel/irq_remapping.c
> @@ -732,7 +732,7 @@ static int __init intel_prepare_irq_remapping(void)
>   		return -ENODEV;
>   	}
>   
> -	if (dmar_table_init() < 0)
> +	if (dmar_table_init(true, false) < 0)
>   		return -ENODEV;
>   
>   	if (intel_cap_audit(CAP_AUDIT_STATIC_IRQR, NULL))
> diff --git a/include/linux/dmar.h b/include/linux/dmar.h
> index e04436a7ff27..f88535d41a6e 100644
> --- a/include/linux/dmar.h
> +++ b/include/linux/dmar.h
> @@ -106,7 +106,7 @@ static inline bool dmar_rcu_check(void)
>   	for_each_dev_scope((devs), (cnt), (i), (tmp))			\
>   		if (!(tmp)) { continue; } else
>   
> -extern int dmar_table_init(void);
> +extern int dmar_table_init(bool alloc, bool force_parse);
>   extern int dmar_dev_scope_init(void);
>   extern void dmar_register_bus_notifier(void);
>   extern int dmar_parse_dev_scope(void *start, void *end, int *cnt,
> 
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU
  2021-07-09 12:46   ` Robin Murphy
@ 2021-07-09 13:34     ` Wei Liu
  0 siblings, 0 replies; 18+ messages in thread
From: Wei Liu @ 2021-07-09 13:34 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Wei Liu, Stephen Hemminger, pasha.tatashin, kumarpraveen,
	Vineeth Pillai, Haiyang Zhang, Dexuan Cui, Linux on Hyper-V List,
	Linux Kernel List, open list:IOMMU DRIVERS, Nuno Das Neves,
	Sunil Muthuswamy, K. Y. Srinivasan, Will Deacon, virtualization,
	Michael Kelley

On Fri, Jul 09, 2021 at 01:46:19PM +0100, Robin Murphy wrote:
> On 2021-07-09 12:43, Wei Liu wrote:
> > Some devices may have been claimed by the hypervisor already. One such
> > example is a user can assign a NIC for debugging purpose.
> > 
> > Ideally Linux should be able to tell retrieve that information, but
> > there is no way to do that yet. And designing that new mechanism is
> > going to take time.
> > 
> > Provide a command line option for skipping devices. This is a stopgap
> > solution, so it is intentionally undocumented. Hopefully we can retire
> > it in the future.
> 
> Huh? If the host is using a device, why the heck is it exposing any
> knowledge of that device to the guest at all, let alone allowing the guest
> to do anything that could affect its operation!?

The host in this setup consists of the hypervisor, the root kernel and a
bunch of user space programs.

Root is not an ordinary guest. It does need to know all the hardware to
manage the platform. Hypervisor does not claim more devices than it
needs to, nor does it try to hide hardware details from the root.

The hypervisor can protect itself just fine. Any attempt to use the
already claimed devices will be blocked or rejected, so are the attempts
to attach them to device domains.

That, however, leads to some interesting interactions between the
hypervisor and Linux kernel.  When kernel initializes IOMMU during boot,
it will try to attach all devices in one go. Any failure there will
cause kernel to detach the already attached devices. That's not fatal to
kernel, and is only a minor annoyance to our current use case, because
the default domain is a passthrough domain anyway. It will become
problematic once we switch the default domain to a DMA domain to further
tighten security during Linux boot.

Wei.

> 
> Robin.
> 
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 3/8] intel/vt-d: make DMAR table parsing code more flexible
  2021-07-09 12:56   ` Robin Murphy
@ 2021-07-09 13:42     ` Wei Liu
  0 siblings, 0 replies; 18+ messages in thread
From: Wei Liu @ 2021-07-09 13:42 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Jean-Philippe Brucker, Wei Liu, pasha.tatashin, kumarpraveen,
	Will Deacon, virtualization, Linux on Hyper-V List,
	Linux Kernel List, open list:INTEL IOMMU VT-d, Nuno Das Neves,
	Sunil Muthuswamy, Michael Kelley, David Woodhouse,
	Vineeth Pillai

On Fri, Jul 09, 2021 at 01:56:46PM +0100, Robin Murphy wrote:
> On 2021-07-09 12:43, Wei Liu wrote:
> > Microsoft Hypervisor provides a set of hypercalls to manage device
> > domains. The root kernel should parse the DMAR so that it can program
> > the IOMMU (with hypercalls) correctly.
> > 
> > The DMAR code was designed to work with Intel IOMMU only. Add two more
> > parameters to make it useful to Microsoft Hypervisor. Microsoft
> > Hypervisor does not need the DMAR parsing code to allocate an Intel
> > IOMMU structure; it also wishes to always reparse the DMAR table even
> > after it has been parsed before.
> 
> We've recently defined the VIOT table for describing paravirtualised IOMMUs
> - would it make more sense to extend that to support the Microsoft
> implementation than to abuse a hardware-specific table? Am I right in

I searched for VIOT and believed I found the correct link
https://lwn.net/Articles/859291/. My understanding is based on the
reading of that series.

VIOT is useful. I think it solves the problem for guests.

It does not solve the problem we have though. The DMAR tables are not
conjured up by some backend software running on the host side. They are
the real tables provided by the firmware. The kernel here is part of the
host setup, dealing with physical hardware.

No matter how much I wish all vendors unified their tables, I don't see
how that's going to happen for readily available servers. :-(

> assuming said hypervisor isn't intended to only ever run on Intel hardware?

Yes, that's correct. We also plan to add support AMD and ARM64.

Wei.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 4/8] intel/vt-d: export intel_iommu_get_resv_regions
  2021-07-09 11:43 ` [RFC v1 4/8] intel/vt-d: export intel_iommu_get_resv_regions Wei Liu
@ 2021-07-09 14:17   ` Lu Baolu
  2021-07-09 14:21     ` Wei Liu
  0 siblings, 1 reply; 18+ messages in thread
From: Lu Baolu @ 2021-07-09 14:17 UTC (permalink / raw)
  To: Wei Liu, Linux on Hyper-V List
  Cc: pasha.tatashin, Will Deacon, kumarpraveen, David Woodhouse,
	Linux Kernel List, virtualization, open list:INTEL IOMMU (VT-d),
	Nuno Das Neves, Sunil Muthuswamy, Michael Kelley, Vineeth Pillai

On 2021/7/9 19:43, Wei Liu wrote:
> When Microsoft Hypervisor runs on Intel platforms it needs to know the
> reserved regions to program devices correctly. There is no reason to
> duplicate intel_iommu_get_resv_regions. Export it.

Why not using iommu_get_resv_regions()?

Best regards,
baolu

> 
> Signed-off-by: Wei Liu <wei.liu@kernel.org>
> ---
>   drivers/iommu/intel/iommu.c | 5 +++--
>   include/linux/intel-iommu.h | 4 ++++
>   2 files changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index a4294d310b93..01973bc20080 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -5176,8 +5176,8 @@ static void intel_iommu_probe_finalize(struct device *dev)
>   		set_dma_ops(dev, NULL);
>   }
>   
> -static void intel_iommu_get_resv_regions(struct device *device,
> -					 struct list_head *head)
> +void intel_iommu_get_resv_regions(struct device *device,
> +				 struct list_head *head)
>   {
>   	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
>   	struct iommu_resv_region *reg;
> @@ -5232,6 +5232,7 @@ static void intel_iommu_get_resv_regions(struct device *device,
>   		return;
>   	list_add_tail(&reg->list, head);
>   }
> +EXPORT_SYMBOL_GPL(intel_iommu_get_resv_regions);
>   
>   int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
>   {
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index 03faf20a6817..f91869f765bc 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -814,6 +814,8 @@ extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
>   extern int dmar_disabled;
>   extern int intel_iommu_enabled;
>   extern int intel_iommu_gfx_mapped;
> +extern void intel_iommu_get_resv_regions(struct device *device,
> +				 struct list_head *head);
>   #else
>   static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
>   {
> @@ -825,6 +827,8 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
>   }
>   #define dmar_disabled	(1)
>   #define intel_iommu_enabled (0)
> +static inline void intel_iommu_get_resv_regions(struct device *device,
> +				 struct list_head *head) {}
>   #endif
>   
>   #endif
> 
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 4/8] intel/vt-d: export intel_iommu_get_resv_regions
  2021-07-09 14:17   ` Lu Baolu
@ 2021-07-09 14:21     ` Wei Liu
  0 siblings, 0 replies; 18+ messages in thread
From: Wei Liu @ 2021-07-09 14:21 UTC (permalink / raw)
  To: Lu Baolu
  Cc: Wei Liu, pasha.tatashin, Linux on Hyper-V List, kumarpraveen,
	David Woodhouse, Linux Kernel List, open list:INTEL IOMMU (VT-d),
	Michael Kelley, Will Deacon, Nuno Das Neves, Sunil Muthuswamy,
	virtualization, Vineeth Pillai

On Fri, Jul 09, 2021 at 10:17:25PM +0800, Lu Baolu wrote:
> On 2021/7/9 19:43, Wei Liu wrote:
> > When Microsoft Hypervisor runs on Intel platforms it needs to know the
> > reserved regions to program devices correctly. There is no reason to
> > duplicate intel_iommu_get_resv_regions. Export it.
> 
> Why not using iommu_get_resv_regions()?

That calls into ops->get_resv_regions.

In this patch series, get_resv_regions is hv_iommu_resv_regions, which
wants to use intel_iommu_get_resv_regions when it detects the underlying
hardware platform is from Intel.

Wei.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 5/8] mshv: add paravirtualized IOMMU support
  2021-07-09 11:43 ` [RFC v1 5/8] mshv: add paravirtualized IOMMU support Wei Liu
@ 2021-08-03 18:40   ` Praveen Kumar
  2021-08-03 21:47     ` Wei Liu
  0 siblings, 1 reply; 18+ messages in thread
From: Praveen Kumar @ 2021-08-03 18:40 UTC (permalink / raw)
  To: Wei Liu, Linux on Hyper-V List
  Cc: K. Y. Srinivasan, Stephen Hemminger, pasha.tatashin, Will Deacon,
	Dexuan Cui, Linux Kernel List, Michael Kelley,
	open list:IOMMU DRIVERS, Nuno Das Neves, Sunil Muthuswamy,
	virtualization, Vineeth Pillai, Haiyang Zhang

On 09-07-2021 17:13, Wei Liu wrote:
> +static void hv_iommu_domain_free(struct iommu_domain *d)
> +{
> +	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
> +	unsigned long flags;
> +	u64 status;
> +	struct hv_input_delete_device_domain *input;
> +
> +	if (is_identity_domain(domain) || is_null_domain(domain))
> +		return;
> +
> +	local_irq_save(flags);
> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +	memset(input, 0, sizeof(*input));
> +
> +	input->device_domain= domain->device_domain;
> +
> +	status = hv_do_hypercall(HVCALL_DELETE_DEVICE_DOMAIN, input, NULL);
> +
> +	local_irq_restore(flags);
> +
> +	if (!hv_result_success(status))
> +		pr_err("%s: hypercall failed, status %lld\n", __func__, status);

Is it OK to deallocate the resources, if hypercall has failed ?
Do we have any specific error code EBUSY (kind of) which we need to wait upon ?

> +
> +	ida_free(&domain->hv_iommu->domain_ids, domain->device_domain.domain_id.id);
> +
> +	iommu_put_dma_cookie(d);
> +
> +	kfree(domain);
> +}
> +
> +static int hv_iommu_attach_dev(struct iommu_domain *d, struct device *dev)
> +{
> +	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
> +	u64 status;
> +	unsigned long flags;
> +	struct hv_input_attach_device_domain *input;
> +	struct pci_dev *pdev;
> +	struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
> +
> +	/* Only allow PCI devices for now */
> +	if (!dev_is_pci(dev))
> +		return -EINVAL;
> +
> +	pdev = to_pci_dev(dev);
> +
> +	dev_dbg(dev, "Attaching (%strusted) to %d\n", pdev->untrusted ? "un" : "",
> +		domain->device_domain.domain_id.id);
> +
> +	local_irq_save(flags);
> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +	memset(input, 0, sizeof(*input));
> +
> +	input->device_domain = domain->device_domain;
> +	input->device_id = hv_build_pci_dev_id(pdev);
> +
> +	status = hv_do_hypercall(HVCALL_ATTACH_DEVICE_DOMAIN, input, NULL);
> +	local_irq_restore(flags);
> +
> +	if (!hv_result_success(status))
> +		pr_err("%s: hypercall failed, status %lld\n", __func__, status);

Does it make sense to vdev->domain = NULL ?

> +	else
> +		vdev->domain = domain;
> +
> +	return hv_status_to_errno(status);
> +}
> +
> +static void hv_iommu_detach_dev(struct iommu_domain *d, struct device *dev)
> +{
> +	u64 status;
> +	unsigned long flags;
> +	struct hv_input_detach_device_domain *input;
> +	struct pci_dev *pdev;
> +	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
> +	struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
> +
> +	/* See the attach function, only PCI devices for now */
> +	if (!dev_is_pci(dev))
> +		return;
> +
> +	pdev = to_pci_dev(dev);
> +
> +	dev_dbg(dev, "Detaching from %d\n", domain->device_domain.domain_id.id);
> +
> +	local_irq_save(flags);
> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +	memset(input, 0, sizeof(*input));
> +
> +	input->partition_id = HV_PARTITION_ID_SELF;
> +	input->device_id = hv_build_pci_dev_id(pdev);
> +
> +	status = hv_do_hypercall(HVCALL_DETACH_DEVICE_DOMAIN, input, NULL);
> +	local_irq_restore(flags);
> +
> +	if (!hv_result_success(status))
> +		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
> +
> +	vdev->domain = NULL;
> +}
> +
> +static int hv_iommu_add_mapping(struct hv_iommu_domain *domain, unsigned long iova,
> +		phys_addr_t paddr, size_t size, u32 flags)
> +{
> +	unsigned long irqflags;
> +	struct hv_iommu_mapping *mapping;
> +
> +	mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
> +	if (!mapping)
> +		return -ENOMEM;
> +
> +	mapping->paddr = paddr;
> +	mapping->iova.start = iova;
> +	mapping->iova.last = iova + size - 1;
> +	mapping->flags = flags;
> +
> +	spin_lock_irqsave(&domain->mappings_lock, irqflags);
> +	interval_tree_insert(&mapping->iova, &domain->mappings);
> +	spin_unlock_irqrestore(&domain->mappings_lock, irqflags);
> +
> +	return 0;
> +}
> +
> +static size_t hv_iommu_del_mappings(struct hv_iommu_domain *domain,
> +		unsigned long iova, size_t size)
> +{
> +	unsigned long flags;
> +	size_t unmapped = 0;
> +	unsigned long last = iova + size - 1;
> +	struct hv_iommu_mapping *mapping = NULL;
> +	struct interval_tree_node *node, *next;
> +
> +	spin_lock_irqsave(&domain->mappings_lock, flags);
> +	next = interval_tree_iter_first(&domain->mappings, iova, last);
> +	while (next) {
> +		node = next;
> +		mapping = container_of(node, struct hv_iommu_mapping, iova);
> +		next = interval_tree_iter_next(node, iova, last);
> +
> +		/* Trying to split a mapping? Not supported for now. */
> +		if (mapping->iova.start < iova)
> +			break;
> +
> +		unmapped += mapping->iova.last - mapping->iova.start + 1;
> +
> +		interval_tree_remove(node, &domain->mappings);
> +		kfree(mapping);
> +	}
> +	spin_unlock_irqrestore(&domain->mappings_lock, flags);
> +
> +	return unmapped;
> +}
> +
> +static int hv_iommu_map(struct iommu_domain *d, unsigned long iova,
> +			phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
> +{
> +	u32 map_flags;
> +	unsigned long flags, pfn, npages;
> +	int ret, i;
> +	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
> +	struct hv_input_map_device_gpa_pages *input;
> +	u64 status;
> +
> +	/* Reject size that's not a whole page */
> +	if (size & ~HV_HYP_PAGE_MASK)
> +		return -EINVAL;
> +
> +	map_flags = HV_MAP_GPA_READABLE; /* Always required */
> +	map_flags |= prot & IOMMU_WRITE ? HV_MAP_GPA_WRITABLE : 0;
> +
> +	ret = hv_iommu_add_mapping(domain, iova, paddr, size, flags);
> +	if (ret)
> +		return ret;
> +
> +	npages = size >> HV_HYP_PAGE_SHIFT;
> +
> +	local_irq_save(flags);
> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +	memset(input, 0, sizeof(*input));
> +
> +	input->device_domain = domain->device_domain;
> +	input->map_flags = map_flags;
> +	input->target_device_va_base = iova;
> +
> +	pfn = paddr >> HV_HYP_PAGE_SHIFT;
> +	for (i = 0; i < npages; i++) {
> +		input->gpa_page_list[i] = pfn;
> +		pfn += 1;
> +	}
> +
> +	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_GPA_PAGES, npages, 0,
> +			input, NULL);
> +
> +	local_irq_restore(flags);
> +
> +	if (!hv_result_success(status)) {
> +		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
> +		hv_iommu_del_mappings(domain, iova, size);
> +	}
> +
> +	return hv_status_to_errno(status);
> +}
> +
> +static size_t hv_iommu_unmap(struct iommu_domain *d, unsigned long iova,
> +			   size_t size, struct iommu_iotlb_gather *gather)
> +{
> +	size_t unmapped;
> +	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
> +	unsigned long flags, npages;
> +	struct hv_input_unmap_device_gpa_pages *input;
> +	u64 status;
> +
> +	unmapped = hv_iommu_del_mappings(domain, iova, size);
> +	if (unmapped < size)
> +		return 0;

Is there a case where unmapped > 0 && unmapped < size ?

> +
> +	npages = size >> HV_HYP_PAGE_SHIFT;
> +
> +	local_irq_save(flags);
> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +	memset(input, 0, sizeof(*input));
> +
> +	input->device_domain = domain->device_domain;
> +	input->target_device_va_base = iova;
> +
> +	/* Unmap `npages` pages starting from VA base */
> +	status = hv_do_rep_hypercall(HVCALL_UNMAP_DEVICE_GPA_PAGES, npages,
> +			0, input, NULL);
> +
> +	local_irq_restore(flags);
> +
> +	if (!hv_result_success(status))
> +		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
> +
> +	return hv_result_success(status) ? unmapped : 0;
> +}
> +

Regards,

~Praveen.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU
  2021-07-09 11:43 ` [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU Wei Liu
  2021-07-09 12:46   ` Robin Murphy
@ 2021-08-03 18:50   ` Praveen Kumar
  2021-08-03 21:56     ` Wei Liu
  1 sibling, 1 reply; 18+ messages in thread
From: Praveen Kumar @ 2021-08-03 18:50 UTC (permalink / raw)
  To: Wei Liu, Linux on Hyper-V List
  Cc: K. Y. Srinivasan, Stephen Hemminger, pasha.tatashin, Will Deacon,
	Haiyang Zhang, Dexuan Cui, Linux Kernel List, Michael Kelley,
	open list:IOMMU DRIVERS, Nuno Das Neves, Sunil Muthuswamy,
	virtualization, Vineeth Pillai

On 09-07-2021 17:13, Wei Liu wrote:
> Some devices may have been claimed by the hypervisor already. One such
> example is a user can assign a NIC for debugging purpose.
> 
> Ideally Linux should be able to tell retrieve that information, but
> there is no way to do that yet. And designing that new mechanism is
> going to take time.
> 
> Provide a command line option for skipping devices. This is a stopgap
> solution, so it is intentionally undocumented. Hopefully we can retire
> it in the future.
> 
> Signed-off-by: Wei Liu <wei.liu@kernel.org>
> ---
>  drivers/iommu/hyperv-iommu.c | 45 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 45 insertions(+)
> 
> diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
> index 043dcff06511..353da5036387 100644
> --- a/drivers/iommu/hyperv-iommu.c
> +++ b/drivers/iommu/hyperv-iommu.c
> @@ -349,6 +349,16 @@ static const struct irq_domain_ops hyperv_root_ir_domain_ops = {
>  
>  #ifdef CONFIG_HYPERV_ROOT_PVIOMMU
>  
> +/* The IOMMU will not claim these PCI devices. */
> +static char *pci_devs_to_skip;
> +static int __init mshv_iommu_setup_skip(char *str) {
> +	pci_devs_to_skip = str;
> +
> +	return 0;
> +}
> +/* mshv_iommu_skip=(SSSS:BB:DD.F)(SSSS:BB:DD.F) */
> +__setup("mshv_iommu_skip=", mshv_iommu_setup_skip);
> +
>  /* DMA remapping support */
>  struct hv_iommu_domain {
>  	struct iommu_domain domain;
> @@ -774,6 +784,41 @@ static struct iommu_device *hv_iommu_probe_device(struct device *dev)
>  	if (!dev_is_pci(dev))
>  		return ERR_PTR(-ENODEV);
>  
> +	/*
> +	 * Skip the PCI device specified in `pci_devs_to_skip`. This is a
> +	 * temporary solution until we figure out a way to extract information
> +	 * from the hypervisor what devices it is already using.
> +	 */
> +	if (pci_devs_to_skip && *pci_devs_to_skip) {
> +		int pos = 0;
> +		int parsed;
> +		int segment, bus, slot, func;
> +		struct pci_dev *pdev = to_pci_dev(dev);
> +
> +		do {
> +			parsed = 0;
> +
> +			sscanf(pci_devs_to_skip + pos,
> +				" (%x:%x:%x.%x) %n",
> +				&segment, &bus, &slot, &func, &parsed);
> +
> +			if (parsed <= 0)
> +				break;
> +
> +			if (pci_domain_nr(pdev->bus) == segment &&
> +				pdev->bus->number == bus &&
> +				PCI_SLOT(pdev->devfn) == slot &&
> +				PCI_FUNC(pdev->devfn) == func)
> +			{
> +				dev_info(dev, "skipped by MSHV IOMMU\n");
> +				return ERR_PTR(-ENODEV);
> +			}
> +
> +			pos += parsed;
> +
> +		} while (pci_devs_to_skip[pos]);

Is there a possibility of pci_devs_to_skip + pos > sizeof(pci_devs_to_skip) and also a valid memory ?
I would recommend to have a check of size as well before accessing the array content, just to be safer accessing any memory.

> +	}
> +
>  	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
>  	if (!vdev)
>  		return ERR_PTR(-ENOMEM);
> 

Regards,

~Praveen.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 5/8] mshv: add paravirtualized IOMMU support
  2021-08-03 18:40   ` Praveen Kumar
@ 2021-08-03 21:47     ` Wei Liu
  2021-08-04  6:43       ` Praveen Kumar
  0 siblings, 1 reply; 18+ messages in thread
From: Wei Liu @ 2021-08-03 21:47 UTC (permalink / raw)
  To: Praveen Kumar
  Cc: Wei Liu, K. Y. Srinivasan, Stephen Hemminger, pasha.tatashin,
	Linux on Hyper-V List, Will Deacon, Dexuan Cui,
	Linux Kernel List, Michael Kelley, open list:IOMMU DRIVERS,
	Nuno Das Neves, Sunil Muthuswamy, virtualization, Vineeth Pillai,
	Haiyang Zhang

On Wed, Aug 04, 2021 at 12:10:45AM +0530, Praveen Kumar wrote:
> On 09-07-2021 17:13, Wei Liu wrote:
> > +static void hv_iommu_domain_free(struct iommu_domain *d)
> > +{
> > +	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
> > +	unsigned long flags;
> > +	u64 status;
> > +	struct hv_input_delete_device_domain *input;
> > +
> > +	if (is_identity_domain(domain) || is_null_domain(domain))
> > +		return;
> > +
> > +	local_irq_save(flags);
> > +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > +	memset(input, 0, sizeof(*input));
> > +
> > +	input->device_domain= domain->device_domain;
> > +
> > +	status = hv_do_hypercall(HVCALL_DELETE_DEVICE_DOMAIN, input, NULL);
> > +
> > +	local_irq_restore(flags);
> > +
> > +	if (!hv_result_success(status))
> > +		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
> 
> Is it OK to deallocate the resources, if hypercall has failed ?

It should be fine. We leak some resources in the hypervisor, but Linux
is in a rather wedged state anyway. Refusing to free up resources in
Linux does not much good.

> Do we have any specific error code EBUSY (kind of) which we need to wait upon ?
> 

I have not found a circumstance that can happen.

> > +
> > +	ida_free(&domain->hv_iommu->domain_ids, domain->device_domain.domain_id.id);
> > +
> > +	iommu_put_dma_cookie(d);
> > +
> > +	kfree(domain);
> > +}
> > +
> > +static int hv_iommu_attach_dev(struct iommu_domain *d, struct device *dev)
> > +{
> > +	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
> > +	u64 status;
> > +	unsigned long flags;
> > +	struct hv_input_attach_device_domain *input;
> > +	struct pci_dev *pdev;
> > +	struct hv_iommu_endpoint *vdev = dev_iommu_priv_get(dev);
> > +
> > +	/* Only allow PCI devices for now */
> > +	if (!dev_is_pci(dev))
> > +		return -EINVAL;
> > +
> > +	pdev = to_pci_dev(dev);
> > +
> > +	dev_dbg(dev, "Attaching (%strusted) to %d\n", pdev->untrusted ? "un" : "",
> > +		domain->device_domain.domain_id.id);
> > +
> > +	local_irq_save(flags);
> > +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > +	memset(input, 0, sizeof(*input));
> > +
> > +	input->device_domain = domain->device_domain;
> > +	input->device_id = hv_build_pci_dev_id(pdev);
> > +
> > +	status = hv_do_hypercall(HVCALL_ATTACH_DEVICE_DOMAIN, input, NULL);
> > +	local_irq_restore(flags);
> > +
> > +	if (!hv_result_success(status))
> > +		pr_err("%s: hypercall failed, status %lld\n", __func__, status);
> 
> Does it make sense to vdev->domain = NULL ?
>

It is already NULL -- there is no other code path that sets it and the
detach path always sets the field to NULL.

> > +	else
> > +		vdev->domain = domain;
> > +
> > +	return hv_status_to_errno(status);
> > +}
> > +
[...]
> > +static size_t hv_iommu_unmap(struct iommu_domain *d, unsigned long iova,
> > +			   size_t size, struct iommu_iotlb_gather *gather)
> > +{
> > +	size_t unmapped;
> > +	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
> > +	unsigned long flags, npages;
> > +	struct hv_input_unmap_device_gpa_pages *input;
> > +	u64 status;
> > +
> > +	unmapped = hv_iommu_del_mappings(domain, iova, size);
> > +	if (unmapped < size)
> > +		return 0;
> 
> Is there a case where unmapped > 0 && unmapped < size ?
> 

There could be such a case -- hv_iommu_del_mappings' return value is >= 0.
Is there a problem with this predicate?

Wei.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU
  2021-08-03 18:50   ` Praveen Kumar
@ 2021-08-03 21:56     ` Wei Liu
  2021-08-04  7:03       ` Praveen Kumar
  0 siblings, 1 reply; 18+ messages in thread
From: Wei Liu @ 2021-08-03 21:56 UTC (permalink / raw)
  To: Praveen Kumar
  Cc: Wei Liu, K. Y. Srinivasan, Stephen Hemminger, pasha.tatashin,
	Linux on Hyper-V List, Will Deacon, Haiyang Zhang, Dexuan Cui,
	Linux Kernel List, Michael Kelley, open list:IOMMU DRIVERS,
	Nuno Das Neves, Sunil Muthuswamy, virtualization, Vineeth Pillai

On Wed, Aug 04, 2021 at 12:20:42AM +0530, Praveen Kumar wrote:
> On 09-07-2021 17:13, Wei Liu wrote:
> > Some devices may have been claimed by the hypervisor already. One such
> > example is a user can assign a NIC for debugging purpose.
> > 
> > Ideally Linux should be able to tell retrieve that information, but
> > there is no way to do that yet. And designing that new mechanism is
> > going to take time.
> > 
> > Provide a command line option for skipping devices. This is a stopgap
> > solution, so it is intentionally undocumented. Hopefully we can retire
> > it in the future.
> > 
> > Signed-off-by: Wei Liu <wei.liu@kernel.org>
> > ---
> >  drivers/iommu/hyperv-iommu.c | 45 ++++++++++++++++++++++++++++++++++++
> >  1 file changed, 45 insertions(+)
> > 
> > diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
> > index 043dcff06511..353da5036387 100644
> > --- a/drivers/iommu/hyperv-iommu.c
> > +++ b/drivers/iommu/hyperv-iommu.c
> > @@ -349,6 +349,16 @@ static const struct irq_domain_ops hyperv_root_ir_domain_ops = {
> >  
> >  #ifdef CONFIG_HYPERV_ROOT_PVIOMMU
> >  
> > +/* The IOMMU will not claim these PCI devices. */
> > +static char *pci_devs_to_skip;
> > +static int __init mshv_iommu_setup_skip(char *str) {
> > +	pci_devs_to_skip = str;
> > +
> > +	return 0;
> > +}
> > +/* mshv_iommu_skip=(SSSS:BB:DD.F)(SSSS:BB:DD.F) */
> > +__setup("mshv_iommu_skip=", mshv_iommu_setup_skip);
> > +
> >  /* DMA remapping support */
> >  struct hv_iommu_domain {
> >  	struct iommu_domain domain;
> > @@ -774,6 +784,41 @@ static struct iommu_device *hv_iommu_probe_device(struct device *dev)
> >  	if (!dev_is_pci(dev))
> >  		return ERR_PTR(-ENODEV);
> >  
> > +	/*
> > +	 * Skip the PCI device specified in `pci_devs_to_skip`. This is a
> > +	 * temporary solution until we figure out a way to extract information
> > +	 * from the hypervisor what devices it is already using.
> > +	 */
> > +	if (pci_devs_to_skip && *pci_devs_to_skip) {
> > +		int pos = 0;
> > +		int parsed;
> > +		int segment, bus, slot, func;
> > +		struct pci_dev *pdev = to_pci_dev(dev);
> > +
> > +		do {
> > +			parsed = 0;
> > +
> > +			sscanf(pci_devs_to_skip + pos,
> > +				" (%x:%x:%x.%x) %n",
> > +				&segment, &bus, &slot, &func, &parsed);
> > +
> > +			if (parsed <= 0)
> > +				break;
> > +
> > +			if (pci_domain_nr(pdev->bus) == segment &&
> > +				pdev->bus->number == bus &&
> > +				PCI_SLOT(pdev->devfn) == slot &&
> > +				PCI_FUNC(pdev->devfn) == func)
> > +			{
> > +				dev_info(dev, "skipped by MSHV IOMMU\n");
> > +				return ERR_PTR(-ENODEV);
> > +			}
> > +
> > +			pos += parsed;
> > +
> > +		} while (pci_devs_to_skip[pos]);
> 
> Is there a possibility of pci_devs_to_skip + pos > sizeof(pci_devs_to_skip)
> and also a valid memory ?

pos should point to the last parsed position. If parsing fails pos does
not get updated and the code breaks out of the loop. If parsing is
success pos should point to either the start of next element of '\0'
(end of string). To me this is good enough.

> I would recommend to have a check of size as well before accessing the
> array content, just to be safer accessing any memory.
> 

What check do you have in mind?

Wei.

> > +	}
> > +
> >  	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
> >  	if (!vdev)
> >  		return ERR_PTR(-ENOMEM);
> > 
> 
> Regards,
> 
> ~Praveen.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 5/8] mshv: add paravirtualized IOMMU support
  2021-08-03 21:47     ` Wei Liu
@ 2021-08-04  6:43       ` Praveen Kumar
  2021-08-10 10:46         ` Wei Liu
  0 siblings, 1 reply; 18+ messages in thread
From: Praveen Kumar @ 2021-08-04  6:43 UTC (permalink / raw)
  To: Wei Liu
  Cc: Linux on Hyper-V List, K. Y. Srinivasan, Stephen Hemminger,
	pasha.tatashin, Will Deacon, Dexuan Cui, Linux Kernel List,
	Michael Kelley, open list:IOMMU DRIVERS, Nuno Das Neves,
	Sunil Muthuswamy, virtualization, Vineeth Pillai, Haiyang Zhang

On 04-08-2021 03:17, Wei Liu wrote:
>>> +static size_t hv_iommu_unmap(struct iommu_domain *d, unsigned long iova,
>>> +			   size_t size, struct iommu_iotlb_gather *gather)
>>> +{
>>> +	size_t unmapped;
>>> +	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
>>> +	unsigned long flags, npages;
>>> +	struct hv_input_unmap_device_gpa_pages *input;
>>> +	u64 status;
>>> +
>>> +	unmapped = hv_iommu_del_mappings(domain, iova, size);
>>> +	if (unmapped < size)
>>> +		return 0;
>> Is there a case where unmapped > 0 && unmapped < size ?
>>
> There could be such a case -- hv_iommu_del_mappings' return value is >= 0.
> Is there a problem with this predicate?

What I understand, if we are unmapping and return 0, means nothing was unmapped, and will that not cause any corruption or illegal access of unmapped memory later?
From __iommu_unmap
...
    13         while (unmapped < size) {
    12                 size_t pgsize = iommu_pgsize(domain, iova, size - unmapped);
    11
    10                 unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather);
     9                 if (!unmapped_page)
     8                         break;		<<< we just break here, thinking there is nothing unmapped, but actually hv_iommu_del_mappings has removed some pages.
     7
     6                 pr_debug("unmapped: iova 0x%lx size 0x%zx\n",
     5                         ¦iova, unmapped_page);
     4
     3                 iova += unmapped_page;
     2                 unmapped += unmapped_page;
     1         }
...

Am I missing something ?

Regards,

~Praveen.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU
  2021-08-03 21:56     ` Wei Liu
@ 2021-08-04  7:03       ` Praveen Kumar
  2021-08-10 10:04         ` Wei Liu
  0 siblings, 1 reply; 18+ messages in thread
From: Praveen Kumar @ 2021-08-04  7:03 UTC (permalink / raw)
  To: Wei Liu
  Cc: Linux on Hyper-V List, K. Y. Srinivasan, Stephen Hemminger,
	pasha.tatashin, Will Deacon, Haiyang Zhang, Dexuan Cui,
	Linux Kernel List, Michael Kelley, open list:IOMMU DRIVERS,
	Nuno Das Neves, Sunil Muthuswamy, virtualization, Vineeth Pillai

On 04-08-2021 03:26, Wei Liu wrote:
>>>  	struct iommu_domain domain;
>>> @@ -774,6 +784,41 @@ static struct iommu_device *hv_iommu_probe_device(struct device *dev)
>>>  	if (!dev_is_pci(dev))
>>>  		return ERR_PTR(-ENODEV);
>>>  
>>> +	/*
>>> +	 * Skip the PCI device specified in `pci_devs_to_skip`. This is a
>>> +	 * temporary solution until we figure out a way to extract information
>>> +	 * from the hypervisor what devices it is already using.
>>> +	 */
>>> +	if (pci_devs_to_skip && *pci_devs_to_skip) {
>>> +		int pos = 0;
>>> +		int parsed;
>>> +		int segment, bus, slot, func;
>>> +		struct pci_dev *pdev = to_pci_dev(dev);
>>> +
>>> +		do {
>>> +			parsed = 0;
>>> +
>>> +			sscanf(pci_devs_to_skip + pos,
>>> +				" (%x:%x:%x.%x) %n",
>>> +				&segment, &bus, &slot, &func, &parsed);
>>> +
>>> +			if (parsed <= 0)
>>> +				break;
>>> +
>>> +			if (pci_domain_nr(pdev->bus) == segment &&
>>> +				pdev->bus->number == bus &&
>>> +				PCI_SLOT(pdev->devfn) == slot &&
>>> +				PCI_FUNC(pdev->devfn) == func)
>>> +			{
>>> +				dev_info(dev, "skipped by MSHV IOMMU\n");
>>> +				return ERR_PTR(-ENODEV);
>>> +			}
>>> +
>>> +			pos += parsed;
>>> +
>>> +		} while (pci_devs_to_skip[pos]);
>>
>> Is there a possibility of pci_devs_to_skip + pos > sizeof(pci_devs_to_skip)
>> and also a valid memory ?
> 
> pos should point to the last parsed position. If parsing fails pos does
> not get updated and the code breaks out of the loop. If parsing is
> success pos should point to either the start of next element of '\0'
> (end of string). To me this is good enough.

The point is, hypothetically the address to pci_devs_to_skip + pos can be valid address (later to '\0'), and thus there is a possibility, that parsing may not fail.
Another, there is also a possibility of sscanf faulting accessing the illegal address, if pci_devs_to_skip[pos] turns out to be not NULL or valid address.

> 
>> I would recommend to have a check of size as well before accessing the
>> array content, just to be safer accessing any memory.
>>
> 
> What check do you have in mind?

Something like,
size_t len = strlen(pci_devs_to_skip);
do {

	len -= parsed;
} while (len);

OR

do {
...
	pos += parsed;
} while (pos < len);

Further, I'm also fine with the existing code, if you think this won't break and already been taken care. Thanks.

Regards,

~Praveen.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU
  2021-08-04  7:03       ` Praveen Kumar
@ 2021-08-10 10:04         ` Wei Liu
  0 siblings, 0 replies; 18+ messages in thread
From: Wei Liu @ 2021-08-10 10:04 UTC (permalink / raw)
  To: Praveen Kumar
  Cc: Wei Liu, K. Y. Srinivasan, Stephen Hemminger, pasha.tatashin,
	Linux on Hyper-V List, Will Deacon, Haiyang Zhang, Dexuan Cui,
	Linux Kernel List, Michael Kelley, open list:IOMMU DRIVERS,
	Nuno Das Neves, Sunil Muthuswamy, virtualization, Vineeth Pillai

On Wed, Aug 04, 2021 at 12:33:54PM +0530, Praveen Kumar wrote:
> On 04-08-2021 03:26, Wei Liu wrote:
> >>>  	struct iommu_domain domain;
> >>> @@ -774,6 +784,41 @@ static struct iommu_device *hv_iommu_probe_device(struct device *dev)
> >>>  	if (!dev_is_pci(dev))
> >>>  		return ERR_PTR(-ENODEV);
> >>>  
> >>> +	/*
> >>> +	 * Skip the PCI device specified in `pci_devs_to_skip`. This is a
> >>> +	 * temporary solution until we figure out a way to extract information
> >>> +	 * from the hypervisor what devices it is already using.
> >>> +	 */
> >>> +	if (pci_devs_to_skip && *pci_devs_to_skip) {
> >>> +		int pos = 0;
> >>> +		int parsed;
> >>> +		int segment, bus, slot, func;
> >>> +		struct pci_dev *pdev = to_pci_dev(dev);
> >>> +
> >>> +		do {
> >>> +			parsed = 0;
> >>> +
> >>> +			sscanf(pci_devs_to_skip + pos,
> >>> +				" (%x:%x:%x.%x) %n",
> >>> +				&segment, &bus, &slot, &func, &parsed);
> >>> +
> >>> +			if (parsed <= 0)
> >>> +				break;
> >>> +
> >>> +			if (pci_domain_nr(pdev->bus) == segment &&
> >>> +				pdev->bus->number == bus &&
> >>> +				PCI_SLOT(pdev->devfn) == slot &&
> >>> +				PCI_FUNC(pdev->devfn) == func)
> >>> +			{
> >>> +				dev_info(dev, "skipped by MSHV IOMMU\n");
> >>> +				return ERR_PTR(-ENODEV);
> >>> +			}
> >>> +
> >>> +			pos += parsed;
> >>> +
> >>> +		} while (pci_devs_to_skip[pos]);
> >>
> >> Is there a possibility of pci_devs_to_skip + pos > sizeof(pci_devs_to_skip)
> >> and also a valid memory ?
> > 
> > pos should point to the last parsed position. If parsing fails pos does
> > not get updated and the code breaks out of the loop. If parsing is
> > success pos should point to either the start of next element of '\0'
> > (end of string). To me this is good enough.
> 
> The point is, hypothetically the address to pci_devs_to_skip + pos can
> be valid address (later to '\0'), and thus there is a possibility,
> that parsing may not fail.

Have you found an example how at any given point in time
pci_devs_to_skip + pos can point outside of user provided string?

> Another, there is also a possibility of sscanf faulting accessing the
> illegal address, if pci_devs_to_skip[pos] turns out to be not NULL or
> valid address.

That depends on pci_devs_to_skip + pos can point to an invalid address
in the first place, so that goes back to the question above.

> 
> > 
> >> I would recommend to have a check of size as well before accessing the
> >> array content, just to be safer accessing any memory.
> >>
> > 
> > What check do you have in mind?
> 
> Something like,
> size_t len = strlen(pci_devs_to_skip);
> do {
> 
> 	len -= parsed;
> } while (len);
> 
> OR
> 
> do {
> ...
> 	pos += parsed;
> } while (pos < len);
> 
> Further, I'm also fine with the existing code, if you think this won't
> break and already been taken care. Thanks.

But in the loop somewhere you will still need to parse pci_devs_to_skip
+ some_offset. The new code structure does not remove that, right?

Given this is for debugging and is supposed to be temporary, I think the
code is good enough. But I want to make sure if there is anything I
missed.

Wei.

> 
> Regards,
> 
> ~Praveen.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC v1 5/8] mshv: add paravirtualized IOMMU support
  2021-08-04  6:43       ` Praveen Kumar
@ 2021-08-10 10:46         ` Wei Liu
  0 siblings, 0 replies; 18+ messages in thread
From: Wei Liu @ 2021-08-10 10:46 UTC (permalink / raw)
  To: Praveen Kumar
  Cc: Wei Liu, K. Y. Srinivasan, Stephen Hemminger, pasha.tatashin,
	Linux on Hyper-V List, Will Deacon, Dexuan Cui,
	Linux Kernel List, Michael Kelley, open list:IOMMU DRIVERS,
	Nuno Das Neves, Sunil Muthuswamy, virtualization, Vineeth Pillai,
	Haiyang Zhang

On Wed, Aug 04, 2021 at 12:13:42PM +0530, Praveen Kumar wrote:
> On 04-08-2021 03:17, Wei Liu wrote:
> >>> +static size_t hv_iommu_unmap(struct iommu_domain *d, unsigned long iova,
> >>> +			   size_t size, struct iommu_iotlb_gather *gather)
> >>> +{
> >>> +	size_t unmapped;
> >>> +	struct hv_iommu_domain *domain = to_hv_iommu_domain(d);
> >>> +	unsigned long flags, npages;
> >>> +	struct hv_input_unmap_device_gpa_pages *input;
> >>> +	u64 status;
> >>> +
> >>> +	unmapped = hv_iommu_del_mappings(domain, iova, size);
> >>> +	if (unmapped < size)
> >>> +		return 0;
> >> Is there a case where unmapped > 0 && unmapped < size ?
> >>
> > There could be such a case -- hv_iommu_del_mappings' return value is >= 0.
> > Is there a problem with this predicate?
> 
> What I understand, if we are unmapping and return 0, means nothing was
> unmapped, and will that not cause any corruption or illegal access of
> unmapped memory later?  From __iommu_unmap

Those pages are not really unmapped. The hypercall is skipped.

> ...
>     13         while (unmapped < size) {
>     12                 size_t pgsize = iommu_pgsize(domain, iova, size - unmapped);
>     11
>     10                 unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather);
>      9                 if (!unmapped_page)
>      8                         break;		<<< we just break here, thinking there is nothing unmapped, but actually hv_iommu_del_mappings has removed some pages.
>      7
>      6                 pr_debug("unmapped: iova 0x%lx size 0x%zx\n",
>      5                         ¦iova, unmapped_page);
>      4
>      3                 iova += unmapped_page;
>      2                 unmapped += unmapped_page;
>      1         }
> ...
> 
> Am I missing something ?
> 
> Regards,
> 
> ~Praveen.
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2021-08-10 10:47 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20210709114339.3467637-1-wei.liu@kernel.org>
2021-07-09 11:43 ` [RFC v1 3/8] intel/vt-d: make DMAR table parsing code more flexible Wei Liu
2021-07-09 12:56   ` Robin Murphy
2021-07-09 13:42     ` Wei Liu
2021-07-09 11:43 ` [RFC v1 4/8] intel/vt-d: export intel_iommu_get_resv_regions Wei Liu
2021-07-09 14:17   ` Lu Baolu
2021-07-09 14:21     ` Wei Liu
2021-07-09 11:43 ` [RFC v1 5/8] mshv: add paravirtualized IOMMU support Wei Liu
2021-08-03 18:40   ` Praveen Kumar
2021-08-03 21:47     ` Wei Liu
2021-08-04  6:43       ` Praveen Kumar
2021-08-10 10:46         ` Wei Liu
2021-07-09 11:43 ` [RFC v1 6/8] mshv: command line option to skip devices in PV-IOMMU Wei Liu
2021-07-09 12:46   ` Robin Murphy
2021-07-09 13:34     ` Wei Liu
2021-08-03 18:50   ` Praveen Kumar
2021-08-03 21:56     ` Wei Liu
2021-08-04  7:03       ` Praveen Kumar
2021-08-10 10:04         ` Wei Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).