[PATCH v4 0/6] vfio/type1: Add support for valid iova list management

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v4 0/6] vfio/type1: Add support for valid iova list management
@ 2018-02-21 12:22 Shameer Kolothum
  2018-02-21 12:22 ` [PATCH v4 1/6] vfio/type1: Introduce iova list and add iommu aperture validity check Shameer Kolothum
                   ` (5 more replies)
  0 siblings, 6 replies; 28+ messages in thread
From: Shameer Kolothum @ 2018-02-21 12:22 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

This series introduces an iova list associated with a vfio 
iommu. The list is kept updated taking care of iommu apertures,
and reserved regions. Also this series adds checks for any conflict
with existing dma mappings whenever a new device group is attached to
the domain.

User-space can retrieve valid iova ranges using VFIO_IOMMU_GET_INFO
ioctl capability chains. Any dma map request outside the valid iova
range will be rejected.

v3 --> v4
 Addressed comments received for v3.
 -dma_addr_t instead of phys_addr_t
 -LIST_HEAD() usage.
 -Free up iova_copy list in case of error.
 -updated logic in filling the iova caps info(patch #5)

RFCv2 --> v3
 Removed RFC tag.
 Addressed comments from Alex and Eric:
 - Added comments to make iova list management logic more clear.
 - Use of iova list copy so that original is not altered in
   case of failure.

RFCv1 --> RFCv2
 Addressed comments from Alex:
-Introduced IOVA list management and added checks for conflicts with 
 existing dma map entries during attach/detach.

Shameer Kolothum (6):
  vfio/type1: Introduce iova list and add iommu aperture validity check
  vfio/type1: Check reserve region conflict and update iova list
  vfio/type1: Update iova list on detach
  vfio/type1: check dma map request is within a valid iova range
  vfio/type1: Add IOVA range capability support
  vfio/type1: remove duplicate retrieval of reserved regions.

 drivers/vfio/vfio_iommu_type1.c | 489 +++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/vfio.h       |  23 ++
 2 files changed, 500 insertions(+), 12 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v4 1/6] vfio/type1: Introduce iova list and add iommu aperture validity check
  2018-02-21 12:22 [PATCH v4 0/6] vfio/type1: Add support for valid iova list management Shameer Kolothum
@ 2018-02-21 12:22 ` Shameer Kolothum
  2018-02-21 12:22 ` [PATCH v4 2/6] vfio/type1: Check reserve region conflict and update iova list Shameer Kolothum
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 28+ messages in thread
From: Shameer Kolothum @ 2018-02-21 12:22 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

This introduces an iova list that is valid for dma mappings. Make
sure the new iommu aperture window doesn't conflict with the current
one or with any existing dma mappings during attach.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 183 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 180 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e30e29a..ac6772d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,6 +60,7 @@ MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
 	struct list_head	domain_list;
+	struct list_head	iova_list;
 	struct vfio_domain	*external_domain; /* domain for external user */
 	struct mutex		lock;
 	struct rb_root		dma_list;
@@ -92,6 +93,12 @@ struct vfio_group {
 	struct list_head	next;
 };
 
+struct vfio_iova {
+	struct list_head	list;
+	dma_addr_t		start;
+	dma_addr_t		end;
+};
+
 /*
  * Guest RAM pinning working set or DMA target
  */
@@ -1192,6 +1199,149 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
 	return ret;
 }
 
+/*
+ * This is a helper function to insert an address range to iova list.
+ * The list starts with a single entry corresponding to the IOMMU
+ * domain geometry to which the device group is attached. The list
+ * aperture gets modified when a new domain is added to the container
+ * if the new aperture doesn't conflict with the current one or with
+ * any existing dma mappings. The list is also modified to exclude
+ * any reserved regions associated with the device group.
+ */
+static int vfio_iommu_iova_insert(struct list_head *head,
+				  dma_addr_t start, dma_addr_t end)
+{
+	struct vfio_iova *region;
+
+	region = kmalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&region->list);
+	region->start = start;
+	region->end = end;
+
+	list_add_tail(&region->list, head);
+	return 0;
+}
+
+/*
+ * Check the new iommu aperture conflicts with existing aper or with any
+ * existing dma mappings.
+ */
+static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
+				     dma_addr_t start, dma_addr_t end)
+{
+	struct vfio_iova *first, *last;
+	struct list_head *iova = &iommu->iova_list;
+
+	if (list_empty(iova))
+		return false;
+
+	/* Disjoint sets, return conflict */
+	first = list_first_entry(iova, struct vfio_iova, list);
+	last = list_last_entry(iova, struct vfio_iova, list);
+	if ((start > last->end) || (end < first->start))
+		return true;
+
+	/* Check for any existing dma mappings outside the new start */
+	if (start > first->start) {
+		if (vfio_find_dma(iommu, first->start, start - first->start))
+			return true;
+	}
+
+	/* Check for any existing dma mappings outside the new end */
+	if (end < last->end) {
+		if (vfio_find_dma(iommu, end + 1, last->end - end))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Resize iommu iova aperture window. This is called only if the new
+ * aperture has no conflict with existing aperture and dma mappings.
+ */
+static int vfio_iommu_aper_resize(struct list_head *iova,
+				      dma_addr_t start,
+				      dma_addr_t end)
+{
+	struct vfio_iova *node, *next;
+
+	if (list_empty(iova))
+		return vfio_iommu_iova_insert(iova, start, end);
+
+	/* Adjust iova list start */
+	list_for_each_entry_safe(node, next, iova, list) {
+		if (start < node->start)
+			break;
+		if ((start >= node->start) && (start < node->end)) {
+			node->start = start;
+			break;
+		}
+		/* Delete nodes before new start */
+		list_del(&node->list);
+		kfree(node);
+	}
+
+	/* Adjust iova list end */
+	list_for_each_entry_safe(node, next, iova, list) {
+		if (end > node->end)
+			continue;
+		if ((end > node->start) && (end <= node->end)) {
+			node->end = end;
+			continue;
+		}
+		/* Delete nodes after new end */
+		list_del(&node->list);
+		kfree(node);
+	}
+
+	return 0;
+}
+
+static void vfio_iommu_iova_free(struct list_head *iova)
+{
+	struct vfio_iova *n, *next;
+
+	list_for_each_entry_safe(n, next, iova, list) {
+		list_del(&n->list);
+		kfree(n);
+	}
+}
+
+static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
+				    struct list_head *iova_copy)
+{
+
+	struct list_head *iova = &iommu->iova_list;
+	struct vfio_iova *n;
+	int ret;
+
+	list_for_each_entry(n, iova, list) {
+		ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
+		if (ret)
+			goto out_free;
+	}
+
+	return 0;
+
+out_free:
+	vfio_iommu_iova_free(iova_copy);
+	return ret;
+}
+
+static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
+					struct list_head *iova_copy)
+{
+	struct list_head *iova = &iommu->iova_list;
+
+	vfio_iommu_iova_free(iova);
+
+	list_splice_tail(iova_copy, iova);
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 					 struct iommu_group *iommu_group)
 {
@@ -1202,6 +1352,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	int ret;
 	bool resv_msi, msi_remap;
 	phys_addr_t resv_msi_base;
+	struct iommu_domain_geometry geo;
+	LIST_HEAD(iova_copy);
 
 	mutex_lock(&iommu->lock);
 
@@ -1271,6 +1423,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_domain;
 
+	/* Get aperture info */
+	iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
+
+	if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
+					    geo.aperture_end)) {
+		ret = -EINVAL;
+		goto out_detach;
+	}
+
+	/* Get a copy of the current iova list and work on it */
+	ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
+	if (ret)
+		goto out_detach;
+
+	ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
+						 geo.aperture_end);
+	if (ret)
+		goto out_detach;
+
 	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
 
 	INIT_LIST_HEAD(&domain->group_list);
@@ -1304,8 +1475,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 				list_add(&group->next, &d->group_list);
 				iommu_domain_free(domain->domain);
 				kfree(domain);
-				mutex_unlock(&iommu->lock);
-				return 0;
+				goto done;
 			}
 
 			ret = iommu_attach_group(domain->domain, iommu_group);
@@ -1328,7 +1498,9 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	}
 
 	list_add(&domain->next, &iommu->domain_list);
-
+done:
+	/* Delete the old one and insert new iova list */
+	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
 	mutex_unlock(&iommu->lock);
 
 	return 0;
@@ -1337,6 +1509,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	iommu_detach_group(domain->domain, iommu_group);
 out_domain:
 	iommu_domain_free(domain->domain);
+	vfio_iommu_iova_free(&iova_copy);
 out_free:
 	kfree(domain);
 	kfree(group);
@@ -1475,6 +1648,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 	}
 
 	INIT_LIST_HEAD(&iommu->domain_list);
+	INIT_LIST_HEAD(&iommu->iova_list);
 	iommu->dma_list = RB_ROOT;
 	mutex_init(&iommu->lock);
 	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
@@ -1517,6 +1691,9 @@ static void vfio_iommu_type1_release(void *iommu_data)
 		list_del(&domain->next);
 		kfree(domain);
 	}
+
+	vfio_iommu_iova_free(&iommu->iova_list);
+
 	kfree(iommu);
 }
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v4 2/6] vfio/type1: Check reserve region conflict and update iova list
  2018-02-21 12:22 [PATCH v4 0/6] vfio/type1: Add support for valid iova list management Shameer Kolothum
  2018-02-21 12:22 ` [PATCH v4 1/6] vfio/type1: Introduce iova list and add iommu aperture validity check Shameer Kolothum
@ 2018-02-21 12:22 ` Shameer Kolothum
  2018-02-21 12:22 ` [PATCH v4 3/6] vfio/type1: Update iova list on detach Shameer Kolothum
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 28+ messages in thread
From: Shameer Kolothum @ 2018-02-21 12:22 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

This retrieves the reserved regions associated with dev group and
checks for conflicts with any existing dma mappings. Also update
the iova list excluding the reserved regions.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 90 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index ac6772d..66c57ee 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1301,6 +1301,82 @@ static int vfio_iommu_aper_resize(struct list_head *iova,
 	return 0;
 }
 
+/*
+ * Check reserved region conflicts with existing dma mappings
+ */
+static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
+				struct list_head *resv_regions)
+{
+	struct iommu_resv_region *region;
+
+	/* Check for conflict with existing dma mappings */
+	list_for_each_entry(region, resv_regions, list) {
+		if (vfio_find_dma(iommu, region->start, region->length))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Check iova region overlap with  reserved regions and
+ * exclude them from the iommu iova range
+ */
+static int vfio_iommu_resv_exclude(struct list_head *iova,
+					struct list_head *resv_regions)
+{
+	struct iommu_resv_region *resv;
+	struct vfio_iova *n, *next;
+
+	list_for_each_entry(resv, resv_regions, list) {
+		phys_addr_t start, end;
+
+		start = resv->start;
+		end = resv->start + resv->length - 1;
+
+		list_for_each_entry_safe(n, next, iova, list) {
+			int ret = 0;
+
+			/* No overlap */
+			if ((start > n->end) || (end < n->start))
+				continue;
+			/*
+			 * Insert a new node if current node overlaps with the
+			 * reserve region to exlude that from valid iova range.
+			 * Note that, new node is inserted before the current
+			 * node and finally the current node is deleted keeping
+			 * the list updated and sorted.
+			 */
+			if (start > n->start)
+				ret = vfio_iommu_iova_insert(&n->list,
+							n->start, start - 1);
+			if (!ret && end < n->end)
+				ret = vfio_iommu_iova_insert(&n->list,
+							end + 1, n->end);
+			if (ret)
+				return ret;
+
+			list_del(&n->list);
+			kfree(n);
+		}
+	}
+
+	if (list_empty(iova))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void vfio_iommu_resv_free(struct list_head *resv_regions)
+{
+	struct iommu_resv_region *n, *next;
+
+	list_for_each_entry_safe(n, next, resv_regions, list) {
+		list_del(&n->list);
+		kfree(n);
+	}
+}
+
 static void vfio_iommu_iova_free(struct list_head *iova)
 {
 	struct vfio_iova *n, *next;
@@ -1354,6 +1430,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	phys_addr_t resv_msi_base;
 	struct iommu_domain_geometry geo;
 	LIST_HEAD(iova_copy);
+	LIST_HEAD(group_resv_regions);
 
 	mutex_lock(&iommu->lock);
 
@@ -1432,6 +1509,13 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 		goto out_detach;
 	}
 
+	iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
+
+	if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
+		ret = -EINVAL;
+		goto out_detach;
+	}
+
 	/* Get a copy of the current iova list and work on it */
 	ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
 	if (ret)
@@ -1442,6 +1526,10 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
+	ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
+	if (ret)
+		goto out_detach;
+
 	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
 
 	INIT_LIST_HEAD(&domain->group_list);
@@ -1502,6 +1590,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	/* Delete the old one and insert new iova list */
 	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
 	mutex_unlock(&iommu->lock);
+	vfio_iommu_resv_free(&group_resv_regions);
 
 	return 0;
 
@@ -1510,6 +1599,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 out_domain:
 	iommu_domain_free(domain->domain);
 	vfio_iommu_iova_free(&iova_copy);
+	vfio_iommu_resv_free(&group_resv_regions);
 out_free:
 	kfree(domain);
 	kfree(group);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v4 3/6] vfio/type1: Update iova list on detach
  2018-02-21 12:22 [PATCH v4 0/6] vfio/type1: Add support for valid iova list management Shameer Kolothum
  2018-02-21 12:22 ` [PATCH v4 1/6] vfio/type1: Introduce iova list and add iommu aperture validity check Shameer Kolothum
  2018-02-21 12:22 ` [PATCH v4 2/6] vfio/type1: Check reserve region conflict and update iova list Shameer Kolothum
@ 2018-02-21 12:22 ` Shameer Kolothum
  2018-02-21 12:22 ` [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range Shameer Kolothum
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 28+ messages in thread
From: Shameer Kolothum @ 2018-02-21 12:22 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

Get a copy of iova list on _group_detach and try to update the list.
On success replace the current one with the copy. Leave the list as
it is if update fails.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 91 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 66c57ee..a80884e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1655,12 +1655,88 @@ static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
 	WARN_ON(iommu->notifier.head);
 }
 
+/*
+ * Called when a domain is removed in detach. It is possible that
+ * the removed domain decided the iova aperture window. Modify the
+ * iova aperture with the smallest window among existing domains.
+ */
+static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
+				   struct list_head *iova_copy)
+{
+	struct vfio_domain *domain;
+	struct iommu_domain_geometry geo;
+	struct vfio_iova *node;
+	dma_addr_t start = 0;
+	dma_addr_t end = (dma_addr_t)~0;
+
+	list_for_each_entry(domain, &iommu->domain_list, next) {
+		iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
+				      &geo);
+		if (geo.aperture_start > start)
+			start = geo.aperture_start;
+		if (geo.aperture_end < end)
+			end = geo.aperture_end;
+	}
+
+	/* Modify aperture limits. The new aper is either same or bigger */
+	node = list_first_entry(iova_copy, struct vfio_iova, list);
+	node->start = start;
+	node = list_last_entry(iova_copy, struct vfio_iova, list);
+	node->end = end;
+}
+
+/*
+ * Called when a group is detached. The reserved regions for that
+ * group can be part of valid iova now. But since reserved regions
+ * may be duplicated among groups, populate the iova valid regions
+ * list again.
+ */
+static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
+				   struct list_head *iova_copy)
+{
+	struct vfio_domain *d;
+	struct vfio_group *g;
+	struct vfio_iova *node;
+	dma_addr_t start, end;
+	LIST_HEAD(resv_regions);
+	int ret;
+
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		list_for_each_entry(g, &d->group_list, next)
+			iommu_get_group_resv_regions(g->iommu_group,
+							 &resv_regions);
+	}
+
+	if (list_empty(&resv_regions))
+		return 0;
+
+	node = list_first_entry(iova_copy, struct vfio_iova, list);
+	start = node->start;
+	node = list_last_entry(iova_copy, struct vfio_iova, list);
+	end = node->end;
+
+	/* purge the iova list and create new one */
+	vfio_iommu_iova_free(iova_copy);
+
+	ret = vfio_iommu_aper_resize(iova_copy, start, end);
+	if (ret)
+		goto done;
+
+	/* Exclude current reserved regions from iova ranges */
+	ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
+done:
+	vfio_iommu_resv_free(&resv_regions);
+	return ret;
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
 					  struct iommu_group *iommu_group)
 {
 	struct vfio_iommu *iommu = iommu_data;
 	struct vfio_domain *domain;
 	struct vfio_group *group;
+	bool iova_copy_fail;
+	LIST_HEAD(iova_copy);
 
 	mutex_lock(&iommu->lock);
 
@@ -1683,6 +1759,12 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 		}
 	}
 
+	/*
+	 * Get a copy of iova list. If success, use copy to update the
+	 * list and to replace the current one.
+	 */
+	iova_copy_fail = !!vfio_iommu_iova_get_copy(iommu, &iova_copy);
+
 	list_for_each_entry(domain, &iommu->domain_list, next) {
 		group = find_iommu_group(domain, iommu_group);
 		if (!group)
@@ -1708,10 +1790,19 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
 			iommu_domain_free(domain->domain);
 			list_del(&domain->next);
 			kfree(domain);
+			if (!iova_copy_fail && !list_empty(&iommu->domain_list))
+				vfio_iommu_aper_expand(iommu, &iova_copy);
 		}
 		break;
 	}
 
+	if (!iova_copy_fail && !list_empty(&iommu->domain_list)) {
+		if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
+			vfio_iommu_iova_insert_copy(iommu, &iova_copy);
+		else
+			vfio_iommu_iova_free(&iova_copy);
+	}
+
 detach_group_done:
 	mutex_unlock(&iommu->lock);
 }
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-21 12:22 [PATCH v4 0/6] vfio/type1: Add support for valid iova list management Shameer Kolothum
                   ` (2 preceding siblings ...)
  2018-02-21 12:22 ` [PATCH v4 3/6] vfio/type1: Update iova list on detach Shameer Kolothum
@ 2018-02-21 12:22 ` Shameer Kolothum
  2018-02-26 22:05   ` Auger Eric
  2018-02-21 12:22 ` [PATCH v4 5/6] vfio/type1: Add IOVA range capability support Shameer Kolothum
  2018-02-21 12:22 ` [PATCH v4 6/6] vfio/type1: remove duplicate retrieval of reserved regions Shameer Kolothum
  5 siblings, 1 reply; 28+ messages in thread
From: Shameer Kolothum @ 2018-02-21 12:22 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

This checks and rejects any dma map request outside valid iova
range.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a80884e..3049393 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
 	return ret;
 }
 
+/*
+ * Check dma map request is within a valid iova range
+ */
+static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
+				dma_addr_t start, dma_addr_t end)
+{
+	struct list_head *iova = &iommu->iova_list;
+	struct vfio_iova *node;
+
+	list_for_each_entry(node, iova, list) {
+		if ((start >= node->start) && (end <= node->end))
+			return true;
+	}
+
+	return false;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
@@ -1008,6 +1025,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 		goto out_unlock;
 	}
 
+	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 	if (!dma) {
 		ret = -ENOMEM;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v4 5/6] vfio/type1: Add IOVA range capability support
  2018-02-21 12:22 [PATCH v4 0/6] vfio/type1: Add support for valid iova list management Shameer Kolothum
                   ` (3 preceding siblings ...)
  2018-02-21 12:22 ` [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range Shameer Kolothum
@ 2018-02-21 12:22 ` Shameer Kolothum
  2018-02-22 22:54   ` Alex Williamson
  2018-02-21 12:22 ` [PATCH v4 6/6] vfio/type1: remove duplicate retrieval of reserved regions Shameer Kolothum
  5 siblings, 1 reply; 28+ messages in thread
From: Shameer Kolothum @ 2018-02-21 12:22 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

This  allows the user-space to retrieve the supported IOVA
range(s), excluding any reserved regions. The implementation
is based on capability chains, added to VFIO_IOMMU_GET_INFO ioctl.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 88 +++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h       | 23 +++++++++++
 2 files changed, 111 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 3049393..c08adb5 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1917,6 +1917,68 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
 	return ret;
 }
 
+static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
+		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
+		 size_t size)
+{
+	struct vfio_info_cap_header *header;
+	struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
+
+	header = vfio_info_cap_add(caps, size,
+				VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
+	if (IS_ERR(header))
+		return PTR_ERR(header);
+
+	iova_cap = container_of(header,
+			struct vfio_iommu_type1_info_cap_iova_range, header);
+	iova_cap->nr_iovas = cap_iovas->nr_iovas;
+	memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
+		cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
+	return 0;
+}
+
+static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
+				struct vfio_info_cap *caps)
+{
+	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
+	struct vfio_iova *iova;
+	size_t size;
+	int iovas = 0, i = 0, ret;
+
+	mutex_lock(&iommu->lock);
+
+	list_for_each_entry(iova, &iommu->iova_list, list)
+		iovas++;
+
+	if (!iovas) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
+
+	cap_iovas = kzalloc(size, GFP_KERNEL);
+	if (!cap_iovas) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	cap_iovas->nr_iovas = iovas;
+
+	list_for_each_entry(iova, &iommu->iova_list, list) {
+		cap_iovas->iova_ranges[i].start = iova->start;
+		cap_iovas->iova_ranges[i].end = iova->end;
+		i++;
+	}
+
+	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
+
+	kfree(cap_iovas);
+out_unlock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
 				   unsigned int cmd, unsigned long arg)
 {
@@ -1938,6 +2000,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 		}
 	} else if (cmd == VFIO_IOMMU_GET_INFO) {
 		struct vfio_iommu_type1_info info;
+		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		int ret;
 
 		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 
@@ -1951,6 +2015,30 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
 
+		ret = vfio_iommu_iova_build_caps(iommu, &caps);
+		if (ret)
+			return ret;
+
+		if (caps.size) {
+			info.flags |= VFIO_IOMMU_INFO_CAPS;
+
+			if (info.argsz < sizeof(info) + caps.size) {
+				info.argsz = sizeof(info) + caps.size;
+			} else {
+				vfio_info_cap_shift(&caps, sizeof(info));
+				if (copy_to_user((void __user *)arg +
+						sizeof(info), caps.buf,
+						caps.size)) {
+					kfree(caps.buf);
+					return -EFAULT;
+				}
+				minsz = offsetofend(struct vfio_iommu_type1_info,
+							   cap_offset);
+				info.cap_offset = sizeof(info);
+			}
+
+			kfree(caps.buf);
+		}
 		return copy_to_user((void __user *)arg, &info, minsz) ?
 			-EFAULT : 0;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index c743721..46b49e9 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -589,7 +589,30 @@ struct vfio_iommu_type1_info {
 	__u32	argsz;
 	__u32	flags;
 #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
+#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
 	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
+	__u32   cap_offset;	/* Offset within info struct of first cap */
+};
+
+/*
+ * The IOVA capability allows to report the valid IOVA range(s)
+ * excluding any reserved regions associated with dev group. Any dma
+ * map attempt outside the valid iova range will return error.
+ *
+ * The structures below define version 1 of this capability.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE  1
+
+struct vfio_iova_range {
+	__u64	start;
+	__u64	end;
+};
+
+struct vfio_iommu_type1_info_cap_iova_range {
+	struct vfio_info_cap_header header;
+	__u32	nr_iovas;
+	__u32	reserved;
+	struct vfio_iova_range iova_ranges[];
 };
 
 #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v4 6/6] vfio/type1: remove duplicate retrieval of reserved regions.
  2018-02-21 12:22 [PATCH v4 0/6] vfio/type1: Add support for valid iova list management Shameer Kolothum
                   ` (4 preceding siblings ...)
  2018-02-21 12:22 ` [PATCH v4 5/6] vfio/type1: Add IOVA range capability support Shameer Kolothum
@ 2018-02-21 12:22 ` Shameer Kolothum
  5 siblings, 0 replies; 28+ messages in thread
From: Shameer Kolothum @ 2018-02-21 12:22 UTC (permalink / raw)
  To: alex.williamson, eric.auger, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Shameer Kolothum

As we now already have the reserved regions list, just pass that into
vfio_iommu_has_sw_msi() fn.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index c08adb5..56f68b1 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1192,15 +1192,13 @@ static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
 	return NULL;
 }
 
-static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
+static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
+						phys_addr_t *base)
 {
-	struct list_head group_resv_regions;
-	struct iommu_resv_region *region, *next;
+	struct iommu_resv_region *region;
 	bool ret = false;
 
-	INIT_LIST_HEAD(&group_resv_regions);
-	iommu_get_group_resv_regions(group, &group_resv_regions);
-	list_for_each_entry(region, &group_resv_regions, list) {
+	list_for_each_entry(region, group_resv_regions, list) {
 		/*
 		 * The presence of any 'real' MSI regions should take
 		 * precedence over the software-managed one if the
@@ -1216,8 +1214,7 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
 			ret = true;
 		}
 	}
-	list_for_each_entry_safe(region, next, &group_resv_regions, list)
-		kfree(region);
+
 	return ret;
 }
 
@@ -1552,7 +1549,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	if (ret)
 		goto out_detach;
 
-	resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
+	resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
 
 	INIT_LIST_HEAD(&domain->group_list);
 	list_add(&group->next, &domain->group_list);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 5/6] vfio/type1: Add IOVA range capability support
  2018-02-21 12:22 ` [PATCH v4 5/6] vfio/type1: Add IOVA range capability support Shameer Kolothum
@ 2018-02-22 22:54   ` Alex Williamson
  2018-02-23 10:56     ` Shameerali Kolothum Thodi
  0 siblings, 1 reply; 28+ messages in thread
From: Alex Williamson @ 2018-02-22 22:54 UTC (permalink / raw)
  To: Shameer Kolothum
  Cc: eric.auger, pmorel, kvm, linux-kernel, linuxarm, john.garry, xuwei5

On Wed, 21 Feb 2018 12:22:08 +0000
Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:

> This  allows the user-space to retrieve the supported IOVA
> range(s), excluding any reserved regions. The implementation
> is based on capability chains, added to VFIO_IOMMU_GET_INFO ioctl.
> 
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 88 +++++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h       | 23 +++++++++++
>  2 files changed, 111 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 3049393..c08adb5 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -1917,6 +1917,68 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
>  	return ret;
>  }
>  
> +static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
> +		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
> +		 size_t size)
> +{
> +	struct vfio_info_cap_header *header;
> +	struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
> +
> +	header = vfio_info_cap_add(caps, size,
> +				VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
> +	if (IS_ERR(header))
> +		return PTR_ERR(header);
> +
> +	iova_cap = container_of(header,
> +			struct vfio_iommu_type1_info_cap_iova_range, header);
> +	iova_cap->nr_iovas = cap_iovas->nr_iovas;
> +	memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
> +		cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
> +	return 0;
> +}
> +
> +static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
> +				struct vfio_info_cap *caps)
> +{
> +	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
> +	struct vfio_iova *iova;
> +	size_t size;
> +	int iovas = 0, i = 0, ret;
> +
> +	mutex_lock(&iommu->lock);
> +
> +	list_for_each_entry(iova, &iommu->iova_list, list)
> +		iovas++;
> +
> +	if (!iovas) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
> +	size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
> +
> +	cap_iovas = kzalloc(size, GFP_KERNEL);
> +	if (!cap_iovas) {
> +		ret = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	cap_iovas->nr_iovas = iovas;
> +
> +	list_for_each_entry(iova, &iommu->iova_list, list) {
> +		cap_iovas->iova_ranges[i].start = iova->start;
> +		cap_iovas->iova_ranges[i].end = iova->end;
> +		i++;
> +	}
> +
> +	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
> +
> +	kfree(cap_iovas);
> +out_unlock:
> +	mutex_unlock(&iommu->lock);
> +	return ret;
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  				   unsigned int cmd, unsigned long arg)
>  {
> @@ -1938,6 +2000,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  		}
>  	} else if (cmd == VFIO_IOMMU_GET_INFO) {
>  		struct vfio_iommu_type1_info info;
> +		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
> +		int ret;
>  
>  		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
>  
> @@ -1951,6 +2015,30 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  
>  		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
>  
> +		ret = vfio_iommu_iova_build_caps(iommu, &caps);
> +		if (ret)
> +			return ret;
> +
> +		if (caps.size) {
> +			info.flags |= VFIO_IOMMU_INFO_CAPS;
> +
> +			if (info.argsz < sizeof(info) + caps.size) {
> +				info.argsz = sizeof(info) + caps.size;

There's still a corner case here where the user could have provided an
argsz including cap_offset, potentially including uninitialized user
data, and it becomes more error prone not to zero that field for this
case.  I'd suggest this:

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a1ce8853caf1..d9ea69e62c46 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2105,16 +2105,25 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 	} else if (cmd == VFIO_IOMMU_GET_INFO) {
 		struct vfio_iommu_type1_info info;
 		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+		unsigned long capsz;
 		int ret;
 
 		minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 
+		/* For backward compatibility, cannot require this */
+		capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+
 		if (copy_from_user(&info, (void __user *)arg, minsz))
 			return -EFAULT;
 
 		if (info.argsz < minsz)
 			return -EINVAL;
 
+		if (info.argsz >= capsz) {
+			minsz = capsz;
+			info.cap_offset = 0; /* output, no-recopy necessary */
+		}
+
 		info.flags = VFIO_IOMMU_INFO_PGSIZES;
 
 		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
@@ -2136,13 +2145,12 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 					kfree(caps.buf);
 					return -EFAULT;
 				}
-				minsz = offsetofend(struct vfio_iommu_type1_info,
-							   cap_offset);
 				info.cap_offset = sizeof(info);
 			}
 
 			kfree(caps.buf);
 		}
+
 		return copy_to_user((void __user *)arg, &info, minsz) ?
 			-EFAULT : 0;
 


If you approve and there are no other comments, I can roll that change
into this patch as-is.  Thanks,

Alex

> +			} else {
> +				vfio_info_cap_shift(&caps, sizeof(info));
> +				if (copy_to_user((void __user *)arg +
> +						sizeof(info), caps.buf,
> +						caps.size)) {
> +					kfree(caps.buf);
> +					return -EFAULT;
> +				}
> +				minsz = offsetofend(struct vfio_iommu_type1_info,
> +							   cap_offset);
> +				info.cap_offset = sizeof(info);
> +			}
> +
> +			kfree(caps.buf);
> +		}
>  		return copy_to_user((void __user *)arg, &info, minsz) ?
>  			-EFAULT : 0;
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index c743721..46b49e9 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -589,7 +589,30 @@ struct vfio_iommu_type1_info {
>  	__u32	argsz;
>  	__u32	flags;
>  #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
> +#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
>  	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
> +	__u32   cap_offset;	/* Offset within info struct of first cap */
> +};
> +
> +/*
> + * The IOVA capability allows to report the valid IOVA range(s)
> + * excluding any reserved regions associated with dev group. Any dma
> + * map attempt outside the valid iova range will return error.
> + *
> + * The structures below define version 1 of this capability.
> + */
> +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE  1
> +
> +struct vfio_iova_range {
> +	__u64	start;
> +	__u64	end;
> +};
> +
> +struct vfio_iommu_type1_info_cap_iova_range {
> +	struct vfio_info_cap_header header;
> +	__u32	nr_iovas;
> +	__u32	reserved;
> +	struct vfio_iova_range iova_ranges[];
>  };
>  
>  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* RE: [PATCH v4 5/6] vfio/type1: Add IOVA range capability support
  2018-02-22 22:54   ` Alex Williamson
@ 2018-02-23 10:56     ` Shameerali Kolothum Thodi
  0 siblings, 0 replies; 28+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-02-23 10:56 UTC (permalink / raw)
  To: Alex Williamson
  Cc: eric.auger, pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)



> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Thursday, February 22, 2018 10:54 PM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: eric.auger@redhat.com; pmorel@linux.vnet.ibm.com;
> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)
> <xuwei5@huawei.com>
> Subject: Re: [PATCH v4 5/6] vfio/type1: Add IOVA range capability support
> 
> On Wed, 21 Feb 2018 12:22:08 +0000
> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> wrote:
> 
> > This  allows the user-space to retrieve the supported IOVA
> > range(s), excluding any reserved regions. The implementation
> > is based on capability chains, added to VFIO_IOMMU_GET_INFO ioctl.
> >
> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 88
> +++++++++++++++++++++++++++++++++++++++++
> >  include/uapi/linux/vfio.h       | 23 +++++++++++
> >  2 files changed, 111 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> > index 3049393..c08adb5 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -1917,6 +1917,68 @@ static int
> vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
> >  	return ret;
> >  }
> >
> > +static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
> > +		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
> > +		 size_t size)
> > +{
> > +	struct vfio_info_cap_header *header;
> > +	struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
> > +
> > +	header = vfio_info_cap_add(caps, size,
> > +
> 	VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
> > +	if (IS_ERR(header))
> > +		return PTR_ERR(header);
> > +
> > +	iova_cap = container_of(header,
> > +			struct vfio_iommu_type1_info_cap_iova_range,
> header);
> > +	iova_cap->nr_iovas = cap_iovas->nr_iovas;
> > +	memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
> > +		cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
> > +	return 0;
> > +}
> > +
> > +static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
> > +				struct vfio_info_cap *caps)
> > +{
> > +	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
> > +	struct vfio_iova *iova;
> > +	size_t size;
> > +	int iovas = 0, i = 0, ret;
> > +
> > +	mutex_lock(&iommu->lock);
> > +
> > +	list_for_each_entry(iova, &iommu->iova_list, list)
> > +		iovas++;
> > +
> > +	if (!iovas) {
> > +		ret = -EINVAL;
> > +		goto out_unlock;
> > +	}
> > +
> > +	size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
> > +
> > +	cap_iovas = kzalloc(size, GFP_KERNEL);
> > +	if (!cap_iovas) {
> > +		ret = -ENOMEM;
> > +		goto out_unlock;
> > +	}
> > +
> > +	cap_iovas->nr_iovas = iovas;
> > +
> > +	list_for_each_entry(iova, &iommu->iova_list, list) {
> > +		cap_iovas->iova_ranges[i].start = iova->start;
> > +		cap_iovas->iova_ranges[i].end = iova->end;
> > +		i++;
> > +	}
> > +
> > +	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
> > +
> > +	kfree(cap_iovas);
> > +out_unlock:
> > +	mutex_unlock(&iommu->lock);
> > +	return ret;
> > +}
> > +
> >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> >  				   unsigned int cmd, unsigned long arg)
> >  {
> > @@ -1938,6 +2000,8 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> >  		}
> >  	} else if (cmd == VFIO_IOMMU_GET_INFO) {
> >  		struct vfio_iommu_type1_info info;
> > +		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
> > +		int ret;
> >
> >  		minsz = offsetofend(struct vfio_iommu_type1_info,
> iova_pgsizes);
> >
> > @@ -1951,6 +2015,30 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> >
> >  		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
> >
> > +		ret = vfio_iommu_iova_build_caps(iommu, &caps);
> > +		if (ret)
> > +			return ret;
> > +
> > +		if (caps.size) {
> > +			info.flags |= VFIO_IOMMU_INFO_CAPS;
> > +
> > +			if (info.argsz < sizeof(info) + caps.size) {
> > +				info.argsz = sizeof(info) + caps.size;
> 
> There's still a corner case here where the user could have provided an
> argsz including cap_offset, potentially including uninitialized user
> data, and it becomes more error prone not to zero that field for this
> case.  I'd suggest this:

Yes,  it is  error prone. Thanks for spotting this.

> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index a1ce8853caf1..d9ea69e62c46 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2105,16 +2105,25 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
>  	} else if (cmd == VFIO_IOMMU_GET_INFO) {
>  		struct vfio_iommu_type1_info info;
>  		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
> +		unsigned long capsz;
>  		int ret;
> 
>  		minsz = offsetofend(struct vfio_iommu_type1_info,
> iova_pgsizes);
> 
> +		/* For backward compatibility, cannot require this */
> +		capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
> +
>  		if (copy_from_user(&info, (void __user *)arg, minsz))
>  			return -EFAULT;
> 
>  		if (info.argsz < minsz)
>  			return -EINVAL;
> 
> +		if (info.argsz >= capsz) {
> +			minsz = capsz;
> +			info.cap_offset = 0; /* output, no-recopy necessary */
> +		}
> +
>  		info.flags = VFIO_IOMMU_INFO_PGSIZES;
> 
>  		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
> @@ -2136,13 +2145,12 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
>  					kfree(caps.buf);
>  					return -EFAULT;
>  				}
> -				minsz = offsetofend(struct
> vfio_iommu_type1_info,
> -							   cap_offset);
>  				info.cap_offset = sizeof(info);
>  			}
> 
>  			kfree(caps.buf);
>  		}
> +
>  		return copy_to_user((void __user *)arg, &info, minsz) ?
>  			-EFAULT : 0;
> 
> 
> 
> If you approve and there are no other comments, I can roll that change
> into this patch as-is.  Thanks,

That looks fine to me.

Thanks,
Shameer

> 
> > +			} else {
> > +				vfio_info_cap_shift(&caps, sizeof(info));
> > +				if (copy_to_user((void __user *)arg +
> > +						sizeof(info), caps.buf,
> > +						caps.size)) {
> > +					kfree(caps.buf);
> > +					return -EFAULT;
> > +				}
> > +				minsz = offsetofend(struct
> vfio_iommu_type1_info,
> > +							   cap_offset);
> > +				info.cap_offset = sizeof(info);
> > +			}
> > +
> > +			kfree(caps.buf);
> > +		}
> >  		return copy_to_user((void __user *)arg, &info, minsz) ?
> >  			-EFAULT : 0;
> >
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index c743721..46b49e9 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -589,7 +589,30 @@ struct vfio_iommu_type1_info {
> >  	__u32	argsz;
> >  	__u32	flags;
> >  #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
> > +#define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
> >  	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
> > +	__u32   cap_offset;	/* Offset within info struct of first cap */
> > +};
> > +
> > +/*
> > + * The IOVA capability allows to report the valid IOVA range(s)
> > + * excluding any reserved regions associated with dev group. Any dma
> > + * map attempt outside the valid iova range will return error.
> > + *
> > + * The structures below define version 1 of this capability.
> > + */
> > +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE  1
> > +
> > +struct vfio_iova_range {
> > +	__u64	start;
> > +	__u64	end;
> > +};
> > +
> > +struct vfio_iommu_type1_info_cap_iova_range {
> > +	struct vfio_info_cap_header header;
> > +	__u32	nr_iovas;
> > +	__u32	reserved;
> > +	struct vfio_iova_range iova_ranges[];
> >  };
> >
> >  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-21 12:22 ` [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range Shameer Kolothum
@ 2018-02-26 22:05   ` Auger Eric
  2018-02-26 23:13     ` Alex Williamson
  0 siblings, 1 reply; 28+ messages in thread
From: Auger Eric @ 2018-02-26 22:05 UTC (permalink / raw)
  To: Shameer Kolothum, alex.williamson, pmorel
  Cc: kvm, linux-kernel, linuxarm, john.garry, xuwei5, Robin Murphy

Hi Shameer,

[Adding Robin in CC]
On 21/02/18 13:22, Shameer Kolothum wrote:
> This checks and rejects any dma map request outside valid iova
> range.
> 
> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
>  1 file changed, 22 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index a80884e..3049393 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
>  	return ret;
>  }
>  
> +/*
> + * Check dma map request is within a valid iova range
> + */
> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
> +				dma_addr_t start, dma_addr_t end)
> +{
> +	struct list_head *iova = &iommu->iova_list;
> +	struct vfio_iova *node;
> +
> +	list_for_each_entry(node, iova, list) {
> +		if ((start >= node->start) && (end <= node->end))
> +			return true;
I am now confused by the fact this change will prevent existing QEMU
from working with this series on some platforms. For instance QEMU virt
machine GPA space collides with Seattle PCI host bridge windows. On ARM
the smmu and smmuv3 drivers report the PCI host bridge windows as
reserved regions which does not seem to be the case on other platforms.
The change happened in commit 273df9635385b2156851c7ee49f40658d7bcb29d
(iommu/dma: Make PCI window reservation generic).

For background, we already discussed the topic after LPC 2016. See
https://www.spinics.net/lists/kernel/msg2379607.html.

So is it the right choice to expose PCI host bridge windows as reserved
regions? If yes shouldn't we make a difference between those and MSI
windows in this series and do not reject any user space DMA_MAP attempt
within PCI host bridge windows.

Thanks

Eric
> +	}
> +
> +	return false;
> +}
> +
>  static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  			   struct vfio_iommu_type1_dma_map *map)
>  {
> @@ -1008,6 +1025,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
>  		goto out_unlock;
>  	}
>  
> +	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
> +		ret = -EINVAL;
> +		goto out_unlock;
> +	}
> +
>  	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
>  	if (!dma) {
>  		ret = -ENOMEM;
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-26 22:05   ` Auger Eric
@ 2018-02-26 23:13     ` Alex Williamson
  2018-02-27  8:26       ` Auger Eric
  2018-02-27 12:40       ` Robin Murphy
  0 siblings, 2 replies; 28+ messages in thread
From: Alex Williamson @ 2018-02-26 23:13 UTC (permalink / raw)
  To: Auger Eric
  Cc: Shameer Kolothum, pmorel, kvm, linux-kernel, linuxarm,
	john.garry, xuwei5, Robin Murphy

On Mon, 26 Feb 2018 23:05:43 +0100
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Shameer,
> 
> [Adding Robin in CC]
> On 21/02/18 13:22, Shameer Kolothum wrote:
> > This checks and rejects any dma map request outside valid iova
> > range.
> > 
> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
> >  1 file changed, 22 insertions(+)
> > 
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index a80884e..3049393 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> >  	return ret;
> >  }
> >  
> > +/*
> > + * Check dma map request is within a valid iova range
> > + */
> > +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
> > +				dma_addr_t start, dma_addr_t end)
> > +{
> > +	struct list_head *iova = &iommu->iova_list;
> > +	struct vfio_iova *node;
> > +
> > +	list_for_each_entry(node, iova, list) {
> > +		if ((start >= node->start) && (end <= node->end))
> > +			return true;  
> I am now confused by the fact this change will prevent existing QEMU
> from working with this series on some platforms. For instance QEMU virt
> machine GPA space collides with Seattle PCI host bridge windows. On ARM
> the smmu and smmuv3 drivers report the PCI host bridge windows as
> reserved regions which does not seem to be the case on other platforms.
> The change happened in commit 273df9635385b2156851c7ee49f40658d7bcb29d
> (iommu/dma: Make PCI window reservation generic).
> 
> For background, we already discussed the topic after LPC 2016. See
> https://www.spinics.net/lists/kernel/msg2379607.html.
> 
> So is it the right choice to expose PCI host bridge windows as reserved
> regions? If yes shouldn't we make a difference between those and MSI
> windows in this series and do not reject any user space DMA_MAP attempt
> within PCI host bridge windows.

If the QEMU machine GPA collides with a reserved region today, then
either:

a) The mapping through the IOMMU works and the reserved region is wrong

or

b) The mapping doesn't actually work, QEMU is at risk of data loss by
being told that it worked, and we're justified in changing that
behavior.

Without knowing the specifics of SMMU, it doesn't particularly make
sense to me to mark the entire PCI hierarchy MMIO range as reserved,
unless perhaps the IOMMU is incapable of translating those IOVAs.

Are we trying to prevent untranslated p2p with this reserved range?
That's not necessarily a terrible idea, but it seems that doing it for
that purpose would need to be a lot smarter, taking into account ACS
and precisely selecting ranges within the peer address space that would
be untranslated.  Perhaps only populated MMIO within non-ACS
hierarchies.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-26 23:13     ` Alex Williamson
@ 2018-02-27  8:26       ` Auger Eric
  2018-02-27  9:57         ` Shameerali Kolothum Thodi
  2018-02-27 16:57         ` Alex Williamson
  2018-02-27 12:40       ` Robin Murphy
  1 sibling, 2 replies; 28+ messages in thread
From: Auger Eric @ 2018-02-27  8:26 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Shameer Kolothum, pmorel, kvm, linux-kernel, linuxarm,
	john.garry, xuwei5, Robin Murphy

Hi,
On 27/02/18 00:13, Alex Williamson wrote:
> On Mon, 26 Feb 2018 23:05:43 +0100
> Auger Eric <eric.auger@redhat.com> wrote:
> 
>> Hi Shameer,
>>
>> [Adding Robin in CC]
>> On 21/02/18 13:22, Shameer Kolothum wrote:
>>> This checks and rejects any dma map request outside valid iova
>>> range.
>>>
>>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
>>> ---
>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
>>>  1 file changed, 22 insertions(+)
>>>
>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>> index a80884e..3049393 100644
>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
>>>  	return ret;
>>>  }
>>>  
>>> +/*
>>> + * Check dma map request is within a valid iova range
>>> + */
>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
>>> +				dma_addr_t start, dma_addr_t end)
>>> +{
>>> +	struct list_head *iova = &iommu->iova_list;
>>> +	struct vfio_iova *node;
>>> +
>>> +	list_for_each_entry(node, iova, list) {
>>> +		if ((start >= node->start) && (end <= node->end))
>>> +			return true;  
>> I am now confused by the fact this change will prevent existing QEMU
>> from working with this series on some platforms. For instance QEMU virt
>> machine GPA space collides with Seattle PCI host bridge windows. On ARM
>> the smmu and smmuv3 drivers report the PCI host bridge windows as
>> reserved regions which does not seem to be the case on other platforms.
>> The change happened in commit 273df9635385b2156851c7ee49f40658d7bcb29d
>> (iommu/dma: Make PCI window reservation generic).
>>
>> For background, we already discussed the topic after LPC 2016. See
>> https://www.spinics.net/lists/kernel/msg2379607.html.
>>
>> So is it the right choice to expose PCI host bridge windows as reserved
>> regions? If yes shouldn't we make a difference between those and MSI
>> windows in this series and do not reject any user space DMA_MAP attempt
>> within PCI host bridge windows.
> 
> If the QEMU machine GPA collides with a reserved region today, then
> either:
> 
> a) The mapping through the IOMMU works and the reserved region is wrong
> 
> or
> 
> b) The mapping doesn't actually work, QEMU is at risk of data loss by
> being told that it worked, and we're justified in changing that
> behavior.
> 
> Without knowing the specifics of SMMU, it doesn't particularly make
> sense to me to mark the entire PCI hierarchy MMIO range as reserved,
> unless perhaps the IOMMU is incapable of translating those IOVAs.
to me the limitation does not come from the smmu itself, which is a
separate HW block sitting between the root complex and the interconnect.
If ACS is not enforced by the PCIe subsystem, the transaction will never
reach the IOMMU.

In the case of such overlap, shouldn't we just warn the end-user that
this situation is dangerous instead of forbidding the use case which
worked "in most cases" until now.

> Are we trying to prevent untranslated p2p with this reserved range?
> That's not necessarily a terrible idea, but it seems that doing it for
> that purpose would need to be a lot smarter, taking into account ACS
> and precisely selecting ranges within the peer address space that would
> be untranslated.  Perhaps only populated MMIO within non-ACS
> hierarchies.  Thanks,

Indeed taking into account the ACS capability would refine the
situations where a risk exists.

Thanks

Eric
> 
> Alex
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-27  8:26       ` Auger Eric
@ 2018-02-27  9:57         ` Shameerali Kolothum Thodi
  2018-02-27 17:13           ` Alex Williamson
  2018-02-28  9:02           ` Auger Eric
  2018-02-27 16:57         ` Alex Williamson
  1 sibling, 2 replies; 28+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-02-27  9:57 UTC (permalink / raw)
  To: Auger Eric, Alex Williamson
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O), Robin Murphy



> -----Original Message-----
> From: Auger Eric [mailto:eric.auger@redhat.com]
> Sent: Tuesday, February 27, 2018 8:27 AM
> To: Alex Williamson <alex.williamson@redhat.com>
> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy
> <robin.murphy@arm.com>
> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
> iova range
> 
> Hi,
> On 27/02/18 00:13, Alex Williamson wrote:
> > On Mon, 26 Feb 2018 23:05:43 +0100
> > Auger Eric <eric.auger@redhat.com> wrote:
> >
> >> Hi Shameer,
> >>
> >> [Adding Robin in CC]
> >> On 21/02/18 13:22, Shameer Kolothum wrote:
> >>> This checks and rejects any dma map request outside valid iova
> >>> range.
> >>>
> >>> Signed-off-by: Shameer Kolothum
> <shameerali.kolothum.thodi@huawei.com>
> >>> ---
> >>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
> >>>  1 file changed, 22 insertions(+)
> >>>
> >>> diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> >>> index a80884e..3049393 100644
> >>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu
> *iommu, struct vfio_dma *dma,
> >>>  	return ret;
> >>>  }
> >>>
> >>> +/*
> >>> + * Check dma map request is within a valid iova range
> >>> + */
> >>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
> >>> +				dma_addr_t start, dma_addr_t end)
> >>> +{
> >>> +	struct list_head *iova = &iommu->iova_list;
> >>> +	struct vfio_iova *node;
> >>> +
> >>> +	list_for_each_entry(node, iova, list) {
> >>> +		if ((start >= node->start) && (end <= node->end))
> >>> +			return true;
> >> I am now confused by the fact this change will prevent existing QEMU
> >> from working with this series on some platforms. For instance QEMU virt
> >> machine GPA space collides with Seattle PCI host bridge windows. On ARM
> >> the smmu and smmuv3 drivers report the PCI host bridge windows as
> >> reserved regions which does not seem to be the case on other platforms.
> >> The change happened in commit
> 273df9635385b2156851c7ee49f40658d7bcb29d
> >> (iommu/dma: Make PCI window reservation generic).
> >>
> >> For background, we already discussed the topic after LPC 2016. See
> >> https://www.spinics.net/lists/kernel/msg2379607.html.
> >>
> >> So is it the right choice to expose PCI host bridge windows as reserved
> >> regions? If yes shouldn't we make a difference between those and MSI
> >> windows in this series and do not reject any user space DMA_MAP attempt
> >> within PCI host bridge windows.
> >
> > If the QEMU machine GPA collides with a reserved region today, then
> > either:
> >
> > a) The mapping through the IOMMU works and the reserved region is wrong
> >
> > or
> >
> > b) The mapping doesn't actually work, QEMU is at risk of data loss by
> > being told that it worked, and we're justified in changing that
> > behavior.
> >
> > Without knowing the specifics of SMMU, it doesn't particularly make
> > sense to me to mark the entire PCI hierarchy MMIO range as reserved,
> > unless perhaps the IOMMU is incapable of translating those IOVAs.
> to me the limitation does not come from the smmu itself, which is a
> separate HW block sitting between the root complex and the interconnect.
> If ACS is not enforced by the PCIe subsystem, the transaction will never
> reach the IOMMU.

True. And we do have one such platform where ACS is not enforced but 
reserving the regions and possibly creating holes while launching VM will
make it secure. But I do wonder how we will solve the device grouping
in such cases. 

The Seattle PCI host bridge windows case you mentioned has any pci quirk 
to claim that they support ACS?
 
> In the case of such overlap, shouldn't we just warn the end-user that
> this situation is dangerous instead of forbidding the use case which
> worked "in most cases" until now.

Yes, may be something similar to the allow_unsafe_interrupts case, if
that is acceptable.

Thanks,
Shameer
 
> > Are we trying to prevent untranslated p2p with this reserved range?
> > That's not necessarily a terrible idea, but it seems that doing it for
> > that purpose would need to be a lot smarter, taking into account ACS
> > and precisely selecting ranges within the peer address space that would
> > be untranslated.  Perhaps only populated MMIO within non-ACS
> > hierarchies.  Thanks,
> 
> Indeed taking into account the ACS capability would refine the
> situations where a risk exists.
> 
> Thanks
> 
> Eric
> >
> > Alex
> >

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-26 23:13     ` Alex Williamson
  2018-02-27  8:26       ` Auger Eric
@ 2018-02-27 12:40       ` Robin Murphy
  1 sibling, 0 replies; 28+ messages in thread
From: Robin Murphy @ 2018-02-27 12:40 UTC (permalink / raw)
  To: Alex Williamson, Auger Eric
  Cc: Shameer Kolothum, pmorel, kvm, linux-kernel, linuxarm,
	john.garry, xuwei5

Hi Alex,

On 26/02/18 23:13, Alex Williamson wrote:
> On Mon, 26 Feb 2018 23:05:43 +0100
> Auger Eric <eric.auger@redhat.com> wrote:
> 
>> Hi Shameer,
>>
>> [Adding Robin in CC]
>> On 21/02/18 13:22, Shameer Kolothum wrote:
>>> This checks and rejects any dma map request outside valid iova
>>> range.
>>>
>>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
>>> ---
>>>   drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
>>>   1 file changed, 22 insertions(+)
>>>
>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>> index a80884e..3049393 100644
>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
>>>   	return ret;
>>>   }
>>>   
>>> +/*
>>> + * Check dma map request is within a valid iova range
>>> + */
>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
>>> +				dma_addr_t start, dma_addr_t end)
>>> +{
>>> +	struct list_head *iova = &iommu->iova_list;
>>> +	struct vfio_iova *node;
>>> +
>>> +	list_for_each_entry(node, iova, list) {
>>> +		if ((start >= node->start) && (end <= node->end))
>>> +			return true;
>> I am now confused by the fact this change will prevent existing QEMU
>> from working with this series on some platforms. For instance QEMU virt
>> machine GPA space collides with Seattle PCI host bridge windows. On ARM
>> the smmu and smmuv3 drivers report the PCI host bridge windows as
>> reserved regions which does not seem to be the case on other platforms.
>> The change happened in commit 273df9635385b2156851c7ee49f40658d7bcb29d
>> (iommu/dma: Make PCI window reservation generic).
>>
>> For background, we already discussed the topic after LPC 2016. See
>> https://www.spinics.net/lists/kernel/msg2379607.html.
>>
>> So is it the right choice to expose PCI host bridge windows as reserved
>> regions? If yes shouldn't we make a difference between those and MSI
>> windows in this series and do not reject any user space DMA_MAP attempt
>> within PCI host bridge windows.
> 
> If the QEMU machine GPA collides with a reserved region today, then
> either:
> 
> a) The mapping through the IOMMU works and the reserved region is wrong
> 
> or
> 
> b) The mapping doesn't actually work, QEMU is at risk of data loss by
> being told that it worked, and we're justified in changing that
> behavior.
> 
> Without knowing the specifics of SMMU, it doesn't particularly make
> sense to me to mark the entire PCI hierarchy MMIO range as reserved,
> unless perhaps the IOMMU is incapable of translating those IOVAs.

Yes, the problem in general is that the SMMU *might* be incapable of 
making any use of IOVAs shadowed by PCI p2p addresses, depending on the 
behaviour and/or integration of the root complex/host bridge.

By way of example, the machine on which I developed iommu-dma (Arm Juno) 
has a PCIe RC which doesn't support p2p at all, and happens to have its 
32-bit bridge window placed exactly where qemu-arm starts guest RAM - I 
can plug in a graphics card which claims a nice big BAR from that 
window, assign the whole PCI group to a VM, and watch the NIC blow up in 
the guest as its (IPA) DMA addresses cause the RC to send an abort back 
to the endpoint before the transactions ever get out to the SMMU.

The SMMU itself doesn't know anything about this (e.g. it's the exact 
same IP block as used in AMD Seattle, where AFAIK the AMD root complex 
doesn't suffer the same issue).

> Are we trying to prevent untranslated p2p with this reserved range?
> That's not necessarily a terrible idea, but it seems that doing it for
> that purpose would need to be a lot smarter, taking into account ACS
> and precisely selecting ranges within the peer address space that would
> be untranslated.  Perhaps only populated MMIO within non-ACS
> hierarchies.  Thanks,

The current code is just playing it as safe as it can by always 
reserving the full windows - IIRC there was some debate about whether 
the "only populated MMIO" approach as used by the x86 IOMMUs is really 
safe, given that hotplug and/or BAR reassignment could happen after the 
domain is created. I agree there probably is room to be a bit cleverer 
with respect to ACS, though.

Robin.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-27  8:26       ` Auger Eric
  2018-02-27  9:57         ` Shameerali Kolothum Thodi
@ 2018-02-27 16:57         ` Alex Williamson
  1 sibling, 0 replies; 28+ messages in thread
From: Alex Williamson @ 2018-02-27 16:57 UTC (permalink / raw)
  To: Auger Eric
  Cc: Shameer Kolothum, pmorel, kvm, linux-kernel, linuxarm,
	john.garry, xuwei5, Robin Murphy

On Tue, 27 Feb 2018 09:26:37 +0100
Auger Eric <eric.auger@redhat.com> wrote:

> Hi,
> On 27/02/18 00:13, Alex Williamson wrote:
> > On Mon, 26 Feb 2018 23:05:43 +0100
> > Auger Eric <eric.auger@redhat.com> wrote:
> >   
> >> Hi Shameer,
> >>
> >> [Adding Robin in CC]
> >> On 21/02/18 13:22, Shameer Kolothum wrote:  
> >>> This checks and rejects any dma map request outside valid iova
> >>> range.
> >>>
> >>> Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> >>> ---
> >>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
> >>>  1 file changed, 22 insertions(+)
> >>>
> >>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >>> index a80884e..3049393 100644
> >>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> >>>  	return ret;
> >>>  }
> >>>  
> >>> +/*
> >>> + * Check dma map request is within a valid iova range
> >>> + */
> >>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
> >>> +				dma_addr_t start, dma_addr_t end)
> >>> +{
> >>> +	struct list_head *iova = &iommu->iova_list;
> >>> +	struct vfio_iova *node;
> >>> +
> >>> +	list_for_each_entry(node, iova, list) {
> >>> +		if ((start >= node->start) && (end <= node->end))
> >>> +			return true;    
> >> I am now confused by the fact this change will prevent existing QEMU
> >> from working with this series on some platforms. For instance QEMU virt
> >> machine GPA space collides with Seattle PCI host bridge windows. On ARM
> >> the smmu and smmuv3 drivers report the PCI host bridge windows as
> >> reserved regions which does not seem to be the case on other platforms.
> >> The change happened in commit 273df9635385b2156851c7ee49f40658d7bcb29d
> >> (iommu/dma: Make PCI window reservation generic).
> >>
> >> For background, we already discussed the topic after LPC 2016. See
> >> https://www.spinics.net/lists/kernel/msg2379607.html.
> >>
> >> So is it the right choice to expose PCI host bridge windows as reserved
> >> regions? If yes shouldn't we make a difference between those and MSI
> >> windows in this series and do not reject any user space DMA_MAP attempt
> >> within PCI host bridge windows.  
> > 
> > If the QEMU machine GPA collides with a reserved region today, then
> > either:
> > 
> > a) The mapping through the IOMMU works and the reserved region is wrong
> > 
> > or
> > 
> > b) The mapping doesn't actually work, QEMU is at risk of data loss by
> > being told that it worked, and we're justified in changing that
> > behavior.
> > 
> > Without knowing the specifics of SMMU, it doesn't particularly make
> > sense to me to mark the entire PCI hierarchy MMIO range as reserved,
> > unless perhaps the IOMMU is incapable of translating those IOVAs.  
> to me the limitation does not come from the smmu itself, which is a
> separate HW block sitting between the root complex and the interconnect.
> If ACS is not enforced by the PCIe subsystem, the transaction will never
> reach the IOMMU.

If the limitation is not from the SMMU, then why is it being exposed
via the IOMMU API?  This seems like overreach, trying to compensate for
a limitation elsewhere by imposing a restriction at the IOMMU.

> In the case of such overlap, shouldn't we just warn the end-user that
> this situation is dangerous instead of forbidding the use case which
> worked "in most cases" until now.

How do we distinguish between reserved ranges that are really reserved
and those that are only an advisory?  This seems like it defeats the
whole purpose of the reserved ranges.  Furthermore, if our vfio IOVA
list to the user is only advisory, what's the point?

Peer-to-peer MMIO within an IOMMU group is a tough problem, and one
that we've mostly ignored as we strive towards singleton IOMMU groups,
which are more the normal case on "enterprise" x86 hardware.  The user
does have some ability to determine potential conflicts, so I don't
necessarily see this as exclusively a kernel issue to solve.  However,
if the user needs to account for potentially conflicting MMIO outside of
the IOMMU group which they're provided, then yeah, we have a bigger
issue.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-27  9:57         ` Shameerali Kolothum Thodi
@ 2018-02-27 17:13           ` Alex Williamson
  2018-02-28  9:02           ` Auger Eric
  1 sibling, 0 replies; 28+ messages in thread
From: Alex Williamson @ 2018-02-27 17:13 UTC (permalink / raw)
  To: Shameerali Kolothum Thodi
  Cc: Auger Eric, pmorel, kvm, linux-kernel, Linuxarm, John Garry,
	xuwei (O),
	Robin Murphy

On Tue, 27 Feb 2018 09:57:16 +0000
Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> wrote:

> > -----Original Message-----
> > From: Auger Eric [mailto:eric.auger@redhat.com]
> > Sent: Tuesday, February 27, 2018 8:27 AM
> > To: Alex Williamson <alex.williamson@redhat.com>
> > Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> > pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> > kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> > <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy
> > <robin.murphy@arm.com>
> > Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
> > iova range
> > 
> > Hi,
> > On 27/02/18 00:13, Alex Williamson wrote:  
> > > On Mon, 26 Feb 2018 23:05:43 +0100
> > > Auger Eric <eric.auger@redhat.com> wrote:
> > >  
> > >> Hi Shameer,
> > >>
> > >> [Adding Robin in CC]
> > >> On 21/02/18 13:22, Shameer Kolothum wrote:  
> > >>> This checks and rejects any dma map request outside valid iova
> > >>> range.
> > >>>
> > >>> Signed-off-by: Shameer Kolothum  
> > <shameerali.kolothum.thodi@huawei.com>  
> > >>> ---
> > >>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
> > >>>  1 file changed, 22 insertions(+)
> > >>>
> > >>> diff --git a/drivers/vfio/vfio_iommu_type1.c  
> > b/drivers/vfio/vfio_iommu_type1.c  
> > >>> index a80884e..3049393 100644
> > >>> --- a/drivers/vfio/vfio_iommu_type1.c
> > >>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > >>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu  
> > *iommu, struct vfio_dma *dma,  
> > >>>  	return ret;
> > >>>  }
> > >>>
> > >>> +/*
> > >>> + * Check dma map request is within a valid iova range
> > >>> + */
> > >>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
> > >>> +				dma_addr_t start, dma_addr_t end)
> > >>> +{
> > >>> +	struct list_head *iova = &iommu->iova_list;
> > >>> +	struct vfio_iova *node;
> > >>> +
> > >>> +	list_for_each_entry(node, iova, list) {
> > >>> +		if ((start >= node->start) && (end <= node->end))
> > >>> +			return true;  
> > >> I am now confused by the fact this change will prevent existing QEMU
> > >> from working with this series on some platforms. For instance QEMU virt
> > >> machine GPA space collides with Seattle PCI host bridge windows. On ARM
> > >> the smmu and smmuv3 drivers report the PCI host bridge windows as
> > >> reserved regions which does not seem to be the case on other platforms.
> > >> The change happened in commit  
> > 273df9635385b2156851c7ee49f40658d7bcb29d  
> > >> (iommu/dma: Make PCI window reservation generic).
> > >>
> > >> For background, we already discussed the topic after LPC 2016. See
> > >> https://www.spinics.net/lists/kernel/msg2379607.html.
> > >>
> > >> So is it the right choice to expose PCI host bridge windows as reserved
> > >> regions? If yes shouldn't we make a difference between those and MSI
> > >> windows in this series and do not reject any user space DMA_MAP attempt
> > >> within PCI host bridge windows.  
> > >
> > > If the QEMU machine GPA collides with a reserved region today, then
> > > either:
> > >
> > > a) The mapping through the IOMMU works and the reserved region is wrong
> > >
> > > or
> > >
> > > b) The mapping doesn't actually work, QEMU is at risk of data loss by
> > > being told that it worked, and we're justified in changing that
> > > behavior.
> > >
> > > Without knowing the specifics of SMMU, it doesn't particularly make
> > > sense to me to mark the entire PCI hierarchy MMIO range as reserved,
> > > unless perhaps the IOMMU is incapable of translating those IOVAs.  
> > to me the limitation does not come from the smmu itself, which is a
> > separate HW block sitting between the root complex and the interconnect.
> > If ACS is not enforced by the PCIe subsystem, the transaction will never
> > reach the IOMMU.  
> 
> True. And we do have one such platform where ACS is not enforced but 
> reserving the regions and possibly creating holes while launching VM will
> make it secure. But I do wonder how we will solve the device grouping
> in such cases. 

"creating holes... will make it secure", that's worrisome.  The purpose
of an IOVA list is to inform the user which ranges are available to
them to use.  This is informative, not security.  A malicious user can
ask the device to perform DMA anywhere they choose, regardless of what
we report as reserved.  The hardware provides the security, if we're
relying on the user to honor holes, that's only cooperative security,
which is really no security at all.  If the IOMMU groups don't already
reflect this, they're incorrect.
 
> The Seattle PCI host bridge windows case you mentioned has any pci quirk 
> to claim that they support ACS?
>  
> > In the case of such overlap, shouldn't we just warn the end-user that
> > this situation is dangerous instead of forbidding the use case which
> > worked "in most cases" until now.  
> 
> Yes, may be something similar to the allow_unsafe_interrupts case, if
> that is acceptable.

"allow_unsafe_interrupts" is basically a statement that the user trusts
their user is not malicious and will not exploit potential DMA
attacks.  I'm having a little bit of trouble equating that to allowing
the user to ignore the list of valid IOVA ranges we've provided.  Why
provide a list at all if it's ignored?  If there's grey area in the
list for things the user can choose to ignore, then the list is
invalid.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-27  9:57         ` Shameerali Kolothum Thodi
  2018-02-27 17:13           ` Alex Williamson
@ 2018-02-28  9:02           ` Auger Eric
  2018-02-28  9:25             ` Shameerali Kolothum Thodi
  1 sibling, 1 reply; 28+ messages in thread
From: Auger Eric @ 2018-02-28  9:02 UTC (permalink / raw)
  To: Shameerali Kolothum Thodi, Alex Williamson
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O), Robin Murphy

Hi Shameer,

On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:
> 
> 
>> -----Original Message-----
>> From: Auger Eric [mailto:eric.auger@redhat.com]
>> Sent: Tuesday, February 27, 2018 8:27 AM
>> To: Alex Williamson <alex.williamson@redhat.com>
>> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy
>> <robin.murphy@arm.com>
>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
>> iova range
>>
>> Hi,
>> On 27/02/18 00:13, Alex Williamson wrote:
>>> On Mon, 26 Feb 2018 23:05:43 +0100
>>> Auger Eric <eric.auger@redhat.com> wrote:
>>>
>>>> Hi Shameer,
>>>>
>>>> [Adding Robin in CC]
>>>> On 21/02/18 13:22, Shameer Kolothum wrote:
>>>>> This checks and rejects any dma map request outside valid iova
>>>>> range.
>>>>>
>>>>> Signed-off-by: Shameer Kolothum
>> <shameerali.kolothum.thodi@huawei.com>
>>>>> ---
>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
>>>>>  1 file changed, 22 insertions(+)
>>>>>
>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c
>> b/drivers/vfio/vfio_iommu_type1.c
>>>>> index a80884e..3049393 100644
>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu
>> *iommu, struct vfio_dma *dma,
>>>>>  	return ret;
>>>>>  }
>>>>>
>>>>> +/*
>>>>> + * Check dma map request is within a valid iova range
>>>>> + */
>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
>>>>> +				dma_addr_t start, dma_addr_t end)
>>>>> +{
>>>>> +	struct list_head *iova = &iommu->iova_list;
>>>>> +	struct vfio_iova *node;
>>>>> +
>>>>> +	list_for_each_entry(node, iova, list) {
>>>>> +		if ((start >= node->start) && (end <= node->end))
>>>>> +			return true;
>>>> I am now confused by the fact this change will prevent existing QEMU
>>>> from working with this series on some platforms. For instance QEMU virt
>>>> machine GPA space collides with Seattle PCI host bridge windows. On ARM
>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as
>>>> reserved regions which does not seem to be the case on other platforms.
>>>> The change happened in commit
>> 273df9635385b2156851c7ee49f40658d7bcb29d
>>>> (iommu/dma: Make PCI window reservation generic).
>>>>
>>>> For background, we already discussed the topic after LPC 2016. See
>>>> https://www.spinics.net/lists/kernel/msg2379607.html.
>>>>
>>>> So is it the right choice to expose PCI host bridge windows as reserved
>>>> regions? If yes shouldn't we make a difference between those and MSI
>>>> windows in this series and do not reject any user space DMA_MAP attempt
>>>> within PCI host bridge windows.
>>>
>>> If the QEMU machine GPA collides with a reserved region today, then
>>> either:
>>>
>>> a) The mapping through the IOMMU works and the reserved region is wrong
>>>
>>> or
>>>
>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by
>>> being told that it worked, and we're justified in changing that
>>> behavior.
>>>
>>> Without knowing the specifics of SMMU, it doesn't particularly make
>>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,
>>> unless perhaps the IOMMU is incapable of translating those IOVAs.
>> to me the limitation does not come from the smmu itself, which is a
>> separate HW block sitting between the root complex and the interconnect.
>> If ACS is not enforced by the PCIe subsystem, the transaction will never
>> reach the IOMMU.
> 
> True. And we do have one such platform where ACS is not enforced but 
> reserving the regions and possibly creating holes while launching VM will
> make it secure. But I do wonder how we will solve the device grouping
> in such cases. 
> 
> The Seattle PCI host bridge windows case you mentioned has any pci quirk 
> to claim that they support ACS?
No there is none to my knowledge. I am applying Alex' not upstream ACS
overwrite patch.

Thanks

Eric
>  
>> In the case of such overlap, shouldn't we just warn the end-user that
>> this situation is dangerous instead of forbidding the use case which
>> worked "in most cases" until now.
> 
> Yes, may be something similar to the allow_unsafe_interrupts case, if
> that is acceptable.
> 
> Thanks,
> Shameer
>  
>>> Are we trying to prevent untranslated p2p with this reserved range?
>>> That's not necessarily a terrible idea, but it seems that doing it for
>>> that purpose would need to be a lot smarter, taking into account ACS
>>> and precisely selecting ranges within the peer address space that would
>>> be untranslated.  Perhaps only populated MMIO within non-ACS
>>> hierarchies.  Thanks,
>>
>> Indeed taking into account the ACS capability would refine the
>> situations where a risk exists.
>>
>> Thanks
>>
>> Eric
>>>
>>> Alex
>>>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-28  9:02           ` Auger Eric
@ 2018-02-28  9:25             ` Shameerali Kolothum Thodi
  2018-02-28 11:53               ` Auger Eric
  0 siblings, 1 reply; 28+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-02-28  9:25 UTC (permalink / raw)
  To: Auger Eric, Alex Williamson
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O), Robin Murphy



> -----Original Message-----
> From: Auger Eric [mailto:eric.auger@redhat.com]
> Sent: Wednesday, February 28, 2018 9:02 AM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> Alex Williamson <alex.williamson@redhat.com>
> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy
> <robin.murphy@arm.com>
> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
> iova range
> 
> Hi Shameer,
> 
> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:
> >
> >
> >> -----Original Message-----
> >> From: Auger Eric [mailto:eric.auger@redhat.com]
> >> Sent: Tuesday, February 27, 2018 8:27 AM
> >> To: Alex Williamson <alex.williamson@redhat.com>
> >> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> >> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin
> Murphy
> >> <robin.murphy@arm.com>
> >> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a
> valid
> >> iova range
> >>
> >> Hi,
> >> On 27/02/18 00:13, Alex Williamson wrote:
> >>> On Mon, 26 Feb 2018 23:05:43 +0100
> >>> Auger Eric <eric.auger@redhat.com> wrote:
> >>>
> >>>> Hi Shameer,
> >>>>
> >>>> [Adding Robin in CC]
> >>>> On 21/02/18 13:22, Shameer Kolothum wrote:
> >>>>> This checks and rejects any dma map request outside valid iova
> >>>>> range.
> >>>>>
> >>>>> Signed-off-by: Shameer Kolothum
> >> <shameerali.kolothum.thodi@huawei.com>
> >>>>> ---
> >>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
> >>>>>  1 file changed, 22 insertions(+)
> >>>>>
> >>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c
> >> b/drivers/vfio/vfio_iommu_type1.c
> >>>>> index a80884e..3049393 100644
> >>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu
> >> *iommu, struct vfio_dma *dma,
> >>>>>  	return ret;
> >>>>>  }
> >>>>>
> >>>>> +/*
> >>>>> + * Check dma map request is within a valid iova range
> >>>>> + */
> >>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
> >>>>> +				dma_addr_t start, dma_addr_t end)
> >>>>> +{
> >>>>> +	struct list_head *iova = &iommu->iova_list;
> >>>>> +	struct vfio_iova *node;
> >>>>> +
> >>>>> +	list_for_each_entry(node, iova, list) {
> >>>>> +		if ((start >= node->start) && (end <= node->end))
> >>>>> +			return true;
> >>>> I am now confused by the fact this change will prevent existing QEMU
> >>>> from working with this series on some platforms. For instance QEMU virt
> >>>> machine GPA space collides with Seattle PCI host bridge windows. On
> ARM
> >>>> the smmu and smmuv3 drivers report the PCI host bridge windows as
> >>>> reserved regions which does not seem to be the case on other platforms.
> >>>> The change happened in commit
> >> 273df9635385b2156851c7ee49f40658d7bcb29d
> >>>> (iommu/dma: Make PCI window reservation generic).
> >>>>
> >>>> For background, we already discussed the topic after LPC 2016. See
> >>>> https://www.spinics.net/lists/kernel/msg2379607.html.
> >>>>
> >>>> So is it the right choice to expose PCI host bridge windows as reserved
> >>>> regions? If yes shouldn't we make a difference between those and MSI
> >>>> windows in this series and do not reject any user space DMA_MAP
> attempt
> >>>> within PCI host bridge windows.
> >>>
> >>> If the QEMU machine GPA collides with a reserved region today, then
> >>> either:
> >>>
> >>> a) The mapping through the IOMMU works and the reserved region is
> wrong
> >>>
> >>> or
> >>>
> >>> b) The mapping doesn't actually work, QEMU is at risk of data loss by
> >>> being told that it worked, and we're justified in changing that
> >>> behavior.
> >>>
> >>> Without knowing the specifics of SMMU, it doesn't particularly make
> >>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,
> >>> unless perhaps the IOMMU is incapable of translating those IOVAs.
> >> to me the limitation does not come from the smmu itself, which is a
> >> separate HW block sitting between the root complex and the interconnect.
> >> If ACS is not enforced by the PCIe subsystem, the transaction will never
> >> reach the IOMMU.
> >
> > True. And we do have one such platform where ACS is not enforced but
> > reserving the regions and possibly creating holes while launching VM will
> > make it secure. But I do wonder how we will solve the device grouping
> > in such cases.
> >
> > The Seattle PCI host bridge windows case you mentioned has any pci quirk
> > to claim that they support ACS?
> No there is none to my knowledge. I am applying Alex' not upstream ACS
> overwrite patch.

Ok. But isn't that patch actually applicable to cases where ACS is really supported
by hardware but the capability is not available?  I am just trying to see whether
the argument that we should allow DMA MAP requests for this(non-ACS case)
even if the Guest GPA conflict with reserved region holds good. The fact that may
be it was working before is that the Guest never actually allocated any GPA from
the reserved region or maybe I am missing something here.

Thanks,
Shameer

> Thanks
> 
> Eric
> >
> >> In the case of such overlap, shouldn't we just warn the end-user that
> >> this situation is dangerous instead of forbidding the use case which
> >> worked "in most cases" until now.
> >
> > Yes, may be something similar to the allow_unsafe_interrupts case, if
> > that is acceptable.
> >
> > Thanks,
> > Shameer
> >
> >>> Are we trying to prevent untranslated p2p with this reserved range?
> >>> That's not necessarily a terrible idea, but it seems that doing it for
> >>> that purpose would need to be a lot smarter, taking into account ACS
> >>> and precisely selecting ranges within the peer address space that would
> >>> be untranslated.  Perhaps only populated MMIO within non-ACS
> >>> hierarchies.  Thanks,
> >>
> >> Indeed taking into account the ACS capability would refine the
> >> situations where a risk exists.
> >>
> >> Thanks
> >>
> >> Eric
> >>>
> >>> Alex
> >>>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-28  9:25             ` Shameerali Kolothum Thodi
@ 2018-02-28 11:53               ` Auger Eric
  2018-02-28 13:39                 ` Shameerali Kolothum Thodi
  2018-02-28 15:24                 ` Alex Williamson
  0 siblings, 2 replies; 28+ messages in thread
From: Auger Eric @ 2018-02-28 11:53 UTC (permalink / raw)
  To: Shameerali Kolothum Thodi, Alex Williamson
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O), Robin Murphy

Hi Shameer,

On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:
> 
> 
>> -----Original Message-----
>> From: Auger Eric [mailto:eric.auger@redhat.com]
>> Sent: Wednesday, February 28, 2018 9:02 AM
>> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
>> Alex Williamson <alex.williamson@redhat.com>
>> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy
>> <robin.murphy@arm.com>
>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
>> iova range
>>
>> Hi Shameer,
>>
>> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Auger Eric [mailto:eric.auger@redhat.com]
>>>> Sent: Tuesday, February 27, 2018 8:27 AM
>>>> To: Alex Williamson <alex.williamson@redhat.com>
>>>> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
>>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
>>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
>>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin
>> Murphy
>>>> <robin.murphy@arm.com>
>>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a
>> valid
>>>> iova range
>>>>
>>>> Hi,
>>>> On 27/02/18 00:13, Alex Williamson wrote:
>>>>> On Mon, 26 Feb 2018 23:05:43 +0100
>>>>> Auger Eric <eric.auger@redhat.com> wrote:
>>>>>
>>>>>> Hi Shameer,
>>>>>>
>>>>>> [Adding Robin in CC]
>>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:
>>>>>>> This checks and rejects any dma map request outside valid iova
>>>>>>> range.
>>>>>>>
>>>>>>> Signed-off-by: Shameer Kolothum
>>>> <shameerali.kolothum.thodi@huawei.com>
>>>>>>> ---
>>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
>>>>>>>  1 file changed, 22 insertions(+)
>>>>>>>
>>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c
>>>> b/drivers/vfio/vfio_iommu_type1.c
>>>>>>> index a80884e..3049393 100644
>>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu
>>>> *iommu, struct vfio_dma *dma,
>>>>>>>  	return ret;
>>>>>>>  }
>>>>>>>
>>>>>>> +/*
>>>>>>> + * Check dma map request is within a valid iova range
>>>>>>> + */
>>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
>>>>>>> +				dma_addr_t start, dma_addr_t end)
>>>>>>> +{
>>>>>>> +	struct list_head *iova = &iommu->iova_list;
>>>>>>> +	struct vfio_iova *node;
>>>>>>> +
>>>>>>> +	list_for_each_entry(node, iova, list) {
>>>>>>> +		if ((start >= node->start) && (end <= node->end))
>>>>>>> +			return true;
>>>>>> I am now confused by the fact this change will prevent existing QEMU
>>>>>> from working with this series on some platforms. For instance QEMU virt
>>>>>> machine GPA space collides with Seattle PCI host bridge windows. On
>> ARM
>>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as
>>>>>> reserved regions which does not seem to be the case on other platforms.
>>>>>> The change happened in commit
>>>> 273df9635385b2156851c7ee49f40658d7bcb29d
>>>>>> (iommu/dma: Make PCI window reservation generic).
>>>>>>
>>>>>> For background, we already discussed the topic after LPC 2016. See
>>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.
>>>>>>
>>>>>> So is it the right choice to expose PCI host bridge windows as reserved
>>>>>> regions? If yes shouldn't we make a difference between those and MSI
>>>>>> windows in this series and do not reject any user space DMA_MAP
>> attempt
>>>>>> within PCI host bridge windows.
>>>>>
>>>>> If the QEMU machine GPA collides with a reserved region today, then
>>>>> either:
>>>>>
>>>>> a) The mapping through the IOMMU works and the reserved region is
>> wrong
>>>>>
>>>>> or
>>>>>
>>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by
>>>>> being told that it worked, and we're justified in changing that
>>>>> behavior.
>>>>>
>>>>> Without knowing the specifics of SMMU, it doesn't particularly make
>>>>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,
>>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.
>>>> to me the limitation does not come from the smmu itself, which is a
>>>> separate HW block sitting between the root complex and the interconnect.
>>>> If ACS is not enforced by the PCIe subsystem, the transaction will never
>>>> reach the IOMMU.
>>>
>>> True. And we do have one such platform where ACS is not enforced but
>>> reserving the regions and possibly creating holes while launching VM will
>>> make it secure. But I do wonder how we will solve the device grouping
>>> in such cases.
>>>
>>> The Seattle PCI host bridge windows case you mentioned has any pci quirk
>>> to claim that they support ACS?
>> No there is none to my knowledge. I am applying Alex' not upstream ACS
>> overwrite patch.
> 
> Ok. But isn't that patch actually applicable to cases where ACS is really supported
> by hardware but the capability is not available?

My understanding is normally yes. If you apply the patch whereas the HW
practically does not support ACS, then you fool the kernel pretending
there is isolation whereas there is not. I don't know the exact
capability of the HW on AMD Seattle and effectively I should have cared
about it much earlier and if the HW capability were supported and not
properly exposed we should have implemented a clean quirk for this platform.


  I am just trying to see whether
> the argument that we should allow DMA MAP requests for this(non-ACS case)
> even if the Guest GPA conflict with reserved region holds good. The fact that may
> be it was working before is that the Guest never actually allocated any GPA from
> the reserved region or maybe I am missing something here.

If my understanding is correct, in ideal world we would report the PCI
host bridge window as reserved only in case ACS is not supported. If you
apply the patch and overrides the ACS, then the DMA_MAP  would be
allowed. In case the HW does not support ACS, then you could face
situations where one EP tries to access GPA that never reaches the IOMMU
(because it corresponds to the BAR of another downstream EP). Same can
happen at the moment.

Thanks

Eric
> 
> Thanks,
> Shameer
> 
>> Thanks
>>
>> Eric
>>>
>>>> In the case of such overlap, shouldn't we just warn the end-user that
>>>> this situation is dangerous instead of forbidding the use case which
>>>> worked "in most cases" until now.
>>>
>>> Yes, may be something similar to the allow_unsafe_interrupts case, if
>>> that is acceptable.
>>>
>>> Thanks,
>>> Shameer
>>>
>>>>> Are we trying to prevent untranslated p2p with this reserved range?
>>>>> That's not necessarily a terrible idea, but it seems that doing it for
>>>>> that purpose would need to be a lot smarter, taking into account ACS
>>>>> and precisely selecting ranges within the peer address space that would
>>>>> be untranslated.  Perhaps only populated MMIO within non-ACS
>>>>> hierarchies.  Thanks,
>>>>
>>>> Indeed taking into account the ACS capability would refine the
>>>> situations where a risk exists.
>>>>
>>>> Thanks
>>>>
>>>> Eric
>>>>>
>>>>> Alex
>>>>>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-28 11:53               ` Auger Eric
@ 2018-02-28 13:39                 ` Shameerali Kolothum Thodi
  2018-02-28 15:32                   ` Auger Eric
  2018-02-28 15:24                 ` Alex Williamson
  1 sibling, 1 reply; 28+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-02-28 13:39 UTC (permalink / raw)
  To: Auger Eric, Alex Williamson
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O), Robin Murphy

Hi Eric,

> -----Original Message-----
> From: Auger Eric [mailto:eric.auger@redhat.com]
> Sent: Wednesday, February 28, 2018 11:53 AM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> Alex Williamson <alex.williamson@redhat.com>
> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy
> <robin.murphy@arm.com>
> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
> iova range
> 
> Hi Shameer,
> 
> On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:
> >
> >
> >> -----Original Message-----
> >> From: Auger Eric [mailto:eric.auger@redhat.com]
> >> Sent: Wednesday, February 28, 2018 9:02 AM
> >> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> >> Alex Williamson <alex.williamson@redhat.com>
> >> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin
> Murphy
> >> <robin.murphy@arm.com>
> >> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a
> valid
> >> iova range
> >>
> >> Hi Shameer,
> >>
> >> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: Auger Eric [mailto:eric.auger@redhat.com]
> >>>> Sent: Tuesday, February 27, 2018 8:27 AM
> >>>> To: Alex Williamson <alex.williamson@redhat.com>
> >>>> Cc: Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>;
> >>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> >>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> >>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin
> >> Murphy
> >>>> <robin.murphy@arm.com>
> >>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a
> >> valid
> >>>> iova range
> >>>>
> >>>> Hi,
> >>>> On 27/02/18 00:13, Alex Williamson wrote:
> >>>>> On Mon, 26 Feb 2018 23:05:43 +0100
> >>>>> Auger Eric <eric.auger@redhat.com> wrote:
> >>>>>
> >>>>>> Hi Shameer,
> >>>>>>
> >>>>>> [Adding Robin in CC]
> >>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:
> >>>>>>> This checks and rejects any dma map request outside valid iova
> >>>>>>> range.
> >>>>>>>
> >>>>>>> Signed-off-by: Shameer Kolothum
> >>>> <shameerali.kolothum.thodi@huawei.com>
> >>>>>>> ---
> >>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
> >>>>>>>  1 file changed, 22 insertions(+)
> >>>>>>>
> >>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c
> >>>> b/drivers/vfio/vfio_iommu_type1.c
> >>>>>>> index a80884e..3049393 100644
> >>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct
> vfio_iommu
> >>>> *iommu, struct vfio_dma *dma,
> >>>>>>>  	return ret;
> >>>>>>>  }
> >>>>>>>
> >>>>>>> +/*
> >>>>>>> + * Check dma map request is within a valid iova range
> >>>>>>> + */
> >>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
> >>>>>>> +				dma_addr_t start, dma_addr_t end)
> >>>>>>> +{
> >>>>>>> +	struct list_head *iova = &iommu->iova_list;
> >>>>>>> +	struct vfio_iova *node;
> >>>>>>> +
> >>>>>>> +	list_for_each_entry(node, iova, list) {
> >>>>>>> +		if ((start >= node->start) && (end <= node->end))
> >>>>>>> +			return true;
> >>>>>> I am now confused by the fact this change will prevent existing QEMU
> >>>>>> from working with this series on some platforms. For instance QEMU
> virt
> >>>>>> machine GPA space collides with Seattle PCI host bridge windows. On
> >> ARM
> >>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as
> >>>>>> reserved regions which does not seem to be the case on other
> platforms.
> >>>>>> The change happened in commit
> >>>> 273df9635385b2156851c7ee49f40658d7bcb29d
> >>>>>> (iommu/dma: Make PCI window reservation generic).
> >>>>>>
> >>>>>> For background, we already discussed the topic after LPC 2016. See
> >>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.
> >>>>>>
> >>>>>> So is it the right choice to expose PCI host bridge windows as reserved
> >>>>>> regions? If yes shouldn't we make a difference between those and MSI
> >>>>>> windows in this series and do not reject any user space DMA_MAP
> >> attempt
> >>>>>> within PCI host bridge windows.
> >>>>>
> >>>>> If the QEMU machine GPA collides with a reserved region today, then
> >>>>> either:
> >>>>>
> >>>>> a) The mapping through the IOMMU works and the reserved region is
> >> wrong
> >>>>>
> >>>>> or
> >>>>>
> >>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by
> >>>>> being told that it worked, and we're justified in changing that
> >>>>> behavior.
> >>>>>
> >>>>> Without knowing the specifics of SMMU, it doesn't particularly make
> >>>>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,
> >>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.
> >>>> to me the limitation does not come from the smmu itself, which is a
> >>>> separate HW block sitting between the root complex and the
> interconnect.
> >>>> If ACS is not enforced by the PCIe subsystem, the transaction will never
> >>>> reach the IOMMU.
> >>>
> >>> True. And we do have one such platform where ACS is not enforced but
> >>> reserving the regions and possibly creating holes while launching VM will
> >>> make it secure. But I do wonder how we will solve the device grouping
> >>> in such cases.
> >>>
> >>> The Seattle PCI host bridge windows case you mentioned has any pci quirk
> >>> to claim that they support ACS?
> >> No there is none to my knowledge. I am applying Alex' not upstream ACS
> >> overwrite patch.
> >
> > Ok. But isn't that patch actually applicable to cases where ACS is really
> supported
> > by hardware but the capability is not available?
> 
> My understanding is normally yes. If you apply the patch whereas the HW
> practically does not support ACS, then you fool the kernel pretending
> there is isolation whereas there is not. I don't know the exact
> capability of the HW on AMD Seattle and effectively I should have cared
> about it much earlier and if the HW capability were supported and not
> properly exposed we should have implemented a clean quirk for this platform.

Ok. Thanks for the details.
 
> 
>   I am just trying to see whether
> > the argument that we should allow DMA MAP requests for this(non-ACS case)
> > even if the Guest GPA conflict with reserved region holds good. The fact that
> may
> > be it was working before is that the Guest never actually allocated any GPA
> from
> > the reserved region or maybe I am missing something here.
> 
> If my understanding is correct, in ideal world we would report the PCI
> host bridge window as reserved only in case ACS is not supported. If you
> apply the patch and overrides the ACS, then the DMA_MAP  would be
> allowed. In case the HW does not support ACS, then you could face
> situations where one EP tries to access GPA that never reaches the IOMMU
> (because it corresponds to the BAR of another downstream EP). Same can
> happen at the moment.

Yes, this is my understanding too.

Just wondering the below changes to the iommu_dma_get_resv_regions() is
good enough to take care this issue or not.

Thanks,
Shameer

-- >8 --
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f05f3cf..b6e89d5 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -34,6 +34,8 @@
 
 #define IOMMU_MAPPING_ERROR	0
 
+#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
+
 struct iommu_dma_msi_page {
 	struct list_head	list;
 	dma_addr_t		iova;
@@ -183,6 +185,9 @@ void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 	if (!dev_is_pci(dev))
 		return;
 
+	if (pci_acs_path_enabled(to_pci_dev(dev), NULL, REQ_ACS_FLAGS))
+		return;
+
 	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
 	resource_list_for_each_entry(window, &bridge->windows) {
 		struct iommu_resv_region *region;
---8--


> 
> Eric
> >
> > Thanks,
> > Shameer
> >
> >> Thanks
> >>
> >> Eric
> >>>
> >>>> In the case of such overlap, shouldn't we just warn the end-user that
> >>>> this situation is dangerous instead of forbidding the use case which
> >>>> worked "in most cases" until now.
> >>>
> >>> Yes, may be something similar to the allow_unsafe_interrupts case, if
> >>> that is acceptable.
> >>>
> >>> Thanks,
> >>> Shameer
> >>>
> >>>>> Are we trying to prevent untranslated p2p with this reserved range?
> >>>>> That's not necessarily a terrible idea, but it seems that doing it for
> >>>>> that purpose would need to be a lot smarter, taking into account ACS
> >>>>> and precisely selecting ranges within the peer address space that would
> >>>>> be untranslated.  Perhaps only populated MMIO within non-ACS
> >>>>> hierarchies.  Thanks,
> >>>>
> >>>> Indeed taking into account the ACS capability would refine the
> >>>> situations where a risk exists.
> >>>>
> >>>> Thanks
> >>>>
> >>>> Eric
> >>>>>
> >>>>> Alex
> >>>>>

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-28 11:53               ` Auger Eric
  2018-02-28 13:39                 ` Shameerali Kolothum Thodi
@ 2018-02-28 15:24                 ` Alex Williamson
  2018-03-02 13:19                   ` Shameerali Kolothum Thodi
  1 sibling, 1 reply; 28+ messages in thread
From: Alex Williamson @ 2018-02-28 15:24 UTC (permalink / raw)
  To: Auger Eric
  Cc: Shameerali Kolothum Thodi, pmorel, kvm, linux-kernel, Linuxarm,
	John Garry, xuwei (O),
	Robin Murphy

On Wed, 28 Feb 2018 12:53:27 +0100
Auger Eric <eric.auger@redhat.com> wrote:

> Hi Shameer,
> 
> On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:
> > 
> >   
> >> -----Original Message-----
> >> From: Auger Eric [mailto:eric.auger@redhat.com]
> >> Sent: Wednesday, February 28, 2018 9:02 AM
> >> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> >> Alex Williamson <alex.williamson@redhat.com>
> >> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy
> >> <robin.murphy@arm.com>
> >> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
> >> iova range
> >>
> >> Hi Shameer,
> >>
> >> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:  
> >>>
> >>>  
> >>>> -----Original Message-----
> >>>> From: Auger Eric [mailto:eric.auger@redhat.com]
> >>>> Sent: Tuesday, February 27, 2018 8:27 AM
> >>>> To: Alex Williamson <alex.williamson@redhat.com>
> >>>> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> >>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> >>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> >>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin  
> >> Murphy  
> >>>> <robin.murphy@arm.com>
> >>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a  
> >> valid  
> >>>> iova range
> >>>>
> >>>> Hi,
> >>>> On 27/02/18 00:13, Alex Williamson wrote:  
> >>>>> On Mon, 26 Feb 2018 23:05:43 +0100
> >>>>> Auger Eric <eric.auger@redhat.com> wrote:
> >>>>>  
> >>>>>> Hi Shameer,
> >>>>>>
> >>>>>> [Adding Robin in CC]
> >>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:  
> >>>>>>> This checks and rejects any dma map request outside valid iova
> >>>>>>> range.
> >>>>>>>
> >>>>>>> Signed-off-by: Shameer Kolothum  
> >>>> <shameerali.kolothum.thodi@huawei.com>  
> >>>>>>> ---
> >>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
> >>>>>>>  1 file changed, 22 insertions(+)
> >>>>>>>
> >>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c  
> >>>> b/drivers/vfio/vfio_iommu_type1.c  
> >>>>>>> index a80884e..3049393 100644
> >>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct vfio_iommu  
> >>>> *iommu, struct vfio_dma *dma,  
> >>>>>>>  	return ret;
> >>>>>>>  }
> >>>>>>>
> >>>>>>> +/*
> >>>>>>> + * Check dma map request is within a valid iova range
> >>>>>>> + */
> >>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
> >>>>>>> +				dma_addr_t start, dma_addr_t end)
> >>>>>>> +{
> >>>>>>> +	struct list_head *iova = &iommu->iova_list;
> >>>>>>> +	struct vfio_iova *node;
> >>>>>>> +
> >>>>>>> +	list_for_each_entry(node, iova, list) {
> >>>>>>> +		if ((start >= node->start) && (end <= node->end))
> >>>>>>> +			return true;  
> >>>>>> I am now confused by the fact this change will prevent existing QEMU
> >>>>>> from working with this series on some platforms. For instance QEMU virt
> >>>>>> machine GPA space collides with Seattle PCI host bridge windows. On  
> >> ARM  
> >>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as
> >>>>>> reserved regions which does not seem to be the case on other platforms.
> >>>>>> The change happened in commit  
> >>>> 273df9635385b2156851c7ee49f40658d7bcb29d  
> >>>>>> (iommu/dma: Make PCI window reservation generic).
> >>>>>>
> >>>>>> For background, we already discussed the topic after LPC 2016. See
> >>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.
> >>>>>>
> >>>>>> So is it the right choice to expose PCI host bridge windows as reserved
> >>>>>> regions? If yes shouldn't we make a difference between those and MSI
> >>>>>> windows in this series and do not reject any user space DMA_MAP  
> >> attempt  
> >>>>>> within PCI host bridge windows.  
> >>>>>
> >>>>> If the QEMU machine GPA collides with a reserved region today, then
> >>>>> either:
> >>>>>
> >>>>> a) The mapping through the IOMMU works and the reserved region is  
> >> wrong  
> >>>>>
> >>>>> or
> >>>>>
> >>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by
> >>>>> being told that it worked, and we're justified in changing that
> >>>>> behavior.
> >>>>>
> >>>>> Without knowing the specifics of SMMU, it doesn't particularly make
> >>>>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,
> >>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.  
> >>>> to me the limitation does not come from the smmu itself, which is a
> >>>> separate HW block sitting between the root complex and the interconnect.
> >>>> If ACS is not enforced by the PCIe subsystem, the transaction will never
> >>>> reach the IOMMU.  
> >>>
> >>> True. And we do have one such platform where ACS is not enforced but
> >>> reserving the regions and possibly creating holes while launching VM will
> >>> make it secure. But I do wonder how we will solve the device grouping
> >>> in such cases.
> >>>
> >>> The Seattle PCI host bridge windows case you mentioned has any pci quirk
> >>> to claim that they support ACS?  
> >> No there is none to my knowledge. I am applying Alex' not upstream ACS
> >> overwrite patch.  
> > 
> > Ok. But isn't that patch actually applicable to cases where ACS is really supported
> > by hardware but the capability is not available?  
> 
> My understanding is normally yes. If you apply the patch whereas the HW
> practically does not support ACS, then you fool the kernel pretending
> there is isolation whereas there is not. I don't know the exact
> capability of the HW on AMD Seattle and effectively I should have cared
> about it much earlier and if the HW capability were supported and not
> properly exposed we should have implemented a clean quirk for this platform.

Yes, there's a reason why the ACS override patch is not upstream, and
any downstream distribution that actually intends to support their
customers should not include it.  If we can verify with the hardware
vendor that ACS equivalent isolation exists, we should be putting
quirks into the kernel to enable it automatically.  Supporting users
using the ACS override patch is a non-goal.

>   I am just trying to see whether
> > the argument that we should allow DMA MAP requests for this(non-ACS case)
> > even if the Guest GPA conflict with reserved region holds good. The fact that may
> > be it was working before is that the Guest never actually allocated any GPA from
> > the reserved region or maybe I am missing something here.  
> 
> If my understanding is correct, in ideal world we would report the PCI
> host bridge window as reserved only in case ACS is not supported. If you
> apply the patch and overrides the ACS, then the DMA_MAP  would be
> allowed. In case the HW does not support ACS, then you could face
> situations where one EP tries to access GPA that never reaches the IOMMU
> (because it corresponds to the BAR of another downstream EP). Same can
> happen at the moment.

I still think you're overstretching the IOMMU reserved region interface
by trying to report possible ACS conflicts.  Are we going to update the
reserved list on device hotplug?  Are we going to update the list when
MMIO is enabled or disabled for each device?  What if the BARs are
reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved list
should account for specific IOVA exclusions that apply to transactions
that actually reach the IOMMU, not aliasing that might occur in the
downstream topology.  Additionally, the IOMMU group composition must be
such that these sorts of aliasing issues can only occur within an IOMMU
group.  If a transaction can be affected by the MMIO programming of
another group, then the groups are drawn incorrectly for the isolation
capabilities of the hardware.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-28 13:39                 ` Shameerali Kolothum Thodi
@ 2018-02-28 15:32                   ` Auger Eric
  0 siblings, 0 replies; 28+ messages in thread
From: Auger Eric @ 2018-02-28 15:32 UTC (permalink / raw)
  To: Shameerali Kolothum Thodi, Alex Williamson
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O), Robin Murphy

Hi Shameer,

On 28/02/18 14:39, Shameerali Kolothum Thodi wrote:
> Hi Eric,
> 
>> -----Original Message-----
>> From: Auger Eric [mailto:eric.auger@redhat.com]
>> Sent: Wednesday, February 28, 2018 11:53 AM
>> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
>> Alex Williamson <alex.williamson@redhat.com>
>> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy
>> <robin.murphy@arm.com>
>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
>> iova range
>>
>> Hi Shameer,
>>
>> On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Auger Eric [mailto:eric.auger@redhat.com]
>>>> Sent: Wednesday, February 28, 2018 9:02 AM
>>>> To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
>>>> Alex Williamson <alex.williamson@redhat.com>
>>>> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
>>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
>>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin
>> Murphy
>>>> <robin.murphy@arm.com>
>>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a
>> valid
>>>> iova range
>>>>
>>>> Hi Shameer,
>>>>
>>>> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Auger Eric [mailto:eric.auger@redhat.com]
>>>>>> Sent: Tuesday, February 27, 2018 8:27 AM
>>>>>> To: Alex Williamson <alex.williamson@redhat.com>
>>>>>> Cc: Shameerali Kolothum Thodi
>> <shameerali.kolothum.thodi@huawei.com>;
>>>>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
>>>>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
>>>>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin
>>>> Murphy
>>>>>> <robin.murphy@arm.com>
>>>>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a
>>>> valid
>>>>>> iova range
>>>>>>
>>>>>> Hi,
>>>>>> On 27/02/18 00:13, Alex Williamson wrote:
>>>>>>> On Mon, 26 Feb 2018 23:05:43 +0100
>>>>>>> Auger Eric <eric.auger@redhat.com> wrote:
>>>>>>>
>>>>>>>> Hi Shameer,
>>>>>>>>
>>>>>>>> [Adding Robin in CC]
>>>>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:
>>>>>>>>> This checks and rejects any dma map request outside valid iova
>>>>>>>>> range.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Shameer Kolothum
>>>>>> <shameerali.kolothum.thodi@huawei.com>
>>>>>>>>> ---
>>>>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
>>>>>>>>>  1 file changed, 22 insertions(+)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c
>>>>>> b/drivers/vfio/vfio_iommu_type1.c
>>>>>>>>> index a80884e..3049393 100644
>>>>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct
>> vfio_iommu
>>>>>> *iommu, struct vfio_dma *dma,
>>>>>>>>>  	return ret;
>>>>>>>>>  }
>>>>>>>>>
>>>>>>>>> +/*
>>>>>>>>> + * Check dma map request is within a valid iova range
>>>>>>>>> + */
>>>>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
>>>>>>>>> +				dma_addr_t start, dma_addr_t end)
>>>>>>>>> +{
>>>>>>>>> +	struct list_head *iova = &iommu->iova_list;
>>>>>>>>> +	struct vfio_iova *node;
>>>>>>>>> +
>>>>>>>>> +	list_for_each_entry(node, iova, list) {
>>>>>>>>> +		if ((start >= node->start) && (end <= node->end))
>>>>>>>>> +			return true;
>>>>>>>> I am now confused by the fact this change will prevent existing QEMU
>>>>>>>> from working with this series on some platforms. For instance QEMU
>> virt
>>>>>>>> machine GPA space collides with Seattle PCI host bridge windows. On
>>>> ARM
>>>>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as
>>>>>>>> reserved regions which does not seem to be the case on other
>> platforms.
>>>>>>>> The change happened in commit
>>>>>> 273df9635385b2156851c7ee49f40658d7bcb29d
>>>>>>>> (iommu/dma: Make PCI window reservation generic).
>>>>>>>>
>>>>>>>> For background, we already discussed the topic after LPC 2016. See
>>>>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.
>>>>>>>>
>>>>>>>> So is it the right choice to expose PCI host bridge windows as reserved
>>>>>>>> regions? If yes shouldn't we make a difference between those and MSI
>>>>>>>> windows in this series and do not reject any user space DMA_MAP
>>>> attempt
>>>>>>>> within PCI host bridge windows.
>>>>>>>
>>>>>>> If the QEMU machine GPA collides with a reserved region today, then
>>>>>>> either:
>>>>>>>
>>>>>>> a) The mapping through the IOMMU works and the reserved region is
>>>> wrong
>>>>>>>
>>>>>>> or
>>>>>>>
>>>>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by
>>>>>>> being told that it worked, and we're justified in changing that
>>>>>>> behavior.
>>>>>>>
>>>>>>> Without knowing the specifics of SMMU, it doesn't particularly make
>>>>>>> sense to me to mark the entire PCI hierarchy MMIO range as reserved,
>>>>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.
>>>>>> to me the limitation does not come from the smmu itself, which is a
>>>>>> separate HW block sitting between the root complex and the
>> interconnect.
>>>>>> If ACS is not enforced by the PCIe subsystem, the transaction will never
>>>>>> reach the IOMMU.
>>>>>
>>>>> True. And we do have one such platform where ACS is not enforced but
>>>>> reserving the regions and possibly creating holes while launching VM will
>>>>> make it secure. But I do wonder how we will solve the device grouping
>>>>> in such cases.
>>>>>
>>>>> The Seattle PCI host bridge windows case you mentioned has any pci quirk
>>>>> to claim that they support ACS?
>>>> No there is none to my knowledge. I am applying Alex' not upstream ACS
>>>> overwrite patch.
>>>
>>> Ok. But isn't that patch actually applicable to cases where ACS is really
>> supported
>>> by hardware but the capability is not available?
>>
>> My understanding is normally yes. If you apply the patch whereas the HW
>> practically does not support ACS, then you fool the kernel pretending
>> there is isolation whereas there is not. I don't know the exact
>> capability of the HW on AMD Seattle and effectively I should have cared
>> about it much earlier and if the HW capability were supported and not
>> properly exposed we should have implemented a clean quirk for this platform.
> 
> Ok. Thanks for the details.
>  
>>
>>   I am just trying to see whether
>>> the argument that we should allow DMA MAP requests for this(non-ACS case)
>>> even if the Guest GPA conflict with reserved region holds good. The fact that
>> may
>>> be it was working before is that the Guest never actually allocated any GPA
>> from
>>> the reserved region or maybe I am missing something here.
>>
>> If my understanding is correct, in ideal world we would report the PCI
>> host bridge window as reserved only in case ACS is not supported. If you
>> apply the patch and overrides the ACS, then the DMA_MAP  would be
>> allowed. In case the HW does not support ACS, then you could face
>> situations where one EP tries to access GPA that never reaches the IOMMU
>> (because it corresponds to the BAR of another downstream EP). Same can
>> happen at the moment.
> 
> Yes, this is my understanding too.
> 
> Just wondering the below changes to the iommu_dma_get_resv_regions() is
> good enough to take care this issue or not.

This looks sensible to me. The only question I have is can this ACS
computation result be altered later on for some reason.

Otherwise I confirm this fixes the reported reserved regions (no more
PCI host bridge windows) and assignment failure on Cavium CN88xx. Cavium
CN88xx assignment would otherwise be affected too.

Thanks

Eric
> 
> Thanks,
> Shameer
> 
> -- >8 --
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index f05f3cf..b6e89d5 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -34,6 +34,8 @@
>  
>  #define IOMMU_MAPPING_ERROR	0
>  
> +#define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
> +
>  struct iommu_dma_msi_page {
>  	struct list_head	list;
>  	dma_addr_t		iova;
> @@ -183,6 +185,9 @@ void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
>  	if (!dev_is_pci(dev))
>  		return;
>  
> +	if (pci_acs_path_enabled(to_pci_dev(dev), NULL, REQ_ACS_FLAGS))
> +		return;
> +
>  	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
>  	resource_list_for_each_entry(window, &bridge->windows) {
>  		struct iommu_resv_region *region;
> ---8--
> 
> 
>>
>> Eric
>>>
>>> Thanks,
>>> Shameer
>>>
>>>> Thanks
>>>>
>>>> Eric
>>>>>
>>>>>> In the case of such overlap, shouldn't we just warn the end-user that
>>>>>> this situation is dangerous instead of forbidding the use case which
>>>>>> worked "in most cases" until now.
>>>>>
>>>>> Yes, may be something similar to the allow_unsafe_interrupts case, if
>>>>> that is acceptable.
>>>>>
>>>>> Thanks,
>>>>> Shameer
>>>>>
>>>>>>> Are we trying to prevent untranslated p2p with this reserved range?
>>>>>>> That's not necessarily a terrible idea, but it seems that doing it for
>>>>>>> that purpose would need to be a lot smarter, taking into account ACS
>>>>>>> and precisely selecting ranges within the peer address space that would
>>>>>>> be untranslated.  Perhaps only populated MMIO within non-ACS
>>>>>>> hierarchies.  Thanks,
>>>>>>
>>>>>> Indeed taking into account the ACS capability would refine the
>>>>>> situations where a risk exists.
>>>>>>
>>>>>> Thanks
>>>>>>
>>>>>> Eric
>>>>>>>
>>>>>>> Alex
>>>>>>>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-02-28 15:24                 ` Alex Williamson
@ 2018-03-02 13:19                   ` Shameerali Kolothum Thodi
  2018-03-02 16:04                     ` Alex Williamson
  0 siblings, 1 reply; 28+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-03-02 13:19 UTC (permalink / raw)
  To: Alex Williamson, Auger Eric
  Cc: pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O), Robin Murphy


> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Wednesday, February 28, 2018 3:24 PM
> To: Auger Eric <eric.auger@redhat.com>
> Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy
> <robin.murphy@arm.com>
> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
> iova range
> 
> On Wed, 28 Feb 2018 12:53:27 +0100
> Auger Eric <eric.auger@redhat.com> wrote:
> 
> > Hi Shameer,
> >
> > On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:
> > >
> > >
> > >> -----Original Message-----
> > >> From: Auger Eric [mailto:eric.auger@redhat.com]
> > >> Sent: Wednesday, February 28, 2018 9:02 AM
> > >> To: Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>;
> > >> Alex Williamson <alex.williamson@redhat.com>
> > >> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> > >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> > >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin
> Murphy
> > >> <robin.murphy@arm.com>
> > >> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a
> valid
> > >> iova range
> > >>
> > >> Hi Shameer,
> > >>
> > >> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:
> > >>>
> > >>>
> > >>>> -----Original Message-----
> > >>>> From: Auger Eric [mailto:eric.auger@redhat.com]
> > >>>> Sent: Tuesday, February 27, 2018 8:27 AM
> > >>>> To: Alex Williamson <alex.williamson@redhat.com>
> > >>>> Cc: Shameerali Kolothum Thodi
> <shameerali.kolothum.thodi@huawei.com>;
> > >>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> > >>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John
> Garry
> > >>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin
> > >> Murphy
> > >>>> <robin.murphy@arm.com>
> > >>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within
> a
> > >> valid
> > >>>> iova range
> > >>>>
> > >>>> Hi,
> > >>>> On 27/02/18 00:13, Alex Williamson wrote:
> > >>>>> On Mon, 26 Feb 2018 23:05:43 +0100
> > >>>>> Auger Eric <eric.auger@redhat.com> wrote:
> > >>>>>
> > >>>>>> Hi Shameer,
> > >>>>>>
> > >>>>>> [Adding Robin in CC]
> > >>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:
> > >>>>>>> This checks and rejects any dma map request outside valid iova
> > >>>>>>> range.
> > >>>>>>>
> > >>>>>>> Signed-off-by: Shameer Kolothum
> > >>>> <shameerali.kolothum.thodi@huawei.com>
> > >>>>>>> ---
> > >>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
> > >>>>>>>  1 file changed, 22 insertions(+)
> > >>>>>>>
> > >>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c
> > >>>> b/drivers/vfio/vfio_iommu_type1.c
> > >>>>>>> index a80884e..3049393 100644
> > >>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > >>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > >>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct
> vfio_iommu
> > >>>> *iommu, struct vfio_dma *dma,
> > >>>>>>>  	return ret;
> > >>>>>>>  }
> > >>>>>>>
> > >>>>>>> +/*
> > >>>>>>> + * Check dma map request is within a valid iova range
> > >>>>>>> + */
> > >>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu
> *iommu,
> > >>>>>>> +				dma_addr_t start, dma_addr_t end)
> > >>>>>>> +{
> > >>>>>>> +	struct list_head *iova = &iommu->iova_list;
> > >>>>>>> +	struct vfio_iova *node;
> > >>>>>>> +
> > >>>>>>> +	list_for_each_entry(node, iova, list) {
> > >>>>>>> +		if ((start >= node->start) && (end <= node->end))
> > >>>>>>> +			return true;
> > >>>>>> I am now confused by the fact this change will prevent existing QEMU
> > >>>>>> from working with this series on some platforms. For instance QEMU
> virt
> > >>>>>> machine GPA space collides with Seattle PCI host bridge windows. On
> > >> ARM
> > >>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as
> > >>>>>> reserved regions which does not seem to be the case on other
> platforms.
> > >>>>>> The change happened in commit
> > >>>> 273df9635385b2156851c7ee49f40658d7bcb29d
> > >>>>>> (iommu/dma: Make PCI window reservation generic).
> > >>>>>>
> > >>>>>> For background, we already discussed the topic after LPC 2016. See
> > >>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.
> > >>>>>>
> > >>>>>> So is it the right choice to expose PCI host bridge windows as reserved
> > >>>>>> regions? If yes shouldn't we make a difference between those and
> MSI
> > >>>>>> windows in this series and do not reject any user space DMA_MAP
> > >> attempt
> > >>>>>> within PCI host bridge windows.
> > >>>>>
> > >>>>> If the QEMU machine GPA collides with a reserved region today, then
> > >>>>> either:
> > >>>>>
> > >>>>> a) The mapping through the IOMMU works and the reserved region is
> > >> wrong
> > >>>>>
> > >>>>> or
> > >>>>>
> > >>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by
> > >>>>> being told that it worked, and we're justified in changing that
> > >>>>> behavior.
> > >>>>>
> > >>>>> Without knowing the specifics of SMMU, it doesn't particularly make
> > >>>>> sense to me to mark the entire PCI hierarchy MMIO range as
> reserved,
> > >>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.
> > >>>> to me the limitation does not come from the smmu itself, which is a
> > >>>> separate HW block sitting between the root complex and the
> interconnect.
> > >>>> If ACS is not enforced by the PCIe subsystem, the transaction will never
> > >>>> reach the IOMMU.
> > >>>
> > >>> True. And we do have one such platform where ACS is not enforced but
> > >>> reserving the regions and possibly creating holes while launching VM will
> > >>> make it secure. But I do wonder how we will solve the device grouping
> > >>> in such cases.
> > >>>
> > >>> The Seattle PCI host bridge windows case you mentioned has any pci
> quirk
> > >>> to claim that they support ACS?
> > >> No there is none to my knowledge. I am applying Alex' not upstream ACS
> > >> overwrite patch.
> > >
> > > Ok. But isn't that patch actually applicable to cases where ACS is really
> supported
> > > by hardware but the capability is not available?
> >
> > My understanding is normally yes. If you apply the patch whereas the HW
> > practically does not support ACS, then you fool the kernel pretending
> > there is isolation whereas there is not. I don't know the exact
> > capability of the HW on AMD Seattle and effectively I should have cared
> > about it much earlier and if the HW capability were supported and not
> > properly exposed we should have implemented a clean quirk for this
> platform.
> 
> Yes, there's a reason why the ACS override patch is not upstream, and
> any downstream distribution that actually intends to support their
> customers should not include it.  If we can verify with the hardware
> vendor that ACS equivalent isolation exists, we should be putting
> quirks into the kernel to enable it automatically.  Supporting users
> using the ACS override patch is a non-goal.
> 
> >   I am just trying to see whether
> > > the argument that we should allow DMA MAP requests for this(non-ACS
> case)
> > > even if the Guest GPA conflict with reserved region holds good. The fact
> that may
> > > be it was working before is that the Guest never actually allocated any GPA
> from
> > > the reserved region or maybe I am missing something here.
> >
> > If my understanding is correct, in ideal world we would report the PCI
> > host bridge window as reserved only in case ACS is not supported. If you
> > apply the patch and overrides the ACS, then the DMA_MAP  would be
> > allowed. In case the HW does not support ACS, then you could face
> > situations where one EP tries to access GPA that never reaches the IOMMU
> > (because it corresponds to the BAR of another downstream EP). Same can
> > happen at the moment.
> 
> I still think you're overstretching the IOMMU reserved region interface
> by trying to report possible ACS conflicts.  Are we going to update the
> reserved list on device hotplug?  Are we going to update the list when
> MMIO is enabled or disabled for each device?  What if the BARs are
> reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved list
> should account for specific IOVA exclusions that apply to transactions
> that actually reach the IOMMU, not aliasing that might occur in the
> downstream topology.  Additionally, the IOMMU group composition must be
> such that these sorts of aliasing issues can only occur within an IOMMU
> group.  If a transaction can be affected by the MMIO programming of
> another group, then the groups are drawn incorrectly for the isolation
> capabilities of the hardware.  Thanks,

Agree that this is not a perfect thing to do covering all scenarios. As Robin
pointed out, the current code is playing safe by reserving the full windows.

My suggestion will be to limit this reservation to non-ACS cases only.  This will
make sure that ACS ARM hardware is not broken by this series and nos-ACS
ones has a chance to run once Qemu is updated to take care of the reserved
regions (at least in some user scenarios).

If you all are fine with this, I can include the ACS check I mentioned earlier in
iommu_dma_get_resv_regions() and sent out the revised  series.

Please let me know your thoughts.

Thanks,
Shameer

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-03-02 13:19                   ` Shameerali Kolothum Thodi
@ 2018-03-02 16:04                     ` Alex Williamson
  2018-03-02 17:16                       ` Robin Murphy
  0 siblings, 1 reply; 28+ messages in thread
From: Alex Williamson @ 2018-03-02 16:04 UTC (permalink / raw)
  To: Shameerali Kolothum Thodi
  Cc: Auger Eric, pmorel, kvm, linux-kernel, Linuxarm, John Garry,
	xuwei (O),
	Robin Murphy

On Fri, 2 Mar 2018 13:19:58 +0000
Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com> wrote:

> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Wednesday, February 28, 2018 3:24 PM
> > To: Auger Eric <eric.auger@redhat.com>
> > Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>;
> > pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> > kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> > <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin Murphy
> > <robin.murphy@arm.com>
> > Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
> > iova range
> > 
> > On Wed, 28 Feb 2018 12:53:27 +0100
> > Auger Eric <eric.auger@redhat.com> wrote:
> >   
> > > Hi Shameer,
> > >
> > > On 28/02/18 10:25, Shameerali Kolothum Thodi wrote:  
> > > >
> > > >  
> > > >> -----Original Message-----
> > > >> From: Auger Eric [mailto:eric.auger@redhat.com]
> > > >> Sent: Wednesday, February 28, 2018 9:02 AM
> > > >> To: Shameerali Kolothum Thodi  
> > <shameerali.kolothum.thodi@huawei.com>;  
> > > >> Alex Williamson <alex.williamson@redhat.com>
> > > >> Cc: pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> > > >> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John Garry
> > > >> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin  
> > Murphy  
> > > >> <robin.murphy@arm.com>
> > > >> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a  
> > valid  
> > > >> iova range
> > > >>
> > > >> Hi Shameer,
> > > >>
> > > >> On 27/02/18 10:57, Shameerali Kolothum Thodi wrote:  
> > > >>>
> > > >>>  
> > > >>>> -----Original Message-----
> > > >>>> From: Auger Eric [mailto:eric.auger@redhat.com]
> > > >>>> Sent: Tuesday, February 27, 2018 8:27 AM
> > > >>>> To: Alex Williamson <alex.williamson@redhat.com>
> > > >>>> Cc: Shameerali Kolothum Thodi  
> > <shameerali.kolothum.thodi@huawei.com>;  
> > > >>>> pmorel@linux.vnet.ibm.com; kvm@vger.kernel.org; linux-
> > > >>>> kernel@vger.kernel.org; Linuxarm <linuxarm@huawei.com>; John  
> > Garry  
> > > >>>> <john.garry@huawei.com>; xuwei (O) <xuwei5@huawei.com>; Robin  
> > > >> Murphy  
> > > >>>> <robin.murphy@arm.com>
> > > >>>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within  
> > a  
> > > >> valid  
> > > >>>> iova range
> > > >>>>
> > > >>>> Hi,
> > > >>>> On 27/02/18 00:13, Alex Williamson wrote:  
> > > >>>>> On Mon, 26 Feb 2018 23:05:43 +0100
> > > >>>>> Auger Eric <eric.auger@redhat.com> wrote:
> > > >>>>>  
> > > >>>>>> Hi Shameer,
> > > >>>>>>
> > > >>>>>> [Adding Robin in CC]
> > > >>>>>> On 21/02/18 13:22, Shameer Kolothum wrote:  
> > > >>>>>>> This checks and rejects any dma map request outside valid iova
> > > >>>>>>> range.
> > > >>>>>>>
> > > >>>>>>> Signed-off-by: Shameer Kolothum  
> > > >>>> <shameerali.kolothum.thodi@huawei.com>  
> > > >>>>>>> ---
> > > >>>>>>>  drivers/vfio/vfio_iommu_type1.c | 22 ++++++++++++++++++++++
> > > >>>>>>>  1 file changed, 22 insertions(+)
> > > >>>>>>>
> > > >>>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c  
> > > >>>> b/drivers/vfio/vfio_iommu_type1.c  
> > > >>>>>>> index a80884e..3049393 100644
> > > >>>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
> > > >>>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
> > > >>>>>>> @@ -970,6 +970,23 @@ static int vfio_pin_map_dma(struct  
> > vfio_iommu  
> > > >>>> *iommu, struct vfio_dma *dma,  
> > > >>>>>>>  	return ret;
> > > >>>>>>>  }
> > > >>>>>>>
> > > >>>>>>> +/*
> > > >>>>>>> + * Check dma map request is within a valid iova range
> > > >>>>>>> + */
> > > >>>>>>> +static bool vfio_iommu_iova_dma_valid(struct vfio_iommu  
> > *iommu,  
> > > >>>>>>> +				dma_addr_t start, dma_addr_t end)
> > > >>>>>>> +{
> > > >>>>>>> +	struct list_head *iova = &iommu->iova_list;
> > > >>>>>>> +	struct vfio_iova *node;
> > > >>>>>>> +
> > > >>>>>>> +	list_for_each_entry(node, iova, list) {
> > > >>>>>>> +		if ((start >= node->start) && (end <= node->end))
> > > >>>>>>> +			return true;  
> > > >>>>>> I am now confused by the fact this change will prevent existing QEMU
> > > >>>>>> from working with this series on some platforms. For instance QEMU  
> > virt  
> > > >>>>>> machine GPA space collides with Seattle PCI host bridge windows. On  
> > > >> ARM  
> > > >>>>>> the smmu and smmuv3 drivers report the PCI host bridge windows as
> > > >>>>>> reserved regions which does not seem to be the case on other  
> > platforms.  
> > > >>>>>> The change happened in commit  
> > > >>>> 273df9635385b2156851c7ee49f40658d7bcb29d  
> > > >>>>>> (iommu/dma: Make PCI window reservation generic).
> > > >>>>>>
> > > >>>>>> For background, we already discussed the topic after LPC 2016. See
> > > >>>>>> https://www.spinics.net/lists/kernel/msg2379607.html.
> > > >>>>>>
> > > >>>>>> So is it the right choice to expose PCI host bridge windows as reserved
> > > >>>>>> regions? If yes shouldn't we make a difference between those and  
> > MSI  
> > > >>>>>> windows in this series and do not reject any user space DMA_MAP  
> > > >> attempt  
> > > >>>>>> within PCI host bridge windows.  
> > > >>>>>
> > > >>>>> If the QEMU machine GPA collides with a reserved region today, then
> > > >>>>> either:
> > > >>>>>
> > > >>>>> a) The mapping through the IOMMU works and the reserved region is  
> > > >> wrong  
> > > >>>>>
> > > >>>>> or
> > > >>>>>
> > > >>>>> b) The mapping doesn't actually work, QEMU is at risk of data loss by
> > > >>>>> being told that it worked, and we're justified in changing that
> > > >>>>> behavior.
> > > >>>>>
> > > >>>>> Without knowing the specifics of SMMU, it doesn't particularly make
> > > >>>>> sense to me to mark the entire PCI hierarchy MMIO range as  
> > reserved,  
> > > >>>>> unless perhaps the IOMMU is incapable of translating those IOVAs.  
> > > >>>> to me the limitation does not come from the smmu itself, which is a
> > > >>>> separate HW block sitting between the root complex and the  
> > interconnect.  
> > > >>>> If ACS is not enforced by the PCIe subsystem, the transaction will never
> > > >>>> reach the IOMMU.  
> > > >>>
> > > >>> True. And we do have one such platform where ACS is not enforced but
> > > >>> reserving the regions and possibly creating holes while launching VM will
> > > >>> make it secure. But I do wonder how we will solve the device grouping
> > > >>> in such cases.
> > > >>>
> > > >>> The Seattle PCI host bridge windows case you mentioned has any pci  
> > quirk  
> > > >>> to claim that they support ACS?  
> > > >> No there is none to my knowledge. I am applying Alex' not upstream ACS
> > > >> overwrite patch.  
> > > >
> > > > Ok. But isn't that patch actually applicable to cases where ACS is really  
> > supported  
> > > > by hardware but the capability is not available?  
> > >
> > > My understanding is normally yes. If you apply the patch whereas the HW
> > > practically does not support ACS, then you fool the kernel pretending
> > > there is isolation whereas there is not. I don't know the exact
> > > capability of the HW on AMD Seattle and effectively I should have cared
> > > about it much earlier and if the HW capability were supported and not
> > > properly exposed we should have implemented a clean quirk for this  
> > platform.
> > 
> > Yes, there's a reason why the ACS override patch is not upstream, and
> > any downstream distribution that actually intends to support their
> > customers should not include it.  If we can verify with the hardware
> > vendor that ACS equivalent isolation exists, we should be putting
> > quirks into the kernel to enable it automatically.  Supporting users
> > using the ACS override patch is a non-goal.
> >   
> > >   I am just trying to see whether  
> > > > the argument that we should allow DMA MAP requests for this(non-ACS  
> > case)  
> > > > even if the Guest GPA conflict with reserved region holds good. The fact  
> > that may  
> > > > be it was working before is that the Guest never actually allocated any GPA  
> > from  
> > > > the reserved region or maybe I am missing something here.  
> > >
> > > If my understanding is correct, in ideal world we would report the PCI
> > > host bridge window as reserved only in case ACS is not supported. If you
> > > apply the patch and overrides the ACS, then the DMA_MAP  would be
> > > allowed. In case the HW does not support ACS, then you could face
> > > situations where one EP tries to access GPA that never reaches the IOMMU
> > > (because it corresponds to the BAR of another downstream EP). Same can
> > > happen at the moment.  
> > 
> > I still think you're overstretching the IOMMU reserved region interface
> > by trying to report possible ACS conflicts.  Are we going to update the
> > reserved list on device hotplug?  Are we going to update the list when
> > MMIO is enabled or disabled for each device?  What if the BARs are
> > reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved list
> > should account for specific IOVA exclusions that apply to transactions
> > that actually reach the IOMMU, not aliasing that might occur in the
> > downstream topology.  Additionally, the IOMMU group composition must be
> > such that these sorts of aliasing issues can only occur within an IOMMU
> > group.  If a transaction can be affected by the MMIO programming of
> > another group, then the groups are drawn incorrectly for the isolation
> > capabilities of the hardware.  Thanks,  
> 
> Agree that this is not a perfect thing to do covering all scenarios. As Robin
> pointed out, the current code is playing safe by reserving the full windows.
> 
> My suggestion will be to limit this reservation to non-ACS cases only.  This will
> make sure that ACS ARM hardware is not broken by this series and nos-ACS
> ones has a chance to run once Qemu is updated to take care of the reserved
> regions (at least in some user scenarios).
> 
> If you all are fine with this, I can include the ACS check I mentioned earlier in
> iommu_dma_get_resv_regions() and sent out the revised  series.
> 
> Please let me know your thoughts.

IMO, the IOMMU should be concerned with ACS as far as isolation is
concerned and reporting reserved ranges that are imposed at the IOMMU
and leave any aliasing or routing issues in the downstream topology to
other layers, or perhaps to the user.  Unfortunately, enforcing the
iova list in vfio is gated by some movement here since we can't break
existing users.  Thanks,

Alex

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-03-02 16:04                     ` Alex Williamson
@ 2018-03-02 17:16                       ` Robin Murphy
  2018-03-05 11:44                         ` Shameerali Kolothum Thodi
  2018-03-08  9:35                         ` Shameerali Kolothum Thodi
  0 siblings, 2 replies; 28+ messages in thread
From: Robin Murphy @ 2018-03-02 17:16 UTC (permalink / raw)
  To: Alex Williamson, Shameerali Kolothum Thodi
  Cc: Auger Eric, pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)

On 02/03/18 16:04, Alex Williamson wrote:
[...]
>>> I still think you're overstretching the IOMMU reserved region interface
>>> by trying to report possible ACS conflicts.  Are we going to update the
>>> reserved list on device hotplug?  Are we going to update the list when
>>> MMIO is enabled or disabled for each device?  What if the BARs are
>>> reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved list
>>> should account for specific IOVA exclusions that apply to transactions
>>> that actually reach the IOMMU, not aliasing that might occur in the
>>> downstream topology.  Additionally, the IOMMU group composition must be
>>> such that these sorts of aliasing issues can only occur within an IOMMU
>>> group.  If a transaction can be affected by the MMIO programming of
>>> another group, then the groups are drawn incorrectly for the isolation
>>> capabilities of the hardware.  Thanks,
>>
>> Agree that this is not a perfect thing to do covering all scenarios. As Robin
>> pointed out, the current code is playing safe by reserving the full windows.
>>
>> My suggestion will be to limit this reservation to non-ACS cases only.  This will
>> make sure that ACS ARM hardware is not broken by this series and nos-ACS
>> ones has a chance to run once Qemu is updated to take care of the reserved
>> regions (at least in some user scenarios).
>>
>> If you all are fine with this, I can include the ACS check I mentioned earlier in
>> iommu_dma_get_resv_regions() and sent out the revised  series.
>>
>> Please let me know your thoughts.
> 
> IMO, the IOMMU should be concerned with ACS as far as isolation is
> concerned and reporting reserved ranges that are imposed at the IOMMU
> and leave any aliasing or routing issues in the downstream topology to
> other layers, or perhaps to the user.  Unfortunately, enforcing the
> iova list in vfio is gated by some movement here since we can't break
> existing users.  Thanks,

FWIW, given the discussion we've had here I wouldn't object to pushing 
the PCI window reservation back into the DMA-specific path, such that it 
doesn't get exposed via the general IOMMU API interface. We *do* want to 
do it there where we are in total control of our own address space and 
it just avoids a whole class of problems (even with ACS I'm not sure the 
root complex can be guaranteed to send a "bad" IOVA out to the SMMU 
instead of just terminating it for looking like a peer-to-peer attempt).

I do agree that it's not scalable for the IOMMU layer to attempt to 
detect and describe upstream PCI limitations to userspace by itself - 
they are "reserved regions" rather than "may or may not work regions" 
after all. If there is a genuine integration issue between an IOMMU and 
an upstream PCI RC which restricts usable addresses on a given system, 
that probably needs to be explicitly communicated from firmware to the 
IOMMU driver anyway, at which point that driver can report the relevant 
region(s) directly from its own callback.

I suppose there's an in-between option of keeping generic window 
reservations but giving them a new "only reserve this if you're being 
super-cautious or don't know better" region type which we hide from 
userspace and ignore in VFIO, but maybe that leaves the lines a but too 
blurred still.

Robin.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-03-02 17:16                       ` Robin Murphy
@ 2018-03-05 11:44                         ` Shameerali Kolothum Thodi
  2018-03-14 18:12                           ` Robin Murphy
  2018-03-08  9:35                         ` Shameerali Kolothum Thodi
  1 sibling, 1 reply; 28+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-03-05 11:44 UTC (permalink / raw)
  To: Robin Murphy, Alex Williamson
  Cc: Auger Eric, pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)

Hi Robin,

> -----Original Message-----
> From: Robin Murphy [mailto:robin.murphy@arm.com]
> Sent: Friday, March 02, 2018 5:17 PM
> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum
> Thodi <shameerali.kolothum.thodi@huawei.com>
> Cc: Auger Eric <eric.auger@redhat.com>; pmorel@linux.vnet.ibm.com;
> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)
> <xuwei5@huawei.com>
> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
> iova range
> 
> On 02/03/18 16:04, Alex Williamson wrote:
> [...]
> >>> I still think you're overstretching the IOMMU reserved region interface
> >>> by trying to report possible ACS conflicts.  Are we going to update the
> >>> reserved list on device hotplug?  Are we going to update the list when
> >>> MMIO is enabled or disabled for each device?  What if the BARs are
> >>> reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved
> list
> >>> should account for specific IOVA exclusions that apply to transactions
> >>> that actually reach the IOMMU, not aliasing that might occur in the
> >>> downstream topology.  Additionally, the IOMMU group composition must
> be
> >>> such that these sorts of aliasing issues can only occur within an IOMMU
> >>> group.  If a transaction can be affected by the MMIO programming of
> >>> another group, then the groups are drawn incorrectly for the isolation
> >>> capabilities of the hardware.  Thanks,
> >>
> >> Agree that this is not a perfect thing to do covering all scenarios. As Robin
> >> pointed out, the current code is playing safe by reserving the full windows.
> >>
> >> My suggestion will be to limit this reservation to non-ACS cases only.  This
> will
> >> make sure that ACS ARM hardware is not broken by this series and nos-ACS
> >> ones has a chance to run once Qemu is updated to take care of the reserved
> >> regions (at least in some user scenarios).
> >>
> >> If you all are fine with this, I can include the ACS check I mentioned earlier
> in
> >> iommu_dma_get_resv_regions() and sent out the revised  series.
> >>
> >> Please let me know your thoughts.
> >
> > IMO, the IOMMU should be concerned with ACS as far as isolation is
> > concerned and reporting reserved ranges that are imposed at the IOMMU
> > and leave any aliasing or routing issues in the downstream topology to
> > other layers, or perhaps to the user.  Unfortunately, enforcing the
> > iova list in vfio is gated by some movement here since we can't break
> > existing users.  Thanks,
> 
> FWIW, given the discussion we've had here I wouldn't object to pushing
> the PCI window reservation back into the DMA-specific path, such that it
> doesn't get exposed via the general IOMMU API interface. We *do* want to
> do it there where we are in total control of our own address space and
> it just avoids a whole class of problems (even with ACS I'm not sure the
> root complex can be guaranteed to send a "bad" IOVA out to the SMMU
> instead of just terminating it for looking like a peer-to-peer attempt).
> 
> I do agree that it's not scalable for the IOMMU layer to attempt to
> detect and describe upstream PCI limitations to userspace by itself -
> they are "reserved regions" rather than "may or may not work regions"
> after all. If there is a genuine integration issue between an IOMMU and
> an upstream PCI RC which restricts usable addresses on a given system,
> that probably needs to be explicitly communicated from firmware to the
> IOMMU driver anyway, at which point that driver can report the relevant
> region(s) directly from its own callback.
> 
> I suppose there's an in-between option of keeping generic window
> reservations but giving them a new "only reserve this if you're being
> super-cautious or don't know better" region type which we hide from
> userspace and ignore in VFIO, but maybe that leaves the lines a but too
> blurred still.

Thanks for your reply and details. I have made an attempt to revert the PCI
window reservation back  into the DMA path. Could you please take a look
at the below changes and let me know.
(This is on top of HW MSI reserve patches which is now part of linux-next)

Thanks,
Shameer

-->8--
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f05f3cf..ddcbbdb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);
  * @list: Reserved region list from iommu_get_resv_regions()
  *
  * IOMMU drivers can use this to implement their .get_resv_regions callback
- * for general non-IOMMU-specific reservations. Currently, this covers host
- * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI
- * based ARM platforms that may require HW MSI reservation.
+ * for general non-IOMMU-specific reservations. Currently, this covers GICv3
+ * ITS region reservation on ACPI based ARM platforms that may require HW MSI
+ * reservation.
  */
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
-	struct pci_host_bridge *bridge;
-	struct resource_entry *window;
-
-	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) &&
-		iort_iommu_msi_get_resv_regions(dev, list) < 0)
-		return;
-
-	if (!dev_is_pci(dev))
-		return;
-
-	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
-	resource_list_for_each_entry(window, &bridge->windows) {
-		struct iommu_resv_region *region;
-		phys_addr_t start;
-		size_t length;
-
-		if (resource_type(window->res) != IORESOURCE_MEM)
-			continue;
 
-		start = window->res->start - window->offset;
-		length = window->res->end - window->res->start + 1;
-		region = iommu_alloc_resv_region(start, length, 0,
-				IOMMU_RESV_RESERVED);
-		if (!region)
-			return;
+	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))
+		iort_iommu_msi_get_resv_regions(dev, list);
 
-		list_add_tail(&region->list, list);
-	}
 }
 EXPORT_SYMBOL(iommu_dma_get_resv_regions);
 
@@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct iommu_dma_cookie *cookie,
 	return 0;
 }
 
+static void iova_reserve_pci_windows(struct pci_dev *dev,
+		struct iova_domain *iovad)
+{
+	struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
+	struct resource_entry *window;
+	unsigned long lo, hi;
+
+	resource_list_for_each_entry(window, &bridge->windows) {
+		if (resource_type(window->res) != IORESOURCE_MEM)
+			continue;
+
+		lo = iova_pfn(iovad, window->res->start - window->offset);
+		hi = iova_pfn(iovad, window->res->end - window->offset);
+		reserve_iova(iovad, lo, hi);
+	}
+}
+
 static int iova_reserve_iommu_regions(struct device *dev,
 		struct iommu_domain *domain)
 {
@@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device *dev,
 	LIST_HEAD(resv_regions);
 	int ret = 0;
 
+	if (dev_is_pci(dev))
+		iova_reserve_pci_windows(to_pci_dev(dev), iovad);
+
 	iommu_get_resv_regions(dev, &resv_regions);
 	list_for_each_entry(region, &resv_regions, list) {
 		unsigned long lo, hi;

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* RE: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-03-02 17:16                       ` Robin Murphy
  2018-03-05 11:44                         ` Shameerali Kolothum Thodi
@ 2018-03-08  9:35                         ` Shameerali Kolothum Thodi
  1 sibling, 0 replies; 28+ messages in thread
From: Shameerali Kolothum Thodi @ 2018-03-08  9:35 UTC (permalink / raw)
  To: Robin Murphy, Alex Williamson
  Cc: Auger Eric, pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)



> -----Original Message-----
> From: Shameerali Kolothum Thodi
> Sent: Monday, March 05, 2018 11:44 AM
> To: 'Robin Murphy' <robin.murphy@arm.com>; Alex Williamson
> <alex.williamson@redhat.com>
> Cc: Auger Eric <eric.auger@redhat.com>; pmorel@linux.vnet.ibm.com;
> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)
> <xuwei5@huawei.com>
> Subject: RE: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
> iova range

 [...]

> > >>> I still think you're overstretching the IOMMU reserved region interface
> > >>> by trying to report possible ACS conflicts.  Are we going to update the
> > >>> reserved list on device hotplug?  Are we going to update the list when
> > >>> MMIO is enabled or disabled for each device?  What if the BARs are
> > >>> reprogrammed or bridge apertures changed?  IMO, the IOMMU
> reserved
> > list
> > >>> should account for specific IOVA exclusions that apply to transactions
> > >>> that actually reach the IOMMU, not aliasing that might occur in the
> > >>> downstream topology.  Additionally, the IOMMU group composition must
> > be
> > >>> such that these sorts of aliasing issues can only occur within an IOMMU
> > >>> group.  If a transaction can be affected by the MMIO programming of
> > >>> another group, then the groups are drawn incorrectly for the isolation
> > >>> capabilities of the hardware.  Thanks,
> > >>
> > >> Agree that this is not a perfect thing to do covering all scenarios. As Robin
> > >> pointed out, the current code is playing safe by reserving the full windows.
> > >>
> > >> My suggestion will be to limit this reservation to non-ACS cases only.  This
> > will
> > >> make sure that ACS ARM hardware is not broken by this series and nos-
> ACS
> > >> ones has a chance to run once Qemu is updated to take care of the
> reserved
> > >> regions (at least in some user scenarios).
> > >>
> > >> If you all are fine with this, I can include the ACS check I mentioned earlier
> > in
> > >> iommu_dma_get_resv_regions() and sent out the revised  series.
> > >>
> > >> Please let me know your thoughts.
> > >
> > > IMO, the IOMMU should be concerned with ACS as far as isolation is
> > > concerned and reporting reserved ranges that are imposed at the IOMMU
> > > and leave any aliasing or routing issues in the downstream topology to
> > > other layers, or perhaps to the user.  Unfortunately, enforcing the
> > > iova list in vfio is gated by some movement here since we can't break
> > > existing users.  Thanks,
> >
> > FWIW, given the discussion we've had here I wouldn't object to pushing
> > the PCI window reservation back into the DMA-specific path, such that it
> > doesn't get exposed via the general IOMMU API interface. We *do* want to
> > do it there where we are in total control of our own address space and
> > it just avoids a whole class of problems (even with ACS I'm not sure the
> > root complex can be guaranteed to send a "bad" IOVA out to the SMMU
> > instead of just terminating it for looking like a peer-to-peer attempt).
> >
> > I do agree that it's not scalable for the IOMMU layer to attempt to
> > detect and describe upstream PCI limitations to userspace by itself -
> > they are "reserved regions" rather than "may or may not work regions"
> > after all. If there is a genuine integration issue between an IOMMU and
> > an upstream PCI RC which restricts usable addresses on a given system,
> > that probably needs to be explicitly communicated from firmware to the
> > IOMMU driver anyway, at which point that driver can report the relevant
> > region(s) directly from its own callback.
> >
> > I suppose there's an in-between option of keeping generic window
> > reservations but giving them a new "only reserve this if you're being
> > super-cautious or don't know better" region type which we hide from
> > userspace and ignore in VFIO, but maybe that leaves the lines a but too
> > blurred still.
> 
> Thanks for your reply and details. I have made an attempt to revert the PCI
> window reservation back  into the DMA path. Could you please take a look
> at the below changes and let me know.
> (This is on top of HW MSI reserve patches which is now part of linux-next)

Hi Robin,

Please take a look at the below changes if you get sometime and let me know.
Not sure this is what you have in mind.

Thanks,
Shameer

> 
> -->8--
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index f05f3cf..ddcbbdb 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);
>   * @list: Reserved region list from iommu_get_resv_regions()
>   *
>   * IOMMU drivers can use this to implement their .get_resv_regions callback
> - * for general non-IOMMU-specific reservations. Currently, this covers host
> - * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI
> - * based ARM platforms that may require HW MSI reservation.
> + * for general non-IOMMU-specific reservations. Currently, this covers GICv3
> + * ITS region reservation on ACPI based ARM platforms that may require HW
> MSI
> + * reservation.
>   */
>  void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
>  {
> -	struct pci_host_bridge *bridge;
> -	struct resource_entry *window;
> -
> -	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) &&
> -		iort_iommu_msi_get_resv_regions(dev, list) < 0)
> -		return;
> -
> -	if (!dev_is_pci(dev))
> -		return;
> -
> -	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
> -	resource_list_for_each_entry(window, &bridge->windows) {
> -		struct iommu_resv_region *region;
> -		phys_addr_t start;
> -		size_t length;
> -
> -		if (resource_type(window->res) != IORESOURCE_MEM)
> -			continue;
> 
> -		start = window->res->start - window->offset;
> -		length = window->res->end - window->res->start + 1;
> -		region = iommu_alloc_resv_region(start, length, 0,
> -				IOMMU_RESV_RESERVED);
> -		if (!region)
> -			return;
> +	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))
> +		iort_iommu_msi_get_resv_regions(dev, list);
> 
> -		list_add_tail(&region->list, list);
> -	}
>  }
>  EXPORT_SYMBOL(iommu_dma_get_resv_regions);
> 
> @@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct
> iommu_dma_cookie *cookie,
>  	return 0;
>  }
> 
> +static void iova_reserve_pci_windows(struct pci_dev *dev,
> +		struct iova_domain *iovad)
> +{
> +	struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
> +	struct resource_entry *window;
> +	unsigned long lo, hi;
> +
> +	resource_list_for_each_entry(window, &bridge->windows) {
> +		if (resource_type(window->res) != IORESOURCE_MEM)
> +			continue;
> +
> +		lo = iova_pfn(iovad, window->res->start - window->offset);
> +		hi = iova_pfn(iovad, window->res->end - window->offset);
> +		reserve_iova(iovad, lo, hi);
> +	}
> +}
> +
>  static int iova_reserve_iommu_regions(struct device *dev,
>  		struct iommu_domain *domain)
>  {
> @@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device
> *dev,
>  	LIST_HEAD(resv_regions);
>  	int ret = 0;
> 
> +	if (dev_is_pci(dev))
> +		iova_reserve_pci_windows(to_pci_dev(dev), iovad);
> +
>  	iommu_get_resv_regions(dev, &resv_regions);
>  	list_for_each_entry(region, &resv_regions, list) {
>  		unsigned long lo, hi;
> 
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range
  2018-03-05 11:44                         ` Shameerali Kolothum Thodi
@ 2018-03-14 18:12                           ` Robin Murphy
  0 siblings, 0 replies; 28+ messages in thread
From: Robin Murphy @ 2018-03-14 18:12 UTC (permalink / raw)
  To: Shameerali Kolothum Thodi, Alex Williamson
  Cc: Auger Eric, pmorel, kvm, linux-kernel, Linuxarm, John Garry, xuwei (O)

Hi Shameer,

On 05/03/18 11:44, Shameerali Kolothum Thodi wrote:
> Hi Robin,
> 
>> -----Original Message-----
>> From: Robin Murphy [mailto:robin.murphy@arm.com]
>> Sent: Friday, March 02, 2018 5:17 PM
>> To: Alex Williamson <alex.williamson@redhat.com>; Shameerali Kolothum
>> Thodi <shameerali.kolothum.thodi@huawei.com>
>> Cc: Auger Eric <eric.auger@redhat.com>; pmorel@linux.vnet.ibm.com;
>> kvm@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
>> <linuxarm@huawei.com>; John Garry <john.garry@huawei.com>; xuwei (O)
>> <xuwei5@huawei.com>
>> Subject: Re: [PATCH v4 4/6] vfio/type1: check dma map request is within a valid
>> iova range
>>
>> On 02/03/18 16:04, Alex Williamson wrote:
>> [...]
>>>>> I still think you're overstretching the IOMMU reserved region interface
>>>>> by trying to report possible ACS conflicts.  Are we going to update the
>>>>> reserved list on device hotplug?  Are we going to update the list when
>>>>> MMIO is enabled or disabled for each device?  What if the BARs are
>>>>> reprogrammed or bridge apertures changed?  IMO, the IOMMU reserved
>> list
>>>>> should account for specific IOVA exclusions that apply to transactions
>>>>> that actually reach the IOMMU, not aliasing that might occur in the
>>>>> downstream topology.  Additionally, the IOMMU group composition must
>> be
>>>>> such that these sorts of aliasing issues can only occur within an IOMMU
>>>>> group.  If a transaction can be affected by the MMIO programming of
>>>>> another group, then the groups are drawn incorrectly for the isolation
>>>>> capabilities of the hardware.  Thanks,
>>>>
>>>> Agree that this is not a perfect thing to do covering all scenarios. As Robin
>>>> pointed out, the current code is playing safe by reserving the full windows.
>>>>
>>>> My suggestion will be to limit this reservation to non-ACS cases only.  This
>> will
>>>> make sure that ACS ARM hardware is not broken by this series and nos-ACS
>>>> ones has a chance to run once Qemu is updated to take care of the reserved
>>>> regions (at least in some user scenarios).
>>>>
>>>> If you all are fine with this, I can include the ACS check I mentioned earlier
>> in
>>>> iommu_dma_get_resv_regions() and sent out the revised  series.
>>>>
>>>> Please let me know your thoughts.
>>>
>>> IMO, the IOMMU should be concerned with ACS as far as isolation is
>>> concerned and reporting reserved ranges that are imposed at the IOMMU
>>> and leave any aliasing or routing issues in the downstream topology to
>>> other layers, or perhaps to the user.  Unfortunately, enforcing the
>>> iova list in vfio is gated by some movement here since we can't break
>>> existing users.  Thanks,
>>
>> FWIW, given the discussion we've had here I wouldn't object to pushing
>> the PCI window reservation back into the DMA-specific path, such that it
>> doesn't get exposed via the general IOMMU API interface. We *do* want to
>> do it there where we are in total control of our own address space and
>> it just avoids a whole class of problems (even with ACS I'm not sure the
>> root complex can be guaranteed to send a "bad" IOVA out to the SMMU
>> instead of just terminating it for looking like a peer-to-peer attempt).
>>
>> I do agree that it's not scalable for the IOMMU layer to attempt to
>> detect and describe upstream PCI limitations to userspace by itself -
>> they are "reserved regions" rather than "may or may not work regions"
>> after all. If there is a genuine integration issue between an IOMMU and
>> an upstream PCI RC which restricts usable addresses on a given system,
>> that probably needs to be explicitly communicated from firmware to the
>> IOMMU driver anyway, at which point that driver can report the relevant
>> region(s) directly from its own callback.
>>
>> I suppose there's an in-between option of keeping generic window
>> reservations but giving them a new "only reserve this if you're being
>> super-cautious or don't know better" region type which we hide from
>> userspace and ignore in VFIO, but maybe that leaves the lines a but too
>> blurred still.
> 
> Thanks for your reply and details. I have made an attempt to revert the PCI
> window reservation back  into the DMA path. Could you please take a look
> at the below changes and let me know.
> (This is on top of HW MSI reserve patches which is now part of linux-next)

Thanks for putting this together - seeing what's left after the patch 
makes me feel half-tempted to effectively revert 273df9635385 altogether 
and just call iort_iommu_msi_get_resv_regions() directly from SMMUv3, 
but there are other things like dma-ranges constraints which we may also 
want to handle generically somewhere so it's probably worth keeping 
iommu_dma_get_resv_regions() around as a common hook for now. In future 
it might ultimately make sense to move it out to something like 
iommu_get_fw_resv_regions(), but that can be a discussion for another 
day; right now it seems this ought to be enough to resolve everyone's 
immediate concerns.

Cheers,
Robin.

> 
> Thanks,
> Shameer
> 
> -->8--
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index f05f3cf..ddcbbdb 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);
>    * @list: Reserved region list from iommu_get_resv_regions()
>    *
>    * IOMMU drivers can use this to implement their .get_resv_regions callback
> - * for general non-IOMMU-specific reservations. Currently, this covers host
> - * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI
> - * based ARM platforms that may require HW MSI reservation.
> + * for general non-IOMMU-specific reservations. Currently, this covers GICv3
> + * ITS region reservation on ACPI based ARM platforms that may require HW MSI
> + * reservation.
>    */
>   void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
>   {
> -	struct pci_host_bridge *bridge;
> -	struct resource_entry *window;
> -
> -	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) &&
> -		iort_iommu_msi_get_resv_regions(dev, list) < 0)
> -		return;
> -
> -	if (!dev_is_pci(dev))
> -		return;
> -
> -	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
> -	resource_list_for_each_entry(window, &bridge->windows) {
> -		struct iommu_resv_region *region;
> -		phys_addr_t start;
> -		size_t length;
> -
> -		if (resource_type(window->res) != IORESOURCE_MEM)
> -			continue;
>   
> -		start = window->res->start - window->offset;
> -		length = window->res->end - window->res->start + 1;
> -		region = iommu_alloc_resv_region(start, length, 0,
> -				IOMMU_RESV_RESERVED);
> -		if (!region)
> -			return;
> +	if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))
> +		iort_iommu_msi_get_resv_regions(dev, list);
>   
> -		list_add_tail(&region->list, list);
> -	}
>   }
>   EXPORT_SYMBOL(iommu_dma_get_resv_regions);
>   
> @@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct iommu_dma_cookie *cookie,
>   	return 0;
>   }
>   
> +static void iova_reserve_pci_windows(struct pci_dev *dev,
> +		struct iova_domain *iovad)
> +{
> +	struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
> +	struct resource_entry *window;
> +	unsigned long lo, hi;
> +
> +	resource_list_for_each_entry(window, &bridge->windows) {
> +		if (resource_type(window->res) != IORESOURCE_MEM)
> +			continue;
> +
> +		lo = iova_pfn(iovad, window->res->start - window->offset);
> +		hi = iova_pfn(iovad, window->res->end - window->offset);
> +		reserve_iova(iovad, lo, hi);
> +	}
> +}
> +
>   static int iova_reserve_iommu_regions(struct device *dev,
>   		struct iommu_domain *domain)
>   {
> @@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device *dev,
>   	LIST_HEAD(resv_regions);
>   	int ret = 0;
>   
> +	if (dev_is_pci(dev))
> +		iova_reserve_pci_windows(to_pci_dev(dev), iovad);
> +
>   	iommu_get_resv_regions(dev, &resv_regions);
>   	list_for_each_entry(region, &resv_regions, list) {
>   		unsigned long lo, hi;
> 
> 
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2018-03-14 18:12 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-21 12:22 [PATCH v4 0/6] vfio/type1: Add support for valid iova list management Shameer Kolothum
2018-02-21 12:22 ` [PATCH v4 1/6] vfio/type1: Introduce iova list and add iommu aperture validity check Shameer Kolothum
2018-02-21 12:22 ` [PATCH v4 2/6] vfio/type1: Check reserve region conflict and update iova list Shameer Kolothum
2018-02-21 12:22 ` [PATCH v4 3/6] vfio/type1: Update iova list on detach Shameer Kolothum
2018-02-21 12:22 ` [PATCH v4 4/6] vfio/type1: check dma map request is within a valid iova range Shameer Kolothum
2018-02-26 22:05   ` Auger Eric
2018-02-26 23:13     ` Alex Williamson
2018-02-27  8:26       ` Auger Eric
2018-02-27  9:57         ` Shameerali Kolothum Thodi
2018-02-27 17:13           ` Alex Williamson
2018-02-28  9:02           ` Auger Eric
2018-02-28  9:25             ` Shameerali Kolothum Thodi
2018-02-28 11:53               ` Auger Eric
2018-02-28 13:39                 ` Shameerali Kolothum Thodi
2018-02-28 15:32                   ` Auger Eric
2018-02-28 15:24                 ` Alex Williamson
2018-03-02 13:19                   ` Shameerali Kolothum Thodi
2018-03-02 16:04                     ` Alex Williamson
2018-03-02 17:16                       ` Robin Murphy
2018-03-05 11:44                         ` Shameerali Kolothum Thodi
2018-03-14 18:12                           ` Robin Murphy
2018-03-08  9:35                         ` Shameerali Kolothum Thodi
2018-02-27 16:57         ` Alex Williamson
2018-02-27 12:40       ` Robin Murphy
2018-02-21 12:22 ` [PATCH v4 5/6] vfio/type1: Add IOVA range capability support Shameer Kolothum
2018-02-22 22:54   ` Alex Williamson
2018-02-23 10:56     ` Shameerali Kolothum Thodi
2018-02-21 12:22 ` [PATCH v4 6/6] vfio/type1: remove duplicate retrieval of reserved regions Shameer Kolothum

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).